Blame view

mm/swapfile.c 96.4 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
  /*
   *  linux/mm/swapfile.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
8
  #include <linux/mm.h>
6e84f3152   Ingo Molnar   sched/headers: Pr...
9
  #include <linux/sched/mm.h>
299300258   Ingo Molnar   sched/headers: Pr...
10
  #include <linux/sched/task.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
11
12
13
14
15
16
17
18
  #include <linux/hugetlb.h>
  #include <linux/mman.h>
  #include <linux/slab.h>
  #include <linux/kernel_stat.h>
  #include <linux/swap.h>
  #include <linux/vmalloc.h>
  #include <linux/pagemap.h>
  #include <linux/namei.h>
072441e21   Hugh Dickins   mm: move shmem pr...
19
  #include <linux/shmem_fs.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/blkdev.h>
20137a490   Hugh Dickins   swapfile: swapon ...
21
  #include <linux/random.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
  #include <linux/writeback.h>
  #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
  #include <linux/init.h>
5ad646880   Hugh Dickins   ksm: let shared p...
26
  #include <linux/ksm.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
27
28
29
  #include <linux/rmap.h>
  #include <linux/security.h>
  #include <linux/backing-dev.h>
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
30
  #include <linux/mutex.h>
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
31
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
  #include <linux/syscalls.h>
8a9f3ccd2   Balbir Singh   Memory controller...
33
  #include <linux/memcontrol.h>
66d7dd518   Kay Sievers   /proc/swaps: supp...
34
  #include <linux/poll.h>
72788c385   David Rientjes   oom: replace PF_O...
35
  #include <linux/oom.h>
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
36
37
  #include <linux/frontswap.h>
  #include <linux/swapfile.h>
f981c5950   Mel Gorman   mm: methods for t...
38
  #include <linux/export.h>
67afa38e0   Tim Chen   mm/swap: add cach...
39
  #include <linux/swap_slots.h>
155b5f88e   Huang Ying   mm/swapfile.c: so...
40
  #include <linux/sort.h>
63d8620ec   Miaohe Lin   mm/swapfile: use ...
41
  #include <linux/completion.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
43
44
  #include <asm/tlbflush.h>
  #include <linux/swapops.h>
5d1ea48bd   Johannes Weiner   mm: page_cgroup: ...
45
  #include <linux/swap_cgroup.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
46

570a335b8   Hugh Dickins   swap_info: swap c...
47
48
49
  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  				 unsigned char);
  static void free_swap_count_continuations(struct swap_info_struct *);
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
50
  DEFINE_SPINLOCK(swap_lock);
7c363b8c6   Adrian Bunk   mm/swapfile.c: ma...
51
  static unsigned int nr_swapfiles;
ec8acf20a   Shaohua Li   swap: add per-par...
52
  atomic_long_t nr_swap_pages;
fb0fec501   Chris Wilson   mm: Export nr_swa...
53
54
55
56
57
58
  /*
   * Some modules use swappable objects and may try to swap them out under
   * memory pressure (via the shrinker). Before doing so, they may wish to
   * check to see if any swap space is available.
   */
  EXPORT_SYMBOL_GPL(nr_swap_pages);
ec8acf20a   Shaohua Li   swap: add per-par...
59
  /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
60
  long total_swap_pages;
a2468cc9b   Aaron Lu   swap: choose swap...
61
  static int least_priority = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
63
64
65
66
  static const char Bad_file[] = "Bad swap file entry ";
  static const char Unused_file[] = "Unused swap file entry ";
  static const char Bad_offset[] = "Bad swap offset entry ";
  static const char Unused_offset[] = "Unused swap offset entry ";
adfab836f   Dan Streetman   swap: change swap...
67
68
69
70
  /*
   * all active swap_info_structs
   * protected with swap_lock, and ordered by priority.
   */
18ab4d4ce   Dan Streetman   swap: change swap...
71
72
73
74
75
76
77
78
79
80
81
82
83
84
  PLIST_HEAD(swap_active_head);
  
  /*
   * all available (active, not full) swap_info_structs
   * protected with swap_avail_lock, ordered by priority.
   * This is used by get_swap_page() instead of swap_active_head
   * because swap_active_head includes all swap_info_structs,
   * but get_swap_page() doesn't need to look at full ones.
   * This uses its own lock instead of swap_lock because when a
   * swap_info_struct changes between not-full/full, it needs to
   * add/remove itself to/from this list, but the swap_info_struct->lock
   * is held and the locking order requires swap_lock to be taken
   * before any swap_info_struct->lock.
   */
bfc6b1cab   Colin Ian King   mm/swapfile.c: ma...
85
  static struct plist_head *swap_avail_heads;
18ab4d4ce   Dan Streetman   swap: change swap...
86
  static DEFINE_SPINLOCK(swap_avail_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
87

38b5faf4b   Dan Magenheimer   mm: frontswap: co...
88
  struct swap_info_struct *swap_info[MAX_SWAPFILES];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
89

fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
90
  static DEFINE_MUTEX(swapon_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
91

66d7dd518   Kay Sievers   /proc/swaps: supp...
92
93
94
  static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  /* Activity counter to indicate that a swapon or swapoff has occurred */
  static atomic_t proc_poll_event = ATOMIC_INIT(0);
81a0298bd   Huang Ying   mm, swap: don't u...
95
  atomic_t nr_rotate_swap = ATOMIC_INIT(0);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
96
97
  static struct swap_info_struct *swap_type_to_swap_info(int type)
  {
a4b451143   Huang Ying   mm, swap: remove ...
98
  	if (type >= MAX_SWAPFILES)
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
99
  		return NULL;
a4b451143   Huang Ying   mm, swap: remove ...
100
  	return READ_ONCE(swap_info[type]); /* rcu_dereference() */
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
101
  }
8d69aaee8   Hugh Dickins   swap_info: swap_m...
102
  static inline unsigned char swap_count(unsigned char ent)
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
103
  {
955c97f08   Daniel Jordan   mm/swapfile.c: fi...
104
  	return ent & ~SWAP_HAS_CACHE;	/* may include COUNT_CONTINUED flag */
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
105
  }
bcd49e867   Huang Ying   mm/swapfile.c: us...
106
107
108
109
110
111
112
113
114
  /* Reclaim the swap entry anyway if possible */
  #define TTRS_ANYWAY		0x1
  /*
   * Reclaim the swap entry if there are no more mappings of the
   * corresponding page
   */
  #define TTRS_UNMAPPED		0x2
  /* Reclaim the swap entry if swap is getting full*/
  #define TTRS_FULL		0x4
efa90a981   Hugh Dickins   swap_info: change...
115
  /* returns 1 if swap entry is freed */
bcd49e867   Huang Ying   mm/swapfile.c: us...
116
117
  static int __try_to_reclaim_swap(struct swap_info_struct *si,
  				 unsigned long offset, unsigned long flags)
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
118
  {
efa90a981   Hugh Dickins   swap_info: change...
119
  	swp_entry_t entry = swp_entry(si->type, offset);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
120
121
  	struct page *page;
  	int ret = 0;
bcd49e867   Huang Ying   mm/swapfile.c: us...
122
  	page = find_get_page(swap_address_space(entry), offset);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
123
124
125
  	if (!page)
  		return 0;
  	/*
bcd49e867   Huang Ying   mm/swapfile.c: us...
126
127
128
  	 * When this function is called from scan_swap_map_slots() and it's
  	 * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
  	 * here. We have to use trylock for avoiding deadlock. This is a special
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
129
130
131
132
  	 * case and you should use try_to_free_swap() with explicit lock_page()
  	 * in usual operations.
  	 */
  	if (trylock_page(page)) {
bcd49e867   Huang Ying   mm/swapfile.c: us...
133
134
135
136
  		if ((flags & TTRS_ANYWAY) ||
  		    ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
  		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
  			ret = try_to_free_swap(page);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
137
138
  		unlock_page(page);
  	}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
139
  	put_page(page);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
140
141
  	return ret;
  }
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
142

4efaceb1c   Aaron Lu   mm, swap: use rbt...
143
144
145
146
147
148
149
150
151
152
153
  static inline struct swap_extent *first_se(struct swap_info_struct *sis)
  {
  	struct rb_node *rb = rb_first(&sis->swap_extent_root);
  	return rb_entry(rb, struct swap_extent, rb_node);
  }
  
  static inline struct swap_extent *next_se(struct swap_extent *se)
  {
  	struct rb_node *rb = rb_next(&se->rb_node);
  	return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
154
  /*
6a6ba8317   Hugh Dickins   swapfile: swapon ...
155
156
157
158
159
160
   * swapon tell device that all the old swap contents can be discarded,
   * to allow the swap device to optimize its wear-levelling.
   */
  static int discard_swap(struct swap_info_struct *si)
  {
  	struct swap_extent *se;
9625a5f28   Hugh Dickins   swap_info: includ...
161
162
  	sector_t start_block;
  	sector_t nr_blocks;
6a6ba8317   Hugh Dickins   swapfile: swapon ...
163
  	int err = 0;
9625a5f28   Hugh Dickins   swap_info: includ...
164
  	/* Do not discard the swap header page! */
4efaceb1c   Aaron Lu   mm, swap: use rbt...
165
  	se = first_se(si);
9625a5f28   Hugh Dickins   swap_info: includ...
166
167
168
169
  	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
  	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
  	if (nr_blocks) {
  		err = blkdev_issue_discard(si->bdev, start_block,
dd3932edd   Christoph Hellwig   block: remove BLK...
170
  				nr_blocks, GFP_KERNEL, 0);
9625a5f28   Hugh Dickins   swap_info: includ...
171
172
173
174
  		if (err)
  			return err;
  		cond_resched();
  	}
6a6ba8317   Hugh Dickins   swapfile: swapon ...
175

4efaceb1c   Aaron Lu   mm, swap: use rbt...
176
  	for (se = next_se(se); se; se = next_se(se)) {
9625a5f28   Hugh Dickins   swap_info: includ...
177
178
  		start_block = se->start_block << (PAGE_SHIFT - 9);
  		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
6a6ba8317   Hugh Dickins   swapfile: swapon ...
179
180
  
  		err = blkdev_issue_discard(si->bdev, start_block,
dd3932edd   Christoph Hellwig   block: remove BLK...
181
  				nr_blocks, GFP_KERNEL, 0);
6a6ba8317   Hugh Dickins   swapfile: swapon ...
182
183
184
185
186
187
188
  		if (err)
  			break;
  
  		cond_resched();
  	}
  	return err;		/* That will often be -EOPNOTSUPP */
  }
4efaceb1c   Aaron Lu   mm, swap: use rbt...
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  static struct swap_extent *
  offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
  {
  	struct swap_extent *se;
  	struct rb_node *rb;
  
  	rb = sis->swap_extent_root.rb_node;
  	while (rb) {
  		se = rb_entry(rb, struct swap_extent, rb_node);
  		if (offset < se->start_page)
  			rb = rb->rb_left;
  		else if (offset >= se->start_page + se->nr_pages)
  			rb = rb->rb_right;
  		else
  			return se;
  	}
  	/* It *must* be present */
  	BUG();
  }
caf6912f3   Jens Axboe   swap: fix swapfil...
208
209
210
211
212
213
214
215
216
217
218
219
  sector_t swap_page_sector(struct page *page)
  {
  	struct swap_info_struct *sis = page_swap_info(page);
  	struct swap_extent *se;
  	sector_t sector;
  	pgoff_t offset;
  
  	offset = __page_file_index(page);
  	se = offset_to_swap_extent(sis, offset);
  	sector = se->start_block + (offset - se->start_page);
  	return sector << (PAGE_SHIFT - 9);
  }
7992fde72   Hugh Dickins   swapfile: swap al...
220
221
222
223
224
225
226
  /*
   * swap allocation tell device that a cluster of swap can now be discarded,
   * to allow the swap device to optimize its wear-levelling.
   */
  static void discard_swap_cluster(struct swap_info_struct *si,
  				 pgoff_t start_page, pgoff_t nr_pages)
  {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
227
  	struct swap_extent *se = offset_to_swap_extent(si, start_page);
7992fde72   Hugh Dickins   swapfile: swap al...
228
229
  
  	while (nr_pages) {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
230
231
232
233
234
235
236
237
238
239
240
241
242
243
  		pgoff_t offset = start_page - se->start_page;
  		sector_t start_block = se->start_block + offset;
  		sector_t nr_blocks = se->nr_pages - offset;
  
  		if (nr_blocks > nr_pages)
  			nr_blocks = nr_pages;
  		start_page += nr_blocks;
  		nr_pages -= nr_blocks;
  
  		start_block <<= PAGE_SHIFT - 9;
  		nr_blocks <<= PAGE_SHIFT - 9;
  		if (blkdev_issue_discard(si->bdev, start_block,
  					nr_blocks, GFP_NOIO, 0))
  			break;
7992fde72   Hugh Dickins   swapfile: swap al...
244

4efaceb1c   Aaron Lu   mm, swap: use rbt...
245
  		se = next_se(se);
7992fde72   Hugh Dickins   swapfile: swap al...
246
247
  	}
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
248
249
  #ifdef CONFIG_THP_SWAP
  #define SWAPFILE_CLUSTER	HPAGE_PMD_NR
a448f2d07   Huang Ying   mm/swapfile.c: un...
250
251
  
  #define swap_entry_size(size)	(size)
38d8b4e6b   Huang Ying   mm, THP, swap: de...
252
  #else
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
253
  #define SWAPFILE_CLUSTER	256
a448f2d07   Huang Ying   mm/swapfile.c: un...
254
255
256
257
258
259
  
  /*
   * Define swap_entry_size() as constant to let compiler to optimize
   * out some code if !CONFIG_THP_SWAP
   */
  #define swap_entry_size(size)	1
38d8b4e6b   Huang Ying   mm, THP, swap: de...
260
  #endif
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
261
  #define LATENCY_LIMIT		256
2a8f94493   Shaohua Li   swap: change bloc...
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
  static inline void cluster_set_flag(struct swap_cluster_info *info,
  	unsigned int flag)
  {
  	info->flags = flag;
  }
  
  static inline unsigned int cluster_count(struct swap_cluster_info *info)
  {
  	return info->data;
  }
  
  static inline void cluster_set_count(struct swap_cluster_info *info,
  				     unsigned int c)
  {
  	info->data = c;
  }
  
  static inline void cluster_set_count_flag(struct swap_cluster_info *info,
  					 unsigned int c, unsigned int f)
  {
  	info->flags = f;
  	info->data = c;
  }
  
  static inline unsigned int cluster_next(struct swap_cluster_info *info)
  {
  	return info->data;
  }
  
  static inline void cluster_set_next(struct swap_cluster_info *info,
  				    unsigned int n)
  {
  	info->data = n;
  }
  
  static inline void cluster_set_next_flag(struct swap_cluster_info *info,
  					 unsigned int n, unsigned int f)
  {
  	info->flags = f;
  	info->data = n;
  }
  
  static inline bool cluster_is_free(struct swap_cluster_info *info)
  {
  	return info->flags & CLUSTER_FLAG_FREE;
  }
  
  static inline bool cluster_is_null(struct swap_cluster_info *info)
  {
  	return info->flags & CLUSTER_FLAG_NEXT_NULL;
  }
  
  static inline void cluster_set_null(struct swap_cluster_info *info)
  {
  	info->flags = CLUSTER_FLAG_NEXT_NULL;
  	info->data = 0;
  }
e07098294   Huang Ying   mm, THP, swap: su...
319
320
  static inline bool cluster_is_huge(struct swap_cluster_info *info)
  {
33ee011e5   Huang Ying   mm/swapfile.c: un...
321
322
323
  	if (IS_ENABLED(CONFIG_THP_SWAP))
  		return info->flags & CLUSTER_FLAG_HUGE;
  	return false;
e07098294   Huang Ying   mm, THP, swap: su...
324
325
326
327
328
329
  }
  
  static inline void cluster_clear_huge(struct swap_cluster_info *info)
  {
  	info->flags &= ~CLUSTER_FLAG_HUGE;
  }
235b62176   Huang, Ying   mm/swap: add clus...
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
  						     unsigned long offset)
  {
  	struct swap_cluster_info *ci;
  
  	ci = si->cluster_info;
  	if (ci) {
  		ci += offset / SWAPFILE_CLUSTER;
  		spin_lock(&ci->lock);
  	}
  	return ci;
  }
  
  static inline void unlock_cluster(struct swap_cluster_info *ci)
  {
  	if (ci)
  		spin_unlock(&ci->lock);
  }
59d98bf3c   Huang Ying   mm: swap: add com...
348
349
350
351
  /*
   * Determine the locking method in use for this device.  Return
   * swap_cluster_info if SSD-style cluster-based locking is in place.
   */
235b62176   Huang, Ying   mm/swap: add clus...
352
  static inline struct swap_cluster_info *lock_cluster_or_swap_info(
59d98bf3c   Huang Ying   mm: swap: add com...
353
  		struct swap_info_struct *si, unsigned long offset)
235b62176   Huang, Ying   mm/swap: add clus...
354
355
  {
  	struct swap_cluster_info *ci;
59d98bf3c   Huang Ying   mm: swap: add com...
356
  	/* Try to use fine-grained SSD-style locking if available: */
235b62176   Huang, Ying   mm/swap: add clus...
357
  	ci = lock_cluster(si, offset);
59d98bf3c   Huang Ying   mm: swap: add com...
358
  	/* Otherwise, fall back to traditional, coarse locking: */
235b62176   Huang, Ying   mm/swap: add clus...
359
360
361
362
363
364
365
366
367
368
369
370
371
372
  	if (!ci)
  		spin_lock(&si->lock);
  
  	return ci;
  }
  
  static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
  					       struct swap_cluster_info *ci)
  {
  	if (ci)
  		unlock_cluster(ci);
  	else
  		spin_unlock(&si->lock);
  }
6b5349159   Huang Ying   mm, swap: add swa...
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
  static inline bool cluster_list_empty(struct swap_cluster_list *list)
  {
  	return cluster_is_null(&list->head);
  }
  
  static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
  {
  	return cluster_next(&list->head);
  }
  
  static void cluster_list_init(struct swap_cluster_list *list)
  {
  	cluster_set_null(&list->head);
  	cluster_set_null(&list->tail);
  }
  
  static void cluster_list_add_tail(struct swap_cluster_list *list,
  				  struct swap_cluster_info *ci,
  				  unsigned int idx)
  {
  	if (cluster_list_empty(list)) {
  		cluster_set_next_flag(&list->head, idx, 0);
  		cluster_set_next_flag(&list->tail, idx, 0);
  	} else {
235b62176   Huang, Ying   mm/swap: add clus...
397
  		struct swap_cluster_info *ci_tail;
6b5349159   Huang Ying   mm, swap: add swa...
398
  		unsigned int tail = cluster_next(&list->tail);
235b62176   Huang, Ying   mm/swap: add clus...
399
400
401
402
403
404
405
  		/*
  		 * Nested cluster lock, but both cluster locks are
  		 * only acquired when we held swap_info_struct->lock
  		 */
  		ci_tail = ci + tail;
  		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
  		cluster_set_next(ci_tail, idx);
0ef017d11   Huang Ying   mm, swap: improve...
406
  		spin_unlock(&ci_tail->lock);
6b5349159   Huang Ying   mm, swap: add swa...
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
  		cluster_set_next_flag(&list->tail, idx, 0);
  	}
  }
  
  static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
  					   struct swap_cluster_info *ci)
  {
  	unsigned int idx;
  
  	idx = cluster_next(&list->head);
  	if (cluster_next(&list->tail) == idx) {
  		cluster_set_null(&list->head);
  		cluster_set_null(&list->tail);
  	} else
  		cluster_set_next_flag(&list->head,
  				      cluster_next(&ci[idx]), 0);
  
  	return idx;
  }
815c2c543   Shaohua Li   swap: make swap d...
426
427
428
429
430
  /* Add a cluster to discard list and schedule it to do discard */
  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
  		unsigned int idx)
  {
  	/*
bb243f7dc   Miaohe Lin   mm/swapfile: move...
431
  	 * If scan_swap_map_slots() can't find a free cluster, it will check
815c2c543   Shaohua Li   swap: make swap d...
432
  	 * si->swap_map directly. To make sure the discarding cluster isn't
bb243f7dc   Miaohe Lin   mm/swapfile: move...
433
434
  	 * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
  	 * It will be cleared after discard
815c2c543   Shaohua Li   swap: make swap d...
435
436
437
  	 */
  	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
6b5349159   Huang Ying   mm, swap: add swa...
438
  	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
815c2c543   Shaohua Li   swap: make swap d...
439
440
441
  
  	schedule_work(&si->discard_work);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
442
443
444
445
446
447
448
  static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	struct swap_cluster_info *ci = si->cluster_info;
  
  	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
  	cluster_list_add_tail(&si->free_clusters, ci, idx);
  }
815c2c543   Shaohua Li   swap: make swap d...
449
450
451
452
453
454
  /*
   * Doing discard actually. After a cluster discard is finished, the cluster
   * will be added to free cluster list. caller should hold si->lock.
  */
  static void swap_do_scheduled_discard(struct swap_info_struct *si)
  {
235b62176   Huang, Ying   mm/swap: add clus...
455
  	struct swap_cluster_info *info, *ci;
815c2c543   Shaohua Li   swap: make swap d...
456
457
458
  	unsigned int idx;
  
  	info = si->cluster_info;
6b5349159   Huang Ying   mm, swap: add swa...
459
460
  	while (!cluster_list_empty(&si->discard_clusters)) {
  		idx = cluster_list_del_first(&si->discard_clusters, info);
815c2c543   Shaohua Li   swap: make swap d...
461
462
463
464
465
466
  		spin_unlock(&si->lock);
  
  		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
  				SWAPFILE_CLUSTER);
  
  		spin_lock(&si->lock);
235b62176   Huang, Ying   mm/swap: add clus...
467
  		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
468
  		__free_cluster(si, idx);
815c2c543   Shaohua Li   swap: make swap d...
469
470
  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  				0, SWAPFILE_CLUSTER);
235b62176   Huang, Ying   mm/swap: add clus...
471
  		unlock_cluster(ci);
815c2c543   Shaohua Li   swap: make swap d...
472
473
474
475
476
477
478
479
480
481
482
483
484
  	}
  }
  
  static void swap_discard_work(struct work_struct *work)
  {
  	struct swap_info_struct *si;
  
  	si = container_of(work, struct swap_info_struct, discard_work);
  
  	spin_lock(&si->lock);
  	swap_do_scheduled_discard(si);
  	spin_unlock(&si->lock);
  }
63d8620ec   Miaohe Lin   mm/swapfile: use ...
485
486
487
488
489
490
491
  static void swap_users_ref_free(struct percpu_ref *ref)
  {
  	struct swap_info_struct *si;
  
  	si = container_of(ref, struct swap_info_struct, users);
  	complete(&si->comp);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	struct swap_cluster_info *ci = si->cluster_info;
  
  	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
  	cluster_list_del_first(&si->free_clusters, ci);
  	cluster_set_count_flag(ci + idx, 0, 0);
  }
  
  static void free_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	struct swap_cluster_info *ci = si->cluster_info + idx;
  
  	VM_BUG_ON(cluster_count(ci) != 0);
  	/*
  	 * If the swap is discardable, prepare discard the cluster
  	 * instead of free it immediately. The cluster will be freed
  	 * after discard.
  	 */
  	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
  	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
  		swap_cluster_schedule_discard(si, idx);
  		return;
  	}
  
  	__free_cluster(si, idx);
  }
2a8f94493   Shaohua Li   swap: change bloc...
519
520
521
522
523
524
525
526
527
528
529
  /*
   * The cluster corresponding to page_nr will be used. The cluster will be
   * removed from free cluster list and its usage counter will be increased.
   */
  static void inc_cluster_info_page(struct swap_info_struct *p,
  	struct swap_cluster_info *cluster_info, unsigned long page_nr)
  {
  	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  
  	if (!cluster_info)
  		return;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
530
531
  	if (cluster_is_free(&cluster_info[idx]))
  		alloc_cluster(p, idx);
2a8f94493   Shaohua Li   swap: change bloc...
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
  
  	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
  	cluster_set_count(&cluster_info[idx],
  		cluster_count(&cluster_info[idx]) + 1);
  }
  
  /*
   * The cluster corresponding to page_nr decreases one usage. If the usage
   * counter becomes 0, which means no page in the cluster is in using, we can
   * optionally discard the cluster and add it to free cluster list.
   */
  static void dec_cluster_info_page(struct swap_info_struct *p,
  	struct swap_cluster_info *cluster_info, unsigned long page_nr)
  {
  	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  
  	if (!cluster_info)
  		return;
  
  	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
  	cluster_set_count(&cluster_info[idx],
  		cluster_count(&cluster_info[idx]) - 1);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
554
555
  	if (cluster_count(&cluster_info[idx]) == 0)
  		free_cluster(p, idx);
2a8f94493   Shaohua Li   swap: change bloc...
556
557
558
  }
  
  /*
bb243f7dc   Miaohe Lin   mm/swapfile: move...
559
   * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
2a8f94493   Shaohua Li   swap: change bloc...
560
561
   * cluster list. Avoiding such abuse to avoid list corruption.
   */
ebc2a1a69   Shaohua Li   swap: make cluste...
562
563
  static bool
  scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
2a8f94493   Shaohua Li   swap: change bloc...
564
565
  	unsigned long offset)
  {
ebc2a1a69   Shaohua Li   swap: make cluste...
566
567
  	struct percpu_cluster *percpu_cluster;
  	bool conflict;
2a8f94493   Shaohua Li   swap: change bloc...
568
  	offset /= SWAPFILE_CLUSTER;
6b5349159   Huang Ying   mm, swap: add swa...
569
570
  	conflict = !cluster_list_empty(&si->free_clusters) &&
  		offset != cluster_list_first(&si->free_clusters) &&
2a8f94493   Shaohua Li   swap: change bloc...
571
  		cluster_is_free(&si->cluster_info[offset]);
ebc2a1a69   Shaohua Li   swap: make cluste...
572
573
574
575
576
577
578
579
580
581
582
583
584
  
  	if (!conflict)
  		return false;
  
  	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
  	cluster_set_null(&percpu_cluster->index);
  	return true;
  }
  
  /*
   * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
   * might involve allocating a new cluster for current CPU too.
   */
36005bae2   Tim Chen   mm/swap: allocate...
585
  static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
ebc2a1a69   Shaohua Li   swap: make cluste...
586
587
588
  	unsigned long *offset, unsigned long *scan_base)
  {
  	struct percpu_cluster *cluster;
235b62176   Huang, Ying   mm/swap: add clus...
589
  	struct swap_cluster_info *ci;
235b62176   Huang, Ying   mm/swap: add clus...
590
  	unsigned long tmp, max;
ebc2a1a69   Shaohua Li   swap: make cluste...
591
592
593
594
  
  new_cluster:
  	cluster = this_cpu_ptr(si->percpu_cluster);
  	if (cluster_is_null(&cluster->index)) {
6b5349159   Huang Ying   mm, swap: add swa...
595
596
  		if (!cluster_list_empty(&si->free_clusters)) {
  			cluster->index = si->free_clusters.head;
ebc2a1a69   Shaohua Li   swap: make cluste...
597
598
  			cluster->next = cluster_next(&cluster->index) *
  					SWAPFILE_CLUSTER;
6b5349159   Huang Ying   mm, swap: add swa...
599
  		} else if (!cluster_list_empty(&si->discard_clusters)) {
ebc2a1a69   Shaohua Li   swap: make cluste...
600
601
  			/*
  			 * we don't have free cluster but have some clusters in
490705888   Huang Ying   swap: reduce lock...
602
603
  			 * discarding, do discard now and reclaim them, then
  			 * reread cluster_next_cpu since we dropped si->lock
ebc2a1a69   Shaohua Li   swap: make cluste...
604
605
  			 */
  			swap_do_scheduled_discard(si);
490705888   Huang Ying   swap: reduce lock...
606
607
  			*scan_base = this_cpu_read(*si->cluster_next_cpu);
  			*offset = *scan_base;
ebc2a1a69   Shaohua Li   swap: make cluste...
608
609
  			goto new_cluster;
  		} else
36005bae2   Tim Chen   mm/swap: allocate...
610
  			return false;
ebc2a1a69   Shaohua Li   swap: make cluste...
611
  	}
ebc2a1a69   Shaohua Li   swap: make cluste...
612
613
614
615
616
  	/*
  	 * Other CPUs can use our cluster if they can't find a free cluster,
  	 * check if there is still free entry in the cluster
  	 */
  	tmp = cluster->next;
235b62176   Huang, Ying   mm/swap: add clus...
617
618
  	max = min_t(unsigned long, si->max,
  		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
7b9e2de13   Wei Yang   mm/swapfile.c: om...
619
620
621
622
623
624
625
626
  	if (tmp < max) {
  		ci = lock_cluster(si, tmp);
  		while (tmp < max) {
  			if (!si->swap_map[tmp])
  				break;
  			tmp++;
  		}
  		unlock_cluster(ci);
ebc2a1a69   Shaohua Li   swap: make cluste...
627
  	}
0fd0e19e4   Wei Yang   mm/swapfile.c: fo...
628
  	if (tmp >= max) {
ebc2a1a69   Shaohua Li   swap: make cluste...
629
630
631
632
633
634
  		cluster_set_null(&cluster->index);
  		goto new_cluster;
  	}
  	cluster->next = tmp + 1;
  	*offset = tmp;
  	*scan_base = tmp;
fdff1debb   Wei Yang   mm/swapfile.c: tm...
635
  	return true;
2a8f94493   Shaohua Li   swap: change bloc...
636
  }
a2468cc9b   Aaron Lu   swap: choose swap...
637
638
639
640
641
642
643
644
645
646
647
648
649
650
  static void __del_from_avail_list(struct swap_info_struct *p)
  {
  	int nid;
  
  	for_each_node(nid)
  		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
  }
  
  static void del_from_avail_list(struct swap_info_struct *p)
  {
  	spin_lock(&swap_avail_lock);
  	__del_from_avail_list(p);
  	spin_unlock(&swap_avail_lock);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
651
652
653
654
655
656
657
658
  static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
  			     unsigned int nr_entries)
  {
  	unsigned int end = offset + nr_entries - 1;
  
  	if (offset == si->lowest_bit)
  		si->lowest_bit += nr_entries;
  	if (end == si->highest_bit)
a449bf58e   Qian Cai   mm/swapfile: fix ...
659
  		WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
660
661
662
663
  	si->inuse_pages += nr_entries;
  	if (si->inuse_pages == si->pages) {
  		si->lowest_bit = si->max;
  		si->highest_bit = 0;
a2468cc9b   Aaron Lu   swap: choose swap...
664
  		del_from_avail_list(si);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
665
666
  	}
  }
a2468cc9b   Aaron Lu   swap: choose swap...
667
668
669
670
671
672
673
674
675
676
677
  static void add_to_avail_list(struct swap_info_struct *p)
  {
  	int nid;
  
  	spin_lock(&swap_avail_lock);
  	for_each_node(nid) {
  		WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
  		plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
  	}
  	spin_unlock(&swap_avail_lock);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
678
679
680
  static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
  			    unsigned int nr_entries)
  {
3852f6768   Joonsoo Kim   mm/swapcache: sup...
681
  	unsigned long begin = offset;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
682
683
684
685
686
687
688
  	unsigned long end = offset + nr_entries - 1;
  	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
  
  	if (offset < si->lowest_bit)
  		si->lowest_bit = offset;
  	if (end > si->highest_bit) {
  		bool was_full = !si->highest_bit;
a449bf58e   Qian Cai   mm/swapfile: fix ...
689
  		WRITE_ONCE(si->highest_bit, end);
a2468cc9b   Aaron Lu   swap: choose swap...
690
691
  		if (was_full && (si->flags & SWP_WRITEOK))
  			add_to_avail_list(si);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
692
693
694
695
696
697
698
699
700
  	}
  	atomic_long_add(nr_entries, &nr_swap_pages);
  	si->inuse_pages -= nr_entries;
  	if (si->flags & SWP_BLKDEV)
  		swap_slot_free_notify =
  			si->bdev->bd_disk->fops->swap_slot_free_notify;
  	else
  		swap_slot_free_notify = NULL;
  	while (offset <= end) {
8a84802e2   Steven Price   mm: Add arch hook...
701
  		arch_swap_invalidate_page(si->type, offset);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
702
703
704
705
706
  		frontswap_invalidate_page(si->type, offset);
  		if (swap_slot_free_notify)
  			swap_slot_free_notify(si->bdev, offset);
  		offset++;
  	}
3852f6768   Joonsoo Kim   mm/swapcache: sup...
707
  	clear_shadow_from_swap_cache(si->type, begin, end);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
708
  }
490705888   Huang Ying   swap: reduce lock...
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
  static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
  {
  	unsigned long prev;
  
  	if (!(si->flags & SWP_SOLIDSTATE)) {
  		si->cluster_next = next;
  		return;
  	}
  
  	prev = this_cpu_read(*si->cluster_next_cpu);
  	/*
  	 * Cross the swap address space size aligned trunk, choose
  	 * another trunk randomly to avoid lock contention on swap
  	 * address space if possible.
  	 */
  	if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
  	    (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
  		/* No free swap slots available */
  		if (si->highest_bit <= si->lowest_bit)
  			return;
  		next = si->lowest_bit +
  			prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
  		next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
  		next = max_t(unsigned int, next, si->lowest_bit);
  	}
  	this_cpu_write(*si->cluster_next_cpu, next);
  }
36005bae2   Tim Chen   mm/swap: allocate...
736
737
738
  static int scan_swap_map_slots(struct swap_info_struct *si,
  			       unsigned char usage, int nr,
  			       swp_entry_t slots[])
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
739
  {
235b62176   Huang, Ying   mm/swap: add clus...
740
  	struct swap_cluster_info *ci;
ebebbbe90   Hugh Dickins   swapfile: rearran...
741
  	unsigned long offset;
c60aa176c   Hugh Dickins   swapfile: swap al...
742
  	unsigned long scan_base;
7992fde72   Hugh Dickins   swapfile: swap al...
743
  	unsigned long last_in_cluster = 0;
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
744
  	int latency_ration = LATENCY_LIMIT;
36005bae2   Tim Chen   mm/swap: allocate...
745
  	int n_ret = 0;
ed43af109   Huang Ying   swap: try to scan...
746
  	bool scanned_many = false;
36005bae2   Tim Chen   mm/swap: allocate...
747

886bb7e9c   Hugh Dickins   swapfile: remove ...
748
  	/*
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
749
750
751
752
753
754
755
  	 * We try to cluster swap pages by allocating them sequentially
  	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
  	 * way, however, we resort to first-free allocation, starting
  	 * a new cluster.  This prevents us from scattering swap pages
  	 * all over the entire swap partition, so that we reduce
  	 * overall disk seek times between swap pages.  -- sct
  	 * But we do now try to find an empty cluster.  -Andrea
c60aa176c   Hugh Dickins   swapfile: swap al...
756
  	 * And we let swap pages go all over an SSD partition.  Hugh
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
757
  	 */
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
758
  	si->flags += SWP_SCANNING;
490705888   Huang Ying   swap: reduce lock...
759
760
761
762
763
764
765
766
767
768
  	/*
  	 * Use percpu scan base for SSD to reduce lock contention on
  	 * cluster and swap cache.  For HDD, sequential access is more
  	 * important.
  	 */
  	if (si->flags & SWP_SOLIDSTATE)
  		scan_base = this_cpu_read(*si->cluster_next_cpu);
  	else
  		scan_base = si->cluster_next;
  	offset = scan_base;
ebebbbe90   Hugh Dickins   swapfile: rearran...
769

ebc2a1a69   Shaohua Li   swap: make cluste...
770
771
  	/* SSD algorithm */
  	if (si->cluster_info) {
bd2d18da4   Wei Yang   mm/swapfile.c: re...
772
  		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
36005bae2   Tim Chen   mm/swap: allocate...
773
  			goto scan;
f4eaf51a7   Wei Yang   mm/swapfile.c: ex...
774
  	} else if (unlikely(!si->cluster_nr--)) {
ebebbbe90   Hugh Dickins   swapfile: rearran...
775
776
777
778
  		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
  			si->cluster_nr = SWAPFILE_CLUSTER - 1;
  			goto checks;
  		}
2a8f94493   Shaohua Li   swap: change bloc...
779

ec8acf20a   Shaohua Li   swap: add per-par...
780
  		spin_unlock(&si->lock);
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
781

c60aa176c   Hugh Dickins   swapfile: swap al...
782
783
784
  		/*
  		 * If seek is expensive, start searching for new cluster from
  		 * start of partition, to minimize the span of allocated swap.
50088c440   Chen Yucong   mm/swapfile.c: de...
785
786
  		 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
  		 * case, just handled by scan_swap_map_try_ssd_cluster() above.
c60aa176c   Hugh Dickins   swapfile: swap al...
787
  		 */
50088c440   Chen Yucong   mm/swapfile.c: de...
788
  		scan_base = offset = si->lowest_bit;
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
789
790
791
792
  		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
  
  		/* Locate the first empty (unaligned) cluster */
  		for (; last_in_cluster <= si->highest_bit; offset++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
793
  			if (si->swap_map[offset])
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
794
795
  				last_in_cluster = offset + SWAPFILE_CLUSTER;
  			else if (offset == last_in_cluster) {
ec8acf20a   Shaohua Li   swap: add per-par...
796
  				spin_lock(&si->lock);
ebebbbe90   Hugh Dickins   swapfile: rearran...
797
798
799
  				offset -= SWAPFILE_CLUSTER - 1;
  				si->cluster_next = offset;
  				si->cluster_nr = SWAPFILE_CLUSTER - 1;
c60aa176c   Hugh Dickins   swapfile: swap al...
800
801
802
803
804
805
806
807
808
  				goto checks;
  			}
  			if (unlikely(--latency_ration < 0)) {
  				cond_resched();
  				latency_ration = LATENCY_LIMIT;
  			}
  		}
  
  		offset = scan_base;
ec8acf20a   Shaohua Li   swap: add per-par...
809
  		spin_lock(&si->lock);
ebebbbe90   Hugh Dickins   swapfile: rearran...
810
  		si->cluster_nr = SWAPFILE_CLUSTER - 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
811
  	}
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
812

ebebbbe90   Hugh Dickins   swapfile: rearran...
813
  checks:
ebc2a1a69   Shaohua Li   swap: make cluste...
814
  	if (si->cluster_info) {
36005bae2   Tim Chen   mm/swap: allocate...
815
816
817
818
819
820
821
822
  		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
  		/* take a break if we already got some slots */
  			if (n_ret)
  				goto done;
  			if (!scan_swap_map_try_ssd_cluster(si, &offset,
  							&scan_base))
  				goto scan;
  		}
ebc2a1a69   Shaohua Li   swap: make cluste...
823
  	}
ebebbbe90   Hugh Dickins   swapfile: rearran...
824
  	if (!(si->flags & SWP_WRITEOK))
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
825
  		goto no_page;
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
826
827
  	if (!si->highest_bit)
  		goto no_page;
ebebbbe90   Hugh Dickins   swapfile: rearran...
828
  	if (offset > si->highest_bit)
c60aa176c   Hugh Dickins   swapfile: swap al...
829
  		scan_base = offset = si->lowest_bit;
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
830

235b62176   Huang, Ying   mm/swap: add clus...
831
  	ci = lock_cluster(si, offset);
b73d7fcec   Hugh Dickins   swap: prevent reu...
832
833
  	/* reuse swap entry of cache-only swap if not busy. */
  	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
834
  		int swap_was_freed;
235b62176   Huang, Ying   mm/swap: add clus...
835
  		unlock_cluster(ci);
ec8acf20a   Shaohua Li   swap: add per-par...
836
  		spin_unlock(&si->lock);
bcd49e867   Huang Ying   mm/swapfile.c: us...
837
  		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
ec8acf20a   Shaohua Li   swap: add per-par...
838
  		spin_lock(&si->lock);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
839
840
841
842
843
  		/* entry was freed successfully, try to use this again */
  		if (swap_was_freed)
  			goto checks;
  		goto scan; /* check next one */
  	}
235b62176   Huang, Ying   mm/swap: add clus...
844
845
  	if (si->swap_map[offset]) {
  		unlock_cluster(ci);
36005bae2   Tim Chen   mm/swap: allocate...
846
847
848
849
  		if (!n_ret)
  			goto scan;
  		else
  			goto done;
235b62176   Huang, Ying   mm/swap: add clus...
850
  	}
a449bf58e   Qian Cai   mm/swapfile: fix ...
851
  	WRITE_ONCE(si->swap_map[offset], usage);
2872bb2d0   Huang Ying   mm, swap: avoid l...
852
853
  	inc_cluster_info_page(si, si->cluster_info, offset);
  	unlock_cluster(ci);
ebebbbe90   Hugh Dickins   swapfile: rearran...
854

38d8b4e6b   Huang Ying   mm, THP, swap: de...
855
  	swap_range_alloc(si, offset, 1);
36005bae2   Tim Chen   mm/swap: allocate...
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
  	slots[n_ret++] = swp_entry(si->type, offset);
  
  	/* got enough slots or reach max slots? */
  	if ((n_ret == nr) || (offset >= si->highest_bit))
  		goto done;
  
  	/* search for next available slot */
  
  	/* time to take a break? */
  	if (unlikely(--latency_ration < 0)) {
  		if (n_ret)
  			goto done;
  		spin_unlock(&si->lock);
  		cond_resched();
  		spin_lock(&si->lock);
  		latency_ration = LATENCY_LIMIT;
  	}
  
  	/* try to get more slots in cluster */
  	if (si->cluster_info) {
  		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
  			goto checks;
f4eaf51a7   Wei Yang   mm/swapfile.c: ex...
878
879
  	} else if (si->cluster_nr && !si->swap_map[++offset]) {
  		/* non-ssd case, still more slots in cluster? */
36005bae2   Tim Chen   mm/swap: allocate...
880
881
882
  		--si->cluster_nr;
  		goto checks;
  	}
7992fde72   Hugh Dickins   swapfile: swap al...
883

ed43af109   Huang Ying   swap: try to scan...
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
  	/*
  	 * Even if there's no free clusters available (fragmented),
  	 * try to scan a little more quickly with lock held unless we
  	 * have scanned too many slots already.
  	 */
  	if (!scanned_many) {
  		unsigned long scan_limit;
  
  		if (offset < scan_base)
  			scan_limit = scan_base;
  		else
  			scan_limit = si->highest_bit;
  		for (; offset <= scan_limit && --latency_ration > 0;
  		     offset++) {
  			if (!si->swap_map[offset])
  				goto checks;
  		}
  	}
36005bae2   Tim Chen   mm/swap: allocate...
902
  done:
490705888   Huang Ying   swap: reduce lock...
903
  	set_cluster_next(si, offset + 1);
36005bae2   Tim Chen   mm/swap: allocate...
904
905
  	si->flags -= SWP_SCANNING;
  	return n_ret;
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
906

ebebbbe90   Hugh Dickins   swapfile: rearran...
907
  scan:
ec8acf20a   Shaohua Li   swap: add per-par...
908
  	spin_unlock(&si->lock);
a449bf58e   Qian Cai   mm/swapfile: fix ...
909
910
  	while (++offset <= READ_ONCE(si->highest_bit)) {
  		if (data_race(!si->swap_map[offset])) {
ec8acf20a   Shaohua Li   swap: add per-par...
911
  			spin_lock(&si->lock);
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
912
913
  			goto checks;
  		}
a449bf58e   Qian Cai   mm/swapfile: fix ...
914
915
  		if (vm_swap_full() &&
  		    READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
ec8acf20a   Shaohua Li   swap: add per-par...
916
  			spin_lock(&si->lock);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
917
918
  			goto checks;
  		}
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
919
920
921
  		if (unlikely(--latency_ration < 0)) {
  			cond_resched();
  			latency_ration = LATENCY_LIMIT;
ed43af109   Huang Ying   swap: try to scan...
922
  			scanned_many = true;
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
923
  		}
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
924
  	}
c60aa176c   Hugh Dickins   swapfile: swap al...
925
  	offset = si->lowest_bit;
a5998061d   Jamie Liu   mm/swapfile.c: do...
926
  	while (offset < scan_base) {
a449bf58e   Qian Cai   mm/swapfile: fix ...
927
  		if (data_race(!si->swap_map[offset])) {
ec8acf20a   Shaohua Li   swap: add per-par...
928
  			spin_lock(&si->lock);
c60aa176c   Hugh Dickins   swapfile: swap al...
929
930
  			goto checks;
  		}
a449bf58e   Qian Cai   mm/swapfile: fix ...
931
932
  		if (vm_swap_full() &&
  		    READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
ec8acf20a   Shaohua Li   swap: add per-par...
933
  			spin_lock(&si->lock);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
934
935
  			goto checks;
  		}
c60aa176c   Hugh Dickins   swapfile: swap al...
936
937
938
  		if (unlikely(--latency_ration < 0)) {
  			cond_resched();
  			latency_ration = LATENCY_LIMIT;
ed43af109   Huang Ying   swap: try to scan...
939
  			scanned_many = true;
c60aa176c   Hugh Dickins   swapfile: swap al...
940
  		}
a5998061d   Jamie Liu   mm/swapfile.c: do...
941
  		offset++;
c60aa176c   Hugh Dickins   swapfile: swap al...
942
  	}
ec8acf20a   Shaohua Li   swap: add per-par...
943
  	spin_lock(&si->lock);
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
944
945
  
  no_page:
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
946
  	si->flags -= SWP_SCANNING;
36005bae2   Tim Chen   mm/swap: allocate...
947
  	return n_ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
948
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
949
950
951
952
  static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
  {
  	unsigned long idx;
  	struct swap_cluster_info *ci;
661c75664   Miaohe Lin   mm/swapfile.c: us...
953
  	unsigned long offset;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
954

fe5266d5d   Huang Ying   mm/swapfile.c: re...
955
956
957
958
959
960
961
962
  	/*
  	 * Should not even be attempting cluster allocations when huge
  	 * page swap is disabled.  Warn and fail the allocation.
  	 */
  	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
  		VM_WARN_ON_ONCE(1);
  		return 0;
  	}
38d8b4e6b   Huang Ying   mm, THP, swap: de...
963
964
965
966
967
968
969
  	if (cluster_list_empty(&si->free_clusters))
  		return 0;
  
  	idx = cluster_list_first(&si->free_clusters);
  	offset = idx * SWAPFILE_CLUSTER;
  	ci = lock_cluster(si, offset);
  	alloc_cluster(si, idx);
e07098294   Huang Ying   mm, THP, swap: su...
970
  	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
971

661c75664   Miaohe Lin   mm/swapfile.c: us...
972
  	memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
973
974
975
976
977
978
979
980
981
982
983
984
985
  	unlock_cluster(ci);
  	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
  	*slot = swp_entry(si->type, offset);
  
  	return 1;
  }
  
  static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	unsigned long offset = idx * SWAPFILE_CLUSTER;
  	struct swap_cluster_info *ci;
  
  	ci = lock_cluster(si, offset);
979aafa59   Huang Ying   mm/swapfile.c: cl...
986
  	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
987
988
989
990
991
  	cluster_set_count_flag(ci, 0, 0);
  	free_cluster(si, idx);
  	unlock_cluster(ci);
  	swap_range_free(si, offset, SWAPFILE_CLUSTER);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
992

5d5e8f195   Huang Ying   mm, swap, get_swa...
993
  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
994
  {
5d5e8f195   Huang Ying   mm, swap, get_swa...
995
  	unsigned long size = swap_entry_size(entry_size);
adfab836f   Dan Streetman   swap: change swap...
996
  	struct swap_info_struct *si, *next;
36005bae2   Tim Chen   mm/swap: allocate...
997
998
  	long avail_pgs;
  	int n_ret = 0;
a2468cc9b   Aaron Lu   swap: choose swap...
999
  	int node;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1000

38d8b4e6b   Huang Ying   mm, THP, swap: de...
1001
  	/* Only single cluster request supported */
5d5e8f195   Huang Ying   mm, swap, get_swa...
1002
  	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1003

b50da6e9f   Zhaoyang Huang   mm: fix a race on...
1004
  	spin_lock(&swap_avail_lock);
5d5e8f195   Huang Ying   mm, swap, get_swa...
1005
  	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
b50da6e9f   Zhaoyang Huang   mm: fix a race on...
1006
1007
  	if (avail_pgs <= 0) {
  		spin_unlock(&swap_avail_lock);
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1008
  		goto noswap;
b50da6e9f   Zhaoyang Huang   mm: fix a race on...
1009
  	}
36005bae2   Tim Chen   mm/swap: allocate...
1010

08d3090fc   Wei Yang   mm/swapfile.c: si...
1011
  	n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
36005bae2   Tim Chen   mm/swap: allocate...
1012

5d5e8f195   Huang Ying   mm, swap, get_swa...
1013
  	atomic_long_sub(n_goal * size, &nr_swap_pages);
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1014

18ab4d4ce   Dan Streetman   swap: change swap...
1015
  start_over:
a2468cc9b   Aaron Lu   swap: choose swap...
1016
1017
  	node = numa_node_id();
  	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
18ab4d4ce   Dan Streetman   swap: change swap...
1018
  		/* requeue si to after same-priority siblings */
a2468cc9b   Aaron Lu   swap: choose swap...
1019
  		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
18ab4d4ce   Dan Streetman   swap: change swap...
1020
  		spin_unlock(&swap_avail_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
1021
  		spin_lock(&si->lock);
adfab836f   Dan Streetman   swap: change swap...
1022
  		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
18ab4d4ce   Dan Streetman   swap: change swap...
1023
  			spin_lock(&swap_avail_lock);
a2468cc9b   Aaron Lu   swap: choose swap...
1024
  			if (plist_node_empty(&si->avail_lists[node])) {
18ab4d4ce   Dan Streetman   swap: change swap...
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
  				spin_unlock(&si->lock);
  				goto nextsi;
  			}
  			WARN(!si->highest_bit,
  			     "swap_info %d in list but !highest_bit
  ",
  			     si->type);
  			WARN(!(si->flags & SWP_WRITEOK),
  			     "swap_info %d in list but !SWP_WRITEOK
  ",
  			     si->type);
a2468cc9b   Aaron Lu   swap: choose swap...
1036
  			__del_from_avail_list(si);
ec8acf20a   Shaohua Li   swap: add per-par...
1037
  			spin_unlock(&si->lock);
18ab4d4ce   Dan Streetman   swap: change swap...
1038
  			goto nextsi;
ec8acf20a   Shaohua Li   swap: add per-par...
1039
  		}
5d5e8f195   Huang Ying   mm, swap, get_swa...
1040
  		if (size == SWAPFILE_CLUSTER) {
416634305   Gao Xiang   mm, THP, swap: fi...
1041
  			if (si->flags & SWP_BLKDEV)
f0eea189e   Huang Ying   mm, THP, swap: do...
1042
1043
  				n_ret = swap_alloc_cluster(si, swp_entries);
  		} else
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1044
1045
  			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
  						    n_goal, swp_entries);
ec8acf20a   Shaohua Li   swap: add per-par...
1046
  		spin_unlock(&si->lock);
5d5e8f195   Huang Ying   mm, swap, get_swa...
1047
  		if (n_ret || size == SWAPFILE_CLUSTER)
36005bae2   Tim Chen   mm/swap: allocate...
1048
  			goto check_out;
18ab4d4ce   Dan Streetman   swap: change swap...
1049
1050
  		pr_debug("scan_swap_map of si %d failed to find offset
  ",
36005bae2   Tim Chen   mm/swap: allocate...
1051
  			si->type);
18ab4d4ce   Dan Streetman   swap: change swap...
1052
1053
  		spin_lock(&swap_avail_lock);
  nextsi:
adfab836f   Dan Streetman   swap: change swap...
1054
1055
  		/*
  		 * if we got here, it's likely that si was almost full before,
bb243f7dc   Miaohe Lin   mm/swapfile: move...
1056
1057
1058
1059
1060
1061
1062
1063
  		 * and since scan_swap_map_slots() can drop the si->lock,
  		 * multiple callers probably all tried to get a page from the
  		 * same si and it filled up before we could get one; or, the si
  		 * filled up between us dropping swap_avail_lock and taking
  		 * si->lock. Since we dropped the swap_avail_lock, the
  		 * swap_avail_head list may have been modified; so if next is
  		 * still in the swap_avail_head list then try it, otherwise
  		 * start over if we have not gotten any slots.
adfab836f   Dan Streetman   swap: change swap...
1064
  		 */
a2468cc9b   Aaron Lu   swap: choose swap...
1065
  		if (plist_node_empty(&next->avail_lists[node]))
18ab4d4ce   Dan Streetman   swap: change swap...
1066
  			goto start_over;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1067
  	}
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1068

18ab4d4ce   Dan Streetman   swap: change swap...
1069
  	spin_unlock(&swap_avail_lock);
36005bae2   Tim Chen   mm/swap: allocate...
1070
1071
  check_out:
  	if (n_ret < n_goal)
5d5e8f195   Huang Ying   mm, swap, get_swa...
1072
  		atomic_long_add((long)(n_goal - n_ret) * size,
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1073
  				&nr_swap_pages);
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1074
  noswap:
36005bae2   Tim Chen   mm/swap: allocate...
1075
1076
  	return n_ret;
  }
e8c26ab60   Tim Chen   mm/swap: skip rea...
1077
  static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1078
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
1079
  	struct swap_info_struct *p;
eb085574a   Huang Ying   mm, swap: fix rac...
1080
  	unsigned long offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1081
1082
1083
  
  	if (!entry.val)
  		goto out;
eb085574a   Huang Ying   mm, swap: fix rac...
1084
  	p = swp_swap_info(entry);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1085
  	if (!p)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1086
  		goto bad_nofile;
a449bf58e   Qian Cai   mm/swapfile: fix ...
1087
  	if (data_race(!(p->flags & SWP_USED)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1088
1089
1090
1091
  		goto bad_device;
  	offset = swp_offset(entry);
  	if (offset >= p->max)
  		goto bad_offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1092
  	return p;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1093
  bad_offset:
cf532faa4   Stephen Zhang   mm/swapfile.c: fi...
1094
1095
  	pr_err("%s: %s%08lx
  ", __func__, Bad_offset, entry.val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1096
1097
  	goto out;
  bad_device:
cf532faa4   Stephen Zhang   mm/swapfile.c: fi...
1098
1099
  	pr_err("%s: %s%08lx
  ", __func__, Unused_file, entry.val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1100
1101
  	goto out;
  bad_nofile:
cf532faa4   Stephen Zhang   mm/swapfile.c: fi...
1102
1103
  	pr_err("%s: %s%08lx
  ", __func__, Bad_file, entry.val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1104
1105
  out:
  	return NULL;
886bb7e9c   Hugh Dickins   swapfile: remove ...
1106
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1107

e8c26ab60   Tim Chen   mm/swap: skip rea...
1108
1109
1110
1111
1112
1113
1114
  static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
  {
  	struct swap_info_struct *p;
  
  	p = __swap_info_get(entry);
  	if (!p)
  		goto out;
a449bf58e   Qian Cai   mm/swapfile: fix ...
1115
  	if (data_race(!p->swap_map[swp_offset(entry)]))
e8c26ab60   Tim Chen   mm/swap: skip rea...
1116
1117
1118
1119
  		goto bad_free;
  	return p;
  
  bad_free:
cf532faa4   Stephen Zhang   mm/swapfile.c: fi...
1120
1121
  	pr_err("%s: %s%08lx
  ", __func__, Unused_offset, entry.val);
e8c26ab60   Tim Chen   mm/swap: skip rea...
1122
1123
1124
  out:
  	return NULL;
  }
235b62176   Huang, Ying   mm/swap: add clus...
1125
1126
1127
1128
1129
1130
1131
1132
1133
  static struct swap_info_struct *swap_info_get(swp_entry_t entry)
  {
  	struct swap_info_struct *p;
  
  	p = _swap_info_get(entry);
  	if (p)
  		spin_lock(&p->lock);
  	return p;
  }
7c00bafee   Tim Chen   mm/swap: free swa...
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
  static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
  					struct swap_info_struct *q)
  {
  	struct swap_info_struct *p;
  
  	p = _swap_info_get(entry);
  
  	if (p != q) {
  		if (q != NULL)
  			spin_unlock(&q->lock);
  		if (p != NULL)
  			spin_lock(&p->lock);
  	}
  	return p;
  }
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1149
1150
1151
  static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  					      unsigned long offset,
  					      unsigned char usage)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1152
  {
8d69aaee8   Hugh Dickins   swap_info: swap_m...
1153
1154
  	unsigned char count;
  	unsigned char has_cache;
235b62176   Huang, Ying   mm/swap: add clus...
1155

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1156
  	count = p->swap_map[offset];
235b62176   Huang, Ying   mm/swap: add clus...
1157

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1158
1159
  	has_cache = count & SWAP_HAS_CACHE;
  	count &= ~SWAP_HAS_CACHE;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
1160

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1161
  	if (usage == SWAP_HAS_CACHE) {
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
1162
  		VM_BUG_ON(!has_cache);
253d553ba   Hugh Dickins   swap_info: SWAP_H...
1163
  		has_cache = 0;
aaa468653   Hugh Dickins   swap_info: note S...
1164
1165
1166
1167
1168
1169
  	} else if (count == SWAP_MAP_SHMEM) {
  		/*
  		 * Or we could insist on shmem.c using a special
  		 * swap_shmem_free() and free_shmem_swap_and_cache()...
  		 */
  		count = 0;
570a335b8   Hugh Dickins   swap_info: swap c...
1170
1171
1172
1173
1174
1175
1176
1177
1178
  	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
  		if (count == COUNT_CONTINUED) {
  			if (swap_count_continued(p, offset, count))
  				count = SWAP_MAP_MAX | COUNT_CONTINUED;
  			else
  				count = SWAP_MAP_MAX;
  		} else
  			count--;
  	}
253d553ba   Hugh Dickins   swap_info: SWAP_H...
1179

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1180
  	usage = count | has_cache;
a449bf58e   Qian Cai   mm/swapfile: fix ...
1181
1182
1183
1184
  	if (usage)
  		WRITE_ONCE(p->swap_map[offset], usage);
  	else
  		WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
7c00bafee   Tim Chen   mm/swap: free swa...
1185

b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1186
1187
  	return usage;
  }
eb085574a   Huang Ying   mm, swap: fix rac...
1188
1189
1190
1191
1192
1193
  /*
   * Check whether swap entry is valid in the swap device.  If so,
   * return pointer to swap_info_struct, and keep the swap entry valid
   * via preventing the swap device from being swapoff, until
   * put_swap_device() is called.  Otherwise return NULL.
   *
eb085574a   Huang Ying   mm, swap: fix rac...
1194
   * Notice that swapoff or swapoff+swapon can still happen before the
63d8620ec   Miaohe Lin   mm/swapfile: use ...
1195
1196
1197
1198
1199
   * percpu_ref_tryget_live() in get_swap_device() or after the
   * percpu_ref_put() in put_swap_device() if there isn't any other way
   * to prevent swapoff, such as page lock, page table lock, etc.  The
   * caller must be prepared for that.  For example, the following
   * situation is possible.
eb085574a   Huang Ying   mm, swap: fix rac...
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
   *
   *   CPU1				CPU2
   *   do_swap_page()
   *     ...				swapoff+swapon
   *     __read_swap_cache_async()
   *       swapcache_prepare()
   *         __swap_duplicate()
   *           // check swap_map
   *     // verify PTE not changed
   *
   * In __swap_duplicate(), the swap_map need to be checked before
   * changing partly because the specified swap entry may be for another
   * swap device which has been swapoff.  And in do_swap_page(), after
   * the page is read from the swap device, the PTE is verified not
   * changed with the page table locked to check whether the swap device
   * has been swapoff or swapoff+swapon.
   */
  struct swap_info_struct *get_swap_device(swp_entry_t entry)
  {
  	struct swap_info_struct *si;
  	unsigned long offset;
  
  	if (!entry.val)
  		goto out;
  	si = swp_swap_info(entry);
  	if (!si)
  		goto bad_nofile;
63d8620ec   Miaohe Lin   mm/swapfile: use ...
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
  	if (!percpu_ref_tryget_live(&si->users))
  		goto out;
  	/*
  	 * Guarantee the si->users are checked before accessing other
  	 * fields of swap_info_struct.
  	 *
  	 * Paired with the spin_unlock() after setup_swap_info() in
  	 * enable_swap_info().
  	 */
  	smp_rmb();
eb085574a   Huang Ying   mm, swap: fix rac...
1237
1238
  	offset = swp_offset(entry);
  	if (offset >= si->max)
63d8620ec   Miaohe Lin   mm/swapfile: use ...
1239
  		goto put_out;
eb085574a   Huang Ying   mm, swap: fix rac...
1240
1241
1242
1243
1244
1245
1246
  
  	return si;
  bad_nofile:
  	pr_err("%s: %s%08lx
  ", __func__, Bad_file, entry.val);
  out:
  	return NULL;
63d8620ec   Miaohe Lin   mm/swapfile: use ...
1247
1248
  put_out:
  	percpu_ref_put(&si->users);
eb085574a   Huang Ying   mm, swap: fix rac...
1249
1250
  	return NULL;
  }
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1251
  static unsigned char __swap_entry_free(struct swap_info_struct *p,
33e16272f   Wei Yang   mm/swapfile.c: __...
1252
  				       swp_entry_t entry)
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1253
1254
1255
  {
  	struct swap_cluster_info *ci;
  	unsigned long offset = swp_offset(entry);
33e16272f   Wei Yang   mm/swapfile.c: __...
1256
  	unsigned char usage;
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1257
1258
  
  	ci = lock_cluster_or_swap_info(p, offset);
33e16272f   Wei Yang   mm/swapfile.c: __...
1259
  	usage = __swap_entry_free_locked(p, offset, 1);
7c00bafee   Tim Chen   mm/swap: free swa...
1260
  	unlock_cluster_or_swap_info(p, ci);
10e364da1   Huang Ying   mm/swapfile.c: ca...
1261
1262
  	if (!usage)
  		free_swap_slot(entry);
7c00bafee   Tim Chen   mm/swap: free swa...
1263
1264
1265
  
  	return usage;
  }
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
1266

7c00bafee   Tim Chen   mm/swap: free swa...
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
  static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
  {
  	struct swap_cluster_info *ci;
  	unsigned long offset = swp_offset(entry);
  	unsigned char count;
  
  	ci = lock_cluster(p, offset);
  	count = p->swap_map[offset];
  	VM_BUG_ON(count != SWAP_HAS_CACHE);
  	p->swap_map[offset] = 0;
  	dec_cluster_info_page(p, p->cluster_info, offset);
235b62176   Huang, Ying   mm/swap: add clus...
1278
  	unlock_cluster(ci);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1279
1280
  	mem_cgroup_uncharge_swap(entry, 1);
  	swap_range_free(p, offset, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1281
1282
1283
  }
  
  /*
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
1284
   * Caller has made sure that the swap device corresponding to entry
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1285
1286
1287
1288
   * is still around or has not been recycled.
   */
  void swap_free(swp_entry_t entry)
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
1289
  	struct swap_info_struct *p;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1290

235b62176   Huang, Ying   mm/swap: add clus...
1291
  	p = _swap_info_get(entry);
10e364da1   Huang Ying   mm/swapfile.c: ca...
1292
  	if (p)
33e16272f   Wei Yang   mm/swapfile.c: __...
1293
  		__swap_entry_free(p, entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1294
1295
1296
  }
  
  /*
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
1297
1298
   * Called after dropping swapcache to decrease refcnt to swap entries.
   */
a448f2d07   Huang Ying   mm/swapfile.c: un...
1299
  void put_swap_page(struct page *page, swp_entry_t entry)
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1300
1301
1302
1303
1304
1305
  {
  	unsigned long offset = swp_offset(entry);
  	unsigned long idx = offset / SWAPFILE_CLUSTER;
  	struct swap_cluster_info *ci;
  	struct swap_info_struct *si;
  	unsigned char *map;
a3aea839e   Huang Ying   mm, THP, swap: su...
1306
1307
  	unsigned int i, free_entries = 0;
  	unsigned char val;
6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
1308
  	int size = swap_entry_size(thp_nr_pages(page));
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1309

a3aea839e   Huang Ying   mm, THP, swap: su...
1310
  	si = _swap_info_get(entry);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1311
1312
  	if (!si)
  		return;
c2343d276   Huang Ying   mm/swapfile.c: pu...
1313
  	ci = lock_cluster_or_swap_info(si, offset);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1314
  	if (size == SWAPFILE_CLUSTER) {
a448f2d07   Huang Ying   mm/swapfile.c: un...
1315
1316
1317
1318
1319
1320
1321
1322
  		VM_BUG_ON(!cluster_is_huge(ci));
  		map = si->swap_map + offset;
  		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
  			val = map[i];
  			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
  			if (val == SWAP_HAS_CACHE)
  				free_entries++;
  		}
a448f2d07   Huang Ying   mm/swapfile.c: un...
1323
  		cluster_clear_huge(ci);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1324
  		if (free_entries == SWAPFILE_CLUSTER) {
c2343d276   Huang Ying   mm/swapfile.c: pu...
1325
  			unlock_cluster_or_swap_info(si, ci);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1326
  			spin_lock(&si->lock);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1327
1328
1329
1330
1331
1332
  			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
  			swap_free_cluster(si, idx);
  			spin_unlock(&si->lock);
  			return;
  		}
  	}
c2343d276   Huang Ying   mm/swapfile.c: pu...
1333
1334
1335
1336
1337
1338
1339
  	for (i = 0; i < size; i++, entry.val++) {
  		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
  			unlock_cluster_or_swap_info(si, ci);
  			free_swap_slot(entry);
  			if (i == size - 1)
  				return;
  			lock_cluster_or_swap_info(si, offset);
a3aea839e   Huang Ying   mm, THP, swap: su...
1340
1341
  		}
  	}
c2343d276   Huang Ying   mm/swapfile.c: pu...
1342
  	unlock_cluster_or_swap_info(si, ci);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1343
  }
59807685a   Huang Ying   mm, THP, swap: su...
1344

fe5266d5d   Huang Ying   mm/swapfile.c: re...
1345
  #ifdef CONFIG_THP_SWAP
59807685a   Huang Ying   mm, THP, swap: su...
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
  int split_swap_cluster(swp_entry_t entry)
  {
  	struct swap_info_struct *si;
  	struct swap_cluster_info *ci;
  	unsigned long offset = swp_offset(entry);
  
  	si = _swap_info_get(entry);
  	if (!si)
  		return -EBUSY;
  	ci = lock_cluster(si, offset);
  	cluster_clear_huge(ci);
  	unlock_cluster(ci);
  	return 0;
  }
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1360
  #endif
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1361

155b5f88e   Huang Ying   mm/swapfile.c: so...
1362
1363
1364
1365
1366
1367
  static int swp_entry_cmp(const void *ent1, const void *ent2)
  {
  	const swp_entry_t *e1 = ent1, *e2 = ent2;
  
  	return (int)swp_type(*e1) - (int)swp_type(*e2);
  }
7c00bafee   Tim Chen   mm/swap: free swa...
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
  void swapcache_free_entries(swp_entry_t *entries, int n)
  {
  	struct swap_info_struct *p, *prev;
  	int i;
  
  	if (n <= 0)
  		return;
  
  	prev = NULL;
  	p = NULL;
155b5f88e   Huang Ying   mm/swapfile.c: so...
1378
1379
1380
1381
1382
1383
1384
1385
  
  	/*
  	 * Sort swap entries by swap device, so each lock is only taken once.
  	 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
  	 * so low that it isn't necessary to optimize further.
  	 */
  	if (nr_swapfiles > 1)
  		sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
7c00bafee   Tim Chen   mm/swap: free swa...
1386
1387
1388
1389
  	for (i = 0; i < n; ++i) {
  		p = swap_info_get_cont(entries[i], prev);
  		if (p)
  			swap_entry_free(p, entries[i]);
7c00bafee   Tim Chen   mm/swap: free swa...
1390
1391
  		prev = p;
  	}
235b62176   Huang, Ying   mm/swap: add clus...
1392
  	if (p)
7c00bafee   Tim Chen   mm/swap: free swa...
1393
  		spin_unlock(&p->lock);
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
1394
1395
1396
  }
  
  /*
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1397
   * How many references to page are currently swapped out?
570a335b8   Hugh Dickins   swap_info: swap c...
1398
1399
   * This does not give an exact answer when swap count is continued,
   * but does include the high COUNT_CONTINUED flag to allow for that.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1400
   */
bde05d1cc   Hugh Dickins   shmem: replace pa...
1401
  int page_swapcount(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1402
  {
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1403
1404
  	int count = 0;
  	struct swap_info_struct *p;
235b62176   Huang, Ying   mm/swap: add clus...
1405
  	struct swap_cluster_info *ci;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1406
  	swp_entry_t entry;
235b62176   Huang, Ying   mm/swap: add clus...
1407
  	unsigned long offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1408

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
1409
  	entry.val = page_private(page);
235b62176   Huang, Ying   mm/swap: add clus...
1410
  	p = _swap_info_get(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1411
  	if (p) {
235b62176   Huang, Ying   mm/swap: add clus...
1412
1413
1414
1415
  		offset = swp_offset(entry);
  		ci = lock_cluster_or_swap_info(p, offset);
  		count = swap_count(p->swap_map[offset]);
  		unlock_cluster_or_swap_info(p, ci);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1416
  	}
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1417
  	return count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1418
  }
eb085574a   Huang Ying   mm, swap: fix rac...
1419
  int __swap_count(swp_entry_t entry)
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1420
  {
eb085574a   Huang Ying   mm, swap: fix rac...
1421
  	struct swap_info_struct *si;
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1422
  	pgoff_t offset = swp_offset(entry);
eb085574a   Huang Ying   mm, swap: fix rac...
1423
  	int count = 0;
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1424

eb085574a   Huang Ying   mm, swap: fix rac...
1425
1426
1427
1428
1429
1430
  	si = get_swap_device(entry);
  	if (si) {
  		count = swap_count(si->swap_map[offset]);
  		put_swap_device(si);
  	}
  	return count;
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1431
  }
322b8afe4   Huang Ying   mm, swap: Fix a r...
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
  static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
  {
  	int count = 0;
  	pgoff_t offset = swp_offset(entry);
  	struct swap_cluster_info *ci;
  
  	ci = lock_cluster_or_swap_info(si, offset);
  	count = swap_count(si->swap_map[offset]);
  	unlock_cluster_or_swap_info(si, ci);
  	return count;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1443
  /*
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1444
   * How many references to @entry are currently swapped out?
e8c26ab60   Tim Chen   mm/swap: skip rea...
1445
1446
1447
1448
1449
1450
   * This does not give an exact answer when swap count is continued,
   * but does include the high COUNT_CONTINUED flag to allow for that.
   */
  int __swp_swapcount(swp_entry_t entry)
  {
  	int count = 0;
e8c26ab60   Tim Chen   mm/swap: skip rea...
1451
  	struct swap_info_struct *si;
e8c26ab60   Tim Chen   mm/swap: skip rea...
1452

eb085574a   Huang Ying   mm, swap: fix rac...
1453
1454
  	si = get_swap_device(entry);
  	if (si) {
322b8afe4   Huang Ying   mm, swap: Fix a r...
1455
  		count = swap_swapcount(si, entry);
eb085574a   Huang Ying   mm, swap: fix rac...
1456
1457
  		put_swap_device(si);
  	}
e8c26ab60   Tim Chen   mm/swap: skip rea...
1458
1459
1460
1461
1462
  	return count;
  }
  
  /*
   * How many references to @entry are currently swapped out?
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1463
1464
1465
1466
1467
1468
   * This considers COUNT_CONTINUED so it returns exact answer.
   */
  int swp_swapcount(swp_entry_t entry)
  {
  	int count, tmp_count, n;
  	struct swap_info_struct *p;
235b62176   Huang, Ying   mm/swap: add clus...
1469
  	struct swap_cluster_info *ci;
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1470
1471
1472
  	struct page *page;
  	pgoff_t offset;
  	unsigned char *map;
235b62176   Huang, Ying   mm/swap: add clus...
1473
  	p = _swap_info_get(entry);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1474
1475
  	if (!p)
  		return 0;
235b62176   Huang, Ying   mm/swap: add clus...
1476
1477
1478
1479
1480
  	offset = swp_offset(entry);
  
  	ci = lock_cluster_or_swap_info(p, offset);
  
  	count = swap_count(p->swap_map[offset]);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1481
1482
1483
1484
1485
  	if (!(count & COUNT_CONTINUED))
  		goto out;
  
  	count &= ~COUNT_CONTINUED;
  	n = SWAP_MAP_MAX + 1;
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1486
1487
1488
1489
1490
  	page = vmalloc_to_page(p->swap_map + offset);
  	offset &= ~PAGE_MASK;
  	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
  
  	do {
a8ae49917   Geliang Tang   mm/swapfile.c: us...
1491
  		page = list_next_entry(page, lru);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1492
1493
1494
1495
1496
1497
1498
1499
  		map = kmap_atomic(page);
  		tmp_count = map[offset];
  		kunmap_atomic(map);
  
  		count += (tmp_count & ~COUNT_CONTINUED) * n;
  		n *= (SWAP_CONT_MAX + 1);
  	} while (tmp_count & COUNT_CONTINUED);
  out:
235b62176   Huang, Ying   mm/swap: add clus...
1500
  	unlock_cluster_or_swap_info(p, ci);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1501
1502
  	return count;
  }
e07098294   Huang Ying   mm, THP, swap: su...
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
  static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
  					 swp_entry_t entry)
  {
  	struct swap_cluster_info *ci;
  	unsigned char *map = si->swap_map;
  	unsigned long roffset = swp_offset(entry);
  	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
  	int i;
  	bool ret = false;
  
  	ci = lock_cluster_or_swap_info(si, offset);
  	if (!ci || !cluster_is_huge(ci)) {
afa4711ef   Huang Ying   mm/swapfile.c: us...
1515
  		if (swap_count(map[roffset]))
e07098294   Huang Ying   mm, THP, swap: su...
1516
1517
1518
1519
  			ret = true;
  		goto unlock_out;
  	}
  	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
afa4711ef   Huang Ying   mm/swapfile.c: us...
1520
  		if (swap_count(map[offset + i])) {
e07098294   Huang Ying   mm, THP, swap: su...
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
  			ret = true;
  			break;
  		}
  	}
  unlock_out:
  	unlock_cluster_or_swap_info(si, ci);
  	return ret;
  }
  
  static bool page_swapped(struct page *page)
  {
  	swp_entry_t entry;
  	struct swap_info_struct *si;
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1534
  	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
e07098294   Huang Ying   mm, THP, swap: su...
1535
1536
1537
1538
1539
1540
1541
1542
1543
  		return page_swapcount(page) != 0;
  
  	page = compound_head(page);
  	entry.val = page_private(page);
  	si = _swap_info_get(entry);
  	if (si)
  		return swap_page_trans_huge_swapped(si, entry);
  	return false;
  }
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
  
  static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
  					 int *total_swapcount)
  {
  	int i, map_swapcount, _total_mapcount, _total_swapcount;
  	unsigned long offset = 0;
  	struct swap_info_struct *si;
  	struct swap_cluster_info *ci = NULL;
  	unsigned char *map = NULL;
  	int mapcount, swapcount = 0;
  
  	/* hugetlbfs shouldn't call it */
  	VM_BUG_ON_PAGE(PageHuge(page), page);
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1557
1558
  	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
  		mapcount = page_trans_huge_mapcount(page, total_mapcount);
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
  		if (PageSwapCache(page))
  			swapcount = page_swapcount(page);
  		if (total_swapcount)
  			*total_swapcount = swapcount;
  		return mapcount + swapcount;
  	}
  
  	page = compound_head(page);
  
  	_total_mapcount = _total_swapcount = map_swapcount = 0;
  	if (PageSwapCache(page)) {
  		swp_entry_t entry;
  
  		entry.val = page_private(page);
  		si = _swap_info_get(entry);
  		if (si) {
  			map = si->swap_map;
  			offset = swp_offset(entry);
  		}
  	}
  	if (map)
  		ci = lock_cluster(si, offset);
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		mapcount = atomic_read(&page[i]._mapcount) + 1;
  		_total_mapcount += mapcount;
  		if (map) {
  			swapcount = swap_count(map[offset + i]);
  			_total_swapcount += swapcount;
  		}
  		map_swapcount = max(map_swapcount, mapcount + swapcount);
  	}
  	unlock_cluster(ci);
  	if (PageDoubleMap(page)) {
  		map_swapcount -= 1;
  		_total_mapcount -= HPAGE_PMD_NR;
  	}
  	mapcount = compound_mapcount(page);
  	map_swapcount += mapcount;
  	_total_mapcount += mapcount;
  	if (total_mapcount)
  		*total_mapcount = _total_mapcount;
  	if (total_swapcount)
  		*total_swapcount = _total_swapcount;
  
  	return map_swapcount;
  }
e07098294   Huang Ying   mm, THP, swap: su...
1605

8334b9622   Minchan Kim   mm: /proc/pid/sma...
1606
  /*
7b1fe5979   Hugh Dickins   mm: reuse_swap_pa...
1607
1608
1609
1610
   * We can write to an anon page without COW if there are no other references
   * to it.  And as a side-effect, free up its swap: because the old content
   * on disk will never be read, and seeking back there to write new content
   * later would only waste time away from clustering.
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1611
   *
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1612
   * NOTE: total_map_swapcount should not be relied upon by the caller if
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1613
1614
   * reuse_swap_page() returns false, but it may be always overwritten
   * (see the other implementation for CONFIG_SWAP=n).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1615
   */
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1616
  bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1617
  {
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1618
  	int count, total_mapcount, total_swapcount;
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1619

309381fea   Sasha Levin   mm: dump page whe...
1620
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
5ad646880   Hugh Dickins   ksm: let shared p...
1621
  	if (unlikely(PageKsm(page)))
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1622
  		return false;
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1623
1624
1625
1626
1627
1628
1629
1630
  	count = page_trans_huge_map_swapcount(page, &total_mapcount,
  					      &total_swapcount);
  	if (total_map_swapcount)
  		*total_map_swapcount = total_mapcount + total_swapcount;
  	if (count == 1 && PageSwapCache(page) &&
  	    (likely(!PageTransCompound(page)) ||
  	     /* The remaining swap count will be freed soon */
  	     total_swapcount == page_swapcount(page))) {
f05714293   Minchan Kim   mm: support anony...
1631
  		if (!PageWriteback(page)) {
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1632
  			page = compound_head(page);
7b1fe5979   Hugh Dickins   mm: reuse_swap_pa...
1633
1634
  			delete_from_swap_cache(page);
  			SetPageDirty(page);
f05714293   Minchan Kim   mm: support anony...
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
  		} else {
  			swp_entry_t entry;
  			struct swap_info_struct *p;
  
  			entry.val = page_private(page);
  			p = swap_info_get(entry);
  			if (p->flags & SWP_STABLE_WRITES) {
  				spin_unlock(&p->lock);
  				return false;
  			}
  			spin_unlock(&p->lock);
7b1fe5979   Hugh Dickins   mm: reuse_swap_pa...
1646
1647
  		}
  	}
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1648

5ad646880   Hugh Dickins   ksm: let shared p...
1649
  	return count <= 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1650
1651
1652
  }
  
  /*
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1653
1654
   * If swap is getting full, or if there are no more mappings of this page,
   * then try_to_free_swap is called to free its swap space.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1655
   */
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1656
  int try_to_free_swap(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1657
  {
309381fea   Sasha Levin   mm: dump page whe...
1658
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1659
1660
1661
1662
1663
  
  	if (!PageSwapCache(page))
  		return 0;
  	if (PageWriteback(page))
  		return 0;
e07098294   Huang Ying   mm, THP, swap: su...
1664
  	if (page_swapped(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1665
  		return 0;
b73d7fcec   Hugh Dickins   swap: prevent reu...
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
  	/*
  	 * Once hibernation has begun to create its image of memory,
  	 * there's a danger that one of the calls to try_to_free_swap()
  	 * - most probably a call from __try_to_reclaim_swap() while
  	 * hibernation is allocating its own swap pages for the image,
  	 * but conceivably even a call from memory reclaim - will free
  	 * the swap from a page which has already been recorded in the
  	 * image as a clean swapcache page, and then reuse its swap for
  	 * another page of the image.  On waking from hibernation, the
  	 * original page might be freed under memory pressure, then
  	 * later read back in from swap, now with the wrong data.
  	 *
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
1678
  	 * Hibernation suspends storage while it is writing the image
f90ac3982   Mel Gorman   mm: avoid liveloc...
1679
  	 * to disk so check that here.
b73d7fcec   Hugh Dickins   swap: prevent reu...
1680
  	 */
f90ac3982   Mel Gorman   mm: avoid liveloc...
1681
  	if (pm_suspended_storage())
b73d7fcec   Hugh Dickins   swap: prevent reu...
1682
  		return 0;
e07098294   Huang Ying   mm, THP, swap: su...
1683
  	page = compound_head(page);
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1684
1685
1686
  	delete_from_swap_cache(page);
  	SetPageDirty(page);
  	return 1;
68a22394c   Rik van Riel   vmscan: free swap...
1687
1688
1689
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1690
1691
1692
   * Free the swap entry like above, but also try to
   * free the page cache entry if it is the last user.
   */
2509ef26d   Hugh Dickins   badpage: zap prin...
1693
  int free_swap_and_cache(swp_entry_t entry)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1694
  {
2509ef26d   Hugh Dickins   badpage: zap prin...
1695
  	struct swap_info_struct *p;
7c00bafee   Tim Chen   mm/swap: free swa...
1696
  	unsigned char count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1697

a7420aa54   Andi Kleen   HWPOISON: Add sup...
1698
  	if (non_swap_entry(entry))
2509ef26d   Hugh Dickins   badpage: zap prin...
1699
  		return 1;
0697212a4   Christoph Lameter   [PATCH] Swapless ...
1700

7c00bafee   Tim Chen   mm/swap: free swa...
1701
  	p = _swap_info_get(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1702
  	if (p) {
33e16272f   Wei Yang   mm/swapfile.c: __...
1703
  		count = __swap_entry_free(p, entry);
e07098294   Huang Ying   mm, THP, swap: su...
1704
  		if (count == SWAP_HAS_CACHE &&
bcd49e867   Huang Ying   mm/swapfile.c: us...
1705
1706
1707
  		    !swap_page_trans_huge_swapped(p, entry))
  			__try_to_reclaim_swap(p, swp_offset(entry),
  					      TTRS_UNMAPPED | TTRS_FULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1708
  	}
2509ef26d   Hugh Dickins   badpage: zap prin...
1709
  	return p != NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1710
  }
b0cb1a19d   Rafael J. Wysocki   Replace CONFIG_SO...
1711
  #ifdef CONFIG_HIBERNATION
bb243f7dc   Miaohe Lin   mm/swapfile: move...
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
  
  swp_entry_t get_swap_page_of_type(int type)
  {
  	struct swap_info_struct *si = swap_type_to_swap_info(type);
  	swp_entry_t entry = {0};
  
  	if (!si)
  		goto fail;
  
  	/* This is called for allocating swap entry, not cache */
  	spin_lock(&si->lock);
  	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
  		atomic_long_dec(&nr_swap_pages);
  	spin_unlock(&si->lock);
  fail:
  	return entry;
  }
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1729
  /*
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1730
   * Find the swap type that corresponds to given device (if any).
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1731
   *
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1732
1733
1734
1735
   * @offset - number of the PAGE_SIZE-sized block of the device, starting
   * from 0, in which the swap header is expected to be located.
   *
   * This is needed for the suspend to disk (aka swsusp).
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1736
   */
21bd90057   Christoph Hellwig   mm: split swap_ty...
1737
  int swap_type_of(dev_t device, sector_t offset)
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1738
  {
efa90a981   Hugh Dickins   swap_info: change...
1739
  	int type;
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1740

21bd90057   Christoph Hellwig   mm: split swap_ty...
1741
1742
  	if (!device)
  		return -1;
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1743

f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1744
  	spin_lock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
1745
1746
  	for (type = 0; type < nr_swapfiles; type++) {
  		struct swap_info_struct *sis = swap_info[type];
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1747

915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1748
  		if (!(sis->flags & SWP_WRITEOK))
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1749
  			continue;
b6b5bce35   Rafael J. Wysocki   [PATCH] swsusp: F...
1750

21bd90057   Christoph Hellwig   mm: split swap_ty...
1751
  		if (device == sis->bdev->bd_dev) {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
1752
  			struct swap_extent *se = first_se(sis);
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1753

915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1754
1755
  			if (se->start_block == offset) {
  				spin_unlock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
1756
  				return type;
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1757
  			}
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1758
1759
1760
  		}
  	}
  	spin_unlock(&swap_lock);
21bd90057   Christoph Hellwig   mm: split swap_ty...
1761
1762
  	return -ENODEV;
  }
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1763

21bd90057   Christoph Hellwig   mm: split swap_ty...
1764
1765
1766
  int find_first_swap(dev_t *device)
  {
  	int type;
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1767

21bd90057   Christoph Hellwig   mm: split swap_ty...
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
  	spin_lock(&swap_lock);
  	for (type = 0; type < nr_swapfiles; type++) {
  		struct swap_info_struct *sis = swap_info[type];
  
  		if (!(sis->flags & SWP_WRITEOK))
  			continue;
  		*device = sis->bdev->bd_dev;
  		spin_unlock(&swap_lock);
  		return type;
  	}
  	spin_unlock(&swap_lock);
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1779
1780
1781
1782
  	return -ENODEV;
  }
  
  /*
73c34b6ac   Hugh Dickins   swap_info: miscel...
1783
1784
1785
1786
1787
   * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
   * corresponding to given index in swap_info (swap type).
   */
  sector_t swapdev_block(int type, pgoff_t offset)
  {
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1788
  	struct swap_info_struct *si = swap_type_to_swap_info(type);
f885056a4   Christoph Hellwig   mm: simplify swap...
1789
  	struct swap_extent *se;
73c34b6ac   Hugh Dickins   swap_info: miscel...
1790

c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1791
  	if (!si || !(si->flags & SWP_WRITEOK))
73c34b6ac   Hugh Dickins   swap_info: miscel...
1792
  		return 0;
f885056a4   Christoph Hellwig   mm: simplify swap...
1793
1794
  	se = offset_to_swap_extent(si, offset);
  	return se->start_block + (offset - se->start_page);
73c34b6ac   Hugh Dickins   swap_info: miscel...
1795
1796
1797
  }
  
  /*
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1798
1799
1800
1801
1802
1803
1804
1805
   * Return either the total number of swap pages of given type, or the number
   * of free pages of that type (depending on @free)
   *
   * This is needed for software suspend
   */
  unsigned int count_swap_pages(int type, int free)
  {
  	unsigned int n = 0;
efa90a981   Hugh Dickins   swap_info: change...
1806
1807
1808
  	spin_lock(&swap_lock);
  	if ((unsigned int)type < nr_swapfiles) {
  		struct swap_info_struct *sis = swap_info[type];
ec8acf20a   Shaohua Li   swap: add per-par...
1809
  		spin_lock(&sis->lock);
efa90a981   Hugh Dickins   swap_info: change...
1810
1811
  		if (sis->flags & SWP_WRITEOK) {
  			n = sis->pages;
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1812
  			if (free)
efa90a981   Hugh Dickins   swap_info: change...
1813
  				n -= sis->inuse_pages;
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1814
  		}
ec8acf20a   Shaohua Li   swap: add per-par...
1815
  		spin_unlock(&sis->lock);
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1816
  	}
efa90a981   Hugh Dickins   swap_info: change...
1817
  	spin_unlock(&swap_lock);
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1818
1819
  	return n;
  }
73c34b6ac   Hugh Dickins   swap_info: miscel...
1820
  #endif /* CONFIG_HIBERNATION */
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1821

9f8bdb3f3   Hugh Dickins   mm: make swapoff ...
1822
  static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
179ef71cb   Cyrill Gorcunov   mm: save soft-dir...
1823
  {
099dd6878   Peter Xu   mm/swap: fix pte_...
1824
  	return pte_same(pte_swp_clear_flags(pte), swp_pte);
179ef71cb   Cyrill Gorcunov   mm: save soft-dir...
1825
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1826
  /*
72866f6f2   Hugh Dickins   [PATCH] mm: anon ...
1827
1828
1829
   * No need to decide whether this PTE shares the swap entry with others,
   * just let do_wp_page work it out if a write is requested later - to
   * force COW, vm_page_prot omits write permission from any private vma.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1830
   */
044d66c1d   Hugh Dickins   memcgroup: reinst...
1831
  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1832
1833
  		unsigned long addr, swp_entry_t entry, struct page *page)
  {
9e16b7fb1   Hugh Dickins   mm,ksm: swapoff m...
1834
  	struct page *swapcache;
044d66c1d   Hugh Dickins   memcgroup: reinst...
1835
1836
1837
  	spinlock_t *ptl;
  	pte_t *pte;
  	int ret = 1;
9e16b7fb1   Hugh Dickins   mm,ksm: swapoff m...
1838
1839
1840
1841
  	swapcache = page;
  	page = ksm_might_need_to_copy(page, vma, addr);
  	if (unlikely(!page))
  		return -ENOMEM;
044d66c1d   Hugh Dickins   memcgroup: reinst...
1842
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
9f8bdb3f3   Hugh Dickins   mm: make swapoff ...
1843
  	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
044d66c1d   Hugh Dickins   memcgroup: reinst...
1844
1845
1846
  		ret = 0;
  		goto out;
  	}
8a9f3ccd2   Balbir Singh   Memory controller...
1847

b084d4353   KAMEZAWA Hiroyuki   mm: count swap usage
1848
  	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
1849
  	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1850
1851
1852
  	get_page(page);
  	set_pte_at(vma->vm_mm, addr, pte,
  		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
00501b531   Johannes Weiner   mm: memcontrol: r...
1853
  	if (page == swapcache) {
be5d0a74c   Johannes Weiner   mm: memcontrol: s...
1854
  		page_add_anon_rmap(page, vma, addr, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
1855
  	} else { /* ksm created a completely new copy */
be5d0a74c   Johannes Weiner   mm: memcontrol: s...
1856
  		page_add_new_anon_rmap(page, vma, addr, false);
b518154e5   Joonsoo Kim   mm/vmscan: protec...
1857
  		lru_cache_add_inactive_or_unevictable(page, vma);
00501b531   Johannes Weiner   mm: memcontrol: r...
1858
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1859
  	swap_free(entry);
044d66c1d   Hugh Dickins   memcgroup: reinst...
1860
1861
  out:
  	pte_unmap_unlock(pte, ptl);
9e16b7fb1   Hugh Dickins   mm,ksm: swapoff m...
1862
1863
1864
1865
  	if (page != swapcache) {
  		unlock_page(page);
  		put_page(page);
  	}
044d66c1d   Hugh Dickins   memcgroup: reinst...
1866
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1867
1868
1869
  }
  
  static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1870
1871
1872
  			unsigned long addr, unsigned long end,
  			unsigned int type, bool frontswap,
  			unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1873
  {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1874
1875
  	struct page *page;
  	swp_entry_t entry;
705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
1876
  	pte_t *pte;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1877
1878
  	struct swap_info_struct *si;
  	unsigned long offset;
8a9f3ccd2   Balbir Singh   Memory controller...
1879
  	int ret = 0;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1880
  	volatile unsigned char *swap_map;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1881

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1882
  	si = swap_info[type];
044d66c1d   Hugh Dickins   memcgroup: reinst...
1883
  	pte = pte_offset_map(pmd, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1884
  	do {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
  		if (!is_swap_pte(*pte))
  			continue;
  
  		entry = pte_to_swp_entry(*pte);
  		if (swp_type(entry) != type)
  			continue;
  
  		offset = swp_offset(entry);
  		if (frontswap && !frontswap_test(si, offset))
  			continue;
  
  		pte_unmap(pte);
  		swap_map = &si->swap_map[offset];
ebc5951ee   Andrea Righi   mm: swap: properl...
1898
1899
  		page = lookup_swap_cache(entry, vma, addr);
  		if (!page) {
8c63ca5bc   Will Deacon   mm: Use static in...
1900
1901
1902
1903
1904
  			struct vm_fault vmf = {
  				.vma = vma,
  				.address = addr,
  				.pmd = pmd,
  			};
ebc5951ee   Andrea Righi   mm: swap: properl...
1905
1906
1907
  			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  						&vmf);
  		}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
  		if (!page) {
  			if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
  				goto try_next;
  			return -ENOMEM;
  		}
  
  		lock_page(page);
  		wait_on_page_writeback(page);
  		ret = unuse_pte(vma, pmd, addr, entry, page);
  		if (ret < 0) {
  			unlock_page(page);
  			put_page(page);
  			goto out;
  		}
  
  		try_to_free_swap(page);
  		unlock_page(page);
  		put_page(page);
  
  		if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
  			ret = FRONTSWAP_PAGES_UNUSED;
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1930
  		}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1931
1932
  try_next:
  		pte = pte_offset_map(pmd, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1933
  	} while (pte++, addr += PAGE_SIZE, addr != end);
044d66c1d   Hugh Dickins   memcgroup: reinst...
1934
  	pte_unmap(pte - 1);
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1935
1936
  
  	ret = 0;
044d66c1d   Hugh Dickins   memcgroup: reinst...
1937
  out:
8a9f3ccd2   Balbir Singh   Memory controller...
1938
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1939
1940
1941
1942
  }
  
  static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
  				unsigned long addr, unsigned long end,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1943
1944
  				unsigned int type, bool frontswap,
  				unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1945
1946
1947
  {
  	pmd_t *pmd;
  	unsigned long next;
8a9f3ccd2   Balbir Singh   Memory controller...
1948
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1949
1950
1951
  
  	pmd = pmd_offset(pud, addr);
  	do {
dc644a073   Hugh Dickins   mm: add three mor...
1952
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1953
  		next = pmd_addr_end(addr, end);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
1954
  		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1955
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1956
1957
  		ret = unuse_pte_range(vma, pmd, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
8a9f3ccd2   Balbir Singh   Memory controller...
1958
1959
  		if (ret)
  			return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1960
1961
1962
  	} while (pmd++, addr = next, addr != end);
  	return 0;
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
1963
  static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1964
  				unsigned long addr, unsigned long end,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1965
1966
  				unsigned int type, bool frontswap,
  				unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1967
1968
1969
  {
  	pud_t *pud;
  	unsigned long next;
8a9f3ccd2   Balbir Singh   Memory controller...
1970
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1971

c2febafc6   Kirill A. Shutemov   mm: convert gener...
1972
  	pud = pud_offset(p4d, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1973
1974
1975
1976
  	do {
  		next = pud_addr_end(addr, end);
  		if (pud_none_or_clear_bad(pud))
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1977
1978
  		ret = unuse_pmd_range(vma, pud, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
8a9f3ccd2   Balbir Singh   Memory controller...
1979
1980
  		if (ret)
  			return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1981
1982
1983
  	} while (pud++, addr = next, addr != end);
  	return 0;
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
1984
1985
  static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
  				unsigned long addr, unsigned long end,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1986
1987
  				unsigned int type, bool frontswap,
  				unsigned long *fs_pages_to_unuse)
c2febafc6   Kirill A. Shutemov   mm: convert gener...
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
  {
  	p4d_t *p4d;
  	unsigned long next;
  	int ret;
  
  	p4d = p4d_offset(pgd, addr);
  	do {
  		next = p4d_addr_end(addr, end);
  		if (p4d_none_or_clear_bad(p4d))
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1998
1999
  		ret = unuse_pud_range(vma, p4d, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2000
2001
2002
2003
2004
  		if (ret)
  			return ret;
  	} while (p4d++, addr = next, addr != end);
  	return 0;
  }
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2005
2006
  static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
  		     bool frontswap, unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2007
2008
2009
  {
  	pgd_t *pgd;
  	unsigned long addr, end, next;
8a9f3ccd2   Balbir Singh   Memory controller...
2010
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2011

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2012
2013
  	addr = vma->vm_start;
  	end = vma->vm_end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2014
2015
2016
2017
2018
2019
  
  	pgd = pgd_offset(vma->vm_mm, addr);
  	do {
  		next = pgd_addr_end(addr, end);
  		if (pgd_none_or_clear_bad(pgd))
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2020
2021
  		ret = unuse_p4d_range(vma, pgd, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
8a9f3ccd2   Balbir Singh   Memory controller...
2022
2023
  		if (ret)
  			return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2024
2025
2026
  	} while (pgd++, addr = next, addr != end);
  	return 0;
  }
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2027
2028
  static int unuse_mm(struct mm_struct *mm, unsigned int type,
  		    bool frontswap, unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2029
2030
  {
  	struct vm_area_struct *vma;
8a9f3ccd2   Balbir Singh   Memory controller...
2031
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2032

d8ed45c5d   Michel Lespinasse   mmap locking API:...
2033
  	mmap_read_lock(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2034
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2035
2036
2037
2038
2039
2040
  		if (vma->anon_vma) {
  			ret = unuse_vma(vma, type, frontswap,
  					fs_pages_to_unuse);
  			if (ret)
  				break;
  		}
dc644a073   Hugh Dickins   mm: add three mor...
2041
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2042
  	}
d8ed45c5d   Michel Lespinasse   mmap locking API:...
2043
  	mmap_read_unlock(mm);
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2044
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2045
2046
2047
  }
  
  /*
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2048
   * Scan swap_map (or frontswap_map if frontswap parameter is true)
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2049
2050
   * from current position to next entry still in use. Return 0
   * if there are no inuse entries after prev till end of the map.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2051
   */
6eb396dc4   Hugh Dickins   [PATCH] swap: swa...
2052
  static unsigned int find_next_to_unuse(struct swap_info_struct *si,
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2053
  					unsigned int prev, bool frontswap)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2054
  {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2055
  	unsigned int i;
8d69aaee8   Hugh Dickins   swap_info: swap_m...
2056
  	unsigned char count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2057
2058
  
  	/*
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2059
  	 * No need for swap_lock here: we're just looking
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2060
2061
  	 * for whether an entry is in use, not modifying it; false
  	 * hits are okay, and sys_swapoff() has already prevented new
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2062
  	 * allocations from this area (while holding swap_lock).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2063
  	 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2064
  	for (i = prev + 1; i < si->max; i++) {
4db0c3c29   Jason Low   mm: remove rest o...
2065
  		count = READ_ONCE(si->swap_map[i]);
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
2066
  		if (count && swap_count(count) != SWAP_MAP_BAD)
dc644a073   Hugh Dickins   mm: add three mor...
2067
2068
2069
2070
  			if (!frontswap || frontswap_test(si, i))
  				break;
  		if ((i % LATENCY_LIMIT) == 0)
  			cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2071
  	}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2072
2073
2074
  
  	if (i == si->max)
  		i = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2075
2076
2077
2078
  	return i;
  }
  
  /*
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2079
   * If the boolean frontswap is true, only unuse pages_to_unuse pages;
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2080
   * pages_to_unuse==0 means all pages; ignored if frontswap is false
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2081
   */
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2082
2083
  int try_to_unuse(unsigned int type, bool frontswap,
  		 unsigned long pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2084
  {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2085
2086
2087
2088
  	struct mm_struct *prev_mm;
  	struct mm_struct *mm;
  	struct list_head *p;
  	int retval = 0;
efa90a981   Hugh Dickins   swap_info: change...
2089
  	struct swap_info_struct *si = swap_info[type];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2090
2091
  	struct page *page;
  	swp_entry_t entry;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2092
  	unsigned int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2093

218209487   Qian Cai   mm/swapfile: fix ...
2094
  	if (!READ_ONCE(si->inuse_pages))
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2095
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2096

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
  	if (!frontswap)
  		pages_to_unuse = 0;
  
  retry:
  	retval = shmem_unuse(type, frontswap, &pages_to_unuse);
  	if (retval)
  		goto out;
  
  	prev_mm = &init_mm;
  	mmget(prev_mm);
  
  	spin_lock(&mmlist_lock);
  	p = &init_mm.mmlist;
218209487   Qian Cai   mm/swapfile: fix ...
2110
  	while (READ_ONCE(si->inuse_pages) &&
64165b1af   Hugh Dickins   mm: swapoff: take...
2111
2112
  	       !signal_pending(current) &&
  	       (p = p->next) != &init_mm.mmlist) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2113

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2114
2115
2116
2117
2118
2119
2120
  		mm = list_entry(p, struct mm_struct, mmlist);
  		if (!mmget_not_zero(mm))
  			continue;
  		spin_unlock(&mmlist_lock);
  		mmput(prev_mm);
  		prev_mm = mm;
  		retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2121

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2122
2123
2124
  		if (retval) {
  			mmput(prev_mm);
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2125
2126
2127
  		}
  
  		/*
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2128
2129
  		 * Make sure that we aren't completely killing
  		 * interactive performance.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2130
  		 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2131
2132
2133
2134
  		cond_resched();
  		spin_lock(&mmlist_lock);
  	}
  	spin_unlock(&mmlist_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2135

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2136
  	mmput(prev_mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2137

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2138
  	i = 0;
218209487   Qian Cai   mm/swapfile: fix ...
2139
  	while (READ_ONCE(si->inuse_pages) &&
64165b1af   Hugh Dickins   mm: swapoff: take...
2140
2141
  	       !signal_pending(current) &&
  	       (i = find_next_to_unuse(si, i, frontswap)) != 0) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2142

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2143
2144
2145
2146
  		entry = swp_entry(type, i);
  		page = find_get_page(swap_address_space(entry), i);
  		if (!page)
  			continue;
68bdc8d64   Hugh Dickins   mm: try_to_unuse ...
2147
2148
2149
  
  		/*
  		 * It is conceivable that a racing task removed this page from
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2150
2151
2152
  		 * swap cache just before we acquired the page lock. The page
  		 * might even be back in swap cache on another swap area. But
  		 * that is okay, try_to_free_swap() only removes stale pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2153
  		 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2154
2155
2156
  		lock_page(page);
  		wait_on_page_writeback(page);
  		try_to_free_swap(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2157
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2158
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2159
2160
  
  		/*
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2161
2162
2163
  		 * For frontswap, we just need to unuse pages_to_unuse, if
  		 * it was specified. Need not check frontswap again here as
  		 * we already zeroed out pages_to_unuse if not frontswap.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2164
  		 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2165
2166
  		if (pages_to_unuse && --pages_to_unuse == 0)
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2167
  	}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2168
2169
2170
2171
2172
  	/*
  	 * Lets check again to see if there are still swap entries in the map.
  	 * If yes, we would need to do retry the unuse logic again.
  	 * Under global memory pressure, swap entries can be reinserted back
  	 * into process space after the mmlist loop above passes over them.
dd862deb1   Hugh Dickins   mm: swapoff: remo...
2173
  	 *
af53d3e9e   Hugh Dickins   mm: swapoff: shme...
2174
2175
2176
2177
2178
  	 * Limit the number of retries? No: when mmget_not_zero() above fails,
  	 * that mm is likely to be freeing swap from exit_mmap(), which proceeds
  	 * at its own independent pace; and even shmem_writepage() could have
  	 * been preempted after get_swap_page(), temporarily hiding that swap.
  	 * It's easy and robust (though cpu-intensive) just to keep retrying.
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2179
  	 */
218209487   Qian Cai   mm/swapfile: fix ...
2180
  	if (READ_ONCE(si->inuse_pages)) {
64165b1af   Hugh Dickins   mm: swapoff: take...
2181
2182
2183
2184
  		if (!signal_pending(current))
  			goto retry;
  		retval = -EINTR;
  	}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2185
2186
  out:
  	return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2187
2188
2189
  }
  
  /*
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2190
2191
2192
   * After a successful try_to_unuse, if no swap is now in use, we know
   * we can empty the mmlist.  swap_lock must be held on entry and exit.
   * Note that mmlist_lock nests inside swap_lock, and an mm must be
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2193
2194
2195
2196
2197
   * added to the mmlist just after page_duplicate - before would be racy.
   */
  static void drain_mmlist(void)
  {
  	struct list_head *p, *next;
efa90a981   Hugh Dickins   swap_info: change...
2198
  	unsigned int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2199

efa90a981   Hugh Dickins   swap_info: change...
2200
2201
  	for (type = 0; type < nr_swapfiles; type++)
  		if (swap_info[type]->inuse_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2202
2203
2204
2205
2206
2207
  			return;
  	spin_lock(&mmlist_lock);
  	list_for_each_safe(p, next, &init_mm.mmlist)
  		list_del_init(p);
  	spin_unlock(&mmlist_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2208
2209
2210
2211
2212
  /*
   * Free all of a swapdev's extent information
   */
  static void destroy_swap_extents(struct swap_info_struct *sis)
  {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2213
2214
2215
  	while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
  		struct rb_node *rb = sis->swap_extent_root.rb_node;
  		struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2216

4efaceb1c   Aaron Lu   mm, swap: use rbt...
2217
  		rb_erase(rb, &sis->swap_extent_root);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2218
2219
  		kfree(se);
  	}
62c230bc1   Mel Gorman   mm: add support f...
2220

bc4ae27d8   Omar Sandoval   mm: split SWP_FIL...
2221
  	if (sis->flags & SWP_ACTIVATED) {
62c230bc1   Mel Gorman   mm: add support f...
2222
2223
  		struct file *swap_file = sis->swap_file;
  		struct address_space *mapping = swap_file->f_mapping;
bc4ae27d8   Omar Sandoval   mm: split SWP_FIL...
2224
2225
2226
  		sis->flags &= ~SWP_ACTIVATED;
  		if (mapping->a_ops->swap_deactivate)
  			mapping->a_ops->swap_deactivate(swap_file);
62c230bc1   Mel Gorman   mm: add support f...
2227
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2228
2229
2230
2231
  }
  
  /*
   * Add a block range (and the corresponding page range) into this swapdev's
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2232
   * extent tree.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2233
   *
11d31886d   Hugh Dickins   [PATCH] swap: swa...
2234
   * This function rather assumes that it is called in ascending page order.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2235
   */
a509bc1a9   Mel Gorman   mm: swap: impleme...
2236
  int
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2237
2238
2239
  add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  		unsigned long nr_pages, sector_t start_block)
  {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2240
  	struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2241
2242
  	struct swap_extent *se;
  	struct swap_extent *new_se;
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
  
  	/*
  	 * place the new node at the right most since the
  	 * function is called in ascending page order.
  	 */
  	while (*link) {
  		parent = *link;
  		link = &parent->rb_right;
  	}
  
  	if (parent) {
  		se = rb_entry(parent, struct swap_extent, rb_node);
11d31886d   Hugh Dickins   [PATCH] swap: swa...
2255
2256
  		BUG_ON(se->start_page + se->nr_pages != start_page);
  		if (se->start_block + se->nr_pages == start_block) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2257
2258
2259
2260
  			/* Merge it */
  			se->nr_pages += nr_pages;
  			return 0;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2261
  	}
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2262
  	/* No merge, insert a new extent. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2263
2264
2265
2266
2267
2268
  	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
  	if (new_se == NULL)
  		return -ENOMEM;
  	new_se->start_page = start_page;
  	new_se->nr_pages = nr_pages;
  	new_se->start_block = start_block;
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2269
2270
  	rb_link_node(&new_se->rb_node, parent, link);
  	rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
53092a740   Hugh Dickins   [PATCH] swap: sho...
2271
  	return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2272
  }
aa8aa8a33   Omar Sandoval   mm: export add_sw...
2273
  EXPORT_SYMBOL_GPL(add_swap_extent);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
  
  /*
   * A `swap extent' is a simple thing which maps a contiguous range of pages
   * onto a contiguous range of disk blocks.  An ordered list of swap extents
   * is built at swapon time and is then used at swap_writepage/swap_readpage
   * time for locating where on disk a page belongs.
   *
   * If the swapfile is an S_ISBLK block device, a single extent is installed.
   * This is done so that the main operating code can treat S_ISBLK and S_ISREG
   * swap files identically.
   *
   * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
   * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
   * swapfiles are handled *identically* after swapon time.
   *
   * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
   * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
   * some stray blocks are found which do not fall within the PAGE_SIZE alignment
   * requirements, they are simply tossed out - we will never use those blocks
   * for swapping.
   *
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2295
2296
   * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
   * prevents users from writing to the swap device, which will corrupt memory.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2297
2298
2299
2300
2301
2302
2303
2304
   *
   * The amount of disk space which a single swap extent represents varies.
   * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
   * extents in the list.  To avoid much list walking, we cache the previous
   * search location in `curr_swap_extent', and start new searches from there.
   * This is extremely effective.  The average number of iterations in
   * map_swap_page() has been measured at about 0.3 per page.  - akpm.
   */
53092a740   Hugh Dickins   [PATCH] swap: sho...
2305
  static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2306
  {
62c230bc1   Mel Gorman   mm: add support f...
2307
2308
2309
  	struct file *swap_file = sis->swap_file;
  	struct address_space *mapping = swap_file->f_mapping;
  	struct inode *inode = mapping->host;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2310
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2311
2312
  	if (S_ISBLK(inode->i_mode)) {
  		ret = add_swap_extent(sis, 0, sis->max, 0);
53092a740   Hugh Dickins   [PATCH] swap: sho...
2313
  		*span = sis->pages;
a509bc1a9   Mel Gorman   mm: swap: impleme...
2314
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2315
  	}
62c230bc1   Mel Gorman   mm: add support f...
2316
  	if (mapping->a_ops->swap_activate) {
a509bc1a9   Mel Gorman   mm: swap: impleme...
2317
  		ret = mapping->a_ops->swap_activate(sis, swap_file, span);
bc4ae27d8   Omar Sandoval   mm: split SWP_FIL...
2318
2319
  		if (ret >= 0)
  			sis->flags |= SWP_ACTIVATED;
62c230bc1   Mel Gorman   mm: add support f...
2320
  		if (!ret) {
326463154   Gao Xiang   swap: rename SWP_...
2321
  			sis->flags |= SWP_FS_OPS;
62c230bc1   Mel Gorman   mm: add support f...
2322
2323
2324
  			ret = add_swap_extent(sis, 0, sis->max, 0);
  			*span = sis->pages;
  		}
a509bc1a9   Mel Gorman   mm: swap: impleme...
2325
  		return ret;
62c230bc1   Mel Gorman   mm: add support f...
2326
  	}
a509bc1a9   Mel Gorman   mm: swap: impleme...
2327
  	return generic_swapfile_activate(sis, swap_file, span);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2328
  }
a2468cc9b   Aaron Lu   swap: choose swap...
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
  static int swap_node(struct swap_info_struct *p)
  {
  	struct block_device *bdev;
  
  	if (p->bdev)
  		bdev = p->bdev;
  	else
  		bdev = p->swap_file->f_inode->i_sb->s_bdev;
  
  	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
  }
eb085574a   Huang Ying   mm, swap: fix rac...
2340
2341
2342
  static void setup_swap_info(struct swap_info_struct *p, int prio,
  			    unsigned char *swap_map,
  			    struct swap_cluster_info *cluster_info)
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2343
  {
a2468cc9b   Aaron Lu   swap: choose swap...
2344
  	int i;
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2345
2346
2347
2348
  	if (prio >= 0)
  		p->prio = prio;
  	else
  		p->prio = --least_priority;
18ab4d4ce   Dan Streetman   swap: change swap...
2349
2350
2351
2352
2353
  	/*
  	 * the plist prio is negated because plist ordering is
  	 * low-to-high, while swap ordering is high-to-low
  	 */
  	p->list.prio = -p->prio;
a2468cc9b   Aaron Lu   swap: choose swap...
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
  	for_each_node(i) {
  		if (p->prio >= 0)
  			p->avail_lists[i].prio = -p->prio;
  		else {
  			if (swap_node(p) == i)
  				p->avail_lists[i].prio = 1;
  			else
  				p->avail_lists[i].prio = -p->prio;
  		}
  	}
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2364
  	p->swap_map = swap_map;
2a8f94493   Shaohua Li   swap: change bloc...
2365
  	p->cluster_info = cluster_info;
eb085574a   Huang Ying   mm, swap: fix rac...
2366
2367
2368
2369
  }
  
  static void _enable_swap_info(struct swap_info_struct *p)
  {
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2370
  	p->flags |= SWP_WRITEOK;
ec8acf20a   Shaohua Li   swap: add per-par...
2371
  	atomic_long_add(p->pages, &nr_swap_pages);
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2372
  	total_swap_pages += p->pages;
adfab836f   Dan Streetman   swap: change swap...
2373
  	assert_spin_locked(&swap_lock);
adfab836f   Dan Streetman   swap: change swap...
2374
  	/*
18ab4d4ce   Dan Streetman   swap: change swap...
2375
2376
2377
2378
2379
2380
2381
2382
  	 * both lists are plists, and thus priority ordered.
  	 * swap_active_head needs to be priority ordered for swapoff(),
  	 * which on removal of any swap_info_struct with an auto-assigned
  	 * (i.e. negative) priority increments the auto-assigned priority
  	 * of any lower-priority swap_info_structs.
  	 * swap_avail_head needs to be priority ordered for get_swap_page(),
  	 * which allocates swap pages from the highest available priority
  	 * swap_info_struct.
adfab836f   Dan Streetman   swap: change swap...
2383
  	 */
18ab4d4ce   Dan Streetman   swap: change swap...
2384
  	plist_add(&p->list, &swap_active_head);
a2468cc9b   Aaron Lu   swap: choose swap...
2385
  	add_to_avail_list(p);
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2386
2387
2388
2389
  }
  
  static void enable_swap_info(struct swap_info_struct *p, int prio,
  				unsigned char *swap_map,
2a8f94493   Shaohua Li   swap: change bloc...
2390
  				struct swap_cluster_info *cluster_info,
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2391
2392
  				unsigned long *frontswap_map)
  {
4f89849da   Minchan Kim   frontswap: get ri...
2393
  	frontswap_init(p->type, frontswap_map);
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2394
  	spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2395
  	spin_lock(&p->lock);
eb085574a   Huang Ying   mm, swap: fix rac...
2396
2397
2398
2399
  	setup_swap_info(p, prio, swap_map, cluster_info);
  	spin_unlock(&p->lock);
  	spin_unlock(&swap_lock);
  	/*
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2400
  	 * Finished initializing swap device, now it's safe to reference it.
eb085574a   Huang Ying   mm, swap: fix rac...
2401
  	 */
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2402
  	percpu_ref_resurrect(&p->users);
eb085574a   Huang Ying   mm, swap: fix rac...
2403
2404
2405
  	spin_lock(&swap_lock);
  	spin_lock(&p->lock);
  	_enable_swap_info(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2406
  	spin_unlock(&p->lock);
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2407
2408
2409
2410
2411
2412
  	spin_unlock(&swap_lock);
  }
  
  static void reinsert_swap_info(struct swap_info_struct *p)
  {
  	spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2413
  	spin_lock(&p->lock);
eb085574a   Huang Ying   mm, swap: fix rac...
2414
2415
  	setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
  	_enable_swap_info(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2416
  	spin_unlock(&p->lock);
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2417
2418
  	spin_unlock(&swap_lock);
  }
67afa38e0   Tim Chen   mm/swap: add cach...
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
  bool has_usable_swap(void)
  {
  	bool ret = true;
  
  	spin_lock(&swap_lock);
  	if (plist_head_empty(&swap_active_head))
  		ret = false;
  	spin_unlock(&swap_lock);
  	return ret;
  }
c4ea37c26   Heiko Carstens   [CVE-2009-0029] S...
2429
  SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2430
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
2431
  	struct swap_info_struct *p = NULL;
8d69aaee8   Hugh Dickins   swap_info: swap_m...
2432
  	unsigned char *swap_map;
2a8f94493   Shaohua Li   swap: change bloc...
2433
  	struct swap_cluster_info *cluster_info;
4f89849da   Minchan Kim   frontswap: get ri...
2434
  	unsigned long *frontswap_map;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2435
2436
2437
  	struct file *swap_file, *victim;
  	struct address_space *mapping;
  	struct inode *inode;
91a27b2a7   Jeff Layton   vfs: define struc...
2438
  	struct filename *pathname;
adfab836f   Dan Streetman   swap: change swap...
2439
  	int err, found = 0;
5b808a230   Krzysztof Kozlowski   swap: fix set_blo...
2440
  	unsigned int old_block_size;
886bb7e9c   Hugh Dickins   swapfile: remove ...
2441

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2442
2443
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
191c54244   Al Viro   mm: collapse secu...
2444
  	BUG_ON(!current->mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2445
  	pathname = getname(specialfile);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2446
  	if (IS_ERR(pathname))
f58b59c1d   Xiaotian Feng   swapfile: fix nam...
2447
  		return PTR_ERR(pathname);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2448

669abf4e5   Jeff Layton   vfs: make path_op...
2449
  	victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2450
2451
2452
2453
2454
  	err = PTR_ERR(victim);
  	if (IS_ERR(victim))
  		goto out;
  
  	mapping = victim->f_mapping;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2455
  	spin_lock(&swap_lock);
18ab4d4ce   Dan Streetman   swap: change swap...
2456
  	plist_for_each_entry(p, &swap_active_head, list) {
22c6f8fdb   Hugh Dickins   swapfile: remove ...
2457
  		if (p->flags & SWP_WRITEOK) {
adfab836f   Dan Streetman   swap: change swap...
2458
2459
  			if (p->swap_file->f_mapping == mapping) {
  				found = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2460
  				break;
adfab836f   Dan Streetman   swap: change swap...
2461
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2462
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2463
  	}
adfab836f   Dan Streetman   swap: change swap...
2464
  	if (!found) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2465
  		err = -EINVAL;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2466
  		spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2467
2468
  		goto out_dput;
  	}
191c54244   Al Viro   mm: collapse secu...
2469
  	if (!security_vm_enough_memory_mm(current->mm, p->pages))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2470
2471
2472
  		vm_unacct_memory(p->pages);
  	else {
  		err = -ENOMEM;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2473
  		spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2474
2475
  		goto out_dput;
  	}
a2468cc9b   Aaron Lu   swap: choose swap...
2476
  	del_from_avail_list(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2477
  	spin_lock(&p->lock);
78ecba081   Hugh Dickins   mm: fix ever-decr...
2478
  	if (p->prio < 0) {
adfab836f   Dan Streetman   swap: change swap...
2479
  		struct swap_info_struct *si = p;
a2468cc9b   Aaron Lu   swap: choose swap...
2480
  		int nid;
adfab836f   Dan Streetman   swap: change swap...
2481

18ab4d4ce   Dan Streetman   swap: change swap...
2482
  		plist_for_each_entry_continue(si, &swap_active_head, list) {
adfab836f   Dan Streetman   swap: change swap...
2483
  			si->prio++;
18ab4d4ce   Dan Streetman   swap: change swap...
2484
  			si->list.prio--;
a2468cc9b   Aaron Lu   swap: choose swap...
2485
2486
2487
2488
  			for_each_node(nid) {
  				if (si->avail_lists[nid].prio != 1)
  					si->avail_lists[nid].prio--;
  			}
adfab836f   Dan Streetman   swap: change swap...
2489
  		}
78ecba081   Hugh Dickins   mm: fix ever-decr...
2490
2491
  		least_priority++;
  	}
18ab4d4ce   Dan Streetman   swap: change swap...
2492
  	plist_del(&p->list, &swap_active_head);
ec8acf20a   Shaohua Li   swap: add per-par...
2493
  	atomic_long_sub(p->pages, &nr_swap_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2494
2495
  	total_swap_pages -= p->pages;
  	p->flags &= ~SWP_WRITEOK;
ec8acf20a   Shaohua Li   swap: add per-par...
2496
  	spin_unlock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2497
  	spin_unlock(&swap_lock);
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
2498

039939a65   Tim Chen   mm/swap: enable s...
2499
  	disable_swap_slots_cache_lock();
e1e12d2f3   David Rientjes   mm, oom: fix race...
2500
  	set_current_oom_origin();
adfab836f   Dan Streetman   swap: change swap...
2501
  	err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
e1e12d2f3   David Rientjes   mm, oom: fix race...
2502
  	clear_current_oom_origin();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2503

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2504
2505
  	if (err) {
  		/* re-insert swap space back into swap_list */
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2506
  		reinsert_swap_info(p);
039939a65   Tim Chen   mm/swap: enable s...
2507
  		reenable_swap_slots_cache_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2508
2509
  		goto out_dput;
  	}
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2510

039939a65   Tim Chen   mm/swap: enable s...
2511
  	reenable_swap_slots_cache_unlock();
eb085574a   Huang Ying   mm, swap: fix rac...
2512
  	/*
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2513
2514
2515
2516
2517
  	 * Wait for swap operations protected by get/put_swap_device()
  	 * to complete.
  	 *
  	 * We need synchronize_rcu() here to protect the accessing to
  	 * the swap cache data structure.
eb085574a   Huang Ying   mm, swap: fix rac...
2518
  	 */
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2519
  	percpu_ref_kill(&p->users);
eb085574a   Huang Ying   mm, swap: fix rac...
2520
  	synchronize_rcu();
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2521
  	wait_for_completion(&p->comp);
eb085574a   Huang Ying   mm, swap: fix rac...
2522

815c2c543   Shaohua Li   swap: make swap d...
2523
  	flush_work(&p->discard_work);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2524
  	destroy_swap_extents(p);
570a335b8   Hugh Dickins   swap_info: swap c...
2525
2526
  	if (p->flags & SWP_CONTINUED)
  		free_swap_count_continuations(p);
81a0298bd   Huang Ying   mm, swap: don't u...
2527
2528
  	if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
  		atomic_dec(&nr_rotate_swap);
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2529
  	mutex_lock(&swapon_mutex);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2530
  	spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2531
  	spin_lock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2532
  	drain_mmlist();
bb243f7dc   Miaohe Lin   mm/swapfile: move...
2533
  	/* wait for anyone still in scan_swap_map_slots */
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2534
2535
  	p->highest_bit = 0;		/* cuts scans short */
  	while (p->flags >= SWP_SCANNING) {
ec8acf20a   Shaohua Li   swap: add per-par...
2536
  		spin_unlock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2537
  		spin_unlock(&swap_lock);
13e4b57f6   Nishanth Aravamudan   [PATCH] mm: fix-u...
2538
  		schedule_timeout_uninterruptible(1);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2539
  		spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2540
  		spin_lock(&p->lock);
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2541
  	}
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2542

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2543
  	swap_file = p->swap_file;
5b808a230   Krzysztof Kozlowski   swap: fix set_blo...
2544
  	old_block_size = p->old_block_size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2545
2546
2547
2548
  	p->swap_file = NULL;
  	p->max = 0;
  	swap_map = p->swap_map;
  	p->swap_map = NULL;
2a8f94493   Shaohua Li   swap: change bloc...
2549
2550
  	cluster_info = p->cluster_info;
  	p->cluster_info = NULL;
4f89849da   Minchan Kim   frontswap: get ri...
2551
  	frontswap_map = frontswap_map_get(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2552
  	spin_unlock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2553
  	spin_unlock(&swap_lock);
8a84802e2   Steven Price   mm: Add arch hook...
2554
  	arch_swap_invalidate_area(p->type);
adfab836f   Dan Streetman   swap: change swap...
2555
  	frontswap_invalidate_area(p->type);
58e97ba6b   Krzysztof Kozlowski   frontswap: enable...
2556
  	frontswap_map_set(p, NULL);
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2557
  	mutex_unlock(&swapon_mutex);
ebc2a1a69   Shaohua Li   swap: make cluste...
2558
2559
  	free_percpu(p->percpu_cluster);
  	p->percpu_cluster = NULL;
490705888   Huang Ying   swap: reduce lock...
2560
2561
  	free_percpu(p->cluster_next_cpu);
  	p->cluster_next_cpu = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2562
  	vfree(swap_map);
54f180d3c   Huang Ying   mm, swap: use kvz...
2563
2564
  	kvfree(cluster_info);
  	kvfree(frontswap_map);
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
2565
  	/* Destroy swap account information */
adfab836f   Dan Streetman   swap: change swap...
2566
  	swap_cgroup_swapoff(p->type);
4b3ef9daa   Huang, Ying   mm/swap: split sw...
2567
  	exit_swap_address_space(p->type);
27a7faa07   KAMEZAWA Hiroyuki   memcg: swap cgrou...
2568

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2569
2570
2571
  	inode = mapping->host;
  	if (S_ISBLK(inode->i_mode)) {
  		struct block_device *bdev = I_BDEV(inode);
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2572

5b808a230   Krzysztof Kozlowski   swap: fix set_blo...
2573
  		set_blocksize(bdev, old_block_size);
e525fd89d   Tejun Heo   block: make blkde...
2574
  		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2575
  	}
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2576
2577
2578
2579
  
  	inode_lock(inode);
  	inode->i_flags &= ~S_SWAPFILE;
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2580
  	filp_close(swap_file, NULL);
f893ab41e   Weijie Yang   mm/swap: fix race...
2581
2582
2583
2584
2585
2586
2587
2588
2589
  
  	/*
  	 * Clear the SWP_USED flag after all resources are freed so that swapon
  	 * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
  	 * not hold p->lock after we cleared its SWP_WRITEOK.
  	 */
  	spin_lock(&swap_lock);
  	p->flags = 0;
  	spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2590
  	err = 0;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2591
2592
  	atomic_inc(&proc_poll_event);
  	wake_up_interruptible(&proc_poll_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2593
2594
2595
2596
  
  out_dput:
  	filp_close(victim, NULL);
  out:
f58b59c1d   Xiaotian Feng   swapfile: fix nam...
2597
  	putname(pathname);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2598
2599
2600
2601
  	return err;
  }
  
  #ifdef CONFIG_PROC_FS
9dd957485   Al Viro   ipc, kernel, mm: ...
2602
  static __poll_t swaps_poll(struct file *file, poll_table *wait)
66d7dd518   Kay Sievers   /proc/swaps: supp...
2603
  {
f15146380   Kay Sievers   fs: seq_file - ad...
2604
  	struct seq_file *seq = file->private_data;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2605
2606
  
  	poll_wait(file, &proc_poll_wait, wait);
f15146380   Kay Sievers   fs: seq_file - ad...
2607
2608
  	if (seq->poll_event != atomic_read(&proc_poll_event)) {
  		seq->poll_event = atomic_read(&proc_poll_event);
a9a08845e   Linus Torvalds   vfs: do bulk POLL...
2609
  		return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2610
  	}
a9a08845e   Linus Torvalds   vfs: do bulk POLL...
2611
  	return EPOLLIN | EPOLLRDNORM;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2612
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2613
2614
2615
  /* iterator */
  static void *swap_start(struct seq_file *swap, loff_t *pos)
  {
efa90a981   Hugh Dickins   swap_info: change...
2616
2617
  	struct swap_info_struct *si;
  	int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2618
  	loff_t l = *pos;
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2619
  	mutex_lock(&swapon_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2620

881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2621
2622
  	if (!l)
  		return SEQ_START_TOKEN;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
2623
  	for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
efa90a981   Hugh Dickins   swap_info: change...
2624
  		if (!(si->flags & SWP_USED) || !si->swap_map)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2625
  			continue;
881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2626
  		if (!--l)
efa90a981   Hugh Dickins   swap_info: change...
2627
  			return si;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2628
2629
2630
2631
2632
2633
2634
  	}
  
  	return NULL;
  }
  
  static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
  {
efa90a981   Hugh Dickins   swap_info: change...
2635
2636
  	struct swap_info_struct *si = v;
  	int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2637

881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2638
  	if (v == SEQ_START_TOKEN)
efa90a981   Hugh Dickins   swap_info: change...
2639
2640
2641
  		type = 0;
  	else
  		type = si->type + 1;
881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2642

10c8d69f3   Vasily Averin   mm/swapfile.c: sw...
2643
  	++(*pos);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
2644
  	for (; (si = swap_type_to_swap_info(type)); type++) {
efa90a981   Hugh Dickins   swap_info: change...
2645
  		if (!(si->flags & SWP_USED) || !si->swap_map)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2646
  			continue;
efa90a981   Hugh Dickins   swap_info: change...
2647
  		return si;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2648
2649
2650
2651
2652
2653
2654
  	}
  
  	return NULL;
  }
  
  static void swap_stop(struct seq_file *swap, void *v)
  {
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2655
  	mutex_unlock(&swapon_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2656
2657
2658
2659
  }
  
  static int swap_show(struct seq_file *swap, void *v)
  {
efa90a981   Hugh Dickins   swap_info: change...
2660
  	struct swap_info_struct *si = v;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2661
2662
  	struct file *file;
  	int len;
6f7939405   Randy Dunlap   mm: swapfile: fix...
2663
  	unsigned int bytes, inuse;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2664

efa90a981   Hugh Dickins   swap_info: change...
2665
  	if (si == SEQ_START_TOKEN) {
68d68ff6e   Zhiyuan Dai   mm/mempool: minor...
2666
2667
  		seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority
  ");
881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2668
2669
  		return 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2670

6f7939405   Randy Dunlap   mm: swapfile: fix...
2671
2672
  	bytes = si->pages << (PAGE_SHIFT - 10);
  	inuse = si->inuse_pages << (PAGE_SHIFT - 10);
efa90a981   Hugh Dickins   swap_info: change...
2673
  	file = si->swap_file;
2726d5662   Miklos Szeredi   vfs: add seq_file...
2674
2675
  	len = seq_file_path(swap, file, " \t
  \\");
6f7939405   Randy Dunlap   mm: swapfile: fix...
2676
2677
  	seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d
  ",
886bb7e9c   Hugh Dickins   swapfile: remove ...
2678
  			len < 40 ? 40 - len : 1, " ",
496ad9aa8   Al Viro   new helper: file_...
2679
  			S_ISBLK(file_inode(file)->i_mode) ?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2680
  				"partition" : "file\t",
6f7939405   Randy Dunlap   mm: swapfile: fix...
2681
2682
  			bytes, bytes < 10000000 ? "\t" : "",
  			inuse, inuse < 10000000 ? "\t" : "",
efa90a981   Hugh Dickins   swap_info: change...
2683
  			si->prio);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2684
2685
  	return 0;
  }
15ad7cdcf   Helge Deller   [PATCH] struct se...
2686
  static const struct seq_operations swaps_op = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2687
2688
2689
2690
2691
2692
2693
2694
  	.start =	swap_start,
  	.next =		swap_next,
  	.stop =		swap_stop,
  	.show =		swap_show
  };
  
  static int swaps_open(struct inode *inode, struct file *file)
  {
f15146380   Kay Sievers   fs: seq_file - ad...
2695
  	struct seq_file *seq;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2696
  	int ret;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2697
  	ret = seq_open(file, &swaps_op);
f15146380   Kay Sievers   fs: seq_file - ad...
2698
  	if (ret)
66d7dd518   Kay Sievers   /proc/swaps: supp...
2699
  		return ret;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2700

f15146380   Kay Sievers   fs: seq_file - ad...
2701
2702
2703
  	seq = file->private_data;
  	seq->poll_event = atomic_read(&proc_poll_event);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2704
  }
97a32539b   Alexey Dobriyan   proc: convert eve...
2705
  static const struct proc_ops swaps_proc_ops = {
d919b33da   Alexey Dobriyan   proc: faster open...
2706
  	.proc_flags	= PROC_ENTRY_PERMANENT,
97a32539b   Alexey Dobriyan   proc: convert eve...
2707
2708
2709
2710
2711
  	.proc_open	= swaps_open,
  	.proc_read	= seq_read,
  	.proc_lseek	= seq_lseek,
  	.proc_release	= seq_release,
  	.proc_poll	= swaps_poll,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2712
2713
2714
2715
  };
  
  static int __init procswaps_init(void)
  {
97a32539b   Alexey Dobriyan   proc: convert eve...
2716
  	proc_create("swaps", 0, NULL, &swaps_proc_ops);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2717
2718
2719
2720
  	return 0;
  }
  __initcall(procswaps_init);
  #endif /* CONFIG_PROC_FS */
1796316a8   Jan Beulich   x86: consolidate ...
2721
2722
2723
2724
2725
2726
2727
2728
  #ifdef MAX_SWAPFILES_CHECK
  static int __init max_swapfiles_check(void)
  {
  	MAX_SWAPFILES_CHECK();
  	return 0;
  }
  late_initcall(max_swapfiles_check);
  #endif
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2729
  static struct swap_info_struct *alloc_swap_info(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2730
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
2731
  	struct swap_info_struct *p;
b11a76b37   Qian Cai   mm/swapfile: do n...
2732
  	struct swap_info_struct *defer = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2733
  	unsigned int type;
a2468cc9b   Aaron Lu   swap: choose swap...
2734
  	int i;
efa90a981   Hugh Dickins   swap_info: change...
2735

960087445   Gustavo A. R. Silva   mm/swapfile.c: us...
2736
  	p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
efa90a981   Hugh Dickins   swap_info: change...
2737
  	if (!p)
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2738
  		return ERR_PTR(-ENOMEM);
efa90a981   Hugh Dickins   swap_info: change...
2739

63d8620ec   Miaohe Lin   mm/swapfile: use ...
2740
2741
2742
2743
2744
  	if (percpu_ref_init(&p->users, swap_users_ref_free,
  			    PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
  		kvfree(p);
  		return ERR_PTR(-ENOMEM);
  	}
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2745
  	spin_lock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
2746
2747
  	for (type = 0; type < nr_swapfiles; type++) {
  		if (!(swap_info[type]->flags & SWP_USED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2748
  			break;
efa90a981   Hugh Dickins   swap_info: change...
2749
  	}
0697212a4   Christoph Lameter   [PATCH] Swapless ...
2750
  	if (type >= MAX_SWAPFILES) {
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2751
  		spin_unlock(&swap_lock);
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2752
  		percpu_ref_exit(&p->users);
873d7bcfd   Vasily Averin   mm/swapfile.c: us...
2753
  		kvfree(p);
730c0581c   Cesar Eduardo Barros   sys_swapon: simpl...
2754
  		return ERR_PTR(-EPERM);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2755
  	}
efa90a981   Hugh Dickins   swap_info: change...
2756
2757
  	if (type >= nr_swapfiles) {
  		p->type = type;
efa90a981   Hugh Dickins   swap_info: change...
2758
  		/*
a4b451143   Huang Ying   mm, swap: remove ...
2759
2760
  		 * Publish the swap_info_struct after initializing it.
  		 * Note that kvzalloc() above zeroes all its fields.
efa90a981   Hugh Dickins   swap_info: change...
2761
  		 */
a4b451143   Huang Ying   mm, swap: remove ...
2762
2763
  		smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
  		nr_swapfiles++;
efa90a981   Hugh Dickins   swap_info: change...
2764
  	} else {
b11a76b37   Qian Cai   mm/swapfile: do n...
2765
  		defer = p;
efa90a981   Hugh Dickins   swap_info: change...
2766
2767
2768
2769
2770
2771
  		p = swap_info[type];
  		/*
  		 * Do not memset this entry: a racing procfs swap_next()
  		 * would be relying on p->type to remain valid.
  		 */
  	}
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2772
  	p->swap_extent_root = RB_ROOT;
18ab4d4ce   Dan Streetman   swap: change swap...
2773
  	plist_node_init(&p->list, 0);
a2468cc9b   Aaron Lu   swap: choose swap...
2774
2775
  	for_each_node(i)
  		plist_node_init(&p->avail_lists[i], 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2776
  	p->flags = SWP_USED;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2777
  	spin_unlock(&swap_lock);
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2778
2779
2780
2781
  	if (defer) {
  		percpu_ref_exit(&defer->users);
  		kvfree(defer);
  	}
ec8acf20a   Shaohua Li   swap: add per-par...
2782
  	spin_lock_init(&p->lock);
2628bd6fc   Huang Ying   mm, swap: fix rac...
2783
  	spin_lock_init(&p->cont_lock);
63d8620ec   Miaohe Lin   mm/swapfile: use ...
2784
  	init_completion(&p->comp);
efa90a981   Hugh Dickins   swap_info: change...
2785

53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2786
  	return p;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2787
  }
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2788
2789
2790
2791
2792
  static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
  {
  	int error;
  
  	if (S_ISBLK(inode->i_mode)) {
ef16e1d98   Christoph Hellwig   mm: cleanup claim...
2793
  		p->bdev = blkdev_get_by_dev(inode->i_rdev,
6f179af88   Hugh Dickins   mm: fix potential...
2794
  				   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
ef16e1d98   Christoph Hellwig   mm: cleanup claim...
2795
2796
  		if (IS_ERR(p->bdev)) {
  			error = PTR_ERR(p->bdev);
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2797
  			p->bdev = NULL;
6f179af88   Hugh Dickins   mm: fix potential...
2798
  			return error;
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2799
2800
2801
2802
  		}
  		p->old_block_size = block_size(p->bdev);
  		error = set_blocksize(p->bdev, PAGE_SIZE);
  		if (error < 0)
87ade72a7   Cesar Eduardo Barros   sys_swapon: simpl...
2803
  			return error;
12d2966d8   Naohiro Aota   mm, swap: disallo...
2804
2805
2806
2807
2808
  		/*
  		 * Zoned block devices contain zones that have a sequential
  		 * write only restriction.  Hence zoned block devices are not
  		 * suitable for swapping.  Disallow them here.
  		 */
e556f6ba1   Christoph Hellwig   block: remove the...
2809
  		if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
12d2966d8   Naohiro Aota   mm, swap: disallo...
2810
  			return -EINVAL;
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2811
2812
2813
  		p->flags |= SWP_BLKDEV;
  	} else if (S_ISREG(inode->i_mode)) {
  		p->bdev = inode->i_sb->s_bdev;
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2814
  	}
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2815
  	return 0;
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2816
  }
377eeaa8e   Andi Kleen   x86/speculation/l...
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
  
  /*
   * Find out how many pages are allowed for a single swap device. There
   * are two limiting factors:
   * 1) the number of bits for the swap offset in the swp_entry_t type, and
   * 2) the number of bits in the swap pte, as defined by the different
   * architectures.
   *
   * In order to find the largest possible bit mask, a swap entry with
   * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
   * decoded to a swp_entry_t again, and finally the swap offset is
   * extracted.
   *
   * This will mask all the bits from the initial ~0UL mask that can't
   * be encoded in either the swp_entry_t or the architecture definition
   * of a swap pte.
   */
  unsigned long generic_max_swapfile_size(void)
  {
  	return swp_offset(pte_to_swp_entry(
  			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
  }
  
  /* Can be overridden by an architecture for additional checks. */
  __weak unsigned long max_swapfile_size(void)
  {
  	return generic_max_swapfile_size();
  }
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2845
2846
2847
2848
2849
2850
2851
  static unsigned long read_swap_header(struct swap_info_struct *p,
  					union swap_header *swap_header,
  					struct inode *inode)
  {
  	int i;
  	unsigned long maxpages;
  	unsigned long swapfilepages;
d6bbbd29b   Raymond Jennings   swap: warn when a...
2852
  	unsigned long last_page;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2853
2854
  
  	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2855
2856
  		pr_err("Unable to find swap-space signature
  ");
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2857
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2858
  	}
041711ce7   Zhen Lei   mm: fix spelling ...
2859
  	/* swap partition endianness hack... */
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2860
2861
2862
2863
  	if (swab32(swap_header->info.version) == 1) {
  		swab32s(&swap_header->info.version);
  		swab32s(&swap_header->info.last_page);
  		swab32s(&swap_header->info.nr_badpages);
dd111be69   Jann Horn   swapfile: fix mem...
2864
2865
  		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  			return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2866
2867
2868
2869
2870
  		for (i = 0; i < swap_header->info.nr_badpages; i++)
  			swab32s(&swap_header->info.badpages[i]);
  	}
  	/* Check the swap header's sub-version */
  	if (swap_header->info.version != 1) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2871
2872
2873
  		pr_warn("Unable to handle swap header version %d
  ",
  			swap_header->info.version);
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2874
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2875
2876
2877
2878
2879
  	}
  
  	p->lowest_bit  = 1;
  	p->cluster_next = 1;
  	p->cluster_nr = 0;
377eeaa8e   Andi Kleen   x86/speculation/l...
2880
  	maxpages = max_swapfile_size();
d6bbbd29b   Raymond Jennings   swap: warn when a...
2881
  	last_page = swap_header->info.last_page;
a06ad633a   Tom Abraham   swap: divide-by-z...
2882
2883
2884
2885
2886
  	if (!last_page) {
  		pr_warn("Empty swap-file
  ");
  		return 0;
  	}
d6bbbd29b   Raymond Jennings   swap: warn when a...
2887
  	if (last_page > maxpages) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2888
2889
  		pr_warn("Truncating oversized swap area, only using %luk out of %luk
  ",
d6bbbd29b   Raymond Jennings   swap: warn when a...
2890
2891
2892
2893
2894
  			maxpages << (PAGE_SHIFT - 10),
  			last_page << (PAGE_SHIFT - 10));
  	}
  	if (maxpages > last_page) {
  		maxpages = last_page + 1;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2895
2896
2897
2898
2899
2900
2901
  		/* p->max is an unsigned int: don't overflow it */
  		if ((unsigned int)maxpages == 0)
  			maxpages = UINT_MAX;
  	}
  	p->highest_bit = maxpages - 1;
  
  	if (!maxpages)
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2902
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2903
2904
  	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
  	if (swapfilepages && maxpages > swapfilepages) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2905
2906
  		pr_warn("Swap area shorter than signature indicates
  ");
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2907
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2908
2909
  	}
  	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2910
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2911
  	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2912
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2913
2914
  
  	return maxpages;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2915
  }
4b3ef9daa   Huang, Ying   mm/swap: split sw...
2916
  #define SWAP_CLUSTER_INFO_COLS						\
235b62176   Huang, Ying   mm/swap: add clus...
2917
  	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
4b3ef9daa   Huang, Ying   mm/swap: split sw...
2918
2919
2920
2921
  #define SWAP_CLUSTER_SPACE_COLS						\
  	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
  #define SWAP_CLUSTER_COLS						\
  	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
235b62176   Huang, Ying   mm/swap: add clus...
2922

915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2923
2924
2925
  static int setup_swap_map_and_extents(struct swap_info_struct *p,
  					union swap_header *swap_header,
  					unsigned char *swap_map,
2a8f94493   Shaohua Li   swap: change bloc...
2926
  					struct swap_cluster_info *cluster_info,
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2927
2928
2929
  					unsigned long maxpages,
  					sector_t *span)
  {
235b62176   Huang, Ying   mm/swap: add clus...
2930
  	unsigned int j, k;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2931
2932
  	unsigned int nr_good_pages;
  	int nr_extents;
2a8f94493   Shaohua Li   swap: change bloc...
2933
  	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
235b62176   Huang, Ying   mm/swap: add clus...
2934
2935
  	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
  	unsigned long i, idx;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2936
2937
  
  	nr_good_pages = maxpages - 1;	/* omit header page */
6b5349159   Huang Ying   mm, swap: add swa...
2938
2939
  	cluster_list_init(&p->free_clusters);
  	cluster_list_init(&p->discard_clusters);
2a8f94493   Shaohua Li   swap: change bloc...
2940

915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2941
2942
  	for (i = 0; i < swap_header->info.nr_badpages; i++) {
  		unsigned int page_nr = swap_header->info.badpages[i];
bdb8e3f68   Cesar Eduardo Barros   sys_swapon: simpl...
2943
2944
  		if (page_nr == 0 || page_nr > swap_header->info.last_page)
  			return -EINVAL;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2945
2946
2947
  		if (page_nr < maxpages) {
  			swap_map[page_nr] = SWAP_MAP_BAD;
  			nr_good_pages--;
2a8f94493   Shaohua Li   swap: change bloc...
2948
2949
2950
2951
2952
  			/*
  			 * Haven't marked the cluster free yet, no list
  			 * operation involved
  			 */
  			inc_cluster_info_page(p, cluster_info, page_nr);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2953
2954
  		}
  	}
2a8f94493   Shaohua Li   swap: change bloc...
2955
2956
2957
  	/* Haven't marked the cluster free yet, no list operation involved */
  	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
  		inc_cluster_info_page(p, cluster_info, i);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2958
2959
  	if (nr_good_pages) {
  		swap_map[0] = SWAP_MAP_BAD;
2a8f94493   Shaohua Li   swap: change bloc...
2960
2961
2962
2963
2964
  		/*
  		 * Not mark the cluster free yet, no list
  		 * operation involved
  		 */
  		inc_cluster_info_page(p, cluster_info, 0);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2965
2966
2967
  		p->max = maxpages;
  		p->pages = nr_good_pages;
  		nr_extents = setup_swap_extents(p, span);
bdb8e3f68   Cesar Eduardo Barros   sys_swapon: simpl...
2968
2969
  		if (nr_extents < 0)
  			return nr_extents;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2970
2971
2972
  		nr_good_pages = p->pages;
  	}
  	if (!nr_good_pages) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2973
2974
  		pr_warn("Empty swap-file
  ");
bdb8e3f68   Cesar Eduardo Barros   sys_swapon: simpl...
2975
  		return -EINVAL;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2976
  	}
2a8f94493   Shaohua Li   swap: change bloc...
2977
2978
  	if (!cluster_info)
  		return nr_extents;
235b62176   Huang, Ying   mm/swap: add clus...
2979

4b3ef9daa   Huang, Ying   mm/swap: split sw...
2980
2981
2982
2983
  	/*
  	 * Reduce false cache line sharing between cluster_info and
  	 * sharing same address space.
  	 */
235b62176   Huang, Ying   mm/swap: add clus...
2984
2985
2986
2987
2988
2989
2990
2991
  	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
  		j = (k + col) % SWAP_CLUSTER_COLS;
  		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
  			idx = i * SWAP_CLUSTER_COLS + j;
  			if (idx >= nr_clusters)
  				continue;
  			if (cluster_count(&cluster_info[idx]))
  				continue;
2a8f94493   Shaohua Li   swap: change bloc...
2992
  			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
6b5349159   Huang Ying   mm, swap: add swa...
2993
2994
  			cluster_list_add_tail(&p->free_clusters, cluster_info,
  					      idx);
2a8f94493   Shaohua Li   swap: change bloc...
2995
  		}
2a8f94493   Shaohua Li   swap: change bloc...
2996
  	}
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2997
  	return nr_extents;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2998
  }
dcf6b7ddd   Rafael Aquini   swap: discard whi...
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
  /*
   * Helper to sys_swapon determining if a given swap
   * backing device queue supports DISCARD operations.
   */
  static bool swap_discardable(struct swap_info_struct *si)
  {
  	struct request_queue *q = bdev_get_queue(si->bdev);
  
  	if (!q || !blk_queue_discard(q))
  		return false;
  
  	return true;
  }
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3012
3013
3014
  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
  {
  	struct swap_info_struct *p;
91a27b2a7   Jeff Layton   vfs: define struc...
3015
  	struct filename *name;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3016
3017
  	struct file *swap_file = NULL;
  	struct address_space *mapping;
51cc3a662   Hugh Dickins   fs, mm: fix race ...
3018
  	struct dentry *dentry;
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
3019
  	int prio;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3020
3021
  	int error;
  	union swap_header *swap_header;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3022
  	int nr_extents;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3023
3024
  	sector_t span;
  	unsigned long maxpages;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3025
  	unsigned char *swap_map = NULL;
2a8f94493   Shaohua Li   swap: change bloc...
3026
  	struct swap_cluster_info *cluster_info = NULL;
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3027
  	unsigned long *frontswap_map = NULL;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3028
3029
  	struct page *page = NULL;
  	struct inode *inode = NULL;
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3030
  	bool inced_nr_rotate_swap = false;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3031

d15cab975   Hugh Dickins   swapon: check val...
3032
3033
  	if (swap_flags & ~SWAP_FLAGS_VALID)
  		return -EINVAL;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3034
3035
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
a2468cc9b   Aaron Lu   swap: choose swap...
3036
3037
  	if (!swap_avail_heads)
  		return -ENOMEM;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3038
  	p = alloc_swap_info();
2542e5134   Cesar Eduardo Barros   sys_swapon: simpl...
3039
3040
  	if (IS_ERR(p))
  		return PTR_ERR(p);
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3041

815c2c543   Shaohua Li   swap: make swap d...
3042
  	INIT_WORK(&p->discard_work, swap_discard_work);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3043
  	name = getname(specialfile);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3044
  	if (IS_ERR(name)) {
7de7fb6b3   Cesar Eduardo Barros   sys_swapon: move ...
3045
  		error = PTR_ERR(name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3046
  		name = NULL;
bd69010b0   Cesar Eduardo Barros   sys_swapon: use a...
3047
  		goto bad_swap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3048
  	}
669abf4e5   Jeff Layton   vfs: make path_op...
3049
  	swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3050
  	if (IS_ERR(swap_file)) {
7de7fb6b3   Cesar Eduardo Barros   sys_swapon: move ...
3051
  		error = PTR_ERR(swap_file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3052
  		swap_file = NULL;
bd69010b0   Cesar Eduardo Barros   sys_swapon: use a...
3053
  		goto bad_swap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3054
3055
3056
3057
  	}
  
  	p->swap_file = swap_file;
  	mapping = swap_file->f_mapping;
51cc3a662   Hugh Dickins   fs, mm: fix race ...
3058
  	dentry = swap_file->f_path.dentry;
2130781e2   Cesar Eduardo Barros   sys_swapon: fix i...
3059
  	inode = mapping->host;
6f179af88   Hugh Dickins   mm: fix potential...
3060

4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
3061
3062
  	error = claim_swapfile(p, inode);
  	if (unlikely(error))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3063
  		goto bad_swap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3064

d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3065
  	inode_lock(inode);
51cc3a662   Hugh Dickins   fs, mm: fix race ...
3066
3067
3068
3069
  	if (d_unlinked(dentry) || cant_mount(dentry)) {
  		error = -ENOENT;
  		goto bad_swap_unlock_inode;
  	}
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3070
3071
3072
3073
  	if (IS_SWAPFILE(inode)) {
  		error = -EBUSY;
  		goto bad_swap_unlock_inode;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3074
3075
3076
3077
3078
  	/*
  	 * Read the swap header.
  	 */
  	if (!mapping->a_ops->readpage) {
  		error = -EINVAL;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3079
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3080
  	}
090d2b185   Pekka Enberg   [PATCH] read_mapp...
3081
  	page = read_mapping_page(mapping, 0, swap_file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3082
3083
  	if (IS_ERR(page)) {
  		error = PTR_ERR(page);
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3084
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3085
  	}
81e339712   Hugh Dickins   swapfile: remove ...
3086
  	swap_header = kmap(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3087

ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
3088
3089
  	maxpages = read_swap_header(p, swap_header, inode);
  	if (unlikely(!maxpages)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3090
  		error = -EINVAL;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3091
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3092
  	}
886bb7e9c   Hugh Dickins   swapfile: remove ...
3093

81e339712   Hugh Dickins   swapfile: remove ...
3094
  	/* OK, set up the swap map and apply the bad block list */
803d0c835   Cesar Eduardo Barros   sys_swapon: use v...
3095
  	swap_map = vzalloc(maxpages);
81e339712   Hugh Dickins   swapfile: remove ...
3096
3097
  	if (!swap_map) {
  		error = -ENOMEM;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3098
  		goto bad_swap_unlock_inode;
81e339712   Hugh Dickins   swapfile: remove ...
3099
  	}
f05714293   Minchan Kim   mm: support anony...
3100

1cb039f3d   Christoph Hellwig   bdi: replace BDI_...
3101
  	if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
f05714293   Minchan Kim   mm: support anony...
3102
  		p->flags |= SWP_STABLE_WRITES;
a8b456d01   Christoph Hellwig   bdi: remove BDI_C...
3103
  	if (p->bdev && p->bdev->bd_disk->fops->rw_page)
539a6fea7   Minchan Kim   mm, swap: introdu...
3104
  		p->flags |= SWP_SYNCHRONOUS_IO;
2a8f94493   Shaohua Li   swap: change bloc...
3105
  	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
6f179af88   Hugh Dickins   mm: fix potential...
3106
  		int cpu;
235b62176   Huang, Ying   mm/swap: add clus...
3107
  		unsigned long ci, nr_cluster;
6f179af88   Hugh Dickins   mm: fix potential...
3108

2a8f94493   Shaohua Li   swap: change bloc...
3109
  		p->flags |= SWP_SOLIDSTATE;
490705888   Huang Ying   swap: reduce lock...
3110
3111
3112
3113
3114
  		p->cluster_next_cpu = alloc_percpu(unsigned int);
  		if (!p->cluster_next_cpu) {
  			error = -ENOMEM;
  			goto bad_swap_unlock_inode;
  		}
2a8f94493   Shaohua Li   swap: change bloc...
3115
3116
3117
3118
  		/*
  		 * select a random position to start with to help wear leveling
  		 * SSD
  		 */
490705888   Huang Ying   swap: reduce lock...
3119
3120
3121
3122
  		for_each_possible_cpu(cpu) {
  			per_cpu(*p->cluster_next_cpu, cpu) =
  				1 + prandom_u32_max(p->highest_bit);
  		}
235b62176   Huang, Ying   mm/swap: add clus...
3123
  		nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2a8f94493   Shaohua Li   swap: change bloc...
3124

778e1cdd8   Kees Cook   treewide: kvzallo...
3125
  		cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
54f180d3c   Huang Ying   mm, swap: use kvz...
3126
  					GFP_KERNEL);
2a8f94493   Shaohua Li   swap: change bloc...
3127
3128
  		if (!cluster_info) {
  			error = -ENOMEM;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3129
  			goto bad_swap_unlock_inode;
2a8f94493   Shaohua Li   swap: change bloc...
3130
  		}
235b62176   Huang, Ying   mm/swap: add clus...
3131
3132
3133
  
  		for (ci = 0; ci < nr_cluster; ci++)
  			spin_lock_init(&((cluster_info + ci)->lock));
ebc2a1a69   Shaohua Li   swap: make cluste...
3134
3135
3136
  		p->percpu_cluster = alloc_percpu(struct percpu_cluster);
  		if (!p->percpu_cluster) {
  			error = -ENOMEM;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3137
  			goto bad_swap_unlock_inode;
ebc2a1a69   Shaohua Li   swap: make cluste...
3138
  		}
6f179af88   Hugh Dickins   mm: fix potential...
3139
  		for_each_possible_cpu(cpu) {
ebc2a1a69   Shaohua Li   swap: make cluste...
3140
  			struct percpu_cluster *cluster;
6f179af88   Hugh Dickins   mm: fix potential...
3141
  			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
ebc2a1a69   Shaohua Li   swap: make cluste...
3142
3143
  			cluster_set_null(&cluster->index);
  		}
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3144
  	} else {
81a0298bd   Huang Ying   mm, swap: don't u...
3145
  		atomic_inc(&nr_rotate_swap);
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3146
3147
  		inced_nr_rotate_swap = true;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3148

1421ef3cd   Cesar Eduardo Barros   sys_swapon: call ...
3149
3150
  	error = swap_cgroup_swapon(p->type, maxpages);
  	if (error)
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3151
  		goto bad_swap_unlock_inode;
1421ef3cd   Cesar Eduardo Barros   sys_swapon: call ...
3152

915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3153
  	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2a8f94493   Shaohua Li   swap: change bloc...
3154
  		cluster_info, maxpages, &span);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3155
3156
  	if (unlikely(nr_extents < 0)) {
  		error = nr_extents;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3157
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3158
  	}
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3159
  	/* frontswap enabled? set up bit-per-page map for frontswap */
8ea1d2a19   Vlastimil Babka   mm, frontswap: co...
3160
  	if (IS_ENABLED(CONFIG_FRONTSWAP))
778e1cdd8   Kees Cook   treewide: kvzallo...
3161
3162
  		frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
  					 sizeof(long),
54f180d3c   Huang Ying   mm, swap: use kvz...
3163
  					 GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3164

68d68ff6e   Zhiyuan Dai   mm/mempool: minor...
3165
  	if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2a8f94493   Shaohua Li   swap: change bloc...
3166
3167
3168
3169
3170
3171
3172
3173
  		/*
  		 * When discard is enabled for swap with no particular
  		 * policy flagged, we set all swap discard flags here in
  		 * order to sustain backward compatibility with older
  		 * swapon(8) releases.
  		 */
  		p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
  			     SWP_PAGE_DISCARD);
dcf6b7ddd   Rafael Aquini   swap: discard whi...
3174

2a8f94493   Shaohua Li   swap: change bloc...
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
  		/*
  		 * By flagging sys_swapon, a sysadmin can tell us to
  		 * either do single-time area discards only, or to just
  		 * perform discards for released swap page-clusters.
  		 * Now it's time to adjust the p->flags accordingly.
  		 */
  		if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
  			p->flags &= ~SWP_PAGE_DISCARD;
  		else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
  			p->flags &= ~SWP_AREA_DISCARD;
  
  		/* issue a swapon-time discard if it's still required */
  		if (p->flags & SWP_AREA_DISCARD) {
  			int err = discard_swap(p);
  			if (unlikely(err))
  				pr_err("swapon: discard_swap(%p): %d
  ",
  					p, err);
dcf6b7ddd   Rafael Aquini   swap: discard whi...
3193
  		}
20137a490   Hugh Dickins   swapfile: swapon ...
3194
  	}
6a6ba8317   Hugh Dickins   swapfile: swapon ...
3195

4b3ef9daa   Huang, Ying   mm/swap: split sw...
3196
3197
  	error = init_swap_address_space(p->type, maxpages);
  	if (error)
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3198
  		goto bad_swap_unlock_inode;
4b3ef9daa   Huang, Ying   mm/swap: split sw...
3199

dc617f29d   Darrick J. Wong   vfs: don't allow ...
3200
3201
3202
3203
3204
3205
3206
3207
  	/*
  	 * Flush any pending IO and dirty mappings before we start using this
  	 * swap device.
  	 */
  	inode->i_flags |= S_SWAPFILE;
  	error = inode_drain_writes(inode);
  	if (error) {
  		inode->i_flags &= ~S_SWAPFILE;
822bca52e   Miaohe Lin   mm/swapfile.c: fi...
3208
  		goto free_swap_address_space;
dc617f29d   Darrick J. Wong   vfs: don't allow ...
3209
  	}
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
3210
  	mutex_lock(&swapon_mutex);
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
3211
  	prio = -1;
78ecba081   Hugh Dickins   mm: fix ever-decr...
3212
  	if (swap_flags & SWAP_FLAG_PREFER)
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
3213
  		prio =
78ecba081   Hugh Dickins   mm: fix ever-decr...
3214
  		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2a8f94493   Shaohua Li   swap: change bloc...
3215
  	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
c69dbfb84   Cesar Eduardo Barros   sys_swapon: move ...
3216

756a025f0   Joe Perches   mm: coalesce spli...
3217
3218
  	pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s
  ",
91a27b2a7   Jeff Layton   vfs: define struc...
3219
  		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
c69dbfb84   Cesar Eduardo Barros   sys_swapon: move ...
3220
3221
  		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
  		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3222
  		(p->flags & SWP_DISCARDABLE) ? "D" : "",
dcf6b7ddd   Rafael Aquini   swap: discard whi...
3223
3224
  		(p->flags & SWP_AREA_DISCARD) ? "s" : "",
  		(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3225
  		(frontswap_map) ? "FS" : "");
c69dbfb84   Cesar Eduardo Barros   sys_swapon: move ...
3226

fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
3227
  	mutex_unlock(&swapon_mutex);
66d7dd518   Kay Sievers   /proc/swaps: supp...
3228
3229
  	atomic_inc(&proc_poll_event);
  	wake_up_interruptible(&proc_poll_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3230
3231
  	error = 0;
  	goto out;
822bca52e   Miaohe Lin   mm/swapfile.c: fi...
3232
3233
  free_swap_address_space:
  	exit_swap_address_space(p->type);
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3234
3235
  bad_swap_unlock_inode:
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3236
  bad_swap:
ebc2a1a69   Shaohua Li   swap: make cluste...
3237
3238
  	free_percpu(p->percpu_cluster);
  	p->percpu_cluster = NULL;
490705888   Huang Ying   swap: reduce lock...
3239
3240
  	free_percpu(p->cluster_next_cpu);
  	p->cluster_next_cpu = NULL;
bd69010b0   Cesar Eduardo Barros   sys_swapon: use a...
3241
  	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
f2090d2df   Cesar Eduardo Barros   sys_swapon: remov...
3242
3243
  		set_blocksize(p->bdev, p->old_block_size);
  		blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3244
  	}
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3245
  	inode = NULL;
4cd3bb10f   Hugh Dickins   [PATCH] swap: mov...
3246
  	destroy_swap_extents(p);
e8e6c2ec4   Cesar Eduardo Barros   sys_swapon: do no...
3247
  	swap_cgroup_swapoff(p->type);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3248
  	spin_lock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3249
  	p->swap_file = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3250
  	p->flags = 0;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3251
  	spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3252
  	vfree(swap_map);
8606a1a94   Darrick J. Wong   mm: kvfree the sw...
3253
  	kvfree(cluster_info);
b6b1fd2a6   David Rientjes   mm/swapfile.c: fi...
3254
  	kvfree(frontswap_map);
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3255
3256
  	if (inced_nr_rotate_swap)
  		atomic_dec(&nr_rotate_swap);
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3257
  	if (swap_file)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3258
3259
3260
3261
  		filp_close(swap_file, NULL);
  out:
  	if (page && !IS_ERR(page)) {
  		kunmap(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3262
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3263
3264
3265
  	}
  	if (name)
  		putname(name);
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
3266
  	if (inode)
5955102c9   Al Viro   wrappers for ->i_...
3267
  		inode_unlock(inode);
039939a65   Tim Chen   mm/swap: enable s...
3268
3269
  	if (!error)
  		enable_swap_slots_cache();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3270
3271
3272
3273
3274
  	return error;
  }
  
  void si_swapinfo(struct sysinfo *val)
  {
efa90a981   Hugh Dickins   swap_info: change...
3275
  	unsigned int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3276
  	unsigned long nr_to_be_unused = 0;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3277
  	spin_lock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
3278
3279
3280
3281
3282
  	for (type = 0; type < nr_swapfiles; type++) {
  		struct swap_info_struct *si = swap_info[type];
  
  		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
  			nr_to_be_unused += si->inuse_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3283
  	}
ec8acf20a   Shaohua Li   swap: add per-par...
3284
  	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3285
  	val->totalswap = total_swap_pages + nr_to_be_unused;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3286
  	spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3287
3288
3289
3290
3291
  }
  
  /*
   * Verify that a swap entry is valid and increment its swap map count.
   *
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3292
3293
3294
3295
3296
3297
   * Returns error code in following case.
   * - success -> 0
   * - swp_entry is invalid -> EINVAL
   * - swp_entry is migration entry -> EINVAL
   * - swap-cache reference is requested but there is already one. -> EEXIST
   * - swap-cache reference is requested but the entry is not used. -> ENOENT
570a335b8   Hugh Dickins   swap_info: swap c...
3298
   * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3299
   */
8d69aaee8   Hugh Dickins   swap_info: swap_m...
3300
  static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3301
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
3302
  	struct swap_info_struct *p;
235b62176   Huang, Ying   mm/swap: add clus...
3303
  	struct swap_cluster_info *ci;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
3304
  	unsigned long offset;
8d69aaee8   Hugh Dickins   swap_info: swap_m...
3305
3306
  	unsigned char count;
  	unsigned char has_cache;
9d9a03340   Miaohe Lin   mm/swapfile.c: re...
3307
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3308

eb085574a   Huang Ying   mm, swap: fix rac...
3309
  	p = get_swap_device(entry);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
3310
  	if (!p)
9d9a03340   Miaohe Lin   mm/swapfile.c: re...
3311
  		return -EINVAL;
235b62176   Huang, Ying   mm/swap: add clus...
3312

eb085574a   Huang Ying   mm, swap: fix rac...
3313
  	offset = swp_offset(entry);
235b62176   Huang, Ying   mm/swap: add clus...
3314
  	ci = lock_cluster_or_swap_info(p, offset);
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3315

253d553ba   Hugh Dickins   swap_info: SWAP_H...
3316
  	count = p->swap_map[offset];
edfe23dac   Shaohua Li   swap: fix races e...
3317
3318
3319
3320
3321
3322
3323
3324
3325
  
  	/*
  	 * swapin_readahead() doesn't check if a swap entry is valid, so the
  	 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
  	 */
  	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
  		err = -ENOENT;
  		goto unlock_out;
  	}
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3326
3327
3328
  	has_cache = count & SWAP_HAS_CACHE;
  	count &= ~SWAP_HAS_CACHE;
  	err = 0;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3329

253d553ba   Hugh Dickins   swap_info: SWAP_H...
3330
  	if (usage == SWAP_HAS_CACHE) {
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3331
3332
  
  		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3333
3334
3335
3336
3337
3338
  		if (!has_cache && count)
  			has_cache = SWAP_HAS_CACHE;
  		else if (has_cache)		/* someone else added cache */
  			err = -EEXIST;
  		else				/* no users remaining */
  			err = -ENOENT;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3339
3340
  
  	} else if (count || has_cache) {
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3341

570a335b8   Hugh Dickins   swap_info: swap c...
3342
3343
3344
  		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
  			count += usage;
  		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3345
  			err = -EINVAL;
570a335b8   Hugh Dickins   swap_info: swap c...
3346
3347
3348
3349
  		else if (swap_count_continued(p, offset, count))
  			count = COUNT_CONTINUED;
  		else
  			err = -ENOMEM;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3350
  	} else
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3351
  		err = -ENOENT;			/* unused swap entry */
a449bf58e   Qian Cai   mm/swapfile: fix ...
3352
  	WRITE_ONCE(p->swap_map[offset], count | has_cache);
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3353

355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3354
  unlock_out:
235b62176   Huang, Ying   mm/swap: add clus...
3355
  	unlock_cluster_or_swap_info(p, ci);
eb085574a   Huang Ying   mm, swap: fix rac...
3356
3357
  	if (p)
  		put_swap_device(p);
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3358
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3359
  }
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3360

355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3361
  /*
aaa468653   Hugh Dickins   swap_info: note S...
3362
3363
3364
3365
3366
3367
3368
3369
3370
   * Help swapoff by noting that swap entry belongs to shmem/tmpfs
   * (in which case its reference count is never incremented).
   */
  void swap_shmem_alloc(swp_entry_t entry)
  {
  	__swap_duplicate(entry, SWAP_MAP_SHMEM);
  }
  
  /*
08259d58e   Hugh Dickins   mm: add comment o...
3371
3372
3373
3374
3375
   * Increase reference count of swap entry by 1.
   * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
   * but could not be atomically allocated.  Returns 0, just as if it succeeded,
   * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
   * might occur if a page table entry has got corrupted.
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3376
   */
570a335b8   Hugh Dickins   swap_info: swap c...
3377
  int swap_duplicate(swp_entry_t entry)
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3378
  {
570a335b8   Hugh Dickins   swap_info: swap c...
3379
3380
3381
3382
3383
  	int err = 0;
  
  	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
  		err = add_swap_count_continuation(entry, GFP_ATOMIC);
  	return err;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3384
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3385

cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
3386
  /*
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3387
3388
   * @entry: swap entry for which we allocate swap cache.
   *
73c34b6ac   Hugh Dickins   swap_info: miscel...
3389
   * Called when allocating swap cache for existing swap entry,
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3390
   * This can return error codes. Returns 0 at success.
3eeba1356   Chen Wandun   mm/swapfile.c: fi...
3391
   * -EEXIST means there is a swap cache.
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3392
   * Note: return code is different from swap_duplicate().
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
3393
3394
3395
   */
  int swapcache_prepare(swp_entry_t entry)
  {
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3396
  	return __swap_duplicate(entry, SWAP_HAS_CACHE);
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
3397
  }
0bcac06f2   Minchan Kim   mm, swap: skip sw...
3398
3399
  struct swap_info_struct *swp_swap_info(swp_entry_t entry)
  {
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
3400
  	return swap_type_to_swap_info(swp_type(entry));
0bcac06f2   Minchan Kim   mm, swap: skip sw...
3401
  }
f981c5950   Mel Gorman   mm: methods for t...
3402
3403
  struct swap_info_struct *page_swap_info(struct page *page)
  {
0bcac06f2   Minchan Kim   mm, swap: skip sw...
3404
3405
  	swp_entry_t entry = { .val = page_private(page) };
  	return swp_swap_info(entry);
f981c5950   Mel Gorman   mm: methods for t...
3406
3407
3408
3409
3410
3411
3412
  }
  
  /*
   * out-of-line __page_file_ methods to avoid include hell.
   */
  struct address_space *__page_file_mapping(struct page *page)
  {
f981c5950   Mel Gorman   mm: methods for t...
3413
3414
3415
3416
3417
3418
3419
  	return page_swap_info(page)->swap_file->f_mapping;
  }
  EXPORT_SYMBOL_GPL(__page_file_mapping);
  
  pgoff_t __page_file_index(struct page *page)
  {
  	swp_entry_t swap = { .val = page_private(page) };
f981c5950   Mel Gorman   mm: methods for t...
3420
3421
3422
  	return swp_offset(swap);
  }
  EXPORT_SYMBOL_GPL(__page_file_index);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3423
  /*
570a335b8   Hugh Dickins   swap_info: swap c...
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
   * add_swap_count_continuation - called when a swap count is duplicated
   * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
   * page of the original vmalloc'ed swap_map, to hold the continuation count
   * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
   * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
   *
   * These continuation pages are seldom referenced: the common paths all work
   * on the original swap_map, only referring to a continuation page when the
   * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
   *
   * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
   * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
   * can be called after dropping locks.
   */
  int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
  {
  	struct swap_info_struct *si;
235b62176   Huang, Ying   mm/swap: add clus...
3441
  	struct swap_cluster_info *ci;
570a335b8   Hugh Dickins   swap_info: swap c...
3442
3443
3444
3445
3446
  	struct page *head;
  	struct page *page;
  	struct page *list_page;
  	pgoff_t offset;
  	unsigned char count;
eb085574a   Huang Ying   mm, swap: fix rac...
3447
  	int ret = 0;
570a335b8   Hugh Dickins   swap_info: swap c...
3448
3449
3450
3451
3452
3453
  
  	/*
  	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
  	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
  	 */
  	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
eb085574a   Huang Ying   mm, swap: fix rac...
3454
  	si = get_swap_device(entry);
570a335b8   Hugh Dickins   swap_info: swap c...
3455
3456
3457
  	if (!si) {
  		/*
  		 * An acceptable race has occurred since the failing
eb085574a   Huang Ying   mm, swap: fix rac...
3458
  		 * __swap_duplicate(): the swap device may be swapoff
570a335b8   Hugh Dickins   swap_info: swap c...
3459
3460
3461
  		 */
  		goto outer;
  	}
eb085574a   Huang Ying   mm, swap: fix rac...
3462
  	spin_lock(&si->lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3463
3464
  
  	offset = swp_offset(entry);
235b62176   Huang, Ying   mm/swap: add clus...
3465
3466
  
  	ci = lock_cluster(si, offset);
d8aa24e04   Miaohe Lin   mm/swapfile.c: us...
3467
  	count = swap_count(si->swap_map[offset]);
570a335b8   Hugh Dickins   swap_info: swap c...
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
  
  	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
  		/*
  		 * The higher the swap count, the more likely it is that tasks
  		 * will race to add swap count continuation: we need to avoid
  		 * over-provisioning.
  		 */
  		goto out;
  	}
  
  	if (!page) {
eb085574a   Huang Ying   mm, swap: fix rac...
3479
3480
  		ret = -ENOMEM;
  		goto out;
570a335b8   Hugh Dickins   swap_info: swap c...
3481
3482
3483
3484
  	}
  
  	/*
  	 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
3485
3486
  	 * no architecture is using highmem pages for kernel page tables: so it
  	 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
570a335b8   Hugh Dickins   swap_info: swap c...
3487
3488
3489
  	 */
  	head = vmalloc_to_page(si->swap_map + offset);
  	offset &= ~PAGE_MASK;
2628bd6fc   Huang Ying   mm, swap: fix rac...
3490
  	spin_lock(&si->cont_lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
  	/*
  	 * Page allocation does not initialize the page's lru field,
  	 * but it does always reset its private field.
  	 */
  	if (!page_private(head)) {
  		BUG_ON(count & COUNT_CONTINUED);
  		INIT_LIST_HEAD(&head->lru);
  		set_page_private(head, SWP_CONTINUED);
  		si->flags |= SWP_CONTINUED;
  	}
  
  	list_for_each_entry(list_page, &head->lru, lru) {
  		unsigned char *map;
  
  		/*
  		 * If the previous map said no continuation, but we've found
  		 * a continuation page, free our allocation and use this one.
  		 */
  		if (!(count & COUNT_CONTINUED))
2628bd6fc   Huang Ying   mm, swap: fix rac...
3510
  			goto out_unlock_cont;
570a335b8   Hugh Dickins   swap_info: swap c...
3511

9b04c5fec   Cong Wang   mm: remove the se...
3512
  		map = kmap_atomic(list_page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3513
  		count = *map;
9b04c5fec   Cong Wang   mm: remove the se...
3514
  		kunmap_atomic(map);
570a335b8   Hugh Dickins   swap_info: swap c...
3515
3516
3517
3518
3519
3520
  
  		/*
  		 * If this continuation count now has some space in it,
  		 * free our allocation and use this one.
  		 */
  		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2628bd6fc   Huang Ying   mm, swap: fix rac...
3521
  			goto out_unlock_cont;
570a335b8   Hugh Dickins   swap_info: swap c...
3522
3523
3524
3525
  	}
  
  	list_add_tail(&page->lru, &head->lru);
  	page = NULL;			/* now it's attached, don't free it */
2628bd6fc   Huang Ying   mm, swap: fix rac...
3526
3527
  out_unlock_cont:
  	spin_unlock(&si->cont_lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3528
  out:
235b62176   Huang, Ying   mm/swap: add clus...
3529
  	unlock_cluster(ci);
ec8acf20a   Shaohua Li   swap: add per-par...
3530
  	spin_unlock(&si->lock);
eb085574a   Huang Ying   mm, swap: fix rac...
3531
  	put_swap_device(si);
570a335b8   Hugh Dickins   swap_info: swap c...
3532
3533
3534
  outer:
  	if (page)
  		__free_page(page);
eb085574a   Huang Ying   mm, swap: fix rac...
3535
  	return ret;
570a335b8   Hugh Dickins   swap_info: swap c...
3536
3537
3538
3539
3540
3541
3542
3543
  }
  
  /*
   * swap_count_continued - when the original swap_map count is incremented
   * from SWAP_MAP_MAX, check if there is already a continuation page to carry
   * into, carry if so, or else fail until a new continuation page is allocated;
   * when the original swap_map count is decremented from 0 with continuation,
   * borrow from the continuation and report whether it still holds more.
235b62176   Huang, Ying   mm/swap: add clus...
3544
3545
   * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
   * lock.
570a335b8   Hugh Dickins   swap_info: swap c...
3546
3547
3548
3549
3550
3551
3552
   */
  static bool swap_count_continued(struct swap_info_struct *si,
  				 pgoff_t offset, unsigned char count)
  {
  	struct page *head;
  	struct page *page;
  	unsigned char *map;
2628bd6fc   Huang Ying   mm, swap: fix rac...
3553
  	bool ret;
570a335b8   Hugh Dickins   swap_info: swap c...
3554
3555
3556
3557
3558
3559
  
  	head = vmalloc_to_page(si->swap_map + offset);
  	if (page_private(head) != SWP_CONTINUED) {
  		BUG_ON(count & COUNT_CONTINUED);
  		return false;		/* need to add count continuation */
  	}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3560
  	spin_lock(&si->cont_lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3561
  	offset &= ~PAGE_MASK;
213516ac0   chenqiwu   mm/swapfile: use ...
3562
  	page = list_next_entry(head, lru);
9b04c5fec   Cong Wang   mm: remove the se...
3563
  	map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3564
3565
3566
3567
3568
3569
3570
3571
3572
  
  	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
  		goto init_map;		/* jump over SWAP_CONT_MAX checks */
  
  	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
  		/*
  		 * Think of how you add 1 to 999
  		 */
  		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
9b04c5fec   Cong Wang   mm: remove the se...
3573
  			kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3574
  			page = list_next_entry(page, lru);
570a335b8   Hugh Dickins   swap_info: swap c...
3575
  			BUG_ON(page == head);
9b04c5fec   Cong Wang   mm: remove the se...
3576
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3577
3578
  		}
  		if (*map == SWAP_CONT_MAX) {
9b04c5fec   Cong Wang   mm: remove the se...
3579
  			kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3580
  			page = list_next_entry(page, lru);
2628bd6fc   Huang Ying   mm, swap: fix rac...
3581
3582
3583
3584
  			if (page == head) {
  				ret = false;	/* add count continuation */
  				goto out;
  			}
9b04c5fec   Cong Wang   mm: remove the se...
3585
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3586
3587
3588
  init_map:		*map = 0;		/* we didn't zero the page */
  		}
  		*map += 1;
9b04c5fec   Cong Wang   mm: remove the se...
3589
  		kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3590
  		while ((page = list_prev_entry(page, lru)) != head) {
9b04c5fec   Cong Wang   mm: remove the se...
3591
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3592
  			*map = COUNT_CONTINUED;
9b04c5fec   Cong Wang   mm: remove the se...
3593
  			kunmap_atomic(map);
570a335b8   Hugh Dickins   swap_info: swap c...
3594
  		}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3595
  		ret = true;			/* incremented */
570a335b8   Hugh Dickins   swap_info: swap c...
3596
3597
3598
3599
3600
3601
3602
  
  	} else {				/* decrementing */
  		/*
  		 * Think of how you subtract 1 from 1000
  		 */
  		BUG_ON(count != COUNT_CONTINUED);
  		while (*map == COUNT_CONTINUED) {
9b04c5fec   Cong Wang   mm: remove the se...
3603
  			kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3604
  			page = list_next_entry(page, lru);
570a335b8   Hugh Dickins   swap_info: swap c...
3605
  			BUG_ON(page == head);
9b04c5fec   Cong Wang   mm: remove the se...
3606
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3607
3608
3609
3610
3611
  		}
  		BUG_ON(*map == 0);
  		*map -= 1;
  		if (*map == 0)
  			count = 0;
9b04c5fec   Cong Wang   mm: remove the se...
3612
  		kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3613
  		while ((page = list_prev_entry(page, lru)) != head) {
9b04c5fec   Cong Wang   mm: remove the se...
3614
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3615
3616
  			*map = SWAP_CONT_MAX | count;
  			count = COUNT_CONTINUED;
9b04c5fec   Cong Wang   mm: remove the se...
3617
  			kunmap_atomic(map);
570a335b8   Hugh Dickins   swap_info: swap c...
3618
  		}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3619
  		ret = count == COUNT_CONTINUED;
570a335b8   Hugh Dickins   swap_info: swap c...
3620
  	}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3621
3622
3623
  out:
  	spin_unlock(&si->cont_lock);
  	return ret;
570a335b8   Hugh Dickins   swap_info: swap c...
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
  }
  
  /*
   * free_swap_count_continuations - swapoff free all the continuation pages
   * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
   */
  static void free_swap_count_continuations(struct swap_info_struct *si)
  {
  	pgoff_t offset;
  
  	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
  		struct page *head;
  		head = vmalloc_to_page(si->swap_map + offset);
  		if (page_private(head)) {
0d576d20c   Geliang Tang   mm/swapfile.c: us...
3638
3639
3640
3641
  			struct page *page, *next;
  
  			list_for_each_entry_safe(page, next, &head->lru, lru) {
  				list_del(&page->lru);
570a335b8   Hugh Dickins   swap_info: swap c...
3642
3643
3644
3645
3646
  				__free_page(page);
  			}
  		}
  	}
  }
a2468cc9b   Aaron Lu   swap: choose swap...
3647

2cf855837   Tejun Heo   memcontrol: sched...
3648
  #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
01c4b28cd   Suren Baghdasaryan   mm, memcg: inline...
3649
  void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
2cf855837   Tejun Heo   memcontrol: sched...
3650
3651
  {
  	struct swap_info_struct *si, *next;
6caa6a070   Johannes Weiner   mm: memcontrol: m...
3652
3653
3654
  	int nid = page_to_nid(page);
  
  	if (!(gfp_mask & __GFP_IO))
2cf855837   Tejun Heo   memcontrol: sched...
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
  		return;
  
  	if (!blk_cgroup_congested())
  		return;
  
  	/*
  	 * We've already scheduled a throttle, avoid taking the global swap
  	 * lock.
  	 */
  	if (current->throttle_queue)
  		return;
  
  	spin_lock(&swap_avail_lock);
6caa6a070   Johannes Weiner   mm: memcontrol: m...
3668
3669
  	plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
  				  avail_lists[nid]) {
2cf855837   Tejun Heo   memcontrol: sched...
3670
  		if (si->bdev) {
6caa6a070   Johannes Weiner   mm: memcontrol: m...
3671
  			blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
2cf855837   Tejun Heo   memcontrol: sched...
3672
3673
3674
3675
3676
3677
  			break;
  		}
  	}
  	spin_unlock(&swap_avail_lock);
  }
  #endif
a2468cc9b   Aaron Lu   swap: choose swap...
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
  static int __init swapfile_init(void)
  {
  	int nid;
  
  	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
  					 GFP_KERNEL);
  	if (!swap_avail_heads) {
  		pr_emerg("Not enough memory for swap heads, swap is disabled
  ");
  		return -ENOMEM;
  	}
  
  	for_each_node(nid)
  		plist_head_init(&swap_avail_heads[nid]);
  
  	return 0;
  }
  subsys_initcall(swapfile_init);