Blame view

mm/swapfile.c 97 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
  /*
   *  linux/mm/swapfile.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
8
  #include <linux/mm.h>
6e84f3152   Ingo Molnar   sched/headers: Pr...
9
  #include <linux/sched/mm.h>
299300258   Ingo Molnar   sched/headers: Pr...
10
  #include <linux/sched/task.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
11
12
13
14
15
16
17
18
  #include <linux/hugetlb.h>
  #include <linux/mman.h>
  #include <linux/slab.h>
  #include <linux/kernel_stat.h>
  #include <linux/swap.h>
  #include <linux/vmalloc.h>
  #include <linux/pagemap.h>
  #include <linux/namei.h>
072441e21   Hugh Dickins   mm: move shmem pr...
19
  #include <linux/shmem_fs.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/blkdev.h>
20137a490   Hugh Dickins   swapfile: swapon ...
21
  #include <linux/random.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
  #include <linux/writeback.h>
  #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
  #include <linux/init.h>
5ad646880   Hugh Dickins   ksm: let shared p...
26
  #include <linux/ksm.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
27
28
29
  #include <linux/rmap.h>
  #include <linux/security.h>
  #include <linux/backing-dev.h>
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
30
  #include <linux/mutex.h>
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
31
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
  #include <linux/syscalls.h>
8a9f3ccd2   Balbir Singh   Memory controller...
33
  #include <linux/memcontrol.h>
66d7dd518   Kay Sievers   /proc/swaps: supp...
34
  #include <linux/poll.h>
72788c385   David Rientjes   oom: replace PF_O...
35
  #include <linux/oom.h>
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
36
37
  #include <linux/frontswap.h>
  #include <linux/swapfile.h>
f981c5950   Mel Gorman   mm: methods for t...
38
  #include <linux/export.h>
67afa38e0   Tim Chen   mm/swap: add cach...
39
  #include <linux/swap_slots.h>
155b5f88e   Huang Ying   mm/swapfile.c: so...
40
  #include <linux/sort.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
41

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
43
  #include <asm/tlbflush.h>
  #include <linux/swapops.h>
5d1ea48bd   Johannes Weiner   mm: page_cgroup: ...
44
  #include <linux/swap_cgroup.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45

570a335b8   Hugh Dickins   swap_info: swap c...
46
47
48
  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  				 unsigned char);
  static void free_swap_count_continuations(struct swap_info_struct *);
d4906e1aa   Lee Schermerhorn   swap: rework map_...
49
  static sector_t map_swap_entry(swp_entry_t, struct block_device**);
570a335b8   Hugh Dickins   swap_info: swap c...
50

38b5faf4b   Dan Magenheimer   mm: frontswap: co...
51
  DEFINE_SPINLOCK(swap_lock);
7c363b8c6   Adrian Bunk   mm/swapfile.c: ma...
52
  static unsigned int nr_swapfiles;
ec8acf20a   Shaohua Li   swap: add per-par...
53
  atomic_long_t nr_swap_pages;
fb0fec501   Chris Wilson   mm: Export nr_swa...
54
55
56
57
58
59
  /*
   * Some modules use swappable objects and may try to swap them out under
   * memory pressure (via the shrinker). Before doing so, they may wish to
   * check to see if any swap space is available.
   */
  EXPORT_SYMBOL_GPL(nr_swap_pages);
ec8acf20a   Shaohua Li   swap: add per-par...
60
  /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61
  long total_swap_pages;
a2468cc9b   Aaron Lu   swap: choose swap...
62
  static int least_priority = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
63

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
64
65
66
67
  static const char Bad_file[] = "Bad swap file entry ";
  static const char Unused_file[] = "Unused swap file entry ";
  static const char Bad_offset[] = "Bad swap offset entry ";
  static const char Unused_offset[] = "Unused swap offset entry ";
adfab836f   Dan Streetman   swap: change swap...
68
69
70
71
  /*
   * all active swap_info_structs
   * protected with swap_lock, and ordered by priority.
   */
18ab4d4ce   Dan Streetman   swap: change swap...
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  PLIST_HEAD(swap_active_head);
  
  /*
   * all available (active, not full) swap_info_structs
   * protected with swap_avail_lock, ordered by priority.
   * This is used by get_swap_page() instead of swap_active_head
   * because swap_active_head includes all swap_info_structs,
   * but get_swap_page() doesn't need to look at full ones.
   * This uses its own lock instead of swap_lock because when a
   * swap_info_struct changes between not-full/full, it needs to
   * add/remove itself to/from this list, but the swap_info_struct->lock
   * is held and the locking order requires swap_lock to be taken
   * before any swap_info_struct->lock.
   */
bfc6b1cab   Colin Ian King   mm/swapfile.c: ma...
86
  static struct plist_head *swap_avail_heads;
18ab4d4ce   Dan Streetman   swap: change swap...
87
  static DEFINE_SPINLOCK(swap_avail_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
88

38b5faf4b   Dan Magenheimer   mm: frontswap: co...
89
  struct swap_info_struct *swap_info[MAX_SWAPFILES];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
90

fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
91
  static DEFINE_MUTEX(swapon_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
92

66d7dd518   Kay Sievers   /proc/swaps: supp...
93
94
95
  static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  /* Activity counter to indicate that a swapon or swapoff has occurred */
  static atomic_t proc_poll_event = ATOMIC_INIT(0);
81a0298bd   Huang Ying   mm, swap: don't u...
96
  atomic_t nr_rotate_swap = ATOMIC_INIT(0);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
97
98
99
100
101
102
103
104
  static struct swap_info_struct *swap_type_to_swap_info(int type)
  {
  	if (type >= READ_ONCE(nr_swapfiles))
  		return NULL;
  
  	smp_rmb();	/* Pairs with smp_wmb in alloc_swap_info. */
  	return READ_ONCE(swap_info[type]);
  }
8d69aaee8   Hugh Dickins   swap_info: swap_m...
105
  static inline unsigned char swap_count(unsigned char ent)
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
106
  {
955c97f08   Daniel Jordan   mm/swapfile.c: fi...
107
  	return ent & ~SWAP_HAS_CACHE;	/* may include COUNT_CONTINUED flag */
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
108
  }
bcd49e867   Huang Ying   mm/swapfile.c: us...
109
110
111
112
113
114
115
116
117
  /* Reclaim the swap entry anyway if possible */
  #define TTRS_ANYWAY		0x1
  /*
   * Reclaim the swap entry if there are no more mappings of the
   * corresponding page
   */
  #define TTRS_UNMAPPED		0x2
  /* Reclaim the swap entry if swap is getting full*/
  #define TTRS_FULL		0x4
efa90a981   Hugh Dickins   swap_info: change...
118
  /* returns 1 if swap entry is freed */
bcd49e867   Huang Ying   mm/swapfile.c: us...
119
120
  static int __try_to_reclaim_swap(struct swap_info_struct *si,
  				 unsigned long offset, unsigned long flags)
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
121
  {
efa90a981   Hugh Dickins   swap_info: change...
122
  	swp_entry_t entry = swp_entry(si->type, offset);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
123
124
  	struct page *page;
  	int ret = 0;
bcd49e867   Huang Ying   mm/swapfile.c: us...
125
  	page = find_get_page(swap_address_space(entry), offset);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
126
127
128
  	if (!page)
  		return 0;
  	/*
bcd49e867   Huang Ying   mm/swapfile.c: us...
129
130
131
  	 * When this function is called from scan_swap_map_slots() and it's
  	 * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
  	 * here. We have to use trylock for avoiding deadlock. This is a special
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
132
133
134
135
  	 * case and you should use try_to_free_swap() with explicit lock_page()
  	 * in usual operations.
  	 */
  	if (trylock_page(page)) {
bcd49e867   Huang Ying   mm/swapfile.c: us...
136
137
138
139
  		if ((flags & TTRS_ANYWAY) ||
  		    ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
  		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
  			ret = try_to_free_swap(page);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
140
141
  		unlock_page(page);
  	}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
142
  	put_page(page);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
143
144
  	return ret;
  }
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
145

4efaceb1c   Aaron Lu   mm, swap: use rbt...
146
147
148
149
150
151
152
153
154
155
156
  static inline struct swap_extent *first_se(struct swap_info_struct *sis)
  {
  	struct rb_node *rb = rb_first(&sis->swap_extent_root);
  	return rb_entry(rb, struct swap_extent, rb_node);
  }
  
  static inline struct swap_extent *next_se(struct swap_extent *se)
  {
  	struct rb_node *rb = rb_next(&se->rb_node);
  	return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
157
  /*
6a6ba8317   Hugh Dickins   swapfile: swapon ...
158
159
160
161
162
163
   * swapon tell device that all the old swap contents can be discarded,
   * to allow the swap device to optimize its wear-levelling.
   */
  static int discard_swap(struct swap_info_struct *si)
  {
  	struct swap_extent *se;
9625a5f28   Hugh Dickins   swap_info: includ...
164
165
  	sector_t start_block;
  	sector_t nr_blocks;
6a6ba8317   Hugh Dickins   swapfile: swapon ...
166
  	int err = 0;
9625a5f28   Hugh Dickins   swap_info: includ...
167
  	/* Do not discard the swap header page! */
4efaceb1c   Aaron Lu   mm, swap: use rbt...
168
  	se = first_se(si);
9625a5f28   Hugh Dickins   swap_info: includ...
169
170
171
172
  	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
  	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
  	if (nr_blocks) {
  		err = blkdev_issue_discard(si->bdev, start_block,
dd3932edd   Christoph Hellwig   block: remove BLK...
173
  				nr_blocks, GFP_KERNEL, 0);
9625a5f28   Hugh Dickins   swap_info: includ...
174
175
176
177
  		if (err)
  			return err;
  		cond_resched();
  	}
6a6ba8317   Hugh Dickins   swapfile: swapon ...
178

4efaceb1c   Aaron Lu   mm, swap: use rbt...
179
  	for (se = next_se(se); se; se = next_se(se)) {
9625a5f28   Hugh Dickins   swap_info: includ...
180
181
  		start_block = se->start_block << (PAGE_SHIFT - 9);
  		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
6a6ba8317   Hugh Dickins   swapfile: swapon ...
182
183
  
  		err = blkdev_issue_discard(si->bdev, start_block,
dd3932edd   Christoph Hellwig   block: remove BLK...
184
  				nr_blocks, GFP_KERNEL, 0);
6a6ba8317   Hugh Dickins   swapfile: swapon ...
185
186
187
188
189
190
191
  		if (err)
  			break;
  
  		cond_resched();
  	}
  	return err;		/* That will often be -EOPNOTSUPP */
  }
4efaceb1c   Aaron Lu   mm, swap: use rbt...
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
  static struct swap_extent *
  offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
  {
  	struct swap_extent *se;
  	struct rb_node *rb;
  
  	rb = sis->swap_extent_root.rb_node;
  	while (rb) {
  		se = rb_entry(rb, struct swap_extent, rb_node);
  		if (offset < se->start_page)
  			rb = rb->rb_left;
  		else if (offset >= se->start_page + se->nr_pages)
  			rb = rb->rb_right;
  		else
  			return se;
  	}
  	/* It *must* be present */
  	BUG();
  }
7992fde72   Hugh Dickins   swapfile: swap al...
211
212
213
214
215
216
217
  /*
   * swap allocation tell device that a cluster of swap can now be discarded,
   * to allow the swap device to optimize its wear-levelling.
   */
  static void discard_swap_cluster(struct swap_info_struct *si,
  				 pgoff_t start_page, pgoff_t nr_pages)
  {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
218
  	struct swap_extent *se = offset_to_swap_extent(si, start_page);
7992fde72   Hugh Dickins   swapfile: swap al...
219
220
  
  	while (nr_pages) {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
221
222
223
224
225
226
227
228
229
230
231
232
233
234
  		pgoff_t offset = start_page - se->start_page;
  		sector_t start_block = se->start_block + offset;
  		sector_t nr_blocks = se->nr_pages - offset;
  
  		if (nr_blocks > nr_pages)
  			nr_blocks = nr_pages;
  		start_page += nr_blocks;
  		nr_pages -= nr_blocks;
  
  		start_block <<= PAGE_SHIFT - 9;
  		nr_blocks <<= PAGE_SHIFT - 9;
  		if (blkdev_issue_discard(si->bdev, start_block,
  					nr_blocks, GFP_NOIO, 0))
  			break;
7992fde72   Hugh Dickins   swapfile: swap al...
235

4efaceb1c   Aaron Lu   mm, swap: use rbt...
236
  		se = next_se(se);
7992fde72   Hugh Dickins   swapfile: swap al...
237
238
  	}
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
239
240
  #ifdef CONFIG_THP_SWAP
  #define SWAPFILE_CLUSTER	HPAGE_PMD_NR
a448f2d07   Huang Ying   mm/swapfile.c: un...
241
242
  
  #define swap_entry_size(size)	(size)
38d8b4e6b   Huang Ying   mm, THP, swap: de...
243
  #else
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
244
  #define SWAPFILE_CLUSTER	256
a448f2d07   Huang Ying   mm/swapfile.c: un...
245
246
247
248
249
250
  
  /*
   * Define swap_entry_size() as constant to let compiler to optimize
   * out some code if !CONFIG_THP_SWAP
   */
  #define swap_entry_size(size)	1
38d8b4e6b   Huang Ying   mm, THP, swap: de...
251
  #endif
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
252
  #define LATENCY_LIMIT		256
2a8f94493   Shaohua Li   swap: change bloc...
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
  static inline void cluster_set_flag(struct swap_cluster_info *info,
  	unsigned int flag)
  {
  	info->flags = flag;
  }
  
  static inline unsigned int cluster_count(struct swap_cluster_info *info)
  {
  	return info->data;
  }
  
  static inline void cluster_set_count(struct swap_cluster_info *info,
  				     unsigned int c)
  {
  	info->data = c;
  }
  
  static inline void cluster_set_count_flag(struct swap_cluster_info *info,
  					 unsigned int c, unsigned int f)
  {
  	info->flags = f;
  	info->data = c;
  }
  
  static inline unsigned int cluster_next(struct swap_cluster_info *info)
  {
  	return info->data;
  }
  
  static inline void cluster_set_next(struct swap_cluster_info *info,
  				    unsigned int n)
  {
  	info->data = n;
  }
  
  static inline void cluster_set_next_flag(struct swap_cluster_info *info,
  					 unsigned int n, unsigned int f)
  {
  	info->flags = f;
  	info->data = n;
  }
  
  static inline bool cluster_is_free(struct swap_cluster_info *info)
  {
  	return info->flags & CLUSTER_FLAG_FREE;
  }
  
  static inline bool cluster_is_null(struct swap_cluster_info *info)
  {
  	return info->flags & CLUSTER_FLAG_NEXT_NULL;
  }
  
  static inline void cluster_set_null(struct swap_cluster_info *info)
  {
  	info->flags = CLUSTER_FLAG_NEXT_NULL;
  	info->data = 0;
  }
e07098294   Huang Ying   mm, THP, swap: su...
310
311
  static inline bool cluster_is_huge(struct swap_cluster_info *info)
  {
33ee011e5   Huang Ying   mm/swapfile.c: un...
312
313
314
  	if (IS_ENABLED(CONFIG_THP_SWAP))
  		return info->flags & CLUSTER_FLAG_HUGE;
  	return false;
e07098294   Huang Ying   mm, THP, swap: su...
315
316
317
318
319
320
  }
  
  static inline void cluster_clear_huge(struct swap_cluster_info *info)
  {
  	info->flags &= ~CLUSTER_FLAG_HUGE;
  }
235b62176   Huang, Ying   mm/swap: add clus...
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
  						     unsigned long offset)
  {
  	struct swap_cluster_info *ci;
  
  	ci = si->cluster_info;
  	if (ci) {
  		ci += offset / SWAPFILE_CLUSTER;
  		spin_lock(&ci->lock);
  	}
  	return ci;
  }
  
  static inline void unlock_cluster(struct swap_cluster_info *ci)
  {
  	if (ci)
  		spin_unlock(&ci->lock);
  }
59d98bf3c   Huang Ying   mm: swap: add com...
339
340
341
342
  /*
   * Determine the locking method in use for this device.  Return
   * swap_cluster_info if SSD-style cluster-based locking is in place.
   */
235b62176   Huang, Ying   mm/swap: add clus...
343
  static inline struct swap_cluster_info *lock_cluster_or_swap_info(
59d98bf3c   Huang Ying   mm: swap: add com...
344
  		struct swap_info_struct *si, unsigned long offset)
235b62176   Huang, Ying   mm/swap: add clus...
345
346
  {
  	struct swap_cluster_info *ci;
59d98bf3c   Huang Ying   mm: swap: add com...
347
  	/* Try to use fine-grained SSD-style locking if available: */
235b62176   Huang, Ying   mm/swap: add clus...
348
  	ci = lock_cluster(si, offset);
59d98bf3c   Huang Ying   mm: swap: add com...
349
  	/* Otherwise, fall back to traditional, coarse locking: */
235b62176   Huang, Ying   mm/swap: add clus...
350
351
352
353
354
355
356
357
358
359
360
361
362
363
  	if (!ci)
  		spin_lock(&si->lock);
  
  	return ci;
  }
  
  static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
  					       struct swap_cluster_info *ci)
  {
  	if (ci)
  		unlock_cluster(ci);
  	else
  		spin_unlock(&si->lock);
  }
6b5349159   Huang Ying   mm, swap: add swa...
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
  static inline bool cluster_list_empty(struct swap_cluster_list *list)
  {
  	return cluster_is_null(&list->head);
  }
  
  static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
  {
  	return cluster_next(&list->head);
  }
  
  static void cluster_list_init(struct swap_cluster_list *list)
  {
  	cluster_set_null(&list->head);
  	cluster_set_null(&list->tail);
  }
  
  static void cluster_list_add_tail(struct swap_cluster_list *list,
  				  struct swap_cluster_info *ci,
  				  unsigned int idx)
  {
  	if (cluster_list_empty(list)) {
  		cluster_set_next_flag(&list->head, idx, 0);
  		cluster_set_next_flag(&list->tail, idx, 0);
  	} else {
235b62176   Huang, Ying   mm/swap: add clus...
388
  		struct swap_cluster_info *ci_tail;
6b5349159   Huang Ying   mm, swap: add swa...
389
  		unsigned int tail = cluster_next(&list->tail);
235b62176   Huang, Ying   mm/swap: add clus...
390
391
392
393
394
395
396
  		/*
  		 * Nested cluster lock, but both cluster locks are
  		 * only acquired when we held swap_info_struct->lock
  		 */
  		ci_tail = ci + tail;
  		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
  		cluster_set_next(ci_tail, idx);
0ef017d11   Huang Ying   mm, swap: improve...
397
  		spin_unlock(&ci_tail->lock);
6b5349159   Huang Ying   mm, swap: add swa...
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
  		cluster_set_next_flag(&list->tail, idx, 0);
  	}
  }
  
  static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
  					   struct swap_cluster_info *ci)
  {
  	unsigned int idx;
  
  	idx = cluster_next(&list->head);
  	if (cluster_next(&list->tail) == idx) {
  		cluster_set_null(&list->head);
  		cluster_set_null(&list->tail);
  	} else
  		cluster_set_next_flag(&list->head,
  				      cluster_next(&ci[idx]), 0);
  
  	return idx;
  }
815c2c543   Shaohua Li   swap: make swap d...
417
418
419
420
421
422
423
424
425
426
427
428
  /* Add a cluster to discard list and schedule it to do discard */
  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
  		unsigned int idx)
  {
  	/*
  	 * If scan_swap_map() can't find a free cluster, it will check
  	 * si->swap_map directly. To make sure the discarding cluster isn't
  	 * taken by scan_swap_map(), mark the swap entries bad (occupied). It
  	 * will be cleared after discard
  	 */
  	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
6b5349159   Huang Ying   mm, swap: add swa...
429
  	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
815c2c543   Shaohua Li   swap: make swap d...
430
431
432
  
  	schedule_work(&si->discard_work);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
433
434
435
436
437
438
439
  static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	struct swap_cluster_info *ci = si->cluster_info;
  
  	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
  	cluster_list_add_tail(&si->free_clusters, ci, idx);
  }
815c2c543   Shaohua Li   swap: make swap d...
440
441
442
443
444
445
  /*
   * Doing discard actually. After a cluster discard is finished, the cluster
   * will be added to free cluster list. caller should hold si->lock.
  */
  static void swap_do_scheduled_discard(struct swap_info_struct *si)
  {
235b62176   Huang, Ying   mm/swap: add clus...
446
  	struct swap_cluster_info *info, *ci;
815c2c543   Shaohua Li   swap: make swap d...
447
448
449
  	unsigned int idx;
  
  	info = si->cluster_info;
6b5349159   Huang Ying   mm, swap: add swa...
450
451
  	while (!cluster_list_empty(&si->discard_clusters)) {
  		idx = cluster_list_del_first(&si->discard_clusters, info);
815c2c543   Shaohua Li   swap: make swap d...
452
453
454
455
456
457
  		spin_unlock(&si->lock);
  
  		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
  				SWAPFILE_CLUSTER);
  
  		spin_lock(&si->lock);
235b62176   Huang, Ying   mm/swap: add clus...
458
  		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
459
  		__free_cluster(si, idx);
815c2c543   Shaohua Li   swap: make swap d...
460
461
  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  				0, SWAPFILE_CLUSTER);
235b62176   Huang, Ying   mm/swap: add clus...
462
  		unlock_cluster(ci);
815c2c543   Shaohua Li   swap: make swap d...
463
464
465
466
467
468
469
470
471
472
473
474
475
  	}
  }
  
  static void swap_discard_work(struct work_struct *work)
  {
  	struct swap_info_struct *si;
  
  	si = container_of(work, struct swap_info_struct, discard_work);
  
  	spin_lock(&si->lock);
  	swap_do_scheduled_discard(si);
  	spin_unlock(&si->lock);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	struct swap_cluster_info *ci = si->cluster_info;
  
  	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
  	cluster_list_del_first(&si->free_clusters, ci);
  	cluster_set_count_flag(ci + idx, 0, 0);
  }
  
  static void free_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	struct swap_cluster_info *ci = si->cluster_info + idx;
  
  	VM_BUG_ON(cluster_count(ci) != 0);
  	/*
  	 * If the swap is discardable, prepare discard the cluster
  	 * instead of free it immediately. The cluster will be freed
  	 * after discard.
  	 */
  	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
  	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
  		swap_cluster_schedule_discard(si, idx);
  		return;
  	}
  
  	__free_cluster(si, idx);
  }
2a8f94493   Shaohua Li   swap: change bloc...
503
504
505
506
507
508
509
510
511
512
513
  /*
   * The cluster corresponding to page_nr will be used. The cluster will be
   * removed from free cluster list and its usage counter will be increased.
   */
  static void inc_cluster_info_page(struct swap_info_struct *p,
  	struct swap_cluster_info *cluster_info, unsigned long page_nr)
  {
  	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  
  	if (!cluster_info)
  		return;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
514
515
  	if (cluster_is_free(&cluster_info[idx]))
  		alloc_cluster(p, idx);
2a8f94493   Shaohua Li   swap: change bloc...
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
  
  	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
  	cluster_set_count(&cluster_info[idx],
  		cluster_count(&cluster_info[idx]) + 1);
  }
  
  /*
   * The cluster corresponding to page_nr decreases one usage. If the usage
   * counter becomes 0, which means no page in the cluster is in using, we can
   * optionally discard the cluster and add it to free cluster list.
   */
  static void dec_cluster_info_page(struct swap_info_struct *p,
  	struct swap_cluster_info *cluster_info, unsigned long page_nr)
  {
  	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  
  	if (!cluster_info)
  		return;
  
  	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
  	cluster_set_count(&cluster_info[idx],
  		cluster_count(&cluster_info[idx]) - 1);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
538
539
  	if (cluster_count(&cluster_info[idx]) == 0)
  		free_cluster(p, idx);
2a8f94493   Shaohua Li   swap: change bloc...
540
541
542
543
544
545
  }
  
  /*
   * It's possible scan_swap_map() uses a free cluster in the middle of free
   * cluster list. Avoiding such abuse to avoid list corruption.
   */
ebc2a1a69   Shaohua Li   swap: make cluste...
546
547
  static bool
  scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
2a8f94493   Shaohua Li   swap: change bloc...
548
549
  	unsigned long offset)
  {
ebc2a1a69   Shaohua Li   swap: make cluste...
550
551
  	struct percpu_cluster *percpu_cluster;
  	bool conflict;
2a8f94493   Shaohua Li   swap: change bloc...
552
  	offset /= SWAPFILE_CLUSTER;
6b5349159   Huang Ying   mm, swap: add swa...
553
554
  	conflict = !cluster_list_empty(&si->free_clusters) &&
  		offset != cluster_list_first(&si->free_clusters) &&
2a8f94493   Shaohua Li   swap: change bloc...
555
  		cluster_is_free(&si->cluster_info[offset]);
ebc2a1a69   Shaohua Li   swap: make cluste...
556
557
558
559
560
561
562
563
564
565
566
567
568
  
  	if (!conflict)
  		return false;
  
  	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
  	cluster_set_null(&percpu_cluster->index);
  	return true;
  }
  
  /*
   * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
   * might involve allocating a new cluster for current CPU too.
   */
36005bae2   Tim Chen   mm/swap: allocate...
569
  static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
ebc2a1a69   Shaohua Li   swap: make cluste...
570
571
572
  	unsigned long *offset, unsigned long *scan_base)
  {
  	struct percpu_cluster *cluster;
235b62176   Huang, Ying   mm/swap: add clus...
573
  	struct swap_cluster_info *ci;
235b62176   Huang, Ying   mm/swap: add clus...
574
  	unsigned long tmp, max;
ebc2a1a69   Shaohua Li   swap: make cluste...
575
576
577
578
  
  new_cluster:
  	cluster = this_cpu_ptr(si->percpu_cluster);
  	if (cluster_is_null(&cluster->index)) {
6b5349159   Huang Ying   mm, swap: add swa...
579
580
  		if (!cluster_list_empty(&si->free_clusters)) {
  			cluster->index = si->free_clusters.head;
ebc2a1a69   Shaohua Li   swap: make cluste...
581
582
  			cluster->next = cluster_next(&cluster->index) *
  					SWAPFILE_CLUSTER;
6b5349159   Huang Ying   mm, swap: add swa...
583
  		} else if (!cluster_list_empty(&si->discard_clusters)) {
ebc2a1a69   Shaohua Li   swap: make cluste...
584
585
  			/*
  			 * we don't have free cluster but have some clusters in
490705888   Huang Ying   swap: reduce lock...
586
587
  			 * discarding, do discard now and reclaim them, then
  			 * reread cluster_next_cpu since we dropped si->lock
ebc2a1a69   Shaohua Li   swap: make cluste...
588
589
  			 */
  			swap_do_scheduled_discard(si);
490705888   Huang Ying   swap: reduce lock...
590
591
  			*scan_base = this_cpu_read(*si->cluster_next_cpu);
  			*offset = *scan_base;
ebc2a1a69   Shaohua Li   swap: make cluste...
592
593
  			goto new_cluster;
  		} else
36005bae2   Tim Chen   mm/swap: allocate...
594
  			return false;
ebc2a1a69   Shaohua Li   swap: make cluste...
595
  	}
ebc2a1a69   Shaohua Li   swap: make cluste...
596
597
598
599
600
  	/*
  	 * Other CPUs can use our cluster if they can't find a free cluster,
  	 * check if there is still free entry in the cluster
  	 */
  	tmp = cluster->next;
235b62176   Huang, Ying   mm/swap: add clus...
601
602
  	max = min_t(unsigned long, si->max,
  		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
7b9e2de13   Wei Yang   mm/swapfile.c: om...
603
604
605
606
607
608
609
610
  	if (tmp < max) {
  		ci = lock_cluster(si, tmp);
  		while (tmp < max) {
  			if (!si->swap_map[tmp])
  				break;
  			tmp++;
  		}
  		unlock_cluster(ci);
ebc2a1a69   Shaohua Li   swap: make cluste...
611
  	}
0fd0e19e4   Wei Yang   mm/swapfile.c: fo...
612
  	if (tmp >= max) {
ebc2a1a69   Shaohua Li   swap: make cluste...
613
614
615
616
617
618
  		cluster_set_null(&cluster->index);
  		goto new_cluster;
  	}
  	cluster->next = tmp + 1;
  	*offset = tmp;
  	*scan_base = tmp;
fdff1debb   Wei Yang   mm/swapfile.c: tm...
619
  	return true;
2a8f94493   Shaohua Li   swap: change bloc...
620
  }
a2468cc9b   Aaron Lu   swap: choose swap...
621
622
623
624
625
626
627
628
629
630
631
632
633
634
  static void __del_from_avail_list(struct swap_info_struct *p)
  {
  	int nid;
  
  	for_each_node(nid)
  		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
  }
  
  static void del_from_avail_list(struct swap_info_struct *p)
  {
  	spin_lock(&swap_avail_lock);
  	__del_from_avail_list(p);
  	spin_unlock(&swap_avail_lock);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
635
636
637
638
639
640
641
642
  static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
  			     unsigned int nr_entries)
  {
  	unsigned int end = offset + nr_entries - 1;
  
  	if (offset == si->lowest_bit)
  		si->lowest_bit += nr_entries;
  	if (end == si->highest_bit)
a449bf58e   Qian Cai   mm/swapfile: fix ...
643
  		WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
644
645
646
647
  	si->inuse_pages += nr_entries;
  	if (si->inuse_pages == si->pages) {
  		si->lowest_bit = si->max;
  		si->highest_bit = 0;
a2468cc9b   Aaron Lu   swap: choose swap...
648
  		del_from_avail_list(si);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
649
650
  	}
  }
a2468cc9b   Aaron Lu   swap: choose swap...
651
652
653
654
655
656
657
658
659
660
661
  static void add_to_avail_list(struct swap_info_struct *p)
  {
  	int nid;
  
  	spin_lock(&swap_avail_lock);
  	for_each_node(nid) {
  		WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
  		plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
  	}
  	spin_unlock(&swap_avail_lock);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
662
663
664
  static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
  			    unsigned int nr_entries)
  {
3852f6768   Joonsoo Kim   mm/swapcache: sup...
665
  	unsigned long begin = offset;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
666
667
668
669
670
671
672
  	unsigned long end = offset + nr_entries - 1;
  	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
  
  	if (offset < si->lowest_bit)
  		si->lowest_bit = offset;
  	if (end > si->highest_bit) {
  		bool was_full = !si->highest_bit;
a449bf58e   Qian Cai   mm/swapfile: fix ...
673
  		WRITE_ONCE(si->highest_bit, end);
a2468cc9b   Aaron Lu   swap: choose swap...
674
675
  		if (was_full && (si->flags & SWP_WRITEOK))
  			add_to_avail_list(si);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
676
677
678
679
680
681
682
683
684
  	}
  	atomic_long_add(nr_entries, &nr_swap_pages);
  	si->inuse_pages -= nr_entries;
  	if (si->flags & SWP_BLKDEV)
  		swap_slot_free_notify =
  			si->bdev->bd_disk->fops->swap_slot_free_notify;
  	else
  		swap_slot_free_notify = NULL;
  	while (offset <= end) {
8a84802e2   Steven Price   mm: Add arch hook...
685
  		arch_swap_invalidate_page(si->type, offset);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
686
687
688
689
690
  		frontswap_invalidate_page(si->type, offset);
  		if (swap_slot_free_notify)
  			swap_slot_free_notify(si->bdev, offset);
  		offset++;
  	}
3852f6768   Joonsoo Kim   mm/swapcache: sup...
691
  	clear_shadow_from_swap_cache(si->type, begin, end);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
692
  }
490705888   Huang Ying   swap: reduce lock...
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
  static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
  {
  	unsigned long prev;
  
  	if (!(si->flags & SWP_SOLIDSTATE)) {
  		si->cluster_next = next;
  		return;
  	}
  
  	prev = this_cpu_read(*si->cluster_next_cpu);
  	/*
  	 * Cross the swap address space size aligned trunk, choose
  	 * another trunk randomly to avoid lock contention on swap
  	 * address space if possible.
  	 */
  	if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
  	    (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
  		/* No free swap slots available */
  		if (si->highest_bit <= si->lowest_bit)
  			return;
  		next = si->lowest_bit +
  			prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
  		next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
  		next = max_t(unsigned int, next, si->lowest_bit);
  	}
  	this_cpu_write(*si->cluster_next_cpu, next);
  }
36005bae2   Tim Chen   mm/swap: allocate...
720
721
722
  static int scan_swap_map_slots(struct swap_info_struct *si,
  			       unsigned char usage, int nr,
  			       swp_entry_t slots[])
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
723
  {
235b62176   Huang, Ying   mm/swap: add clus...
724
  	struct swap_cluster_info *ci;
ebebbbe90   Hugh Dickins   swapfile: rearran...
725
  	unsigned long offset;
c60aa176c   Hugh Dickins   swapfile: swap al...
726
  	unsigned long scan_base;
7992fde72   Hugh Dickins   swapfile: swap al...
727
  	unsigned long last_in_cluster = 0;
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
728
  	int latency_ration = LATENCY_LIMIT;
36005bae2   Tim Chen   mm/swap: allocate...
729
  	int n_ret = 0;
ed43af109   Huang Ying   swap: try to scan...
730
  	bool scanned_many = false;
36005bae2   Tim Chen   mm/swap: allocate...
731

886bb7e9c   Hugh Dickins   swapfile: remove ...
732
  	/*
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
733
734
735
736
737
738
739
  	 * We try to cluster swap pages by allocating them sequentially
  	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
  	 * way, however, we resort to first-free allocation, starting
  	 * a new cluster.  This prevents us from scattering swap pages
  	 * all over the entire swap partition, so that we reduce
  	 * overall disk seek times between swap pages.  -- sct
  	 * But we do now try to find an empty cluster.  -Andrea
c60aa176c   Hugh Dickins   swapfile: swap al...
740
  	 * And we let swap pages go all over an SSD partition.  Hugh
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
741
  	 */
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
742
  	si->flags += SWP_SCANNING;
490705888   Huang Ying   swap: reduce lock...
743
744
745
746
747
748
749
750
751
752
  	/*
  	 * Use percpu scan base for SSD to reduce lock contention on
  	 * cluster and swap cache.  For HDD, sequential access is more
  	 * important.
  	 */
  	if (si->flags & SWP_SOLIDSTATE)
  		scan_base = this_cpu_read(*si->cluster_next_cpu);
  	else
  		scan_base = si->cluster_next;
  	offset = scan_base;
ebebbbe90   Hugh Dickins   swapfile: rearran...
753

ebc2a1a69   Shaohua Li   swap: make cluste...
754
755
  	/* SSD algorithm */
  	if (si->cluster_info) {
bd2d18da4   Wei Yang   mm/swapfile.c: re...
756
  		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
36005bae2   Tim Chen   mm/swap: allocate...
757
  			goto scan;
f4eaf51a7   Wei Yang   mm/swapfile.c: ex...
758
  	} else if (unlikely(!si->cluster_nr--)) {
ebebbbe90   Hugh Dickins   swapfile: rearran...
759
760
761
762
  		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
  			si->cluster_nr = SWAPFILE_CLUSTER - 1;
  			goto checks;
  		}
2a8f94493   Shaohua Li   swap: change bloc...
763

ec8acf20a   Shaohua Li   swap: add per-par...
764
  		spin_unlock(&si->lock);
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
765

c60aa176c   Hugh Dickins   swapfile: swap al...
766
767
768
  		/*
  		 * If seek is expensive, start searching for new cluster from
  		 * start of partition, to minimize the span of allocated swap.
50088c440   Chen Yucong   mm/swapfile.c: de...
769
770
  		 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
  		 * case, just handled by scan_swap_map_try_ssd_cluster() above.
c60aa176c   Hugh Dickins   swapfile: swap al...
771
  		 */
50088c440   Chen Yucong   mm/swapfile.c: de...
772
  		scan_base = offset = si->lowest_bit;
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
773
774
775
776
  		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
  
  		/* Locate the first empty (unaligned) cluster */
  		for (; last_in_cluster <= si->highest_bit; offset++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
777
  			if (si->swap_map[offset])
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
778
779
  				last_in_cluster = offset + SWAPFILE_CLUSTER;
  			else if (offset == last_in_cluster) {
ec8acf20a   Shaohua Li   swap: add per-par...
780
  				spin_lock(&si->lock);
ebebbbe90   Hugh Dickins   swapfile: rearran...
781
782
783
  				offset -= SWAPFILE_CLUSTER - 1;
  				si->cluster_next = offset;
  				si->cluster_nr = SWAPFILE_CLUSTER - 1;
c60aa176c   Hugh Dickins   swapfile: swap al...
784
785
786
787
788
789
790
791
792
  				goto checks;
  			}
  			if (unlikely(--latency_ration < 0)) {
  				cond_resched();
  				latency_ration = LATENCY_LIMIT;
  			}
  		}
  
  		offset = scan_base;
ec8acf20a   Shaohua Li   swap: add per-par...
793
  		spin_lock(&si->lock);
ebebbbe90   Hugh Dickins   swapfile: rearran...
794
  		si->cluster_nr = SWAPFILE_CLUSTER - 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
795
  	}
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
796

ebebbbe90   Hugh Dickins   swapfile: rearran...
797
  checks:
ebc2a1a69   Shaohua Li   swap: make cluste...
798
  	if (si->cluster_info) {
36005bae2   Tim Chen   mm/swap: allocate...
799
800
801
802
803
804
805
806
  		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
  		/* take a break if we already got some slots */
  			if (n_ret)
  				goto done;
  			if (!scan_swap_map_try_ssd_cluster(si, &offset,
  							&scan_base))
  				goto scan;
  		}
ebc2a1a69   Shaohua Li   swap: make cluste...
807
  	}
ebebbbe90   Hugh Dickins   swapfile: rearran...
808
  	if (!(si->flags & SWP_WRITEOK))
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
809
  		goto no_page;
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
810
811
  	if (!si->highest_bit)
  		goto no_page;
ebebbbe90   Hugh Dickins   swapfile: rearran...
812
  	if (offset > si->highest_bit)
c60aa176c   Hugh Dickins   swapfile: swap al...
813
  		scan_base = offset = si->lowest_bit;
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
814

235b62176   Huang, Ying   mm/swap: add clus...
815
  	ci = lock_cluster(si, offset);
b73d7fcec   Hugh Dickins   swap: prevent reu...
816
817
  	/* reuse swap entry of cache-only swap if not busy. */
  	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
818
  		int swap_was_freed;
235b62176   Huang, Ying   mm/swap: add clus...
819
  		unlock_cluster(ci);
ec8acf20a   Shaohua Li   swap: add per-par...
820
  		spin_unlock(&si->lock);
bcd49e867   Huang Ying   mm/swapfile.c: us...
821
  		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
ec8acf20a   Shaohua Li   swap: add per-par...
822
  		spin_lock(&si->lock);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
823
824
825
826
827
  		/* entry was freed successfully, try to use this again */
  		if (swap_was_freed)
  			goto checks;
  		goto scan; /* check next one */
  	}
235b62176   Huang, Ying   mm/swap: add clus...
828
829
  	if (si->swap_map[offset]) {
  		unlock_cluster(ci);
36005bae2   Tim Chen   mm/swap: allocate...
830
831
832
833
  		if (!n_ret)
  			goto scan;
  		else
  			goto done;
235b62176   Huang, Ying   mm/swap: add clus...
834
  	}
a449bf58e   Qian Cai   mm/swapfile: fix ...
835
  	WRITE_ONCE(si->swap_map[offset], usage);
2872bb2d0   Huang Ying   mm, swap: avoid l...
836
837
  	inc_cluster_info_page(si, si->cluster_info, offset);
  	unlock_cluster(ci);
ebebbbe90   Hugh Dickins   swapfile: rearran...
838

38d8b4e6b   Huang Ying   mm, THP, swap: de...
839
  	swap_range_alloc(si, offset, 1);
36005bae2   Tim Chen   mm/swap: allocate...
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
  	slots[n_ret++] = swp_entry(si->type, offset);
  
  	/* got enough slots or reach max slots? */
  	if ((n_ret == nr) || (offset >= si->highest_bit))
  		goto done;
  
  	/* search for next available slot */
  
  	/* time to take a break? */
  	if (unlikely(--latency_ration < 0)) {
  		if (n_ret)
  			goto done;
  		spin_unlock(&si->lock);
  		cond_resched();
  		spin_lock(&si->lock);
  		latency_ration = LATENCY_LIMIT;
  	}
  
  	/* try to get more slots in cluster */
  	if (si->cluster_info) {
  		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
  			goto checks;
f4eaf51a7   Wei Yang   mm/swapfile.c: ex...
862
863
  	} else if (si->cluster_nr && !si->swap_map[++offset]) {
  		/* non-ssd case, still more slots in cluster? */
36005bae2   Tim Chen   mm/swap: allocate...
864
865
866
  		--si->cluster_nr;
  		goto checks;
  	}
7992fde72   Hugh Dickins   swapfile: swap al...
867

ed43af109   Huang Ying   swap: try to scan...
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
  	/*
  	 * Even if there's no free clusters available (fragmented),
  	 * try to scan a little more quickly with lock held unless we
  	 * have scanned too many slots already.
  	 */
  	if (!scanned_many) {
  		unsigned long scan_limit;
  
  		if (offset < scan_base)
  			scan_limit = scan_base;
  		else
  			scan_limit = si->highest_bit;
  		for (; offset <= scan_limit && --latency_ration > 0;
  		     offset++) {
  			if (!si->swap_map[offset])
  				goto checks;
  		}
  	}
36005bae2   Tim Chen   mm/swap: allocate...
886
  done:
490705888   Huang Ying   swap: reduce lock...
887
  	set_cluster_next(si, offset + 1);
36005bae2   Tim Chen   mm/swap: allocate...
888
889
  	si->flags -= SWP_SCANNING;
  	return n_ret;
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
890

ebebbbe90   Hugh Dickins   swapfile: rearran...
891
  scan:
ec8acf20a   Shaohua Li   swap: add per-par...
892
  	spin_unlock(&si->lock);
a449bf58e   Qian Cai   mm/swapfile: fix ...
893
894
  	while (++offset <= READ_ONCE(si->highest_bit)) {
  		if (data_race(!si->swap_map[offset])) {
ec8acf20a   Shaohua Li   swap: add per-par...
895
  			spin_lock(&si->lock);
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
896
897
  			goto checks;
  		}
a449bf58e   Qian Cai   mm/swapfile: fix ...
898
899
  		if (vm_swap_full() &&
  		    READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
ec8acf20a   Shaohua Li   swap: add per-par...
900
  			spin_lock(&si->lock);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
901
902
  			goto checks;
  		}
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
903
904
905
  		if (unlikely(--latency_ration < 0)) {
  			cond_resched();
  			latency_ration = LATENCY_LIMIT;
ed43af109   Huang Ying   swap: try to scan...
906
  			scanned_many = true;
048c27fd7   Hugh Dickins   [PATCH] swap: sca...
907
  		}
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
908
  	}
c60aa176c   Hugh Dickins   swapfile: swap al...
909
  	offset = si->lowest_bit;
a5998061d   Jamie Liu   mm/swapfile.c: do...
910
  	while (offset < scan_base) {
a449bf58e   Qian Cai   mm/swapfile: fix ...
911
  		if (data_race(!si->swap_map[offset])) {
ec8acf20a   Shaohua Li   swap: add per-par...
912
  			spin_lock(&si->lock);
c60aa176c   Hugh Dickins   swapfile: swap al...
913
914
  			goto checks;
  		}
a449bf58e   Qian Cai   mm/swapfile: fix ...
915
916
  		if (vm_swap_full() &&
  		    READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
ec8acf20a   Shaohua Li   swap: add per-par...
917
  			spin_lock(&si->lock);
c9e444103   KAMEZAWA Hiroyuki   mm: reuse unused ...
918
919
  			goto checks;
  		}
c60aa176c   Hugh Dickins   swapfile: swap al...
920
921
922
  		if (unlikely(--latency_ration < 0)) {
  			cond_resched();
  			latency_ration = LATENCY_LIMIT;
ed43af109   Huang Ying   swap: try to scan...
923
  			scanned_many = true;
c60aa176c   Hugh Dickins   swapfile: swap al...
924
  		}
a5998061d   Jamie Liu   mm/swapfile.c: do...
925
  		offset++;
c60aa176c   Hugh Dickins   swapfile: swap al...
926
  	}
ec8acf20a   Shaohua Li   swap: add per-par...
927
  	spin_lock(&si->lock);
7dfad4183   Hugh Dickins   [PATCH] swap: sca...
928
929
  
  no_page:
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
930
  	si->flags -= SWP_SCANNING;
36005bae2   Tim Chen   mm/swap: allocate...
931
  	return n_ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
932
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
933
934
935
936
937
938
  static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
  {
  	unsigned long idx;
  	struct swap_cluster_info *ci;
  	unsigned long offset, i;
  	unsigned char *map;
fe5266d5d   Huang Ying   mm/swapfile.c: re...
939
940
941
942
943
944
945
946
  	/*
  	 * Should not even be attempting cluster allocations when huge
  	 * page swap is disabled.  Warn and fail the allocation.
  	 */
  	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
  		VM_WARN_ON_ONCE(1);
  		return 0;
  	}
38d8b4e6b   Huang Ying   mm, THP, swap: de...
947
948
949
950
951
952
953
  	if (cluster_list_empty(&si->free_clusters))
  		return 0;
  
  	idx = cluster_list_first(&si->free_clusters);
  	offset = idx * SWAPFILE_CLUSTER;
  	ci = lock_cluster(si, offset);
  	alloc_cluster(si, idx);
e07098294   Huang Ying   mm, THP, swap: su...
954
  	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
  
  	map = si->swap_map + offset;
  	for (i = 0; i < SWAPFILE_CLUSTER; i++)
  		map[i] = SWAP_HAS_CACHE;
  	unlock_cluster(ci);
  	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
  	*slot = swp_entry(si->type, offset);
  
  	return 1;
  }
  
  static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
  {
  	unsigned long offset = idx * SWAPFILE_CLUSTER;
  	struct swap_cluster_info *ci;
  
  	ci = lock_cluster(si, offset);
979aafa59   Huang Ying   mm/swapfile.c: cl...
972
  	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
973
974
975
976
977
  	cluster_set_count_flag(ci, 0, 0);
  	free_cluster(si, idx);
  	unlock_cluster(ci);
  	swap_range_free(si, offset, SWAPFILE_CLUSTER);
  }
38d8b4e6b   Huang Ying   mm, THP, swap: de...
978

36005bae2   Tim Chen   mm/swap: allocate...
979
980
981
982
983
984
985
986
987
988
989
990
991
992
  static unsigned long scan_swap_map(struct swap_info_struct *si,
  				   unsigned char usage)
  {
  	swp_entry_t entry;
  	int n_ret;
  
  	n_ret = scan_swap_map_slots(si, usage, 1, &entry);
  
  	if (n_ret)
  		return swp_offset(entry);
  	else
  		return 0;
  
  }
5d5e8f195   Huang Ying   mm, swap, get_swa...
993
  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
994
  {
5d5e8f195   Huang Ying   mm, swap, get_swa...
995
  	unsigned long size = swap_entry_size(entry_size);
adfab836f   Dan Streetman   swap: change swap...
996
  	struct swap_info_struct *si, *next;
36005bae2   Tim Chen   mm/swap: allocate...
997
998
  	long avail_pgs;
  	int n_ret = 0;
a2468cc9b   Aaron Lu   swap: choose swap...
999
  	int node;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1000

38d8b4e6b   Huang Ying   mm, THP, swap: de...
1001
  	/* Only single cluster request supported */
5d5e8f195   Huang Ying   mm, swap, get_swa...
1002
  	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1003

5d5e8f195   Huang Ying   mm, swap, get_swa...
1004
  	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
36005bae2   Tim Chen   mm/swap: allocate...
1005
  	if (avail_pgs <= 0)
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1006
  		goto noswap;
36005bae2   Tim Chen   mm/swap: allocate...
1007

08d3090fc   Wei Yang   mm/swapfile.c: si...
1008
  	n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
36005bae2   Tim Chen   mm/swap: allocate...
1009

5d5e8f195   Huang Ying   mm, swap, get_swa...
1010
  	atomic_long_sub(n_goal * size, &nr_swap_pages);
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1011

18ab4d4ce   Dan Streetman   swap: change swap...
1012
1013
1014
  	spin_lock(&swap_avail_lock);
  
  start_over:
a2468cc9b   Aaron Lu   swap: choose swap...
1015
1016
  	node = numa_node_id();
  	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
18ab4d4ce   Dan Streetman   swap: change swap...
1017
  		/* requeue si to after same-priority siblings */
a2468cc9b   Aaron Lu   swap: choose swap...
1018
  		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
18ab4d4ce   Dan Streetman   swap: change swap...
1019
  		spin_unlock(&swap_avail_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
1020
  		spin_lock(&si->lock);
adfab836f   Dan Streetman   swap: change swap...
1021
  		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
18ab4d4ce   Dan Streetman   swap: change swap...
1022
  			spin_lock(&swap_avail_lock);
a2468cc9b   Aaron Lu   swap: choose swap...
1023
  			if (plist_node_empty(&si->avail_lists[node])) {
18ab4d4ce   Dan Streetman   swap: change swap...
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
  				spin_unlock(&si->lock);
  				goto nextsi;
  			}
  			WARN(!si->highest_bit,
  			     "swap_info %d in list but !highest_bit
  ",
  			     si->type);
  			WARN(!(si->flags & SWP_WRITEOK),
  			     "swap_info %d in list but !SWP_WRITEOK
  ",
  			     si->type);
a2468cc9b   Aaron Lu   swap: choose swap...
1035
  			__del_from_avail_list(si);
ec8acf20a   Shaohua Li   swap: add per-par...
1036
  			spin_unlock(&si->lock);
18ab4d4ce   Dan Streetman   swap: change swap...
1037
  			goto nextsi;
ec8acf20a   Shaohua Li   swap: add per-par...
1038
  		}
5d5e8f195   Huang Ying   mm, swap, get_swa...
1039
  		if (size == SWAPFILE_CLUSTER) {
416634305   Gao Xiang   mm, THP, swap: fi...
1040
  			if (si->flags & SWP_BLKDEV)
f0eea189e   Huang Ying   mm, THP, swap: do...
1041
1042
  				n_ret = swap_alloc_cluster(si, swp_entries);
  		} else
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1043
1044
  			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
  						    n_goal, swp_entries);
ec8acf20a   Shaohua Li   swap: add per-par...
1045
  		spin_unlock(&si->lock);
5d5e8f195   Huang Ying   mm, swap, get_swa...
1046
  		if (n_ret || size == SWAPFILE_CLUSTER)
36005bae2   Tim Chen   mm/swap: allocate...
1047
  			goto check_out;
18ab4d4ce   Dan Streetman   swap: change swap...
1048
1049
  		pr_debug("scan_swap_map of si %d failed to find offset
  ",
36005bae2   Tim Chen   mm/swap: allocate...
1050
  			si->type);
18ab4d4ce   Dan Streetman   swap: change swap...
1051
1052
  		spin_lock(&swap_avail_lock);
  nextsi:
adfab836f   Dan Streetman   swap: change swap...
1053
1054
1055
1056
  		/*
  		 * if we got here, it's likely that si was almost full before,
  		 * and since scan_swap_map() can drop the si->lock, multiple
  		 * callers probably all tried to get a page from the same si
18ab4d4ce   Dan Streetman   swap: change swap...
1057
1058
1059
1060
  		 * and it filled up before we could get one; or, the si filled
  		 * up between us dropping swap_avail_lock and taking si->lock.
  		 * Since we dropped the swap_avail_lock, the swap_avail_head
  		 * list may have been modified; so if next is still in the
36005bae2   Tim Chen   mm/swap: allocate...
1061
1062
  		 * swap_avail_head list then try it, otherwise start over
  		 * if we have not gotten any slots.
adfab836f   Dan Streetman   swap: change swap...
1063
  		 */
a2468cc9b   Aaron Lu   swap: choose swap...
1064
  		if (plist_node_empty(&next->avail_lists[node]))
18ab4d4ce   Dan Streetman   swap: change swap...
1065
  			goto start_over;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1066
  	}
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1067

18ab4d4ce   Dan Streetman   swap: change swap...
1068
  	spin_unlock(&swap_avail_lock);
36005bae2   Tim Chen   mm/swap: allocate...
1069
1070
  check_out:
  	if (n_ret < n_goal)
5d5e8f195   Huang Ying   mm, swap, get_swa...
1071
  		atomic_long_add((long)(n_goal - n_ret) * size,
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1072
  				&nr_swap_pages);
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
1073
  noswap:
36005bae2   Tim Chen   mm/swap: allocate...
1074
1075
  	return n_ret;
  }
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
1076
  /* The only caller of this function is now suspend routine */
910321ea8   Hugh Dickins   swap: revert spec...
1077
1078
  swp_entry_t get_swap_page_of_type(int type)
  {
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1079
  	struct swap_info_struct *si = swap_type_to_swap_info(type);
910321ea8   Hugh Dickins   swap: revert spec...
1080
  	pgoff_t offset;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1081
1082
  	if (!si)
  		goto fail;
ec8acf20a   Shaohua Li   swap: add per-par...
1083
  	spin_lock(&si->lock);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1084
  	if (si->flags & SWP_WRITEOK) {
ec8acf20a   Shaohua Li   swap: add per-par...
1085
  		atomic_long_dec(&nr_swap_pages);
910321ea8   Hugh Dickins   swap: revert spec...
1086
1087
1088
  		/* This is called for allocating swap entry, not cache */
  		offset = scan_swap_map(si, 1);
  		if (offset) {
ec8acf20a   Shaohua Li   swap: add per-par...
1089
  			spin_unlock(&si->lock);
910321ea8   Hugh Dickins   swap: revert spec...
1090
1091
  			return swp_entry(type, offset);
  		}
ec8acf20a   Shaohua Li   swap: add per-par...
1092
  		atomic_long_inc(&nr_swap_pages);
910321ea8   Hugh Dickins   swap: revert spec...
1093
  	}
ec8acf20a   Shaohua Li   swap: add per-par...
1094
  	spin_unlock(&si->lock);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1095
  fail:
910321ea8   Hugh Dickins   swap: revert spec...
1096
1097
  	return (swp_entry_t) {0};
  }
e8c26ab60   Tim Chen   mm/swap: skip rea...
1098
  static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1099
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
1100
  	struct swap_info_struct *p;
eb085574a   Huang Ying   mm, swap: fix rac...
1101
  	unsigned long offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1102
1103
1104
  
  	if (!entry.val)
  		goto out;
eb085574a   Huang Ying   mm, swap: fix rac...
1105
  	p = swp_swap_info(entry);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1106
  	if (!p)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1107
  		goto bad_nofile;
a449bf58e   Qian Cai   mm/swapfile: fix ...
1108
  	if (data_race(!(p->flags & SWP_USED)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1109
1110
1111
1112
  		goto bad_device;
  	offset = swp_offset(entry);
  	if (offset >= p->max)
  		goto bad_offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1113
  	return p;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1114
  bad_offset:
6a991fc72   Huang, Ying   mm/swap: fix kern...
1115
1116
  	pr_err("swap_info_get: %s%08lx
  ", Bad_offset, entry.val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1117
1118
  	goto out;
  bad_device:
6a991fc72   Huang, Ying   mm/swap: fix kern...
1119
1120
  	pr_err("swap_info_get: %s%08lx
  ", Unused_file, entry.val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1121
1122
  	goto out;
  bad_nofile:
6a991fc72   Huang, Ying   mm/swap: fix kern...
1123
1124
  	pr_err("swap_info_get: %s%08lx
  ", Bad_file, entry.val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1125
1126
  out:
  	return NULL;
886bb7e9c   Hugh Dickins   swapfile: remove ...
1127
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1128

e8c26ab60   Tim Chen   mm/swap: skip rea...
1129
1130
1131
1132
1133
1134
1135
  static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
  {
  	struct swap_info_struct *p;
  
  	p = __swap_info_get(entry);
  	if (!p)
  		goto out;
a449bf58e   Qian Cai   mm/swapfile: fix ...
1136
  	if (data_race(!p->swap_map[swp_offset(entry)]))
e8c26ab60   Tim Chen   mm/swap: skip rea...
1137
1138
1139
1140
1141
1142
  		goto bad_free;
  	return p;
  
  bad_free:
  	pr_err("swap_info_get: %s%08lx
  ", Unused_offset, entry.val);
e8c26ab60   Tim Chen   mm/swap: skip rea...
1143
1144
1145
  out:
  	return NULL;
  }
235b62176   Huang, Ying   mm/swap: add clus...
1146
1147
1148
1149
1150
1151
1152
1153
1154
  static struct swap_info_struct *swap_info_get(swp_entry_t entry)
  {
  	struct swap_info_struct *p;
  
  	p = _swap_info_get(entry);
  	if (p)
  		spin_lock(&p->lock);
  	return p;
  }
7c00bafee   Tim Chen   mm/swap: free swa...
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
  static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
  					struct swap_info_struct *q)
  {
  	struct swap_info_struct *p;
  
  	p = _swap_info_get(entry);
  
  	if (p != q) {
  		if (q != NULL)
  			spin_unlock(&q->lock);
  		if (p != NULL)
  			spin_lock(&p->lock);
  	}
  	return p;
  }
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1170
1171
1172
  static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  					      unsigned long offset,
  					      unsigned char usage)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1173
  {
8d69aaee8   Hugh Dickins   swap_info: swap_m...
1174
1175
  	unsigned char count;
  	unsigned char has_cache;
235b62176   Huang, Ying   mm/swap: add clus...
1176

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1177
  	count = p->swap_map[offset];
235b62176   Huang, Ying   mm/swap: add clus...
1178

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1179
1180
  	has_cache = count & SWAP_HAS_CACHE;
  	count &= ~SWAP_HAS_CACHE;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
1181

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1182
  	if (usage == SWAP_HAS_CACHE) {
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
1183
  		VM_BUG_ON(!has_cache);
253d553ba   Hugh Dickins   swap_info: SWAP_H...
1184
  		has_cache = 0;
aaa468653   Hugh Dickins   swap_info: note S...
1185
1186
1187
1188
1189
1190
  	} else if (count == SWAP_MAP_SHMEM) {
  		/*
  		 * Or we could insist on shmem.c using a special
  		 * swap_shmem_free() and free_shmem_swap_and_cache()...
  		 */
  		count = 0;
570a335b8   Hugh Dickins   swap_info: swap c...
1191
1192
1193
1194
1195
1196
1197
1198
1199
  	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
  		if (count == COUNT_CONTINUED) {
  			if (swap_count_continued(p, offset, count))
  				count = SWAP_MAP_MAX | COUNT_CONTINUED;
  			else
  				count = SWAP_MAP_MAX;
  		} else
  			count--;
  	}
253d553ba   Hugh Dickins   swap_info: SWAP_H...
1200

253d553ba   Hugh Dickins   swap_info: SWAP_H...
1201
  	usage = count | has_cache;
a449bf58e   Qian Cai   mm/swapfile: fix ...
1202
1203
1204
1205
  	if (usage)
  		WRITE_ONCE(p->swap_map[offset], usage);
  	else
  		WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
7c00bafee   Tim Chen   mm/swap: free swa...
1206

b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1207
1208
  	return usage;
  }
eb085574a   Huang Ying   mm, swap: fix rac...
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
  /*
   * Check whether swap entry is valid in the swap device.  If so,
   * return pointer to swap_info_struct, and keep the swap entry valid
   * via preventing the swap device from being swapoff, until
   * put_swap_device() is called.  Otherwise return NULL.
   *
   * The entirety of the RCU read critical section must come before the
   * return from or after the call to synchronize_rcu() in
   * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
   * true, the si->map, si->cluster_info, etc. must be valid in the
   * critical section.
   *
   * Notice that swapoff or swapoff+swapon can still happen before the
   * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
   * in put_swap_device() if there isn't any other way to prevent
   * swapoff, such as page lock, page table lock, etc.  The caller must
   * be prepared for that.  For example, the following situation is
   * possible.
   *
   *   CPU1				CPU2
   *   do_swap_page()
   *     ...				swapoff+swapon
   *     __read_swap_cache_async()
   *       swapcache_prepare()
   *         __swap_duplicate()
   *           // check swap_map
   *     // verify PTE not changed
   *
   * In __swap_duplicate(), the swap_map need to be checked before
   * changing partly because the specified swap entry may be for another
   * swap device which has been swapoff.  And in do_swap_page(), after
   * the page is read from the swap device, the PTE is verified not
   * changed with the page table locked to check whether the swap device
   * has been swapoff or swapoff+swapon.
   */
  struct swap_info_struct *get_swap_device(swp_entry_t entry)
  {
  	struct swap_info_struct *si;
  	unsigned long offset;
  
  	if (!entry.val)
  		goto out;
  	si = swp_swap_info(entry);
  	if (!si)
  		goto bad_nofile;
  
  	rcu_read_lock();
a449bf58e   Qian Cai   mm/swapfile: fix ...
1256
  	if (data_race(!(si->flags & SWP_VALID)))
eb085574a   Huang Ying   mm, swap: fix rac...
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
  		goto unlock_out;
  	offset = swp_offset(entry);
  	if (offset >= si->max)
  		goto unlock_out;
  
  	return si;
  bad_nofile:
  	pr_err("%s: %s%08lx
  ", __func__, Bad_file, entry.val);
  out:
  	return NULL;
  unlock_out:
  	rcu_read_unlock();
  	return NULL;
  }
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1272
  static unsigned char __swap_entry_free(struct swap_info_struct *p,
33e16272f   Wei Yang   mm/swapfile.c: __...
1273
  				       swp_entry_t entry)
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1274
1275
1276
  {
  	struct swap_cluster_info *ci;
  	unsigned long offset = swp_offset(entry);
33e16272f   Wei Yang   mm/swapfile.c: __...
1277
  	unsigned char usage;
b32d5f32b   Huang Ying   mm/swapfile.c: ad...
1278
1279
  
  	ci = lock_cluster_or_swap_info(p, offset);
33e16272f   Wei Yang   mm/swapfile.c: __...
1280
  	usage = __swap_entry_free_locked(p, offset, 1);
7c00bafee   Tim Chen   mm/swap: free swa...
1281
  	unlock_cluster_or_swap_info(p, ci);
10e364da1   Huang Ying   mm/swapfile.c: ca...
1282
1283
  	if (!usage)
  		free_swap_slot(entry);
7c00bafee   Tim Chen   mm/swap: free swa...
1284
1285
1286
  
  	return usage;
  }
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
1287

7c00bafee   Tim Chen   mm/swap: free swa...
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
  static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
  {
  	struct swap_cluster_info *ci;
  	unsigned long offset = swp_offset(entry);
  	unsigned char count;
  
  	ci = lock_cluster(p, offset);
  	count = p->swap_map[offset];
  	VM_BUG_ON(count != SWAP_HAS_CACHE);
  	p->swap_map[offset] = 0;
  	dec_cluster_info_page(p, p->cluster_info, offset);
235b62176   Huang, Ying   mm/swap: add clus...
1299
  	unlock_cluster(ci);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1300
1301
  	mem_cgroup_uncharge_swap(entry, 1);
  	swap_range_free(p, offset, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1302
1303
1304
  }
  
  /*
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
1305
   * Caller has made sure that the swap device corresponding to entry
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1306
1307
1308
1309
   * is still around or has not been recycled.
   */
  void swap_free(swp_entry_t entry)
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
1310
  	struct swap_info_struct *p;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1311

235b62176   Huang, Ying   mm/swap: add clus...
1312
  	p = _swap_info_get(entry);
10e364da1   Huang Ying   mm/swapfile.c: ca...
1313
  	if (p)
33e16272f   Wei Yang   mm/swapfile.c: __...
1314
  		__swap_entry_free(p, entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1315
1316
1317
  }
  
  /*
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
1318
1319
   * Called after dropping swapcache to decrease refcnt to swap entries.
   */
a448f2d07   Huang Ying   mm/swapfile.c: un...
1320
  void put_swap_page(struct page *page, swp_entry_t entry)
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1321
1322
1323
1324
1325
1326
  {
  	unsigned long offset = swp_offset(entry);
  	unsigned long idx = offset / SWAPFILE_CLUSTER;
  	struct swap_cluster_info *ci;
  	struct swap_info_struct *si;
  	unsigned char *map;
a3aea839e   Huang Ying   mm, THP, swap: su...
1327
1328
  	unsigned int i, free_entries = 0;
  	unsigned char val;
6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
1329
  	int size = swap_entry_size(thp_nr_pages(page));
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1330

a3aea839e   Huang Ying   mm, THP, swap: su...
1331
  	si = _swap_info_get(entry);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1332
1333
  	if (!si)
  		return;
c2343d276   Huang Ying   mm/swapfile.c: pu...
1334
  	ci = lock_cluster_or_swap_info(si, offset);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1335
  	if (size == SWAPFILE_CLUSTER) {
a448f2d07   Huang Ying   mm/swapfile.c: un...
1336
1337
1338
1339
1340
1341
1342
1343
  		VM_BUG_ON(!cluster_is_huge(ci));
  		map = si->swap_map + offset;
  		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
  			val = map[i];
  			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
  			if (val == SWAP_HAS_CACHE)
  				free_entries++;
  		}
a448f2d07   Huang Ying   mm/swapfile.c: un...
1344
  		cluster_clear_huge(ci);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1345
  		if (free_entries == SWAPFILE_CLUSTER) {
c2343d276   Huang Ying   mm/swapfile.c: pu...
1346
  			unlock_cluster_or_swap_info(si, ci);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1347
  			spin_lock(&si->lock);
a448f2d07   Huang Ying   mm/swapfile.c: un...
1348
1349
1350
1351
1352
1353
  			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
  			swap_free_cluster(si, idx);
  			spin_unlock(&si->lock);
  			return;
  		}
  	}
c2343d276   Huang Ying   mm/swapfile.c: pu...
1354
1355
1356
1357
1358
1359
1360
  	for (i = 0; i < size; i++, entry.val++) {
  		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
  			unlock_cluster_or_swap_info(si, ci);
  			free_swap_slot(entry);
  			if (i == size - 1)
  				return;
  			lock_cluster_or_swap_info(si, offset);
a3aea839e   Huang Ying   mm, THP, swap: su...
1361
1362
  		}
  	}
c2343d276   Huang Ying   mm/swapfile.c: pu...
1363
  	unlock_cluster_or_swap_info(si, ci);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1364
  }
59807685a   Huang Ying   mm, THP, swap: su...
1365

fe5266d5d   Huang Ying   mm/swapfile.c: re...
1366
  #ifdef CONFIG_THP_SWAP
59807685a   Huang Ying   mm, THP, swap: su...
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
  int split_swap_cluster(swp_entry_t entry)
  {
  	struct swap_info_struct *si;
  	struct swap_cluster_info *ci;
  	unsigned long offset = swp_offset(entry);
  
  	si = _swap_info_get(entry);
  	if (!si)
  		return -EBUSY;
  	ci = lock_cluster(si, offset);
  	cluster_clear_huge(ci);
  	unlock_cluster(ci);
  	return 0;
  }
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1381
  #endif
38d8b4e6b   Huang Ying   mm, THP, swap: de...
1382

155b5f88e   Huang Ying   mm/swapfile.c: so...
1383
1384
1385
1386
1387
1388
  static int swp_entry_cmp(const void *ent1, const void *ent2)
  {
  	const swp_entry_t *e1 = ent1, *e2 = ent2;
  
  	return (int)swp_type(*e1) - (int)swp_type(*e2);
  }
7c00bafee   Tim Chen   mm/swap: free swa...
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
  void swapcache_free_entries(swp_entry_t *entries, int n)
  {
  	struct swap_info_struct *p, *prev;
  	int i;
  
  	if (n <= 0)
  		return;
  
  	prev = NULL;
  	p = NULL;
155b5f88e   Huang Ying   mm/swapfile.c: so...
1399
1400
1401
1402
1403
1404
1405
1406
  
  	/*
  	 * Sort swap entries by swap device, so each lock is only taken once.
  	 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
  	 * so low that it isn't necessary to optimize further.
  	 */
  	if (nr_swapfiles > 1)
  		sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
7c00bafee   Tim Chen   mm/swap: free swa...
1407
1408
1409
1410
  	for (i = 0; i < n; ++i) {
  		p = swap_info_get_cont(entries[i], prev);
  		if (p)
  			swap_entry_free(p, entries[i]);
7c00bafee   Tim Chen   mm/swap: free swa...
1411
1412
  		prev = p;
  	}
235b62176   Huang, Ying   mm/swap: add clus...
1413
  	if (p)
7c00bafee   Tim Chen   mm/swap: free swa...
1414
  		spin_unlock(&p->lock);
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
1415
1416
1417
  }
  
  /*
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1418
   * How many references to page are currently swapped out?
570a335b8   Hugh Dickins   swap_info: swap c...
1419
1420
   * This does not give an exact answer when swap count is continued,
   * but does include the high COUNT_CONTINUED flag to allow for that.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1421
   */
bde05d1cc   Hugh Dickins   shmem: replace pa...
1422
  int page_swapcount(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1423
  {
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1424
1425
  	int count = 0;
  	struct swap_info_struct *p;
235b62176   Huang, Ying   mm/swap: add clus...
1426
  	struct swap_cluster_info *ci;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1427
  	swp_entry_t entry;
235b62176   Huang, Ying   mm/swap: add clus...
1428
  	unsigned long offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1429

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
1430
  	entry.val = page_private(page);
235b62176   Huang, Ying   mm/swap: add clus...
1431
  	p = _swap_info_get(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1432
  	if (p) {
235b62176   Huang, Ying   mm/swap: add clus...
1433
1434
1435
1436
  		offset = swp_offset(entry);
  		ci = lock_cluster_or_swap_info(p, offset);
  		count = swap_count(p->swap_map[offset]);
  		unlock_cluster_or_swap_info(p, ci);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1437
  	}
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1438
  	return count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1439
  }
eb085574a   Huang Ying   mm, swap: fix rac...
1440
  int __swap_count(swp_entry_t entry)
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1441
  {
eb085574a   Huang Ying   mm, swap: fix rac...
1442
  	struct swap_info_struct *si;
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1443
  	pgoff_t offset = swp_offset(entry);
eb085574a   Huang Ying   mm, swap: fix rac...
1444
  	int count = 0;
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1445

eb085574a   Huang Ying   mm, swap: fix rac...
1446
1447
1448
1449
1450
1451
  	si = get_swap_device(entry);
  	if (si) {
  		count = swap_count(si->swap_map[offset]);
  		put_swap_device(si);
  	}
  	return count;
aa8d22a11   Minchan Kim   mm: swap: SWP_SYN...
1452
  }
322b8afe4   Huang Ying   mm, swap: Fix a r...
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
  static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
  {
  	int count = 0;
  	pgoff_t offset = swp_offset(entry);
  	struct swap_cluster_info *ci;
  
  	ci = lock_cluster_or_swap_info(si, offset);
  	count = swap_count(si->swap_map[offset]);
  	unlock_cluster_or_swap_info(si, ci);
  	return count;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1464
  /*
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1465
   * How many references to @entry are currently swapped out?
e8c26ab60   Tim Chen   mm/swap: skip rea...
1466
1467
1468
1469
1470
1471
   * This does not give an exact answer when swap count is continued,
   * but does include the high COUNT_CONTINUED flag to allow for that.
   */
  int __swp_swapcount(swp_entry_t entry)
  {
  	int count = 0;
e8c26ab60   Tim Chen   mm/swap: skip rea...
1472
  	struct swap_info_struct *si;
e8c26ab60   Tim Chen   mm/swap: skip rea...
1473

eb085574a   Huang Ying   mm, swap: fix rac...
1474
1475
  	si = get_swap_device(entry);
  	if (si) {
322b8afe4   Huang Ying   mm, swap: Fix a r...
1476
  		count = swap_swapcount(si, entry);
eb085574a   Huang Ying   mm, swap: fix rac...
1477
1478
  		put_swap_device(si);
  	}
e8c26ab60   Tim Chen   mm/swap: skip rea...
1479
1480
1481
1482
1483
  	return count;
  }
  
  /*
   * How many references to @entry are currently swapped out?
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1484
1485
1486
1487
1488
1489
   * This considers COUNT_CONTINUED so it returns exact answer.
   */
  int swp_swapcount(swp_entry_t entry)
  {
  	int count, tmp_count, n;
  	struct swap_info_struct *p;
235b62176   Huang, Ying   mm/swap: add clus...
1490
  	struct swap_cluster_info *ci;
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1491
1492
1493
  	struct page *page;
  	pgoff_t offset;
  	unsigned char *map;
235b62176   Huang, Ying   mm/swap: add clus...
1494
  	p = _swap_info_get(entry);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1495
1496
  	if (!p)
  		return 0;
235b62176   Huang, Ying   mm/swap: add clus...
1497
1498
1499
1500
1501
  	offset = swp_offset(entry);
  
  	ci = lock_cluster_or_swap_info(p, offset);
  
  	count = swap_count(p->swap_map[offset]);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1502
1503
1504
1505
1506
  	if (!(count & COUNT_CONTINUED))
  		goto out;
  
  	count &= ~COUNT_CONTINUED;
  	n = SWAP_MAP_MAX + 1;
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1507
1508
1509
1510
1511
  	page = vmalloc_to_page(p->swap_map + offset);
  	offset &= ~PAGE_MASK;
  	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
  
  	do {
a8ae49917   Geliang Tang   mm/swapfile.c: us...
1512
  		page = list_next_entry(page, lru);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1513
1514
1515
1516
1517
1518
1519
1520
  		map = kmap_atomic(page);
  		tmp_count = map[offset];
  		kunmap_atomic(map);
  
  		count += (tmp_count & ~COUNT_CONTINUED) * n;
  		n *= (SWAP_CONT_MAX + 1);
  	} while (tmp_count & COUNT_CONTINUED);
  out:
235b62176   Huang, Ying   mm/swap: add clus...
1521
  	unlock_cluster_or_swap_info(p, ci);
8334b9622   Minchan Kim   mm: /proc/pid/sma...
1522
1523
  	return count;
  }
e07098294   Huang Ying   mm, THP, swap: su...
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
  static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
  					 swp_entry_t entry)
  {
  	struct swap_cluster_info *ci;
  	unsigned char *map = si->swap_map;
  	unsigned long roffset = swp_offset(entry);
  	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
  	int i;
  	bool ret = false;
  
  	ci = lock_cluster_or_swap_info(si, offset);
  	if (!ci || !cluster_is_huge(ci)) {
afa4711ef   Huang Ying   mm/swapfile.c: us...
1536
  		if (swap_count(map[roffset]))
e07098294   Huang Ying   mm, THP, swap: su...
1537
1538
1539
1540
  			ret = true;
  		goto unlock_out;
  	}
  	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
afa4711ef   Huang Ying   mm/swapfile.c: us...
1541
  		if (swap_count(map[offset + i])) {
e07098294   Huang Ying   mm, THP, swap: su...
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
  			ret = true;
  			break;
  		}
  	}
  unlock_out:
  	unlock_cluster_or_swap_info(si, ci);
  	return ret;
  }
  
  static bool page_swapped(struct page *page)
  {
  	swp_entry_t entry;
  	struct swap_info_struct *si;
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1555
  	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
e07098294   Huang Ying   mm, THP, swap: su...
1556
1557
1558
1559
1560
1561
1562
1563
1564
  		return page_swapcount(page) != 0;
  
  	page = compound_head(page);
  	entry.val = page_private(page);
  	si = _swap_info_get(entry);
  	if (si)
  		return swap_page_trans_huge_swapped(si, entry);
  	return false;
  }
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
  
  static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
  					 int *total_swapcount)
  {
  	int i, map_swapcount, _total_mapcount, _total_swapcount;
  	unsigned long offset = 0;
  	struct swap_info_struct *si;
  	struct swap_cluster_info *ci = NULL;
  	unsigned char *map = NULL;
  	int mapcount, swapcount = 0;
  
  	/* hugetlbfs shouldn't call it */
  	VM_BUG_ON_PAGE(PageHuge(page), page);
fe5266d5d   Huang Ying   mm/swapfile.c: re...
1578
1579
  	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
  		mapcount = page_trans_huge_mapcount(page, total_mapcount);
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
  		if (PageSwapCache(page))
  			swapcount = page_swapcount(page);
  		if (total_swapcount)
  			*total_swapcount = swapcount;
  		return mapcount + swapcount;
  	}
  
  	page = compound_head(page);
  
  	_total_mapcount = _total_swapcount = map_swapcount = 0;
  	if (PageSwapCache(page)) {
  		swp_entry_t entry;
  
  		entry.val = page_private(page);
  		si = _swap_info_get(entry);
  		if (si) {
  			map = si->swap_map;
  			offset = swp_offset(entry);
  		}
  	}
  	if (map)
  		ci = lock_cluster(si, offset);
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		mapcount = atomic_read(&page[i]._mapcount) + 1;
  		_total_mapcount += mapcount;
  		if (map) {
  			swapcount = swap_count(map[offset + i]);
  			_total_swapcount += swapcount;
  		}
  		map_swapcount = max(map_swapcount, mapcount + swapcount);
  	}
  	unlock_cluster(ci);
  	if (PageDoubleMap(page)) {
  		map_swapcount -= 1;
  		_total_mapcount -= HPAGE_PMD_NR;
  	}
  	mapcount = compound_mapcount(page);
  	map_swapcount += mapcount;
  	_total_mapcount += mapcount;
  	if (total_mapcount)
  		*total_mapcount = _total_mapcount;
  	if (total_swapcount)
  		*total_swapcount = _total_swapcount;
  
  	return map_swapcount;
  }
e07098294   Huang Ying   mm, THP, swap: su...
1626

8334b9622   Minchan Kim   mm: /proc/pid/sma...
1627
  /*
7b1fe5979   Hugh Dickins   mm: reuse_swap_pa...
1628
1629
1630
1631
   * We can write to an anon page without COW if there are no other references
   * to it.  And as a side-effect, free up its swap: because the old content
   * on disk will never be read, and seeking back there to write new content
   * later would only waste time away from clustering.
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1632
   *
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1633
   * NOTE: total_map_swapcount should not be relied upon by the caller if
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1634
1635
   * reuse_swap_page() returns false, but it may be always overwritten
   * (see the other implementation for CONFIG_SWAP=n).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1636
   */
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1637
  bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1638
  {
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1639
  	int count, total_mapcount, total_swapcount;
c475a8ab6   Hugh Dickins   [PATCH] can_share...
1640

309381fea   Sasha Levin   mm: dump page whe...
1641
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
5ad646880   Hugh Dickins   ksm: let shared p...
1642
  	if (unlikely(PageKsm(page)))
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1643
  		return false;
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1644
1645
1646
1647
1648
1649
1650
1651
  	count = page_trans_huge_map_swapcount(page, &total_mapcount,
  					      &total_swapcount);
  	if (total_map_swapcount)
  		*total_map_swapcount = total_mapcount + total_swapcount;
  	if (count == 1 && PageSwapCache(page) &&
  	    (likely(!PageTransCompound(page)) ||
  	     /* The remaining swap count will be freed soon */
  	     total_swapcount == page_swapcount(page))) {
f05714293   Minchan Kim   mm: support anony...
1652
  		if (!PageWriteback(page)) {
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1653
  			page = compound_head(page);
7b1fe5979   Hugh Dickins   mm: reuse_swap_pa...
1654
1655
  			delete_from_swap_cache(page);
  			SetPageDirty(page);
f05714293   Minchan Kim   mm: support anony...
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
  		} else {
  			swp_entry_t entry;
  			struct swap_info_struct *p;
  
  			entry.val = page_private(page);
  			p = swap_info_get(entry);
  			if (p->flags & SWP_STABLE_WRITES) {
  				spin_unlock(&p->lock);
  				return false;
  			}
  			spin_unlock(&p->lock);
7b1fe5979   Hugh Dickins   mm: reuse_swap_pa...
1667
1668
  		}
  	}
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1669

5ad646880   Hugh Dickins   ksm: let shared p...
1670
  	return count <= 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1671
1672
1673
  }
  
  /*
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1674
1675
   * If swap is getting full, or if there are no more mappings of this page,
   * then try_to_free_swap is called to free its swap space.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1676
   */
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1677
  int try_to_free_swap(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1678
  {
309381fea   Sasha Levin   mm: dump page whe...
1679
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1680
1681
1682
1683
1684
  
  	if (!PageSwapCache(page))
  		return 0;
  	if (PageWriteback(page))
  		return 0;
e07098294   Huang Ying   mm, THP, swap: su...
1685
  	if (page_swapped(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1686
  		return 0;
b73d7fcec   Hugh Dickins   swap: prevent reu...
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
  	/*
  	 * Once hibernation has begun to create its image of memory,
  	 * there's a danger that one of the calls to try_to_free_swap()
  	 * - most probably a call from __try_to_reclaim_swap() while
  	 * hibernation is allocating its own swap pages for the image,
  	 * but conceivably even a call from memory reclaim - will free
  	 * the swap from a page which has already been recorded in the
  	 * image as a clean swapcache page, and then reuse its swap for
  	 * another page of the image.  On waking from hibernation, the
  	 * original page might be freed under memory pressure, then
  	 * later read back in from swap, now with the wrong data.
  	 *
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
1699
  	 * Hibernation suspends storage while it is writing the image
f90ac3982   Mel Gorman   mm: avoid liveloc...
1700
  	 * to disk so check that here.
b73d7fcec   Hugh Dickins   swap: prevent reu...
1701
  	 */
f90ac3982   Mel Gorman   mm: avoid liveloc...
1702
  	if (pm_suspended_storage())
b73d7fcec   Hugh Dickins   swap: prevent reu...
1703
  		return 0;
e07098294   Huang Ying   mm, THP, swap: su...
1704
  	page = compound_head(page);
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1705
1706
1707
  	delete_from_swap_cache(page);
  	SetPageDirty(page);
  	return 1;
68a22394c   Rik van Riel   vmscan: free swap...
1708
1709
1710
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1711
1712
1713
   * Free the swap entry like above, but also try to
   * free the page cache entry if it is the last user.
   */
2509ef26d   Hugh Dickins   badpage: zap prin...
1714
  int free_swap_and_cache(swp_entry_t entry)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1715
  {
2509ef26d   Hugh Dickins   badpage: zap prin...
1716
  	struct swap_info_struct *p;
7c00bafee   Tim Chen   mm/swap: free swa...
1717
  	unsigned char count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1718

a7420aa54   Andi Kleen   HWPOISON: Add sup...
1719
  	if (non_swap_entry(entry))
2509ef26d   Hugh Dickins   badpage: zap prin...
1720
  		return 1;
0697212a4   Christoph Lameter   [PATCH] Swapless ...
1721

7c00bafee   Tim Chen   mm/swap: free swa...
1722
  	p = _swap_info_get(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1723
  	if (p) {
33e16272f   Wei Yang   mm/swapfile.c: __...
1724
  		count = __swap_entry_free(p, entry);
e07098294   Huang Ying   mm, THP, swap: su...
1725
  		if (count == SWAP_HAS_CACHE &&
bcd49e867   Huang Ying   mm/swapfile.c: us...
1726
1727
1728
  		    !swap_page_trans_huge_swapped(p, entry))
  			__try_to_reclaim_swap(p, swp_offset(entry),
  					      TTRS_UNMAPPED | TTRS_FULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1729
  	}
2509ef26d   Hugh Dickins   badpage: zap prin...
1730
  	return p != NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1731
  }
b0cb1a19d   Rafael J. Wysocki   Replace CONFIG_SO...
1732
  #ifdef CONFIG_HIBERNATION
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1733
  /*
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1734
   * Find the swap type that corresponds to given device (if any).
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1735
   *
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1736
1737
1738
1739
   * @offset - number of the PAGE_SIZE-sized block of the device, starting
   * from 0, in which the swap header is expected to be located.
   *
   * This is needed for the suspend to disk (aka swsusp).
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1740
   */
21bd90057   Christoph Hellwig   mm: split swap_ty...
1741
  int swap_type_of(dev_t device, sector_t offset)
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1742
  {
efa90a981   Hugh Dickins   swap_info: change...
1743
  	int type;
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1744

21bd90057   Christoph Hellwig   mm: split swap_ty...
1745
1746
  	if (!device)
  		return -1;
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1747

f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1748
  	spin_lock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
1749
1750
  	for (type = 0; type < nr_swapfiles; type++) {
  		struct swap_info_struct *sis = swap_info[type];
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1751

915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1752
  		if (!(sis->flags & SWP_WRITEOK))
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1753
  			continue;
b6b5bce35   Rafael J. Wysocki   [PATCH] swsusp: F...
1754

21bd90057   Christoph Hellwig   mm: split swap_ty...
1755
  		if (device == sis->bdev->bd_dev) {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
1756
  			struct swap_extent *se = first_se(sis);
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1757

915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1758
1759
  			if (se->start_block == offset) {
  				spin_unlock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
1760
  				return type;
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1761
  			}
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1762
1763
1764
  		}
  	}
  	spin_unlock(&swap_lock);
21bd90057   Christoph Hellwig   mm: split swap_ty...
1765
1766
  	return -ENODEV;
  }
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1767

21bd90057   Christoph Hellwig   mm: split swap_ty...
1768
1769
1770
  int find_first_swap(dev_t *device)
  {
  	int type;
915bae9eb   Rafael J. Wysocki   [PATCH] swsusp: u...
1771

21bd90057   Christoph Hellwig   mm: split swap_ty...
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
  	spin_lock(&swap_lock);
  	for (type = 0; type < nr_swapfiles; type++) {
  		struct swap_info_struct *sis = swap_info[type];
  
  		if (!(sis->flags & SWP_WRITEOK))
  			continue;
  		*device = sis->bdev->bd_dev;
  		spin_unlock(&swap_lock);
  		return type;
  	}
  	spin_unlock(&swap_lock);
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1783
1784
1785
1786
  	return -ENODEV;
  }
  
  /*
73c34b6ac   Hugh Dickins   swap_info: miscel...
1787
1788
1789
1790
1791
1792
   * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
   * corresponding to given index in swap_info (swap type).
   */
  sector_t swapdev_block(int type, pgoff_t offset)
  {
  	struct block_device *bdev;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1793
  	struct swap_info_struct *si = swap_type_to_swap_info(type);
73c34b6ac   Hugh Dickins   swap_info: miscel...
1794

c10d38cc8   Daniel Jordan   mm, swap: bounds ...
1795
  	if (!si || !(si->flags & SWP_WRITEOK))
73c34b6ac   Hugh Dickins   swap_info: miscel...
1796
  		return 0;
d4906e1aa   Lee Schermerhorn   swap: rework map_...
1797
  	return map_swap_entry(swp_entry(type, offset), &bdev);
73c34b6ac   Hugh Dickins   swap_info: miscel...
1798
1799
1800
  }
  
  /*
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1801
1802
1803
1804
1805
1806
1807
1808
   * Return either the total number of swap pages of given type, or the number
   * of free pages of that type (depending on @free)
   *
   * This is needed for software suspend
   */
  unsigned int count_swap_pages(int type, int free)
  {
  	unsigned int n = 0;
efa90a981   Hugh Dickins   swap_info: change...
1809
1810
1811
  	spin_lock(&swap_lock);
  	if ((unsigned int)type < nr_swapfiles) {
  		struct swap_info_struct *sis = swap_info[type];
ec8acf20a   Shaohua Li   swap: add per-par...
1812
  		spin_lock(&sis->lock);
efa90a981   Hugh Dickins   swap_info: change...
1813
1814
  		if (sis->flags & SWP_WRITEOK) {
  			n = sis->pages;
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1815
  			if (free)
efa90a981   Hugh Dickins   swap_info: change...
1816
  				n -= sis->inuse_pages;
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1817
  		}
ec8acf20a   Shaohua Li   swap: add per-par...
1818
  		spin_unlock(&sis->lock);
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1819
  	}
efa90a981   Hugh Dickins   swap_info: change...
1820
  	spin_unlock(&swap_lock);
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1821
1822
  	return n;
  }
73c34b6ac   Hugh Dickins   swap_info: miscel...
1823
  #endif /* CONFIG_HIBERNATION */
f577eb30a   Rafael J. Wysocki   [PATCH] swsusp: l...
1824

9f8bdb3f3   Hugh Dickins   mm: make swapoff ...
1825
  static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
179ef71cb   Cyrill Gorcunov   mm: save soft-dir...
1826
  {
9f8bdb3f3   Hugh Dickins   mm: make swapoff ...
1827
  	return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
179ef71cb   Cyrill Gorcunov   mm: save soft-dir...
1828
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1829
  /*
72866f6f2   Hugh Dickins   [PATCH] mm: anon ...
1830
1831
1832
   * No need to decide whether this PTE shares the swap entry with others,
   * just let do_wp_page work it out if a write is requested later - to
   * force COW, vm_page_prot omits write permission from any private vma.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1833
   */
044d66c1d   Hugh Dickins   memcgroup: reinst...
1834
  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1835
1836
  		unsigned long addr, swp_entry_t entry, struct page *page)
  {
9e16b7fb1   Hugh Dickins   mm,ksm: swapoff m...
1837
  	struct page *swapcache;
044d66c1d   Hugh Dickins   memcgroup: reinst...
1838
1839
1840
  	spinlock_t *ptl;
  	pte_t *pte;
  	int ret = 1;
9e16b7fb1   Hugh Dickins   mm,ksm: swapoff m...
1841
1842
1843
1844
  	swapcache = page;
  	page = ksm_might_need_to_copy(page, vma, addr);
  	if (unlikely(!page))
  		return -ENOMEM;
044d66c1d   Hugh Dickins   memcgroup: reinst...
1845
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
9f8bdb3f3   Hugh Dickins   mm: make swapoff ...
1846
  	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
044d66c1d   Hugh Dickins   memcgroup: reinst...
1847
1848
1849
  		ret = 0;
  		goto out;
  	}
8a9f3ccd2   Balbir Singh   Memory controller...
1850

b084d4353   KAMEZAWA Hiroyuki   mm: count swap usage
1851
  	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
1852
  	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1853
1854
1855
  	get_page(page);
  	set_pte_at(vma->vm_mm, addr, pte,
  		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
00501b531   Johannes Weiner   mm: memcontrol: r...
1856
  	if (page == swapcache) {
be5d0a74c   Johannes Weiner   mm: memcontrol: s...
1857
  		page_add_anon_rmap(page, vma, addr, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
1858
  	} else { /* ksm created a completely new copy */
be5d0a74c   Johannes Weiner   mm: memcontrol: s...
1859
  		page_add_new_anon_rmap(page, vma, addr, false);
b518154e5   Joonsoo Kim   mm/vmscan: protec...
1860
  		lru_cache_add_inactive_or_unevictable(page, vma);
00501b531   Johannes Weiner   mm: memcontrol: r...
1861
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1862
  	swap_free(entry);
044d66c1d   Hugh Dickins   memcgroup: reinst...
1863
1864
  out:
  	pte_unmap_unlock(pte, ptl);
9e16b7fb1   Hugh Dickins   mm,ksm: swapoff m...
1865
1866
1867
1868
  	if (page != swapcache) {
  		unlock_page(page);
  		put_page(page);
  	}
044d66c1d   Hugh Dickins   memcgroup: reinst...
1869
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1870
1871
1872
  }
  
  static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1873
1874
1875
  			unsigned long addr, unsigned long end,
  			unsigned int type, bool frontswap,
  			unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1876
  {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1877
1878
  	struct page *page;
  	swp_entry_t entry;
705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
1879
  	pte_t *pte;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1880
1881
  	struct swap_info_struct *si;
  	unsigned long offset;
8a9f3ccd2   Balbir Singh   Memory controller...
1882
  	int ret = 0;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1883
  	volatile unsigned char *swap_map;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1884

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1885
  	si = swap_info[type];
044d66c1d   Hugh Dickins   memcgroup: reinst...
1886
  	pte = pte_offset_map(pmd, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1887
  	do {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
  		struct vm_fault vmf;
  
  		if (!is_swap_pte(*pte))
  			continue;
  
  		entry = pte_to_swp_entry(*pte);
  		if (swp_type(entry) != type)
  			continue;
  
  		offset = swp_offset(entry);
  		if (frontswap && !frontswap_test(si, offset))
  			continue;
  
  		pte_unmap(pte);
  		swap_map = &si->swap_map[offset];
ebc5951ee   Andrea Righi   mm: swap: properl...
1903
1904
1905
1906
1907
1908
1909
1910
  		page = lookup_swap_cache(entry, vma, addr);
  		if (!page) {
  			vmf.vma = vma;
  			vmf.address = addr;
  			vmf.pmd = pmd;
  			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  						&vmf);
  		}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
  		if (!page) {
  			if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
  				goto try_next;
  			return -ENOMEM;
  		}
  
  		lock_page(page);
  		wait_on_page_writeback(page);
  		ret = unuse_pte(vma, pmd, addr, entry, page);
  		if (ret < 0) {
  			unlock_page(page);
  			put_page(page);
  			goto out;
  		}
  
  		try_to_free_swap(page);
  		unlock_page(page);
  		put_page(page);
  
  		if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
  			ret = FRONTSWAP_PAGES_UNUSED;
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1933
  		}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1934
1935
  try_next:
  		pte = pte_offset_map(pmd, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1936
  	} while (pte++, addr += PAGE_SIZE, addr != end);
044d66c1d   Hugh Dickins   memcgroup: reinst...
1937
  	pte_unmap(pte - 1);
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1938
1939
  
  	ret = 0;
044d66c1d   Hugh Dickins   memcgroup: reinst...
1940
  out:
8a9f3ccd2   Balbir Singh   Memory controller...
1941
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1942
1943
1944
1945
  }
  
  static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
  				unsigned long addr, unsigned long end,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1946
1947
  				unsigned int type, bool frontswap,
  				unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1948
1949
1950
  {
  	pmd_t *pmd;
  	unsigned long next;
8a9f3ccd2   Balbir Singh   Memory controller...
1951
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1952
1953
1954
  
  	pmd = pmd_offset(pud, addr);
  	do {
dc644a073   Hugh Dickins   mm: add three mor...
1955
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1956
  		next = pmd_addr_end(addr, end);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
1957
  		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1958
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1959
1960
  		ret = unuse_pte_range(vma, pmd, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
8a9f3ccd2   Balbir Singh   Memory controller...
1961
1962
  		if (ret)
  			return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1963
1964
1965
  	} while (pmd++, addr = next, addr != end);
  	return 0;
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
1966
  static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1967
  				unsigned long addr, unsigned long end,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1968
1969
  				unsigned int type, bool frontswap,
  				unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1970
1971
1972
  {
  	pud_t *pud;
  	unsigned long next;
8a9f3ccd2   Balbir Singh   Memory controller...
1973
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1974

c2febafc6   Kirill A. Shutemov   mm: convert gener...
1975
  	pud = pud_offset(p4d, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1976
1977
1978
1979
  	do {
  		next = pud_addr_end(addr, end);
  		if (pud_none_or_clear_bad(pud))
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1980
1981
  		ret = unuse_pmd_range(vma, pud, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
8a9f3ccd2   Balbir Singh   Memory controller...
1982
1983
  		if (ret)
  			return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1984
1985
1986
  	} while (pud++, addr = next, addr != end);
  	return 0;
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
1987
1988
  static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
  				unsigned long addr, unsigned long end,
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
1989
1990
  				unsigned int type, bool frontswap,
  				unsigned long *fs_pages_to_unuse)
c2febafc6   Kirill A. Shutemov   mm: convert gener...
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
  {
  	p4d_t *p4d;
  	unsigned long next;
  	int ret;
  
  	p4d = p4d_offset(pgd, addr);
  	do {
  		next = p4d_addr_end(addr, end);
  		if (p4d_none_or_clear_bad(p4d))
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2001
2002
  		ret = unuse_pud_range(vma, p4d, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2003
2004
2005
2006
2007
  		if (ret)
  			return ret;
  	} while (p4d++, addr = next, addr != end);
  	return 0;
  }
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2008
2009
  static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
  		     bool frontswap, unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2010
2011
2012
  {
  	pgd_t *pgd;
  	unsigned long addr, end, next;
8a9f3ccd2   Balbir Singh   Memory controller...
2013
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2014

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2015
2016
  	addr = vma->vm_start;
  	end = vma->vm_end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2017
2018
2019
2020
2021
2022
  
  	pgd = pgd_offset(vma->vm_mm, addr);
  	do {
  		next = pgd_addr_end(addr, end);
  		if (pgd_none_or_clear_bad(pgd))
  			continue;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2023
2024
  		ret = unuse_p4d_range(vma, pgd, addr, next, type,
  				      frontswap, fs_pages_to_unuse);
8a9f3ccd2   Balbir Singh   Memory controller...
2025
2026
  		if (ret)
  			return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2027
2028
2029
  	} while (pgd++, addr = next, addr != end);
  	return 0;
  }
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2030
2031
  static int unuse_mm(struct mm_struct *mm, unsigned int type,
  		    bool frontswap, unsigned long *fs_pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2032
2033
  {
  	struct vm_area_struct *vma;
8a9f3ccd2   Balbir Singh   Memory controller...
2034
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2035

d8ed45c5d   Michel Lespinasse   mmap locking API:...
2036
  	mmap_read_lock(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2037
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2038
2039
2040
2041
2042
2043
  		if (vma->anon_vma) {
  			ret = unuse_vma(vma, type, frontswap,
  					fs_pages_to_unuse);
  			if (ret)
  				break;
  		}
dc644a073   Hugh Dickins   mm: add three mor...
2044
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2045
  	}
d8ed45c5d   Michel Lespinasse   mmap locking API:...
2046
  	mmap_read_unlock(mm);
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2047
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2048
2049
2050
  }
  
  /*
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2051
   * Scan swap_map (or frontswap_map if frontswap parameter is true)
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2052
2053
   * from current position to next entry still in use. Return 0
   * if there are no inuse entries after prev till end of the map.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2054
   */
6eb396dc4   Hugh Dickins   [PATCH] swap: swa...
2055
  static unsigned int find_next_to_unuse(struct swap_info_struct *si,
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2056
  					unsigned int prev, bool frontswap)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2057
  {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2058
  	unsigned int i;
8d69aaee8   Hugh Dickins   swap_info: swap_m...
2059
  	unsigned char count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2060
2061
  
  	/*
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2062
  	 * No need for swap_lock here: we're just looking
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2063
2064
  	 * for whether an entry is in use, not modifying it; false
  	 * hits are okay, and sys_swapoff() has already prevented new
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2065
  	 * allocations from this area (while holding swap_lock).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2066
  	 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2067
  	for (i = prev + 1; i < si->max; i++) {
4db0c3c29   Jason Low   mm: remove rest o...
2068
  		count = READ_ONCE(si->swap_map[i]);
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
2069
  		if (count && swap_count(count) != SWAP_MAP_BAD)
dc644a073   Hugh Dickins   mm: add three mor...
2070
2071
2072
2073
  			if (!frontswap || frontswap_test(si, i))
  				break;
  		if ((i % LATENCY_LIMIT) == 0)
  			cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2074
  	}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2075
2076
2077
  
  	if (i == si->max)
  		i = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2078
2079
2080
2081
  	return i;
  }
  
  /*
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2082
   * If the boolean frontswap is true, only unuse pages_to_unuse pages;
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2083
   * pages_to_unuse==0 means all pages; ignored if frontswap is false
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2084
   */
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
2085
2086
  int try_to_unuse(unsigned int type, bool frontswap,
  		 unsigned long pages_to_unuse)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2087
  {
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2088
2089
2090
2091
  	struct mm_struct *prev_mm;
  	struct mm_struct *mm;
  	struct list_head *p;
  	int retval = 0;
efa90a981   Hugh Dickins   swap_info: change...
2092
  	struct swap_info_struct *si = swap_info[type];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2093
2094
  	struct page *page;
  	swp_entry_t entry;
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2095
  	unsigned int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2096

218209487   Qian Cai   mm/swapfile: fix ...
2097
  	if (!READ_ONCE(si->inuse_pages))
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2098
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2099

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
  	if (!frontswap)
  		pages_to_unuse = 0;
  
  retry:
  	retval = shmem_unuse(type, frontswap, &pages_to_unuse);
  	if (retval)
  		goto out;
  
  	prev_mm = &init_mm;
  	mmget(prev_mm);
  
  	spin_lock(&mmlist_lock);
  	p = &init_mm.mmlist;
218209487   Qian Cai   mm/swapfile: fix ...
2113
  	while (READ_ONCE(si->inuse_pages) &&
64165b1af   Hugh Dickins   mm: swapoff: take...
2114
2115
  	       !signal_pending(current) &&
  	       (p = p->next) != &init_mm.mmlist) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2116

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2117
2118
2119
2120
2121
2122
2123
  		mm = list_entry(p, struct mm_struct, mmlist);
  		if (!mmget_not_zero(mm))
  			continue;
  		spin_unlock(&mmlist_lock);
  		mmput(prev_mm);
  		prev_mm = mm;
  		retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2124

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2125
2126
2127
  		if (retval) {
  			mmput(prev_mm);
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2128
2129
2130
  		}
  
  		/*
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2131
2132
  		 * Make sure that we aren't completely killing
  		 * interactive performance.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2133
  		 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2134
2135
2136
2137
  		cond_resched();
  		spin_lock(&mmlist_lock);
  	}
  	spin_unlock(&mmlist_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2138

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2139
  	mmput(prev_mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2140

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2141
  	i = 0;
218209487   Qian Cai   mm/swapfile: fix ...
2142
  	while (READ_ONCE(si->inuse_pages) &&
64165b1af   Hugh Dickins   mm: swapoff: take...
2143
2144
  	       !signal_pending(current) &&
  	       (i = find_next_to_unuse(si, i, frontswap)) != 0) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2145

b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2146
2147
2148
2149
  		entry = swp_entry(type, i);
  		page = find_get_page(swap_address_space(entry), i);
  		if (!page)
  			continue;
68bdc8d64   Hugh Dickins   mm: try_to_unuse ...
2150
2151
2152
  
  		/*
  		 * It is conceivable that a racing task removed this page from
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2153
2154
2155
  		 * swap cache just before we acquired the page lock. The page
  		 * might even be back in swap cache on another swap area. But
  		 * that is okay, try_to_free_swap() only removes stale pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2156
  		 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2157
2158
2159
  		lock_page(page);
  		wait_on_page_writeback(page);
  		try_to_free_swap(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2160
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2161
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2162
2163
  
  		/*
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2164
2165
2166
  		 * For frontswap, we just need to unuse pages_to_unuse, if
  		 * it was specified. Need not check frontswap again here as
  		 * we already zeroed out pages_to_unuse if not frontswap.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2167
  		 */
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2168
2169
  		if (pages_to_unuse && --pages_to_unuse == 0)
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2170
  	}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2171
2172
2173
2174
2175
  	/*
  	 * Lets check again to see if there are still swap entries in the map.
  	 * If yes, we would need to do retry the unuse logic again.
  	 * Under global memory pressure, swap entries can be reinserted back
  	 * into process space after the mmlist loop above passes over them.
dd862deb1   Hugh Dickins   mm: swapoff: remo...
2176
  	 *
af53d3e9e   Hugh Dickins   mm: swapoff: shme...
2177
2178
2179
2180
2181
  	 * Limit the number of retries? No: when mmget_not_zero() above fails,
  	 * that mm is likely to be freeing swap from exit_mmap(), which proceeds
  	 * at its own independent pace; and even shmem_writepage() could have
  	 * been preempted after get_swap_page(), temporarily hiding that swap.
  	 * It's easy and robust (though cpu-intensive) just to keep retrying.
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2182
  	 */
218209487   Qian Cai   mm/swapfile: fix ...
2183
  	if (READ_ONCE(si->inuse_pages)) {
64165b1af   Hugh Dickins   mm: swapoff: take...
2184
2185
2186
2187
  		if (!signal_pending(current))
  			goto retry;
  		retval = -EINTR;
  	}
b56a2d8af   Vineeth Remanan Pillai   mm: rid swapoff o...
2188
2189
  out:
  	return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2190
2191
2192
  }
  
  /*
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2193
2194
2195
   * After a successful try_to_unuse, if no swap is now in use, we know
   * we can empty the mmlist.  swap_lock must be held on entry and exit.
   * Note that mmlist_lock nests inside swap_lock, and an mm must be
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2196
2197
2198
2199
2200
   * added to the mmlist just after page_duplicate - before would be racy.
   */
  static void drain_mmlist(void)
  {
  	struct list_head *p, *next;
efa90a981   Hugh Dickins   swap_info: change...
2201
  	unsigned int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2202

efa90a981   Hugh Dickins   swap_info: change...
2203
2204
  	for (type = 0; type < nr_swapfiles; type++)
  		if (swap_info[type]->inuse_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2205
2206
2207
2208
2209
2210
2211
2212
2213
  			return;
  	spin_lock(&mmlist_lock);
  	list_for_each_safe(p, next, &init_mm.mmlist)
  		list_del_init(p);
  	spin_unlock(&mmlist_lock);
  }
  
  /*
   * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
d4906e1aa   Lee Schermerhorn   swap: rework map_...
2214
2215
2216
   * corresponds to page offset for the specified swap entry.
   * Note that the type of this function is sector_t, but it returns page offset
   * into the bdev, not sector offset.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2217
   */
d4906e1aa   Lee Schermerhorn   swap: rework map_...
2218
  static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2219
  {
f29ad6a99   Hugh Dickins   swap_info: privat...
2220
  	struct swap_info_struct *sis;
f29ad6a99   Hugh Dickins   swap_info: privat...
2221
2222
  	struct swap_extent *se;
  	pgoff_t offset;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
2223
  	sis = swp_swap_info(entry);
f29ad6a99   Hugh Dickins   swap_info: privat...
2224
2225
2226
  	*bdev = sis->bdev;
  
  	offset = swp_offset(entry);
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2227
2228
  	se = offset_to_swap_extent(sis, offset);
  	return se->start_block + (offset - se->start_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2229
2230
2231
  }
  
  /*
d4906e1aa   Lee Schermerhorn   swap: rework map_...
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
   * Returns the page offset into bdev for the specified page's swap entry.
   */
  sector_t map_swap_page(struct page *page, struct block_device **bdev)
  {
  	swp_entry_t entry;
  	entry.val = page_private(page);
  	return map_swap_entry(entry, bdev);
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2242
2243
2244
2245
   * Free all of a swapdev's extent information
   */
  static void destroy_swap_extents(struct swap_info_struct *sis)
  {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2246
2247
2248
  	while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
  		struct rb_node *rb = sis->swap_extent_root.rb_node;
  		struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2249

4efaceb1c   Aaron Lu   mm, swap: use rbt...
2250
  		rb_erase(rb, &sis->swap_extent_root);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2251
2252
  		kfree(se);
  	}
62c230bc1   Mel Gorman   mm: add support f...
2253

bc4ae27d8   Omar Sandoval   mm: split SWP_FIL...
2254
  	if (sis->flags & SWP_ACTIVATED) {
62c230bc1   Mel Gorman   mm: add support f...
2255
2256
  		struct file *swap_file = sis->swap_file;
  		struct address_space *mapping = swap_file->f_mapping;
bc4ae27d8   Omar Sandoval   mm: split SWP_FIL...
2257
2258
2259
  		sis->flags &= ~SWP_ACTIVATED;
  		if (mapping->a_ops->swap_deactivate)
  			mapping->a_ops->swap_deactivate(swap_file);
62c230bc1   Mel Gorman   mm: add support f...
2260
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2261
2262
2263
2264
  }
  
  /*
   * Add a block range (and the corresponding page range) into this swapdev's
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2265
   * extent tree.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2266
   *
11d31886d   Hugh Dickins   [PATCH] swap: swa...
2267
   * This function rather assumes that it is called in ascending page order.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2268
   */
a509bc1a9   Mel Gorman   mm: swap: impleme...
2269
  int
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2270
2271
2272
  add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  		unsigned long nr_pages, sector_t start_block)
  {
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2273
  	struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2274
2275
  	struct swap_extent *se;
  	struct swap_extent *new_se;
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
  
  	/*
  	 * place the new node at the right most since the
  	 * function is called in ascending page order.
  	 */
  	while (*link) {
  		parent = *link;
  		link = &parent->rb_right;
  	}
  
  	if (parent) {
  		se = rb_entry(parent, struct swap_extent, rb_node);
11d31886d   Hugh Dickins   [PATCH] swap: swa...
2288
2289
  		BUG_ON(se->start_page + se->nr_pages != start_page);
  		if (se->start_block + se->nr_pages == start_block) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2290
2291
2292
2293
  			/* Merge it */
  			se->nr_pages += nr_pages;
  			return 0;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2294
  	}
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2295
  	/* No merge, insert a new extent. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2296
2297
2298
2299
2300
2301
  	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
  	if (new_se == NULL)
  		return -ENOMEM;
  	new_se->start_page = start_page;
  	new_se->nr_pages = nr_pages;
  	new_se->start_block = start_block;
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2302
2303
  	rb_link_node(&new_se->rb_node, parent, link);
  	rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
53092a740   Hugh Dickins   [PATCH] swap: sho...
2304
  	return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2305
  }
aa8aa8a33   Omar Sandoval   mm: export add_sw...
2306
  EXPORT_SYMBOL_GPL(add_swap_extent);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
  
  /*
   * A `swap extent' is a simple thing which maps a contiguous range of pages
   * onto a contiguous range of disk blocks.  An ordered list of swap extents
   * is built at swapon time and is then used at swap_writepage/swap_readpage
   * time for locating where on disk a page belongs.
   *
   * If the swapfile is an S_ISBLK block device, a single extent is installed.
   * This is done so that the main operating code can treat S_ISBLK and S_ISREG
   * swap files identically.
   *
   * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
   * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
   * swapfiles are handled *identically* after swapon time.
   *
   * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
   * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
   * some stray blocks are found which do not fall within the PAGE_SIZE alignment
   * requirements, they are simply tossed out - we will never use those blocks
   * for swapping.
   *
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2328
2329
   * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
   * prevents users from writing to the swap device, which will corrupt memory.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2330
2331
2332
2333
2334
2335
2336
2337
   *
   * The amount of disk space which a single swap extent represents varies.
   * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
   * extents in the list.  To avoid much list walking, we cache the previous
   * search location in `curr_swap_extent', and start new searches from there.
   * This is extremely effective.  The average number of iterations in
   * map_swap_page() has been measured at about 0.3 per page.  - akpm.
   */
53092a740   Hugh Dickins   [PATCH] swap: sho...
2338
  static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2339
  {
62c230bc1   Mel Gorman   mm: add support f...
2340
2341
2342
  	struct file *swap_file = sis->swap_file;
  	struct address_space *mapping = swap_file->f_mapping;
  	struct inode *inode = mapping->host;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2343
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2344
2345
  	if (S_ISBLK(inode->i_mode)) {
  		ret = add_swap_extent(sis, 0, sis->max, 0);
53092a740   Hugh Dickins   [PATCH] swap: sho...
2346
  		*span = sis->pages;
a509bc1a9   Mel Gorman   mm: swap: impleme...
2347
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2348
  	}
62c230bc1   Mel Gorman   mm: add support f...
2349
  	if (mapping->a_ops->swap_activate) {
a509bc1a9   Mel Gorman   mm: swap: impleme...
2350
  		ret = mapping->a_ops->swap_activate(sis, swap_file, span);
bc4ae27d8   Omar Sandoval   mm: split SWP_FIL...
2351
2352
  		if (ret >= 0)
  			sis->flags |= SWP_ACTIVATED;
62c230bc1   Mel Gorman   mm: add support f...
2353
  		if (!ret) {
326463154   Gao Xiang   swap: rename SWP_...
2354
  			sis->flags |= SWP_FS_OPS;
62c230bc1   Mel Gorman   mm: add support f...
2355
2356
2357
  			ret = add_swap_extent(sis, 0, sis->max, 0);
  			*span = sis->pages;
  		}
a509bc1a9   Mel Gorman   mm: swap: impleme...
2358
  		return ret;
62c230bc1   Mel Gorman   mm: add support f...
2359
  	}
a509bc1a9   Mel Gorman   mm: swap: impleme...
2360
  	return generic_swapfile_activate(sis, swap_file, span);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2361
  }
a2468cc9b   Aaron Lu   swap: choose swap...
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
  static int swap_node(struct swap_info_struct *p)
  {
  	struct block_device *bdev;
  
  	if (p->bdev)
  		bdev = p->bdev;
  	else
  		bdev = p->swap_file->f_inode->i_sb->s_bdev;
  
  	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
  }
eb085574a   Huang Ying   mm, swap: fix rac...
2373
2374
2375
  static void setup_swap_info(struct swap_info_struct *p, int prio,
  			    unsigned char *swap_map,
  			    struct swap_cluster_info *cluster_info)
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2376
  {
a2468cc9b   Aaron Lu   swap: choose swap...
2377
  	int i;
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2378
2379
2380
2381
  	if (prio >= 0)
  		p->prio = prio;
  	else
  		p->prio = --least_priority;
18ab4d4ce   Dan Streetman   swap: change swap...
2382
2383
2384
2385
2386
  	/*
  	 * the plist prio is negated because plist ordering is
  	 * low-to-high, while swap ordering is high-to-low
  	 */
  	p->list.prio = -p->prio;
a2468cc9b   Aaron Lu   swap: choose swap...
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
  	for_each_node(i) {
  		if (p->prio >= 0)
  			p->avail_lists[i].prio = -p->prio;
  		else {
  			if (swap_node(p) == i)
  				p->avail_lists[i].prio = 1;
  			else
  				p->avail_lists[i].prio = -p->prio;
  		}
  	}
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2397
  	p->swap_map = swap_map;
2a8f94493   Shaohua Li   swap: change bloc...
2398
  	p->cluster_info = cluster_info;
eb085574a   Huang Ying   mm, swap: fix rac...
2399
2400
2401
2402
2403
  }
  
  static void _enable_swap_info(struct swap_info_struct *p)
  {
  	p->flags |= SWP_WRITEOK | SWP_VALID;
ec8acf20a   Shaohua Li   swap: add per-par...
2404
  	atomic_long_add(p->pages, &nr_swap_pages);
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2405
  	total_swap_pages += p->pages;
adfab836f   Dan Streetman   swap: change swap...
2406
  	assert_spin_locked(&swap_lock);
adfab836f   Dan Streetman   swap: change swap...
2407
  	/*
18ab4d4ce   Dan Streetman   swap: change swap...
2408
2409
2410
2411
2412
2413
2414
2415
  	 * both lists are plists, and thus priority ordered.
  	 * swap_active_head needs to be priority ordered for swapoff(),
  	 * which on removal of any swap_info_struct with an auto-assigned
  	 * (i.e. negative) priority increments the auto-assigned priority
  	 * of any lower-priority swap_info_structs.
  	 * swap_avail_head needs to be priority ordered for get_swap_page(),
  	 * which allocates swap pages from the highest available priority
  	 * swap_info_struct.
adfab836f   Dan Streetman   swap: change swap...
2416
  	 */
18ab4d4ce   Dan Streetman   swap: change swap...
2417
  	plist_add(&p->list, &swap_active_head);
a2468cc9b   Aaron Lu   swap: choose swap...
2418
  	add_to_avail_list(p);
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2419
2420
2421
2422
  }
  
  static void enable_swap_info(struct swap_info_struct *p, int prio,
  				unsigned char *swap_map,
2a8f94493   Shaohua Li   swap: change bloc...
2423
  				struct swap_cluster_info *cluster_info,
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2424
2425
  				unsigned long *frontswap_map)
  {
4f89849da   Minchan Kim   frontswap: get ri...
2426
  	frontswap_init(p->type, frontswap_map);
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2427
  	spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2428
  	spin_lock(&p->lock);
eb085574a   Huang Ying   mm, swap: fix rac...
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
  	setup_swap_info(p, prio, swap_map, cluster_info);
  	spin_unlock(&p->lock);
  	spin_unlock(&swap_lock);
  	/*
  	 * Guarantee swap_map, cluster_info, etc. fields are valid
  	 * between get/put_swap_device() if SWP_VALID bit is set
  	 */
  	synchronize_rcu();
  	spin_lock(&swap_lock);
  	spin_lock(&p->lock);
  	_enable_swap_info(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2440
  	spin_unlock(&p->lock);
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2441
2442
2443
2444
2445
2446
  	spin_unlock(&swap_lock);
  }
  
  static void reinsert_swap_info(struct swap_info_struct *p)
  {
  	spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2447
  	spin_lock(&p->lock);
eb085574a   Huang Ying   mm, swap: fix rac...
2448
2449
  	setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
  	_enable_swap_info(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2450
  	spin_unlock(&p->lock);
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
2451
2452
  	spin_unlock(&swap_lock);
  }
67afa38e0   Tim Chen   mm/swap: add cach...
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
  bool has_usable_swap(void)
  {
  	bool ret = true;
  
  	spin_lock(&swap_lock);
  	if (plist_head_empty(&swap_active_head))
  		ret = false;
  	spin_unlock(&swap_lock);
  	return ret;
  }
c4ea37c26   Heiko Carstens   [CVE-2009-0029] S...
2463
  SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2464
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
2465
  	struct swap_info_struct *p = NULL;
8d69aaee8   Hugh Dickins   swap_info: swap_m...
2466
  	unsigned char *swap_map;
2a8f94493   Shaohua Li   swap: change bloc...
2467
  	struct swap_cluster_info *cluster_info;
4f89849da   Minchan Kim   frontswap: get ri...
2468
  	unsigned long *frontswap_map;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2469
2470
2471
  	struct file *swap_file, *victim;
  	struct address_space *mapping;
  	struct inode *inode;
91a27b2a7   Jeff Layton   vfs: define struc...
2472
  	struct filename *pathname;
adfab836f   Dan Streetman   swap: change swap...
2473
  	int err, found = 0;
5b808a230   Krzysztof Kozlowski   swap: fix set_blo...
2474
  	unsigned int old_block_size;
886bb7e9c   Hugh Dickins   swapfile: remove ...
2475

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2476
2477
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
191c54244   Al Viro   mm: collapse secu...
2478
  	BUG_ON(!current->mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2479
  	pathname = getname(specialfile);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2480
  	if (IS_ERR(pathname))
f58b59c1d   Xiaotian Feng   swapfile: fix nam...
2481
  		return PTR_ERR(pathname);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2482

669abf4e5   Jeff Layton   vfs: make path_op...
2483
  	victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2484
2485
2486
2487
2488
  	err = PTR_ERR(victim);
  	if (IS_ERR(victim))
  		goto out;
  
  	mapping = victim->f_mapping;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2489
  	spin_lock(&swap_lock);
18ab4d4ce   Dan Streetman   swap: change swap...
2490
  	plist_for_each_entry(p, &swap_active_head, list) {
22c6f8fdb   Hugh Dickins   swapfile: remove ...
2491
  		if (p->flags & SWP_WRITEOK) {
adfab836f   Dan Streetman   swap: change swap...
2492
2493
  			if (p->swap_file->f_mapping == mapping) {
  				found = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2494
  				break;
adfab836f   Dan Streetman   swap: change swap...
2495
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2496
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2497
  	}
adfab836f   Dan Streetman   swap: change swap...
2498
  	if (!found) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2499
  		err = -EINVAL;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2500
  		spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2501
2502
  		goto out_dput;
  	}
191c54244   Al Viro   mm: collapse secu...
2503
  	if (!security_vm_enough_memory_mm(current->mm, p->pages))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2504
2505
2506
  		vm_unacct_memory(p->pages);
  	else {
  		err = -ENOMEM;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2507
  		spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2508
2509
  		goto out_dput;
  	}
a2468cc9b   Aaron Lu   swap: choose swap...
2510
  	del_from_avail_list(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2511
  	spin_lock(&p->lock);
78ecba081   Hugh Dickins   mm: fix ever-decr...
2512
  	if (p->prio < 0) {
adfab836f   Dan Streetman   swap: change swap...
2513
  		struct swap_info_struct *si = p;
a2468cc9b   Aaron Lu   swap: choose swap...
2514
  		int nid;
adfab836f   Dan Streetman   swap: change swap...
2515

18ab4d4ce   Dan Streetman   swap: change swap...
2516
  		plist_for_each_entry_continue(si, &swap_active_head, list) {
adfab836f   Dan Streetman   swap: change swap...
2517
  			si->prio++;
18ab4d4ce   Dan Streetman   swap: change swap...
2518
  			si->list.prio--;
a2468cc9b   Aaron Lu   swap: choose swap...
2519
2520
2521
2522
  			for_each_node(nid) {
  				if (si->avail_lists[nid].prio != 1)
  					si->avail_lists[nid].prio--;
  			}
adfab836f   Dan Streetman   swap: change swap...
2523
  		}
78ecba081   Hugh Dickins   mm: fix ever-decr...
2524
2525
  		least_priority++;
  	}
18ab4d4ce   Dan Streetman   swap: change swap...
2526
  	plist_del(&p->list, &swap_active_head);
ec8acf20a   Shaohua Li   swap: add per-par...
2527
  	atomic_long_sub(p->pages, &nr_swap_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2528
2529
  	total_swap_pages -= p->pages;
  	p->flags &= ~SWP_WRITEOK;
ec8acf20a   Shaohua Li   swap: add per-par...
2530
  	spin_unlock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2531
  	spin_unlock(&swap_lock);
fb4f88dca   Hugh Dickins   [PATCH] swap: get...
2532

039939a65   Tim Chen   mm/swap: enable s...
2533
  	disable_swap_slots_cache_lock();
e1e12d2f3   David Rientjes   mm, oom: fix race...
2534
  	set_current_oom_origin();
adfab836f   Dan Streetman   swap: change swap...
2535
  	err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
e1e12d2f3   David Rientjes   mm, oom: fix race...
2536
  	clear_current_oom_origin();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2537

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2538
2539
  	if (err) {
  		/* re-insert swap space back into swap_list */
cf0cac0a0   Cesar Eduardo Barros   mm: refactor rein...
2540
  		reinsert_swap_info(p);
039939a65   Tim Chen   mm/swap: enable s...
2541
  		reenable_swap_slots_cache_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2542
2543
  		goto out_dput;
  	}
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2544

039939a65   Tim Chen   mm/swap: enable s...
2545
  	reenable_swap_slots_cache_unlock();
eb085574a   Huang Ying   mm, swap: fix rac...
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
  	spin_lock(&swap_lock);
  	spin_lock(&p->lock);
  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
  	spin_unlock(&p->lock);
  	spin_unlock(&swap_lock);
  	/*
  	 * wait for swap operations protected by get/put_swap_device()
  	 * to complete
  	 */
  	synchronize_rcu();
815c2c543   Shaohua Li   swap: make swap d...
2556
  	flush_work(&p->discard_work);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2557
  	destroy_swap_extents(p);
570a335b8   Hugh Dickins   swap_info: swap c...
2558
2559
  	if (p->flags & SWP_CONTINUED)
  		free_swap_count_continuations(p);
81a0298bd   Huang Ying   mm, swap: don't u...
2560
2561
  	if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
  		atomic_dec(&nr_rotate_swap);
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2562
  	mutex_lock(&swapon_mutex);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2563
  	spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2564
  	spin_lock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2565
  	drain_mmlist();
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2566
  	/* wait for anyone still in scan_swap_map */
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2567
2568
  	p->highest_bit = 0;		/* cuts scans short */
  	while (p->flags >= SWP_SCANNING) {
ec8acf20a   Shaohua Li   swap: add per-par...
2569
  		spin_unlock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2570
  		spin_unlock(&swap_lock);
13e4b57f6   Nishanth Aravamudan   [PATCH] mm: fix-u...
2571
  		schedule_timeout_uninterruptible(1);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2572
  		spin_lock(&swap_lock);
ec8acf20a   Shaohua Li   swap: add per-par...
2573
  		spin_lock(&p->lock);
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2574
  	}
52b7efdbe   Hugh Dickins   [PATCH] swap: sca...
2575

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2576
  	swap_file = p->swap_file;
5b808a230   Krzysztof Kozlowski   swap: fix set_blo...
2577
  	old_block_size = p->old_block_size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2578
2579
2580
2581
  	p->swap_file = NULL;
  	p->max = 0;
  	swap_map = p->swap_map;
  	p->swap_map = NULL;
2a8f94493   Shaohua Li   swap: change bloc...
2582
2583
  	cluster_info = p->cluster_info;
  	p->cluster_info = NULL;
4f89849da   Minchan Kim   frontswap: get ri...
2584
  	frontswap_map = frontswap_map_get(p);
ec8acf20a   Shaohua Li   swap: add per-par...
2585
  	spin_unlock(&p->lock);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2586
  	spin_unlock(&swap_lock);
8a84802e2   Steven Price   mm: Add arch hook...
2587
  	arch_swap_invalidate_area(p->type);
adfab836f   Dan Streetman   swap: change swap...
2588
  	frontswap_invalidate_area(p->type);
58e97ba6b   Krzysztof Kozlowski   frontswap: enable...
2589
  	frontswap_map_set(p, NULL);
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2590
  	mutex_unlock(&swapon_mutex);
ebc2a1a69   Shaohua Li   swap: make cluste...
2591
2592
  	free_percpu(p->percpu_cluster);
  	p->percpu_cluster = NULL;
490705888   Huang Ying   swap: reduce lock...
2593
2594
  	free_percpu(p->cluster_next_cpu);
  	p->cluster_next_cpu = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2595
  	vfree(swap_map);
54f180d3c   Huang Ying   mm, swap: use kvz...
2596
2597
  	kvfree(cluster_info);
  	kvfree(frontswap_map);
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
2598
  	/* Destroy swap account information */
adfab836f   Dan Streetman   swap: change swap...
2599
  	swap_cgroup_swapoff(p->type);
4b3ef9daa   Huang, Ying   mm/swap: split sw...
2600
  	exit_swap_address_space(p->type);
27a7faa07   KAMEZAWA Hiroyuki   memcg: swap cgrou...
2601

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2602
2603
2604
  	inode = mapping->host;
  	if (S_ISBLK(inode->i_mode)) {
  		struct block_device *bdev = I_BDEV(inode);
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2605

5b808a230   Krzysztof Kozlowski   swap: fix set_blo...
2606
  		set_blocksize(bdev, old_block_size);
e525fd89d   Tejun Heo   block: make blkde...
2607
  		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2608
  	}
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2609
2610
2611
2612
  
  	inode_lock(inode);
  	inode->i_flags &= ~S_SWAPFILE;
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2613
  	filp_close(swap_file, NULL);
f893ab41e   Weijie Yang   mm/swap: fix race...
2614
2615
2616
2617
2618
2619
2620
2621
2622
  
  	/*
  	 * Clear the SWP_USED flag after all resources are freed so that swapon
  	 * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
  	 * not hold p->lock after we cleared its SWP_WRITEOK.
  	 */
  	spin_lock(&swap_lock);
  	p->flags = 0;
  	spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2623
  	err = 0;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2624
2625
  	atomic_inc(&proc_poll_event);
  	wake_up_interruptible(&proc_poll_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2626
2627
2628
2629
  
  out_dput:
  	filp_close(victim, NULL);
  out:
f58b59c1d   Xiaotian Feng   swapfile: fix nam...
2630
  	putname(pathname);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2631
2632
2633
2634
  	return err;
  }
  
  #ifdef CONFIG_PROC_FS
9dd957485   Al Viro   ipc, kernel, mm: ...
2635
  static __poll_t swaps_poll(struct file *file, poll_table *wait)
66d7dd518   Kay Sievers   /proc/swaps: supp...
2636
  {
f15146380   Kay Sievers   fs: seq_file - ad...
2637
  	struct seq_file *seq = file->private_data;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2638
2639
  
  	poll_wait(file, &proc_poll_wait, wait);
f15146380   Kay Sievers   fs: seq_file - ad...
2640
2641
  	if (seq->poll_event != atomic_read(&proc_poll_event)) {
  		seq->poll_event = atomic_read(&proc_poll_event);
a9a08845e   Linus Torvalds   vfs: do bulk POLL...
2642
  		return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2643
  	}
a9a08845e   Linus Torvalds   vfs: do bulk POLL...
2644
  	return EPOLLIN | EPOLLRDNORM;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2645
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2646
2647
2648
  /* iterator */
  static void *swap_start(struct seq_file *swap, loff_t *pos)
  {
efa90a981   Hugh Dickins   swap_info: change...
2649
2650
  	struct swap_info_struct *si;
  	int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2651
  	loff_t l = *pos;
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2652
  	mutex_lock(&swapon_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2653

881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2654
2655
  	if (!l)
  		return SEQ_START_TOKEN;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
2656
  	for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
efa90a981   Hugh Dickins   swap_info: change...
2657
  		if (!(si->flags & SWP_USED) || !si->swap_map)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2658
  			continue;
881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2659
  		if (!--l)
efa90a981   Hugh Dickins   swap_info: change...
2660
  			return si;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2661
2662
2663
2664
2665
2666
2667
  	}
  
  	return NULL;
  }
  
  static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
  {
efa90a981   Hugh Dickins   swap_info: change...
2668
2669
  	struct swap_info_struct *si = v;
  	int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2670

881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2671
  	if (v == SEQ_START_TOKEN)
efa90a981   Hugh Dickins   swap_info: change...
2672
2673
2674
  		type = 0;
  	else
  		type = si->type + 1;
881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2675

10c8d69f3   Vasily Averin   mm/swapfile.c: sw...
2676
  	++(*pos);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
2677
  	for (; (si = swap_type_to_swap_info(type)); type++) {
efa90a981   Hugh Dickins   swap_info: change...
2678
  		if (!(si->flags & SWP_USED) || !si->swap_map)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2679
  			continue;
efa90a981   Hugh Dickins   swap_info: change...
2680
  		return si;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2681
2682
2683
2684
2685
2686
2687
  	}
  
  	return NULL;
  }
  
  static void swap_stop(struct seq_file *swap, void *v)
  {
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
2688
  	mutex_unlock(&swapon_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2689
2690
2691
2692
  }
  
  static int swap_show(struct seq_file *swap, void *v)
  {
efa90a981   Hugh Dickins   swap_info: change...
2693
  	struct swap_info_struct *si = v;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2694
2695
  	struct file *file;
  	int len;
6f7939405   Randy Dunlap   mm: swapfile: fix...
2696
  	unsigned int bytes, inuse;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2697

efa90a981   Hugh Dickins   swap_info: change...
2698
  	if (si == SEQ_START_TOKEN) {
6f7939405   Randy Dunlap   mm: swapfile: fix...
2699
2700
  		seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority
  ");
881e4aabe   Suleiman Souhlal   [PATCH] Always pr...
2701
2702
  		return 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2703

6f7939405   Randy Dunlap   mm: swapfile: fix...
2704
2705
  	bytes = si->pages << (PAGE_SHIFT - 10);
  	inuse = si->inuse_pages << (PAGE_SHIFT - 10);
efa90a981   Hugh Dickins   swap_info: change...
2706
  	file = si->swap_file;
2726d5662   Miklos Szeredi   vfs: add seq_file...
2707
2708
  	len = seq_file_path(swap, file, " \t
  \\");
6f7939405   Randy Dunlap   mm: swapfile: fix...
2709
2710
  	seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d
  ",
886bb7e9c   Hugh Dickins   swapfile: remove ...
2711
  			len < 40 ? 40 - len : 1, " ",
496ad9aa8   Al Viro   new helper: file_...
2712
  			S_ISBLK(file_inode(file)->i_mode) ?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2713
  				"partition" : "file\t",
6f7939405   Randy Dunlap   mm: swapfile: fix...
2714
2715
  			bytes, bytes < 10000000 ? "\t" : "",
  			inuse, inuse < 10000000 ? "\t" : "",
efa90a981   Hugh Dickins   swap_info: change...
2716
  			si->prio);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2717
2718
  	return 0;
  }
15ad7cdcf   Helge Deller   [PATCH] struct se...
2719
  static const struct seq_operations swaps_op = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2720
2721
2722
2723
2724
2725
2726
2727
  	.start =	swap_start,
  	.next =		swap_next,
  	.stop =		swap_stop,
  	.show =		swap_show
  };
  
  static int swaps_open(struct inode *inode, struct file *file)
  {
f15146380   Kay Sievers   fs: seq_file - ad...
2728
  	struct seq_file *seq;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2729
  	int ret;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2730
  	ret = seq_open(file, &swaps_op);
f15146380   Kay Sievers   fs: seq_file - ad...
2731
  	if (ret)
66d7dd518   Kay Sievers   /proc/swaps: supp...
2732
  		return ret;
66d7dd518   Kay Sievers   /proc/swaps: supp...
2733

f15146380   Kay Sievers   fs: seq_file - ad...
2734
2735
2736
  	seq = file->private_data;
  	seq->poll_event = atomic_read(&proc_poll_event);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2737
  }
97a32539b   Alexey Dobriyan   proc: convert eve...
2738
  static const struct proc_ops swaps_proc_ops = {
d919b33da   Alexey Dobriyan   proc: faster open...
2739
  	.proc_flags	= PROC_ENTRY_PERMANENT,
97a32539b   Alexey Dobriyan   proc: convert eve...
2740
2741
2742
2743
2744
  	.proc_open	= swaps_open,
  	.proc_read	= seq_read,
  	.proc_lseek	= seq_lseek,
  	.proc_release	= seq_release,
  	.proc_poll	= swaps_poll,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2745
2746
2747
2748
  };
  
  static int __init procswaps_init(void)
  {
97a32539b   Alexey Dobriyan   proc: convert eve...
2749
  	proc_create("swaps", 0, NULL, &swaps_proc_ops);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2750
2751
2752
2753
  	return 0;
  }
  __initcall(procswaps_init);
  #endif /* CONFIG_PROC_FS */
1796316a8   Jan Beulich   x86: consolidate ...
2754
2755
2756
2757
2758
2759
2760
2761
  #ifdef MAX_SWAPFILES_CHECK
  static int __init max_swapfiles_check(void)
  {
  	MAX_SWAPFILES_CHECK();
  	return 0;
  }
  late_initcall(max_swapfiles_check);
  #endif
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2762
  static struct swap_info_struct *alloc_swap_info(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2763
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
2764
  	struct swap_info_struct *p;
b11a76b37   Qian Cai   mm/swapfile: do n...
2765
  	struct swap_info_struct *defer = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2766
  	unsigned int type;
a2468cc9b   Aaron Lu   swap: choose swap...
2767
  	int i;
efa90a981   Hugh Dickins   swap_info: change...
2768

960087445   Gustavo A. R. Silva   mm/swapfile.c: us...
2769
  	p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
efa90a981   Hugh Dickins   swap_info: change...
2770
  	if (!p)
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2771
  		return ERR_PTR(-ENOMEM);
efa90a981   Hugh Dickins   swap_info: change...
2772

5d337b919   Hugh Dickins   [PATCH] swap: swa...
2773
  	spin_lock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
2774
2775
  	for (type = 0; type < nr_swapfiles; type++) {
  		if (!(swap_info[type]->flags & SWP_USED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2776
  			break;
efa90a981   Hugh Dickins   swap_info: change...
2777
  	}
0697212a4   Christoph Lameter   [PATCH] Swapless ...
2778
  	if (type >= MAX_SWAPFILES) {
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2779
  		spin_unlock(&swap_lock);
873d7bcfd   Vasily Averin   mm/swapfile.c: us...
2780
  		kvfree(p);
730c0581c   Cesar Eduardo Barros   sys_swapon: simpl...
2781
  		return ERR_PTR(-EPERM);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2782
  	}
efa90a981   Hugh Dickins   swap_info: change...
2783
2784
  	if (type >= nr_swapfiles) {
  		p->type = type;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
2785
  		WRITE_ONCE(swap_info[type], p);
efa90a981   Hugh Dickins   swap_info: change...
2786
2787
2788
2789
2790
2791
  		/*
  		 * Write swap_info[type] before nr_swapfiles, in case a
  		 * racing procfs swap_start() or swap_next() is reading them.
  		 * (We never shrink nr_swapfiles, we never free this entry.)
  		 */
  		smp_wmb();
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
2792
  		WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
efa90a981   Hugh Dickins   swap_info: change...
2793
  	} else {
b11a76b37   Qian Cai   mm/swapfile: do n...
2794
  		defer = p;
efa90a981   Hugh Dickins   swap_info: change...
2795
2796
2797
2798
2799
2800
  		p = swap_info[type];
  		/*
  		 * Do not memset this entry: a racing procfs swap_next()
  		 * would be relying on p->type to remain valid.
  		 */
  	}
4efaceb1c   Aaron Lu   mm, swap: use rbt...
2801
  	p->swap_extent_root = RB_ROOT;
18ab4d4ce   Dan Streetman   swap: change swap...
2802
  	plist_node_init(&p->list, 0);
a2468cc9b   Aaron Lu   swap: choose swap...
2803
2804
  	for_each_node(i)
  		plist_node_init(&p->avail_lists[i], 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2805
  	p->flags = SWP_USED;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
2806
  	spin_unlock(&swap_lock);
b11a76b37   Qian Cai   mm/swapfile: do n...
2807
  	kvfree(defer);
ec8acf20a   Shaohua Li   swap: add per-par...
2808
  	spin_lock_init(&p->lock);
2628bd6fc   Huang Ying   mm, swap: fix rac...
2809
  	spin_lock_init(&p->cont_lock);
efa90a981   Hugh Dickins   swap_info: change...
2810

53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2811
  	return p;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
2812
  }
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2813
2814
2815
2816
2817
  static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
  {
  	int error;
  
  	if (S_ISBLK(inode->i_mode)) {
ef16e1d98   Christoph Hellwig   mm: cleanup claim...
2818
  		p->bdev = blkdev_get_by_dev(inode->i_rdev,
6f179af88   Hugh Dickins   mm: fix potential...
2819
  				   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
ef16e1d98   Christoph Hellwig   mm: cleanup claim...
2820
2821
  		if (IS_ERR(p->bdev)) {
  			error = PTR_ERR(p->bdev);
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2822
  			p->bdev = NULL;
6f179af88   Hugh Dickins   mm: fix potential...
2823
  			return error;
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2824
2825
2826
2827
  		}
  		p->old_block_size = block_size(p->bdev);
  		error = set_blocksize(p->bdev, PAGE_SIZE);
  		if (error < 0)
87ade72a7   Cesar Eduardo Barros   sys_swapon: simpl...
2828
  			return error;
12d2966d8   Naohiro Aota   mm, swap: disallo...
2829
2830
2831
2832
2833
  		/*
  		 * Zoned block devices contain zones that have a sequential
  		 * write only restriction.  Hence zoned block devices are not
  		 * suitable for swapping.  Disallow them here.
  		 */
e556f6ba1   Christoph Hellwig   block: remove the...
2834
  		if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
12d2966d8   Naohiro Aota   mm, swap: disallo...
2835
  			return -EINVAL;
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2836
2837
2838
  		p->flags |= SWP_BLKDEV;
  	} else if (S_ISREG(inode->i_mode)) {
  		p->bdev = inode->i_sb->s_bdev;
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
2839
  	}
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2840
  	return 0;
4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
2841
  }
377eeaa8e   Andi Kleen   x86/speculation/l...
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
  
  /*
   * Find out how many pages are allowed for a single swap device. There
   * are two limiting factors:
   * 1) the number of bits for the swap offset in the swp_entry_t type, and
   * 2) the number of bits in the swap pte, as defined by the different
   * architectures.
   *
   * In order to find the largest possible bit mask, a swap entry with
   * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
   * decoded to a swp_entry_t again, and finally the swap offset is
   * extracted.
   *
   * This will mask all the bits from the initial ~0UL mask that can't
   * be encoded in either the swp_entry_t or the architecture definition
   * of a swap pte.
   */
  unsigned long generic_max_swapfile_size(void)
  {
  	return swp_offset(pte_to_swp_entry(
  			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
  }
  
  /* Can be overridden by an architecture for additional checks. */
  __weak unsigned long max_swapfile_size(void)
  {
  	return generic_max_swapfile_size();
  }
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2870
2871
2872
2873
2874
2875
2876
  static unsigned long read_swap_header(struct swap_info_struct *p,
  					union swap_header *swap_header,
  					struct inode *inode)
  {
  	int i;
  	unsigned long maxpages;
  	unsigned long swapfilepages;
d6bbbd29b   Raymond Jennings   swap: warn when a...
2877
  	unsigned long last_page;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2878
2879
  
  	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2880
2881
  		pr_err("Unable to find swap-space signature
  ");
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2882
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2883
2884
2885
2886
2887
2888
2889
  	}
  
  	/* swap partition endianess hack... */
  	if (swab32(swap_header->info.version) == 1) {
  		swab32s(&swap_header->info.version);
  		swab32s(&swap_header->info.last_page);
  		swab32s(&swap_header->info.nr_badpages);
dd111be69   Jann Horn   swapfile: fix mem...
2890
2891
  		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  			return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2892
2893
2894
2895
2896
  		for (i = 0; i < swap_header->info.nr_badpages; i++)
  			swab32s(&swap_header->info.badpages[i]);
  	}
  	/* Check the swap header's sub-version */
  	if (swap_header->info.version != 1) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2897
2898
2899
  		pr_warn("Unable to handle swap header version %d
  ",
  			swap_header->info.version);
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2900
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2901
2902
2903
2904
2905
  	}
  
  	p->lowest_bit  = 1;
  	p->cluster_next = 1;
  	p->cluster_nr = 0;
377eeaa8e   Andi Kleen   x86/speculation/l...
2906
  	maxpages = max_swapfile_size();
d6bbbd29b   Raymond Jennings   swap: warn when a...
2907
  	last_page = swap_header->info.last_page;
a06ad633a   Tom Abraham   swap: divide-by-z...
2908
2909
2910
2911
2912
  	if (!last_page) {
  		pr_warn("Empty swap-file
  ");
  		return 0;
  	}
d6bbbd29b   Raymond Jennings   swap: warn when a...
2913
  	if (last_page > maxpages) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2914
2915
  		pr_warn("Truncating oversized swap area, only using %luk out of %luk
  ",
d6bbbd29b   Raymond Jennings   swap: warn when a...
2916
2917
2918
2919
2920
  			maxpages << (PAGE_SHIFT - 10),
  			last_page << (PAGE_SHIFT - 10));
  	}
  	if (maxpages > last_page) {
  		maxpages = last_page + 1;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2921
2922
2923
2924
2925
2926
2927
  		/* p->max is an unsigned int: don't overflow it */
  		if ((unsigned int)maxpages == 0)
  			maxpages = UINT_MAX;
  	}
  	p->highest_bit = maxpages - 1;
  
  	if (!maxpages)
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2928
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2929
2930
  	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
  	if (swapfilepages && maxpages > swapfilepages) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2931
2932
  		pr_warn("Swap area shorter than signature indicates
  ");
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2933
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2934
2935
  	}
  	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2936
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2937
  	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
387190253   Cesar Eduardo Barros   sys_swapon: simpl...
2938
  		return 0;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2939
2940
  
  	return maxpages;
ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
2941
  }
4b3ef9daa   Huang, Ying   mm/swap: split sw...
2942
  #define SWAP_CLUSTER_INFO_COLS						\
235b62176   Huang, Ying   mm/swap: add clus...
2943
  	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
4b3ef9daa   Huang, Ying   mm/swap: split sw...
2944
2945
2946
2947
  #define SWAP_CLUSTER_SPACE_COLS						\
  	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
  #define SWAP_CLUSTER_COLS						\
  	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
235b62176   Huang, Ying   mm/swap: add clus...
2948

915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2949
2950
2951
  static int setup_swap_map_and_extents(struct swap_info_struct *p,
  					union swap_header *swap_header,
  					unsigned char *swap_map,
2a8f94493   Shaohua Li   swap: change bloc...
2952
  					struct swap_cluster_info *cluster_info,
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2953
2954
2955
  					unsigned long maxpages,
  					sector_t *span)
  {
235b62176   Huang, Ying   mm/swap: add clus...
2956
  	unsigned int j, k;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2957
2958
  	unsigned int nr_good_pages;
  	int nr_extents;
2a8f94493   Shaohua Li   swap: change bloc...
2959
  	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
235b62176   Huang, Ying   mm/swap: add clus...
2960
2961
  	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
  	unsigned long i, idx;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2962
2963
  
  	nr_good_pages = maxpages - 1;	/* omit header page */
6b5349159   Huang Ying   mm, swap: add swa...
2964
2965
  	cluster_list_init(&p->free_clusters);
  	cluster_list_init(&p->discard_clusters);
2a8f94493   Shaohua Li   swap: change bloc...
2966

915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2967
2968
  	for (i = 0; i < swap_header->info.nr_badpages; i++) {
  		unsigned int page_nr = swap_header->info.badpages[i];
bdb8e3f68   Cesar Eduardo Barros   sys_swapon: simpl...
2969
2970
  		if (page_nr == 0 || page_nr > swap_header->info.last_page)
  			return -EINVAL;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2971
2972
2973
  		if (page_nr < maxpages) {
  			swap_map[page_nr] = SWAP_MAP_BAD;
  			nr_good_pages--;
2a8f94493   Shaohua Li   swap: change bloc...
2974
2975
2976
2977
2978
  			/*
  			 * Haven't marked the cluster free yet, no list
  			 * operation involved
  			 */
  			inc_cluster_info_page(p, cluster_info, page_nr);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2979
2980
  		}
  	}
2a8f94493   Shaohua Li   swap: change bloc...
2981
2982
2983
  	/* Haven't marked the cluster free yet, no list operation involved */
  	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
  		inc_cluster_info_page(p, cluster_info, i);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2984
2985
  	if (nr_good_pages) {
  		swap_map[0] = SWAP_MAP_BAD;
2a8f94493   Shaohua Li   swap: change bloc...
2986
2987
2988
2989
2990
  		/*
  		 * Not mark the cluster free yet, no list
  		 * operation involved
  		 */
  		inc_cluster_info_page(p, cluster_info, 0);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2991
2992
2993
  		p->max = maxpages;
  		p->pages = nr_good_pages;
  		nr_extents = setup_swap_extents(p, span);
bdb8e3f68   Cesar Eduardo Barros   sys_swapon: simpl...
2994
2995
  		if (nr_extents < 0)
  			return nr_extents;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
2996
2997
2998
  		nr_good_pages = p->pages;
  	}
  	if (!nr_good_pages) {
465c47fd8   Andrew Morton   mm/swapfile.c: co...
2999
3000
  		pr_warn("Empty swap-file
  ");
bdb8e3f68   Cesar Eduardo Barros   sys_swapon: simpl...
3001
  		return -EINVAL;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3002
  	}
2a8f94493   Shaohua Li   swap: change bloc...
3003
3004
  	if (!cluster_info)
  		return nr_extents;
235b62176   Huang, Ying   mm/swap: add clus...
3005

4b3ef9daa   Huang, Ying   mm/swap: split sw...
3006
3007
3008
3009
  	/*
  	 * Reduce false cache line sharing between cluster_info and
  	 * sharing same address space.
  	 */
235b62176   Huang, Ying   mm/swap: add clus...
3010
3011
3012
3013
3014
3015
3016
3017
  	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
  		j = (k + col) % SWAP_CLUSTER_COLS;
  		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
  			idx = i * SWAP_CLUSTER_COLS + j;
  			if (idx >= nr_clusters)
  				continue;
  			if (cluster_count(&cluster_info[idx]))
  				continue;
2a8f94493   Shaohua Li   swap: change bloc...
3018
  			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
6b5349159   Huang Ying   mm, swap: add swa...
3019
3020
  			cluster_list_add_tail(&p->free_clusters, cluster_info,
  					      idx);
2a8f94493   Shaohua Li   swap: change bloc...
3021
  		}
2a8f94493   Shaohua Li   swap: change bloc...
3022
  	}
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3023
  	return nr_extents;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3024
  }
dcf6b7ddd   Rafael Aquini   swap: discard whi...
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
  /*
   * Helper to sys_swapon determining if a given swap
   * backing device queue supports DISCARD operations.
   */
  static bool swap_discardable(struct swap_info_struct *si)
  {
  	struct request_queue *q = bdev_get_queue(si->bdev);
  
  	if (!q || !blk_queue_discard(q))
  		return false;
  
  	return true;
  }
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3038
3039
3040
  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
  {
  	struct swap_info_struct *p;
91a27b2a7   Jeff Layton   vfs: define struc...
3041
  	struct filename *name;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3042
3043
  	struct file *swap_file = NULL;
  	struct address_space *mapping;
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
3044
  	int prio;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3045
3046
  	int error;
  	union swap_header *swap_header;
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3047
  	int nr_extents;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3048
3049
  	sector_t span;
  	unsigned long maxpages;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3050
  	unsigned char *swap_map = NULL;
2a8f94493   Shaohua Li   swap: change bloc...
3051
  	struct swap_cluster_info *cluster_info = NULL;
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3052
  	unsigned long *frontswap_map = NULL;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3053
3054
  	struct page *page = NULL;
  	struct inode *inode = NULL;
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3055
  	bool inced_nr_rotate_swap = false;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3056

d15cab975   Hugh Dickins   swapon: check val...
3057
3058
  	if (swap_flags & ~SWAP_FLAGS_VALID)
  		return -EINVAL;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3059
3060
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
a2468cc9b   Aaron Lu   swap: choose swap...
3061
3062
  	if (!swap_avail_heads)
  		return -ENOMEM;
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3063
  	p = alloc_swap_info();
2542e5134   Cesar Eduardo Barros   sys_swapon: simpl...
3064
3065
  	if (IS_ERR(p))
  		return PTR_ERR(p);
53cbb2435   Cesar Eduardo Barros   sys_swapon: separ...
3066

815c2c543   Shaohua Li   swap: make swap d...
3067
  	INIT_WORK(&p->discard_work, swap_discard_work);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3068
  	name = getname(specialfile);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3069
  	if (IS_ERR(name)) {
7de7fb6b3   Cesar Eduardo Barros   sys_swapon: move ...
3070
  		error = PTR_ERR(name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3071
  		name = NULL;
bd69010b0   Cesar Eduardo Barros   sys_swapon: use a...
3072
  		goto bad_swap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3073
  	}
669abf4e5   Jeff Layton   vfs: make path_op...
3074
  	swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3075
  	if (IS_ERR(swap_file)) {
7de7fb6b3   Cesar Eduardo Barros   sys_swapon: move ...
3076
  		error = PTR_ERR(swap_file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3077
  		swap_file = NULL;
bd69010b0   Cesar Eduardo Barros   sys_swapon: use a...
3078
  		goto bad_swap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3079
3080
3081
3082
  	}
  
  	p->swap_file = swap_file;
  	mapping = swap_file->f_mapping;
2130781e2   Cesar Eduardo Barros   sys_swapon: fix i...
3083
  	inode = mapping->host;
6f179af88   Hugh Dickins   mm: fix potential...
3084

4d0e1e107   Cesar Eduardo Barros   sys_swapon: separ...
3085
3086
  	error = claim_swapfile(p, inode);
  	if (unlikely(error))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3087
  		goto bad_swap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3088

d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3089
3090
3091
3092
3093
  	inode_lock(inode);
  	if (IS_SWAPFILE(inode)) {
  		error = -EBUSY;
  		goto bad_swap_unlock_inode;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3094
3095
3096
3097
3098
  	/*
  	 * Read the swap header.
  	 */
  	if (!mapping->a_ops->readpage) {
  		error = -EINVAL;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3099
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3100
  	}
090d2b185   Pekka Enberg   [PATCH] read_mapp...
3101
  	page = read_mapping_page(mapping, 0, swap_file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3102
3103
  	if (IS_ERR(page)) {
  		error = PTR_ERR(page);
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3104
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3105
  	}
81e339712   Hugh Dickins   swapfile: remove ...
3106
  	swap_header = kmap(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3107

ca8bd38bf   Cesar Eduardo Barros   sys_swapon: separ...
3108
3109
  	maxpages = read_swap_header(p, swap_header, inode);
  	if (unlikely(!maxpages)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3110
  		error = -EINVAL;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3111
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3112
  	}
886bb7e9c   Hugh Dickins   swapfile: remove ...
3113

81e339712   Hugh Dickins   swapfile: remove ...
3114
  	/* OK, set up the swap map and apply the bad block list */
803d0c835   Cesar Eduardo Barros   sys_swapon: use v...
3115
  	swap_map = vzalloc(maxpages);
81e339712   Hugh Dickins   swapfile: remove ...
3116
3117
  	if (!swap_map) {
  		error = -ENOMEM;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3118
  		goto bad_swap_unlock_inode;
81e339712   Hugh Dickins   swapfile: remove ...
3119
  	}
f05714293   Minchan Kim   mm: support anony...
3120

1cb039f3d   Christoph Hellwig   bdi: replace BDI_...
3121
  	if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
f05714293   Minchan Kim   mm: support anony...
3122
  		p->flags |= SWP_STABLE_WRITES;
a8b456d01   Christoph Hellwig   bdi: remove BDI_C...
3123
  	if (p->bdev && p->bdev->bd_disk->fops->rw_page)
539a6fea7   Minchan Kim   mm, swap: introdu...
3124
  		p->flags |= SWP_SYNCHRONOUS_IO;
2a8f94493   Shaohua Li   swap: change bloc...
3125
  	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
6f179af88   Hugh Dickins   mm: fix potential...
3126
  		int cpu;
235b62176   Huang, Ying   mm/swap: add clus...
3127
  		unsigned long ci, nr_cluster;
6f179af88   Hugh Dickins   mm: fix potential...
3128

2a8f94493   Shaohua Li   swap: change bloc...
3129
  		p->flags |= SWP_SOLIDSTATE;
490705888   Huang Ying   swap: reduce lock...
3130
3131
3132
3133
3134
  		p->cluster_next_cpu = alloc_percpu(unsigned int);
  		if (!p->cluster_next_cpu) {
  			error = -ENOMEM;
  			goto bad_swap_unlock_inode;
  		}
2a8f94493   Shaohua Li   swap: change bloc...
3135
3136
3137
3138
  		/*
  		 * select a random position to start with to help wear leveling
  		 * SSD
  		 */
490705888   Huang Ying   swap: reduce lock...
3139
3140
3141
3142
  		for_each_possible_cpu(cpu) {
  			per_cpu(*p->cluster_next_cpu, cpu) =
  				1 + prandom_u32_max(p->highest_bit);
  		}
235b62176   Huang, Ying   mm/swap: add clus...
3143
  		nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2a8f94493   Shaohua Li   swap: change bloc...
3144

778e1cdd8   Kees Cook   treewide: kvzallo...
3145
  		cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
54f180d3c   Huang Ying   mm, swap: use kvz...
3146
  					GFP_KERNEL);
2a8f94493   Shaohua Li   swap: change bloc...
3147
3148
  		if (!cluster_info) {
  			error = -ENOMEM;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3149
  			goto bad_swap_unlock_inode;
2a8f94493   Shaohua Li   swap: change bloc...
3150
  		}
235b62176   Huang, Ying   mm/swap: add clus...
3151
3152
3153
  
  		for (ci = 0; ci < nr_cluster; ci++)
  			spin_lock_init(&((cluster_info + ci)->lock));
ebc2a1a69   Shaohua Li   swap: make cluste...
3154
3155
3156
  		p->percpu_cluster = alloc_percpu(struct percpu_cluster);
  		if (!p->percpu_cluster) {
  			error = -ENOMEM;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3157
  			goto bad_swap_unlock_inode;
ebc2a1a69   Shaohua Li   swap: make cluste...
3158
  		}
6f179af88   Hugh Dickins   mm: fix potential...
3159
  		for_each_possible_cpu(cpu) {
ebc2a1a69   Shaohua Li   swap: make cluste...
3160
  			struct percpu_cluster *cluster;
6f179af88   Hugh Dickins   mm: fix potential...
3161
  			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
ebc2a1a69   Shaohua Li   swap: make cluste...
3162
3163
  			cluster_set_null(&cluster->index);
  		}
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3164
  	} else {
81a0298bd   Huang Ying   mm, swap: don't u...
3165
  		atomic_inc(&nr_rotate_swap);
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3166
3167
  		inced_nr_rotate_swap = true;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3168

1421ef3cd   Cesar Eduardo Barros   sys_swapon: call ...
3169
3170
  	error = swap_cgroup_swapon(p->type, maxpages);
  	if (error)
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3171
  		goto bad_swap_unlock_inode;
1421ef3cd   Cesar Eduardo Barros   sys_swapon: call ...
3172

915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3173
  	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2a8f94493   Shaohua Li   swap: change bloc...
3174
  		cluster_info, maxpages, &span);
915d4d7bc   Cesar Eduardo Barros   sys_swapon: separ...
3175
3176
  	if (unlikely(nr_extents < 0)) {
  		error = nr_extents;
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3177
  		goto bad_swap_unlock_inode;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3178
  	}
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3179
  	/* frontswap enabled? set up bit-per-page map for frontswap */
8ea1d2a19   Vlastimil Babka   mm, frontswap: co...
3180
  	if (IS_ENABLED(CONFIG_FRONTSWAP))
778e1cdd8   Kees Cook   treewide: kvzallo...
3181
3182
  		frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
  					 sizeof(long),
54f180d3c   Huang Ying   mm, swap: use kvz...
3183
  					 GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3184

2a8f94493   Shaohua Li   swap: change bloc...
3185
3186
3187
3188
3189
3190
3191
3192
3193
  	if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
  		/*
  		 * When discard is enabled for swap with no particular
  		 * policy flagged, we set all swap discard flags here in
  		 * order to sustain backward compatibility with older
  		 * swapon(8) releases.
  		 */
  		p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
  			     SWP_PAGE_DISCARD);
dcf6b7ddd   Rafael Aquini   swap: discard whi...
3194

2a8f94493   Shaohua Li   swap: change bloc...
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
  		/*
  		 * By flagging sys_swapon, a sysadmin can tell us to
  		 * either do single-time area discards only, or to just
  		 * perform discards for released swap page-clusters.
  		 * Now it's time to adjust the p->flags accordingly.
  		 */
  		if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
  			p->flags &= ~SWP_PAGE_DISCARD;
  		else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
  			p->flags &= ~SWP_AREA_DISCARD;
  
  		/* issue a swapon-time discard if it's still required */
  		if (p->flags & SWP_AREA_DISCARD) {
  			int err = discard_swap(p);
  			if (unlikely(err))
  				pr_err("swapon: discard_swap(%p): %d
  ",
  					p, err);
dcf6b7ddd   Rafael Aquini   swap: discard whi...
3213
  		}
20137a490   Hugh Dickins   swapfile: swapon ...
3214
  	}
6a6ba8317   Hugh Dickins   swapfile: swapon ...
3215

4b3ef9daa   Huang, Ying   mm/swap: split sw...
3216
3217
  	error = init_swap_address_space(p->type, maxpages);
  	if (error)
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3218
  		goto bad_swap_unlock_inode;
4b3ef9daa   Huang, Ying   mm/swap: split sw...
3219

dc617f29d   Darrick J. Wong   vfs: don't allow ...
3220
3221
3222
3223
3224
3225
3226
3227
  	/*
  	 * Flush any pending IO and dirty mappings before we start using this
  	 * swap device.
  	 */
  	inode->i_flags |= S_SWAPFILE;
  	error = inode_drain_writes(inode);
  	if (error) {
  		inode->i_flags &= ~S_SWAPFILE;
822bca52e   Miaohe Lin   mm/swapfile.c: fi...
3228
  		goto free_swap_address_space;
dc617f29d   Darrick J. Wong   vfs: don't allow ...
3229
  	}
fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
3230
  	mutex_lock(&swapon_mutex);
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
3231
  	prio = -1;
78ecba081   Hugh Dickins   mm: fix ever-decr...
3232
  	if (swap_flags & SWAP_FLAG_PREFER)
40531542e   Cesar Eduardo Barros   sys_swapon: separ...
3233
  		prio =
78ecba081   Hugh Dickins   mm: fix ever-decr...
3234
  		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2a8f94493   Shaohua Li   swap: change bloc...
3235
  	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
c69dbfb84   Cesar Eduardo Barros   sys_swapon: move ...
3236

756a025f0   Joe Perches   mm: coalesce spli...
3237
3238
  	pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s
  ",
91a27b2a7   Jeff Layton   vfs: define struc...
3239
  		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
c69dbfb84   Cesar Eduardo Barros   sys_swapon: move ...
3240
3241
  		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
  		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3242
  		(p->flags & SWP_DISCARDABLE) ? "D" : "",
dcf6b7ddd   Rafael Aquini   swap: discard whi...
3243
3244
  		(p->flags & SWP_AREA_DISCARD) ? "s" : "",
  		(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
38b5faf4b   Dan Magenheimer   mm: frontswap: co...
3245
  		(frontswap_map) ? "FS" : "");
c69dbfb84   Cesar Eduardo Barros   sys_swapon: move ...
3246

fc0abb145   Ingo Molnar   [PATCH] sem2mutex...
3247
  	mutex_unlock(&swapon_mutex);
66d7dd518   Kay Sievers   /proc/swaps: supp...
3248
3249
  	atomic_inc(&proc_poll_event);
  	wake_up_interruptible(&proc_poll_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3250
3251
  	error = 0;
  	goto out;
822bca52e   Miaohe Lin   mm/swapfile.c: fi...
3252
3253
  free_swap_address_space:
  	exit_swap_address_space(p->type);
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3254
3255
  bad_swap_unlock_inode:
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3256
  bad_swap:
ebc2a1a69   Shaohua Li   swap: make cluste...
3257
3258
  	free_percpu(p->percpu_cluster);
  	p->percpu_cluster = NULL;
490705888   Huang Ying   swap: reduce lock...
3259
3260
  	free_percpu(p->cluster_next_cpu);
  	p->cluster_next_cpu = NULL;
bd69010b0   Cesar Eduardo Barros   sys_swapon: use a...
3261
  	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
f2090d2df   Cesar Eduardo Barros   sys_swapon: remov...
3262
3263
  		set_blocksize(p->bdev, p->old_block_size);
  		blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3264
  	}
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3265
  	inode = NULL;
4cd3bb10f   Hugh Dickins   [PATCH] swap: mov...
3266
  	destroy_swap_extents(p);
e8e6c2ec4   Cesar Eduardo Barros   sys_swapon: do no...
3267
  	swap_cgroup_swapoff(p->type);
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3268
  	spin_lock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3269
  	p->swap_file = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3270
  	p->flags = 0;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3271
  	spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3272
  	vfree(swap_map);
8606a1a94   Darrick J. Wong   mm: kvfree the sw...
3273
  	kvfree(cluster_info);
b6b1fd2a6   David Rientjes   mm/swapfile.c: fi...
3274
  	kvfree(frontswap_map);
7cbf31923   Omar Sandoval   mm: fix nr_rotate...
3275
3276
  	if (inced_nr_rotate_swap)
  		atomic_dec(&nr_rotate_swap);
d795a90e2   Naohiro Aota   mm/swapfile.c: mo...
3277
  	if (swap_file)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3278
3279
3280
3281
  		filp_close(swap_file, NULL);
  out:
  	if (page && !IS_ERR(page)) {
  		kunmap(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3282
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3283
3284
3285
  	}
  	if (name)
  		putname(name);
1638045c3   Darrick J. Wong   mm: set S_SWAPFIL...
3286
  	if (inode)
5955102c9   Al Viro   wrappers for ->i_...
3287
  		inode_unlock(inode);
039939a65   Tim Chen   mm/swap: enable s...
3288
3289
  	if (!error)
  		enable_swap_slots_cache();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3290
3291
3292
3293
3294
  	return error;
  }
  
  void si_swapinfo(struct sysinfo *val)
  {
efa90a981   Hugh Dickins   swap_info: change...
3295
  	unsigned int type;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3296
  	unsigned long nr_to_be_unused = 0;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3297
  	spin_lock(&swap_lock);
efa90a981   Hugh Dickins   swap_info: change...
3298
3299
3300
3301
3302
  	for (type = 0; type < nr_swapfiles; type++) {
  		struct swap_info_struct *si = swap_info[type];
  
  		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
  			nr_to_be_unused += si->inuse_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3303
  	}
ec8acf20a   Shaohua Li   swap: add per-par...
3304
  	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3305
  	val->totalswap = total_swap_pages + nr_to_be_unused;
5d337b919   Hugh Dickins   [PATCH] swap: swa...
3306
  	spin_unlock(&swap_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3307
  }
5e07d2eb0   Vijayanand Jitta   ANDROID: mm: Expo...
3308
  EXPORT_SYMBOL_GPL(si_swapinfo);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3309
3310
3311
3312
  
  /*
   * Verify that a swap entry is valid and increment its swap map count.
   *
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3313
3314
3315
3316
3317
3318
   * Returns error code in following case.
   * - success -> 0
   * - swp_entry is invalid -> EINVAL
   * - swp_entry is migration entry -> EINVAL
   * - swap-cache reference is requested but there is already one. -> EEXIST
   * - swap-cache reference is requested but the entry is not used. -> ENOENT
570a335b8   Hugh Dickins   swap_info: swap c...
3319
   * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3320
   */
8d69aaee8   Hugh Dickins   swap_info: swap_m...
3321
  static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3322
  {
73c34b6ac   Hugh Dickins   swap_info: miscel...
3323
  	struct swap_info_struct *p;
235b62176   Huang, Ying   mm/swap: add clus...
3324
  	struct swap_cluster_info *ci;
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
3325
  	unsigned long offset;
8d69aaee8   Hugh Dickins   swap_info: swap_m...
3326
3327
  	unsigned char count;
  	unsigned char has_cache;
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3328
  	int err = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3329

eb085574a   Huang Ying   mm, swap: fix rac...
3330
  	p = get_swap_device(entry);
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
3331
  	if (!p)
235b62176   Huang, Ying   mm/swap: add clus...
3332
  		goto out;
eb085574a   Huang Ying   mm, swap: fix rac...
3333
  	offset = swp_offset(entry);
235b62176   Huang, Ying   mm/swap: add clus...
3334
  	ci = lock_cluster_or_swap_info(p, offset);
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3335

253d553ba   Hugh Dickins   swap_info: SWAP_H...
3336
  	count = p->swap_map[offset];
edfe23dac   Shaohua Li   swap: fix races e...
3337
3338
3339
3340
3341
3342
3343
3344
3345
  
  	/*
  	 * swapin_readahead() doesn't check if a swap entry is valid, so the
  	 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
  	 */
  	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
  		err = -ENOENT;
  		goto unlock_out;
  	}
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3346
3347
3348
  	has_cache = count & SWAP_HAS_CACHE;
  	count &= ~SWAP_HAS_CACHE;
  	err = 0;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3349

253d553ba   Hugh Dickins   swap_info: SWAP_H...
3350
  	if (usage == SWAP_HAS_CACHE) {
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3351
3352
  
  		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3353
3354
3355
3356
3357
3358
  		if (!has_cache && count)
  			has_cache = SWAP_HAS_CACHE;
  		else if (has_cache)		/* someone else added cache */
  			err = -EEXIST;
  		else				/* no users remaining */
  			err = -ENOENT;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3359
3360
  
  	} else if (count || has_cache) {
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3361

570a335b8   Hugh Dickins   swap_info: swap c...
3362
3363
3364
  		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
  			count += usage;
  		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3365
  			err = -EINVAL;
570a335b8   Hugh Dickins   swap_info: swap c...
3366
3367
3368
3369
  		else if (swap_count_continued(p, offset, count))
  			count = COUNT_CONTINUED;
  		else
  			err = -ENOMEM;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3370
  	} else
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3371
  		err = -ENOENT;			/* unused swap entry */
a449bf58e   Qian Cai   mm/swapfile: fix ...
3372
  	WRITE_ONCE(p->swap_map[offset], count | has_cache);
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3373

355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3374
  unlock_out:
235b62176   Huang, Ying   mm/swap: add clus...
3375
  	unlock_cluster_or_swap_info(p, ci);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3376
  out:
eb085574a   Huang Ying   mm, swap: fix rac...
3377
3378
  	if (p)
  		put_swap_device(p);
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3379
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3380
  }
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3381

355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3382
  /*
aaa468653   Hugh Dickins   swap_info: note S...
3383
3384
3385
3386
3387
3388
3389
3390
3391
   * Help swapoff by noting that swap entry belongs to shmem/tmpfs
   * (in which case its reference count is never incremented).
   */
  void swap_shmem_alloc(swp_entry_t entry)
  {
  	__swap_duplicate(entry, SWAP_MAP_SHMEM);
  }
  
  /*
08259d58e   Hugh Dickins   mm: add comment o...
3392
3393
3394
3395
3396
   * Increase reference count of swap entry by 1.
   * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
   * but could not be atomically allocated.  Returns 0, just as if it succeeded,
   * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
   * might occur if a page table entry has got corrupted.
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3397
   */
570a335b8   Hugh Dickins   swap_info: swap c...
3398
  int swap_duplicate(swp_entry_t entry)
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3399
  {
570a335b8   Hugh Dickins   swap_info: swap c...
3400
3401
3402
3403
3404
  	int err = 0;
  
  	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
  		err = add_swap_count_continuation(entry, GFP_ATOMIC);
  	return err;
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3405
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3406

cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
3407
  /*
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3408
3409
   * @entry: swap entry for which we allocate swap cache.
   *
73c34b6ac   Hugh Dickins   swap_info: miscel...
3410
   * Called when allocating swap cache for existing swap entry,
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3411
   * This can return error codes. Returns 0 at success.
3eeba1356   Chen Wandun   mm/swapfile.c: fi...
3412
   * -EEXIST means there is a swap cache.
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
3413
   * Note: return code is different from swap_duplicate().
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
3414
3415
3416
   */
  int swapcache_prepare(swp_entry_t entry)
  {
253d553ba   Hugh Dickins   swap_info: SWAP_H...
3417
  	return __swap_duplicate(entry, SWAP_HAS_CACHE);
cb4b86ba4   KAMEZAWA Hiroyuki   mm: add swap cach...
3418
  }
0bcac06f2   Minchan Kim   mm, swap: skip sw...
3419
3420
  struct swap_info_struct *swp_swap_info(swp_entry_t entry)
  {
c10d38cc8   Daniel Jordan   mm, swap: bounds ...
3421
  	return swap_type_to_swap_info(swp_type(entry));
0bcac06f2   Minchan Kim   mm, swap: skip sw...
3422
  }
f981c5950   Mel Gorman   mm: methods for t...
3423
3424
  struct swap_info_struct *page_swap_info(struct page *page)
  {
0bcac06f2   Minchan Kim   mm, swap: skip sw...
3425
3426
  	swp_entry_t entry = { .val = page_private(page) };
  	return swp_swap_info(entry);
f981c5950   Mel Gorman   mm: methods for t...
3427
3428
3429
3430
3431
3432
3433
  }
  
  /*
   * out-of-line __page_file_ methods to avoid include hell.
   */
  struct address_space *__page_file_mapping(struct page *page)
  {
f981c5950   Mel Gorman   mm: methods for t...
3434
3435
3436
3437
3438
3439
3440
  	return page_swap_info(page)->swap_file->f_mapping;
  }
  EXPORT_SYMBOL_GPL(__page_file_mapping);
  
  pgoff_t __page_file_index(struct page *page)
  {
  	swp_entry_t swap = { .val = page_private(page) };
f981c5950   Mel Gorman   mm: methods for t...
3441
3442
3443
  	return swp_offset(swap);
  }
  EXPORT_SYMBOL_GPL(__page_file_index);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3444
  /*
570a335b8   Hugh Dickins   swap_info: swap c...
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
   * add_swap_count_continuation - called when a swap count is duplicated
   * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
   * page of the original vmalloc'ed swap_map, to hold the continuation count
   * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
   * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
   *
   * These continuation pages are seldom referenced: the common paths all work
   * on the original swap_map, only referring to a continuation page when the
   * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
   *
   * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
   * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
   * can be called after dropping locks.
   */
  int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
  {
  	struct swap_info_struct *si;
235b62176   Huang, Ying   mm/swap: add clus...
3462
  	struct swap_cluster_info *ci;
570a335b8   Hugh Dickins   swap_info: swap c...
3463
3464
3465
3466
3467
  	struct page *head;
  	struct page *page;
  	struct page *list_page;
  	pgoff_t offset;
  	unsigned char count;
eb085574a   Huang Ying   mm, swap: fix rac...
3468
  	int ret = 0;
570a335b8   Hugh Dickins   swap_info: swap c...
3469
3470
3471
3472
3473
3474
  
  	/*
  	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
  	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
  	 */
  	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
eb085574a   Huang Ying   mm, swap: fix rac...
3475
  	si = get_swap_device(entry);
570a335b8   Hugh Dickins   swap_info: swap c...
3476
3477
3478
  	if (!si) {
  		/*
  		 * An acceptable race has occurred since the failing
eb085574a   Huang Ying   mm, swap: fix rac...
3479
  		 * __swap_duplicate(): the swap device may be swapoff
570a335b8   Hugh Dickins   swap_info: swap c...
3480
3481
3482
  		 */
  		goto outer;
  	}
eb085574a   Huang Ying   mm, swap: fix rac...
3483
  	spin_lock(&si->lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3484
3485
  
  	offset = swp_offset(entry);
235b62176   Huang, Ying   mm/swap: add clus...
3486
3487
  
  	ci = lock_cluster(si, offset);
570a335b8   Hugh Dickins   swap_info: swap c...
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
  	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
  
  	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
  		/*
  		 * The higher the swap count, the more likely it is that tasks
  		 * will race to add swap count continuation: we need to avoid
  		 * over-provisioning.
  		 */
  		goto out;
  	}
  
  	if (!page) {
eb085574a   Huang Ying   mm, swap: fix rac...
3500
3501
  		ret = -ENOMEM;
  		goto out;
570a335b8   Hugh Dickins   swap_info: swap c...
3502
3503
3504
3505
  	}
  
  	/*
  	 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2de1a7e40   Seth Jennings   mm/swapfile.c: fi...
3506
3507
  	 * no architecture is using highmem pages for kernel page tables: so it
  	 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
570a335b8   Hugh Dickins   swap_info: swap c...
3508
3509
3510
  	 */
  	head = vmalloc_to_page(si->swap_map + offset);
  	offset &= ~PAGE_MASK;
2628bd6fc   Huang Ying   mm, swap: fix rac...
3511
  	spin_lock(&si->cont_lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
  	/*
  	 * Page allocation does not initialize the page's lru field,
  	 * but it does always reset its private field.
  	 */
  	if (!page_private(head)) {
  		BUG_ON(count & COUNT_CONTINUED);
  		INIT_LIST_HEAD(&head->lru);
  		set_page_private(head, SWP_CONTINUED);
  		si->flags |= SWP_CONTINUED;
  	}
  
  	list_for_each_entry(list_page, &head->lru, lru) {
  		unsigned char *map;
  
  		/*
  		 * If the previous map said no continuation, but we've found
  		 * a continuation page, free our allocation and use this one.
  		 */
  		if (!(count & COUNT_CONTINUED))
2628bd6fc   Huang Ying   mm, swap: fix rac...
3531
  			goto out_unlock_cont;
570a335b8   Hugh Dickins   swap_info: swap c...
3532

9b04c5fec   Cong Wang   mm: remove the se...
3533
  		map = kmap_atomic(list_page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3534
  		count = *map;
9b04c5fec   Cong Wang   mm: remove the se...
3535
  		kunmap_atomic(map);
570a335b8   Hugh Dickins   swap_info: swap c...
3536
3537
3538
3539
3540
3541
  
  		/*
  		 * If this continuation count now has some space in it,
  		 * free our allocation and use this one.
  		 */
  		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2628bd6fc   Huang Ying   mm, swap: fix rac...
3542
  			goto out_unlock_cont;
570a335b8   Hugh Dickins   swap_info: swap c...
3543
3544
3545
3546
  	}
  
  	list_add_tail(&page->lru, &head->lru);
  	page = NULL;			/* now it's attached, don't free it */
2628bd6fc   Huang Ying   mm, swap: fix rac...
3547
3548
  out_unlock_cont:
  	spin_unlock(&si->cont_lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3549
  out:
235b62176   Huang, Ying   mm/swap: add clus...
3550
  	unlock_cluster(ci);
ec8acf20a   Shaohua Li   swap: add per-par...
3551
  	spin_unlock(&si->lock);
eb085574a   Huang Ying   mm, swap: fix rac...
3552
  	put_swap_device(si);
570a335b8   Hugh Dickins   swap_info: swap c...
3553
3554
3555
  outer:
  	if (page)
  		__free_page(page);
eb085574a   Huang Ying   mm, swap: fix rac...
3556
  	return ret;
570a335b8   Hugh Dickins   swap_info: swap c...
3557
3558
3559
3560
3561
3562
3563
3564
  }
  
  /*
   * swap_count_continued - when the original swap_map count is incremented
   * from SWAP_MAP_MAX, check if there is already a continuation page to carry
   * into, carry if so, or else fail until a new continuation page is allocated;
   * when the original swap_map count is decremented from 0 with continuation,
   * borrow from the continuation and report whether it still holds more.
235b62176   Huang, Ying   mm/swap: add clus...
3565
3566
   * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
   * lock.
570a335b8   Hugh Dickins   swap_info: swap c...
3567
3568
3569
3570
3571
3572
3573
   */
  static bool swap_count_continued(struct swap_info_struct *si,
  				 pgoff_t offset, unsigned char count)
  {
  	struct page *head;
  	struct page *page;
  	unsigned char *map;
2628bd6fc   Huang Ying   mm, swap: fix rac...
3574
  	bool ret;
570a335b8   Hugh Dickins   swap_info: swap c...
3575
3576
3577
3578
3579
3580
  
  	head = vmalloc_to_page(si->swap_map + offset);
  	if (page_private(head) != SWP_CONTINUED) {
  		BUG_ON(count & COUNT_CONTINUED);
  		return false;		/* need to add count continuation */
  	}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3581
  	spin_lock(&si->cont_lock);
570a335b8   Hugh Dickins   swap_info: swap c...
3582
  	offset &= ~PAGE_MASK;
213516ac0   chenqiwu   mm/swapfile: use ...
3583
  	page = list_next_entry(head, lru);
9b04c5fec   Cong Wang   mm: remove the se...
3584
  	map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3585
3586
3587
3588
3589
3590
3591
3592
3593
  
  	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
  		goto init_map;		/* jump over SWAP_CONT_MAX checks */
  
  	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
  		/*
  		 * Think of how you add 1 to 999
  		 */
  		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
9b04c5fec   Cong Wang   mm: remove the se...
3594
  			kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3595
  			page = list_next_entry(page, lru);
570a335b8   Hugh Dickins   swap_info: swap c...
3596
  			BUG_ON(page == head);
9b04c5fec   Cong Wang   mm: remove the se...
3597
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3598
3599
  		}
  		if (*map == SWAP_CONT_MAX) {
9b04c5fec   Cong Wang   mm: remove the se...
3600
  			kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3601
  			page = list_next_entry(page, lru);
2628bd6fc   Huang Ying   mm, swap: fix rac...
3602
3603
3604
3605
  			if (page == head) {
  				ret = false;	/* add count continuation */
  				goto out;
  			}
9b04c5fec   Cong Wang   mm: remove the se...
3606
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3607
3608
3609
  init_map:		*map = 0;		/* we didn't zero the page */
  		}
  		*map += 1;
9b04c5fec   Cong Wang   mm: remove the se...
3610
  		kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3611
  		while ((page = list_prev_entry(page, lru)) != head) {
9b04c5fec   Cong Wang   mm: remove the se...
3612
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3613
  			*map = COUNT_CONTINUED;
9b04c5fec   Cong Wang   mm: remove the se...
3614
  			kunmap_atomic(map);
570a335b8   Hugh Dickins   swap_info: swap c...
3615
  		}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3616
  		ret = true;			/* incremented */
570a335b8   Hugh Dickins   swap_info: swap c...
3617
3618
3619
3620
3621
3622
3623
  
  	} else {				/* decrementing */
  		/*
  		 * Think of how you subtract 1 from 1000
  		 */
  		BUG_ON(count != COUNT_CONTINUED);
  		while (*map == COUNT_CONTINUED) {
9b04c5fec   Cong Wang   mm: remove the se...
3624
  			kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3625
  			page = list_next_entry(page, lru);
570a335b8   Hugh Dickins   swap_info: swap c...
3626
  			BUG_ON(page == head);
9b04c5fec   Cong Wang   mm: remove the se...
3627
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3628
3629
3630
3631
3632
  		}
  		BUG_ON(*map == 0);
  		*map -= 1;
  		if (*map == 0)
  			count = 0;
9b04c5fec   Cong Wang   mm: remove the se...
3633
  		kunmap_atomic(map);
213516ac0   chenqiwu   mm/swapfile: use ...
3634
  		while ((page = list_prev_entry(page, lru)) != head) {
9b04c5fec   Cong Wang   mm: remove the se...
3635
  			map = kmap_atomic(page) + offset;
570a335b8   Hugh Dickins   swap_info: swap c...
3636
3637
  			*map = SWAP_CONT_MAX | count;
  			count = COUNT_CONTINUED;
9b04c5fec   Cong Wang   mm: remove the se...
3638
  			kunmap_atomic(map);
570a335b8   Hugh Dickins   swap_info: swap c...
3639
  		}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3640
  		ret = count == COUNT_CONTINUED;
570a335b8   Hugh Dickins   swap_info: swap c...
3641
  	}
2628bd6fc   Huang Ying   mm, swap: fix rac...
3642
3643
3644
  out:
  	spin_unlock(&si->cont_lock);
  	return ret;
570a335b8   Hugh Dickins   swap_info: swap c...
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
  }
  
  /*
   * free_swap_count_continuations - swapoff free all the continuation pages
   * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
   */
  static void free_swap_count_continuations(struct swap_info_struct *si)
  {
  	pgoff_t offset;
  
  	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
  		struct page *head;
  		head = vmalloc_to_page(si->swap_map + offset);
  		if (page_private(head)) {
0d576d20c   Geliang Tang   mm/swapfile.c: us...
3659
3660
3661
3662
  			struct page *page, *next;
  
  			list_for_each_entry_safe(page, next, &head->lru, lru) {
  				list_del(&page->lru);
570a335b8   Hugh Dickins   swap_info: swap c...
3663
3664
3665
3666
3667
  				__free_page(page);
  			}
  		}
  	}
  }
a2468cc9b   Aaron Lu   swap: choose swap...
3668

2cf855837   Tejun Heo   memcontrol: sched...
3669
  #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
6caa6a070   Johannes Weiner   mm: memcontrol: m...
3670
  void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
2cf855837   Tejun Heo   memcontrol: sched...
3671
3672
  {
  	struct swap_info_struct *si, *next;
6caa6a070   Johannes Weiner   mm: memcontrol: m...
3673
3674
3675
  	int nid = page_to_nid(page);
  
  	if (!(gfp_mask & __GFP_IO))
2cf855837   Tejun Heo   memcontrol: sched...
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
  		return;
  
  	if (!blk_cgroup_congested())
  		return;
  
  	/*
  	 * We've already scheduled a throttle, avoid taking the global swap
  	 * lock.
  	 */
  	if (current->throttle_queue)
  		return;
  
  	spin_lock(&swap_avail_lock);
6caa6a070   Johannes Weiner   mm: memcontrol: m...
3689
3690
  	plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
  				  avail_lists[nid]) {
2cf855837   Tejun Heo   memcontrol: sched...
3691
  		if (si->bdev) {
6caa6a070   Johannes Weiner   mm: memcontrol: m...
3692
  			blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
2cf855837   Tejun Heo   memcontrol: sched...
3693
3694
3695
3696
3697
3698
  			break;
  		}
  	}
  	spin_unlock(&swap_avail_lock);
  }
  #endif
a2468cc9b   Aaron Lu   swap: choose swap...
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
  static int __init swapfile_init(void)
  {
  	int nid;
  
  	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
  					 GFP_KERNEL);
  	if (!swap_avail_heads) {
  		pr_emerg("Not enough memory for swap heads, swap is disabled
  ");
  		return -ENOMEM;
  	}
  
  	for_each_node(nid)
  		plist_head_init(&swap_avail_heads[nid]);
  
  	return 0;
  }
  subsys_initcall(swapfile_init);