Blame view

mm/swap_state.c 21.5 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
  /*
   *  linux/mm/swap_state.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   *
   *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
10
  #include <linux/mm.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
11
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
13
  #include <linux/kernel_stat.h>
  #include <linux/swap.h>
46017e954   Hugh Dickins   swapin_readahead:...
14
  #include <linux/swapops.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
15
16
  #include <linux/init.h>
  #include <linux/pagemap.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
17
  #include <linux/backing-dev.h>
3fb5c298b   Christian Ehrhardt   swap: allow swap ...
18
  #include <linux/blkdev.h>
c484d4104   Hugh Dickins   [PATCH] mm: free_...
19
  #include <linux/pagevec.h>
b20a35035   Christoph Lameter   [PATCH] page migr...
20
  #include <linux/migrate.h>
4b3ef9daa   Huang, Ying   mm/swap: split sw...
21
  #include <linux/vmalloc.h>
67afa38e0   Tim Chen   mm/swap: add cach...
22
  #include <linux/swap_slots.h>
38d8b4e6b   Huang Ying   mm, THP, swap: de...
23
  #include <linux/huge_mm.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
  
  #include <asm/pgtable.h>
  
  /*
   * swapper_space is a fiction, retained to simplify the path through
7eaceacca   Jens Axboe   block: remove per...
29
   * vmscan's shrink_page_list.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
30
   */
f5e54d6e5   Christoph Hellwig   [PATCH] mark addr...
31
  static const struct address_space_operations swap_aops = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
  	.writepage	= swap_writepage,
62c230bc1   Mel Gorman   mm: add support f...
33
  	.set_page_dirty	= swap_set_page_dirty,
1c93923cc   Andrew Morton   include/linux/mig...
34
  #ifdef CONFIG_MIGRATION
e965f9630   Christoph Lameter   [PATCH] Direct Mi...
35
  	.migratepage	= migrate_page,
1c93923cc   Andrew Morton   include/linux/mig...
36
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
  };
4b3ef9daa   Huang, Ying   mm/swap: split sw...
38
39
  struct address_space *swapper_spaces[MAX_SWAPFILES];
  static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
ec560175c   Huang Ying   mm, swap: VMA bas...
40
  bool swap_vma_readahead = true;
ec560175c   Huang Ying   mm, swap: VMA bas...
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
  #define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
  #define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
  #define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
  #define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)
  
  #define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
  #define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
  #define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)
  
  #define SWAP_RA_VAL(addr, win, hits)				\
  	(((addr) & PAGE_MASK) |					\
  	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
  	 ((hits) & SWAP_RA_HITS_MASK))
  
  /* Initial readahead hits is 4 to start up with a small window */
  #define GET_SWAP_RA_VAL(vma)					\
  	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
58
59
  
  #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
38d8b4e6b   Huang Ying   mm, THP, swap: de...
60
  #define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61
62
63
64
65
66
  
  static struct {
  	unsigned long add_total;
  	unsigned long del_total;
  	unsigned long find_success;
  	unsigned long find_total;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
67
  } swap_cache_info;
33806f06d   Shaohua Li   swap: make each s...
68
69
  unsigned long total_swapcache_pages(void)
  {
4b3ef9daa   Huang, Ying   mm/swap: split sw...
70
  	unsigned int i, j, nr;
33806f06d   Shaohua Li   swap: make each s...
71
  	unsigned long ret = 0;
4b3ef9daa   Huang, Ying   mm/swap: split sw...
72
  	struct address_space *spaces;
33806f06d   Shaohua Li   swap: make each s...
73

4b3ef9daa   Huang, Ying   mm/swap: split sw...
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  	rcu_read_lock();
  	for (i = 0; i < MAX_SWAPFILES; i++) {
  		/*
  		 * The corresponding entries in nr_swapper_spaces and
  		 * swapper_spaces will be reused only after at least
  		 * one grace period.  So it is impossible for them
  		 * belongs to different usage.
  		 */
  		nr = nr_swapper_spaces[i];
  		spaces = rcu_dereference(swapper_spaces[i]);
  		if (!nr || !spaces)
  			continue;
  		for (j = 0; j < nr; j++)
  			ret += spaces[j].nrpages;
  	}
  	rcu_read_unlock();
33806f06d   Shaohua Li   swap: make each s...
90
91
  	return ret;
  }
579f82901   Shaohua Li   swap: add a simpl...
92
  static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
93
94
  void show_swap_cache_info(void)
  {
33806f06d   Shaohua Li   swap: make each s...
95
96
  	printk("%lu pages in swap cache
  ", total_swapcache_pages());
2c97b7fc0   Johannes Weiner   mm: print swapcac...
97
98
  	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
99
  		swap_cache_info.add_total, swap_cache_info.del_total,
bb63be0a0   Hugh Dickins   tmpfs: move swap_...
100
  		swap_cache_info.find_success, swap_cache_info.find_total);
ec8acf20a   Shaohua Li   swap: add per-par...
101
102
103
  	printk("Free swap  = %ldkB
  ",
  		get_nr_swap_pages() << (PAGE_SHIFT - 10));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104
105
106
107
108
  	printk("Total swap = %lukB
  ", total_swap_pages << (PAGE_SHIFT - 10));
  }
  
  /*
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
109
   * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
110
111
   * but sets SwapCache flag and private instead of mapping and index.
   */
2f772e6ca   Seth Jennings   mm: break up swap...
112
  int __add_to_swap_cache(struct page *page, swp_entry_t entry)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
113
  {
38d8b4e6b   Huang Ying   mm, THP, swap: de...
114
  	int error, i, nr = hpage_nr_pages(page);
33806f06d   Shaohua Li   swap: make each s...
115
  	struct address_space *address_space;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
116
  	pgoff_t idx = swp_offset(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
117

309381fea   Sasha Levin   mm: dump page whe...
118
119
120
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageSwapCache(page), page);
  	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
51726b122   Hugh Dickins   mm: replace some ...
121

38d8b4e6b   Huang Ying   mm, THP, swap: de...
122
  	page_ref_add(page, nr);
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
123
  	SetPageSwapCache(page);
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
124

33806f06d   Shaohua Li   swap: make each s...
125
126
  	address_space = swap_address_space(entry);
  	spin_lock_irq(&address_space->tree_lock);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
127
128
129
130
131
132
  	for (i = 0; i < nr; i++) {
  		set_page_private(page + i, entry.val + i);
  		error = radix_tree_insert(&address_space->page_tree,
  					  idx + i, page + i);
  		if (unlikely(error))
  			break;
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
133
  	}
38d8b4e6b   Huang Ying   mm, THP, swap: de...
134
135
136
137
138
  	if (likely(!error)) {
  		address_space->nrpages += nr;
  		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
  		ADD_CACHE_INFO(add_total, nr);
  	} else {
2ca4532a4   Daisuke Nishimura   mm: add_to_swap_c...
139
140
141
142
143
144
  		/*
  		 * Only the context which have set SWAP_HAS_CACHE flag
  		 * would call add_to_swap_cache().
  		 * So add_to_swap_cache() doesn't returns -EEXIST.
  		 */
  		VM_BUG_ON(error == -EEXIST);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
145
146
147
148
149
  		set_page_private(page + i, 0UL);
  		while (i--) {
  			radix_tree_delete(&address_space->page_tree, idx + i);
  			set_page_private(page + i, 0UL);
  		}
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
150
  		ClearPageSwapCache(page);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
151
  		page_ref_sub(page, nr);
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
152
  	}
38d8b4e6b   Huang Ying   mm, THP, swap: de...
153
  	spin_unlock_irq(&address_space->tree_lock);
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
154
155
156
157
158
159
160
161
  
  	return error;
  }
  
  
  int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  {
  	int error;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
162
  	error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
35c754d79   Balbir Singh   memory controller...
163
  	if (!error) {
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
164
  		error = __add_to_swap_cache(page, entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
165
  		radix_tree_preload_end();
fa1de9008   Hugh Dickins   memcgroup: revert...
166
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
167
168
  	return error;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
169
170
171
172
173
174
  /*
   * This must be called only on pages that have
   * been verified to be in the swap cache.
   */
  void __delete_from_swap_cache(struct page *page)
  {
33806f06d   Shaohua Li   swap: make each s...
175
  	struct address_space *address_space;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
176
177
178
  	int i, nr = hpage_nr_pages(page);
  	swp_entry_t entry;
  	pgoff_t idx;
33806f06d   Shaohua Li   swap: make each s...
179

309381fea   Sasha Levin   mm: dump page whe...
180
181
182
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
  	VM_BUG_ON_PAGE(PageWriteback(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
183

33806f06d   Shaohua Li   swap: make each s...
184
185
  	entry.val = page_private(page);
  	address_space = swap_address_space(entry);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
186
187
188
189
190
  	idx = swp_offset(entry);
  	for (i = 0; i < nr; i++) {
  		radix_tree_delete(&address_space->page_tree, idx + i);
  		set_page_private(page + i, 0);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
191
  	ClearPageSwapCache(page);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
192
193
194
  	address_space->nrpages -= nr;
  	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
  	ADD_CACHE_INFO(del_total, nr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
195
196
197
198
199
200
201
202
203
  }
  
  /**
   * add_to_swap - allocate swap space for a page
   * @page: page we want to move to swap
   *
   * Allocate swap space for the page and add the page to the
   * swap cache.  Caller needs to hold the page lock. 
   */
0f0746589   Minchan Kim   mm, THP, swap: mo...
204
  int add_to_swap(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
205
206
  {
  	swp_entry_t entry;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
207
  	int err;
309381fea   Sasha Levin   mm: dump page whe...
208
209
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(!PageUptodate(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210

38d8b4e6b   Huang Ying   mm, THP, swap: de...
211
  	entry = get_swap_page(page);
2ca4532a4   Daisuke Nishimura   mm: add_to_swap_c...
212
  	if (!entry.val)
0f0746589   Minchan Kim   mm, THP, swap: mo...
213
  		return 0;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
214
  	if (mem_cgroup_try_charge_swap(page, entry))
0f0746589   Minchan Kim   mm, THP, swap: mo...
215
  		goto fail;
3f04f62f9   Andrea Arcangeli   thp: split_huge_p...
216

2ca4532a4   Daisuke Nishimura   mm: add_to_swap_c...
217
218
219
220
221
222
223
224
225
  	/*
  	 * Radix-tree node allocations from PF_MEMALLOC contexts could
  	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
  	 * stops emergency reserves from being allocated.
  	 *
  	 * TODO: this could cause a theoretical memory reclaim
  	 * deadlock in the swap out path.
  	 */
  	/*
854e9ed09   Minchan Kim   mm: support madvi...
226
  	 * Add it to the swap cache.
2ca4532a4   Daisuke Nishimura   mm: add_to_swap_c...
227
228
229
  	 */
  	err = add_to_swap_cache(page, entry,
  			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
230
231
  	/* -ENOMEM radix-tree allocation failure */
  	if (err)
bd53b714d   Nick Piggin   [PATCH] mm: use _...
232
  		/*
2ca4532a4   Daisuke Nishimura   mm: add_to_swap_c...
233
234
  		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
  		 * clear SWAP_HAS_CACHE flag.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
235
  		 */
0f0746589   Minchan Kim   mm, THP, swap: mo...
236
  		goto fail;
9625456cc   Shaohua Li   mm: fix data corr...
237
238
239
240
241
242
243
244
245
246
247
  	/*
  	 * Normally the page will be dirtied in unmap because its pte should be
  	 * dirty. A special case is MADV_FREE page. The page'e pte could have
  	 * dirty bit cleared but the page's SwapBacked bit is still set because
  	 * clearing the dirty bit and SwapBacked bit has no lock protected. For
  	 * such page, unmap will not set dirty bit for it, so page reclaim will
  	 * not write the page out. This can cause data corruption when the page
  	 * is swap in later. Always setting the dirty bit for the page solves
  	 * the problem.
  	 */
  	set_page_dirty(page);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
248
249
  
  	return 1;
38d8b4e6b   Huang Ying   mm, THP, swap: de...
250
  fail:
0f0746589   Minchan Kim   mm, THP, swap: mo...
251
  	put_swap_page(page, entry);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
252
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253
254
255
256
257
258
259
260
261
262
263
  }
  
  /*
   * This must be called only on pages that have
   * been verified to be in the swap cache and locked.
   * It will never put the page into the free list,
   * the caller has a reference on the page.
   */
  void delete_from_swap_cache(struct page *page)
  {
  	swp_entry_t entry;
33806f06d   Shaohua Li   swap: make each s...
264
  	struct address_space *address_space;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
265

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
266
  	entry.val = page_private(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
267

33806f06d   Shaohua Li   swap: make each s...
268
269
  	address_space = swap_address_space(entry);
  	spin_lock_irq(&address_space->tree_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
270
  	__delete_from_swap_cache(page);
33806f06d   Shaohua Li   swap: make each s...
271
  	spin_unlock_irq(&address_space->tree_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
272

75f6d6d29   Minchan Kim   mm, THP, swap: un...
273
  	put_swap_page(page, entry);
38d8b4e6b   Huang Ying   mm, THP, swap: de...
274
  	page_ref_sub(page, hpage_nr_pages(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
275
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
276
277
278
279
  /* 
   * If we are the only user, then try to free up the swap cache. 
   * 
   * Its ok to check for PageSwapCache without the page lock
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
280
281
   * here because we are going to recheck again inside
   * try_to_free_swap() _with_ the lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
282
283
284
285
   * 					- Marcelo
   */
  static inline void free_swap_cache(struct page *page)
  {
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
286
287
  	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
  		try_to_free_swap(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
288
289
290
291
292
293
  		unlock_page(page);
  	}
  }
  
  /* 
   * Perform a free_page(), also freeing any swap cache associated with
b8072f099   Hugh Dickins   [PATCH] mm: updat...
294
   * this page if it is the last user of the page.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
295
296
297
298
   */
  void free_page_and_swap_cache(struct page *page)
  {
  	free_swap_cache(page);
6fcb52a56   Aaron Lu   thp: reduce usage...
299
  	if (!is_huge_zero_page(page))
770a53702   Gerald Schaefer   mm: thp: broken p...
300
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
301
302
303
304
305
306
307
308
  }
  
  /*
   * Passed an array of pages, drop them all from swapcache and then release
   * them.  They are removed from the LRU and freed if this is their last use.
   */
  void free_pages_and_swap_cache(struct page **pages, int nr)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
309
  	struct page **pagep = pages;
aabfb5729   Michal Hocko   mm: memcontrol: d...
310
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
311
312
  
  	lru_add_drain();
aabfb5729   Michal Hocko   mm: memcontrol: d...
313
314
315
  	for (i = 0; i < nr; i++)
  		free_swap_cache(pagep[i]);
  	release_pages(pagep, nr, false);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
316
317
318
319
320
321
322
323
  }
  
  /*
   * Lookup a swap entry in the swap cache. A found page will be returned
   * unlocked and with its refcount incremented - we rely on the kernel
   * lock getting page table operations atomic even if we drop the page
   * lock before returning.
   */
ec560175c   Huang Ying   mm, swap: VMA bas...
324
325
  struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
  			       unsigned long addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
326
327
  {
  	struct page *page;
ec560175c   Huang Ying   mm, swap: VMA bas...
328
329
  	unsigned long ra_info;
  	int win, hits, readahead;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
330

f6ab1f7f6   Huang Ying   mm, swap: use off...
331
  	page = find_get_page(swap_address_space(entry), swp_offset(entry));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332

ec560175c   Huang Ying   mm, swap: VMA bas...
333
334
  	INC_CACHE_INFO(find_total);
  	if (page) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
335
  		INC_CACHE_INFO(find_success);
ec560175c   Huang Ying   mm, swap: VMA bas...
336
337
338
339
340
341
342
343
344
345
346
347
348
  		if (unlikely(PageTransCompound(page)))
  			return page;
  		readahead = TestClearPageReadahead(page);
  		if (vma) {
  			ra_info = GET_SWAP_RA_VAL(vma);
  			win = SWAP_RA_WIN(ra_info);
  			hits = SWAP_RA_HITS(ra_info);
  			if (readahead)
  				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
  			atomic_long_set(&vma->swap_readahead_info,
  					SWAP_RA_VAL(addr, win, hits));
  		}
  		if (readahead) {
cbc65df24   Huang Ying   mm, swap: add swa...
349
  			count_vm_event(SWAP_RA_HIT);
ec560175c   Huang Ying   mm, swap: VMA bas...
350
351
  			if (!vma)
  				atomic_inc(&swapin_readahead_hits);
cbc65df24   Huang Ying   mm, swap: add swa...
352
  		}
579f82901   Shaohua Li   swap: add a simpl...
353
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
354
355
  	return page;
  }
5b999aadb   Dmitry Safonov   mm: swap: zswap: ...
356
357
358
  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
  			struct vm_area_struct *vma, unsigned long addr,
  			bool *new_page_allocated)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
359
360
  {
  	struct page *found_page, *new_page = NULL;
5b999aadb   Dmitry Safonov   mm: swap: zswap: ...
361
  	struct address_space *swapper_space = swap_address_space(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
362
  	int err;
5b999aadb   Dmitry Safonov   mm: swap: zswap: ...
363
  	*new_page_allocated = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
364
365
366
367
368
369
370
  
  	do {
  		/*
  		 * First check the swap cache.  Since this is normally
  		 * called after lookup_swap_cache() failed, re-calling
  		 * that would confuse statistics.
  		 */
f6ab1f7f6   Huang Ying   mm, swap: use off...
371
  		found_page = find_get_page(swapper_space, swp_offset(entry));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
372
373
  		if (found_page)
  			break;
ba81f8384   Huang Ying   mm/swap: skip rea...
374
375
376
377
378
379
380
381
382
383
  		/*
  		 * Just skip read ahead for unused swap slot.
  		 * During swap_off when swap_slot_cache is disabled,
  		 * we have to handle the race between putting
  		 * swap entry in swap cache and marking swap slot
  		 * as SWAP_HAS_CACHE.  That's done in later part of code or
  		 * else swap_off will be aborted if we return NULL.
  		 */
  		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
  			break;
e8c26ab60   Tim Chen   mm/swap: skip rea...
384

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
385
386
387
388
  		/*
  		 * Get a new page to read into from swap.
  		 */
  		if (!new_page) {
02098feaa   Hugh Dickins   swapin needs gfp_...
389
  			new_page = alloc_page_vma(gfp_mask, vma, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
390
391
392
393
394
  			if (!new_page)
  				break;		/* Out of memory */
  		}
  
  		/*
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
395
396
  		 * call radix_tree_preload() while we can wait.
  		 */
5e4c0d974   Jan Kara   lib/radix-tree.c:...
397
  		err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
398
399
400
401
  		if (err)
  			break;
  
  		/*
f000944d0   Hugh Dickins   tmpfs: shuffle ad...
402
403
  		 * Swap entry may have been freed since our caller observed it.
  		 */
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
404
  		err = swapcache_prepare(entry);
cbab0e4ee   Rafael Aquini   swap: avoid read_...
405
  		if (err == -EEXIST) {
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
406
  			radix_tree_preload_end();
cbab0e4ee   Rafael Aquini   swap: avoid read_...
407
408
409
  			/*
  			 * We might race against get_swap_page() and stumble
  			 * across a SWAP_HAS_CACHE swap_map entry whose page
9c1cc2e4f   Huang Ying   mm, swap: fix com...
410
  			 * has not been brought into the swapcache yet.
cbab0e4ee   Rafael Aquini   swap: avoid read_...
411
412
  			 */
  			cond_resched();
355cfa73d   KAMEZAWA Hiroyuki   mm: modify swap_m...
413
  			continue;
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
414
415
416
  		}
  		if (err) {		/* swp entry is obsolete ? */
  			radix_tree_preload_end();
f000944d0   Hugh Dickins   tmpfs: shuffle ad...
417
  			break;
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
418
  		}
f000944d0   Hugh Dickins   tmpfs: shuffle ad...
419

2ca4532a4   Daisuke Nishimura   mm: add_to_swap_c...
420
  		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
48c935ad8   Kirill A. Shutemov   page-flags: defin...
421
  		__SetPageLocked(new_page);
fa9949da5   Hugh Dickins   mm: use __SetPage...
422
  		__SetPageSwapBacked(new_page);
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
423
  		err = __add_to_swap_cache(new_page, entry);
529ae9aaa   Nick Piggin   mm: rename page t...
424
  		if (likely(!err)) {
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
425
  			radix_tree_preload_end();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
426
427
428
  			/*
  			 * Initiate read into locked page and return.
  			 */
c5fdae469   Rik van Riel   vmscan: add newly...
429
  			lru_cache_add_anon(new_page);
5b999aadb   Dmitry Safonov   mm: swap: zswap: ...
430
  			*new_page_allocated = true;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
431
432
  			return new_page;
  		}
31a563962   Daisuke Nishimura   mm: add_to_swap_c...
433
  		radix_tree_preload_end();
48c935ad8   Kirill A. Shutemov   page-flags: defin...
434
  		__ClearPageLocked(new_page);
2ca4532a4   Daisuke Nishimura   mm: add_to_swap_c...
435
436
437
438
  		/*
  		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
  		 * clear SWAP_HAS_CACHE flag.
  		 */
75f6d6d29   Minchan Kim   mm, THP, swap: un...
439
  		put_swap_page(new_page, entry);
f000944d0   Hugh Dickins   tmpfs: shuffle ad...
440
  	} while (err != -ENOMEM);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
441
442
  
  	if (new_page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
443
  		put_page(new_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
444
445
  	return found_page;
  }
46017e954   Hugh Dickins   swapin_readahead:...
446

5b999aadb   Dmitry Safonov   mm: swap: zswap: ...
447
448
449
450
451
452
453
  /*
   * Locate a page of swap in physical memory, reserving swap cache space
   * and reading the disk if it is not already cached.
   * A failure return means that either the page allocation failed or that
   * the swap entry is no longer in use.
   */
  struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
23955622f   Shaohua Li   swap: add block i...
454
  		struct vm_area_struct *vma, unsigned long addr, bool do_poll)
5b999aadb   Dmitry Safonov   mm: swap: zswap: ...
455
456
457
458
459
460
  {
  	bool page_was_allocated;
  	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
  			vma, addr, &page_was_allocated);
  
  	if (page_was_allocated)
23955622f   Shaohua Li   swap: add block i...
461
  		swap_readpage(retpage, do_poll);
5b999aadb   Dmitry Safonov   mm: swap: zswap: ...
462
463
464
  
  	return retpage;
  }
ec560175c   Huang Ying   mm, swap: VMA bas...
465
466
467
468
469
  static unsigned int __swapin_nr_pages(unsigned long prev_offset,
  				      unsigned long offset,
  				      int hits,
  				      int max_pages,
  				      int prev_win)
579f82901   Shaohua Li   swap: add a simpl...
470
  {
ec560175c   Huang Ying   mm, swap: VMA bas...
471
  	unsigned int pages, last_ra;
579f82901   Shaohua Li   swap: add a simpl...
472
473
474
475
476
477
  
  	/*
  	 * This heuristic has been found to work well on both sequential and
  	 * random loads, swapping to hard disk or to SSD: please don't ask
  	 * what the "+ 2" means, it just happens to work well, that's all.
  	 */
ec560175c   Huang Ying   mm, swap: VMA bas...
478
  	pages = hits + 2;
579f82901   Shaohua Li   swap: add a simpl...
479
480
481
482
483
484
485
486
  	if (pages == 2) {
  		/*
  		 * We can have no readahead hits to judge by: but must not get
  		 * stuck here forever, so check for an adjacent offset instead
  		 * (and don't even bother to check whether swap type is same).
  		 */
  		if (offset != prev_offset + 1 && offset != prev_offset - 1)
  			pages = 1;
579f82901   Shaohua Li   swap: add a simpl...
487
488
489
490
491
492
493
494
495
496
497
  	} else {
  		unsigned int roundup = 4;
  		while (roundup < pages)
  			roundup <<= 1;
  		pages = roundup;
  	}
  
  	if (pages > max_pages)
  		pages = max_pages;
  
  	/* Don't shrink readahead too fast */
ec560175c   Huang Ying   mm, swap: VMA bas...
498
  	last_ra = prev_win / 2;
579f82901   Shaohua Li   swap: add a simpl...
499
500
  	if (pages < last_ra)
  		pages = last_ra;
ec560175c   Huang Ying   mm, swap: VMA bas...
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
  
  	return pages;
  }
  
  static unsigned long swapin_nr_pages(unsigned long offset)
  {
  	static unsigned long prev_offset;
  	unsigned int hits, pages, max_pages;
  	static atomic_t last_readahead_pages;
  
  	max_pages = 1 << READ_ONCE(page_cluster);
  	if (max_pages <= 1)
  		return 1;
  
  	hits = atomic_xchg(&swapin_readahead_hits, 0);
  	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
  				  atomic_read(&last_readahead_pages));
  	if (!hits)
  		prev_offset = offset;
579f82901   Shaohua Li   swap: add a simpl...
520
521
522
523
  	atomic_set(&last_readahead_pages, pages);
  
  	return pages;
  }
46017e954   Hugh Dickins   swapin_readahead:...
524
525
526
  /**
   * swapin_readahead - swap in pages in hope we need them soon
   * @entry: swap entry of this memory
7682486b3   Randy Dunlap   mm: fix various k...
527
   * @gfp_mask: memory allocation flags
46017e954   Hugh Dickins   swapin_readahead:...
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
   * @vma: user vma this address belongs to
   * @addr: target address for mempolicy
   *
   * Returns the struct page for entry and addr, after queueing swapin.
   *
   * Primitive swap readahead code. We simply read an aligned block of
   * (1 << page_cluster) entries in the swap area. This method is chosen
   * because it doesn't cost us any seek time.  We also make sure to queue
   * the 'original' request together with the readahead ones...
   *
   * This has been extended to use the NUMA policies from the mm triggering
   * the readahead.
   *
   * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
   */
02098feaa   Hugh Dickins   swapin needs gfp_...
543
  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
46017e954   Hugh Dickins   swapin_readahead:...
544
545
  			struct vm_area_struct *vma, unsigned long addr)
  {
46017e954   Hugh Dickins   swapin_readahead:...
546
  	struct page *page;
579f82901   Shaohua Li   swap: add a simpl...
547
548
  	unsigned long entry_offset = swp_offset(entry);
  	unsigned long offset = entry_offset;
67f96aa25   Rik van Riel   mm: make swapin r...
549
  	unsigned long start_offset, end_offset;
579f82901   Shaohua Li   swap: add a simpl...
550
  	unsigned long mask;
3fb5c298b   Christian Ehrhardt   swap: allow swap ...
551
  	struct blk_plug plug;
c4fa63092   Huang Ying   mm, swap: fix swa...
552
  	bool do_poll = true, page_allocated;
46017e954   Hugh Dickins   swapin_readahead:...
553

579f82901   Shaohua Li   swap: add a simpl...
554
555
556
  	mask = swapin_nr_pages(offset) - 1;
  	if (!mask)
  		goto skip;
23955622f   Shaohua Li   swap: add block i...
557
  	do_poll = false;
67f96aa25   Rik van Riel   mm: make swapin r...
558
559
560
561
562
  	/* Read a page_cluster sized and aligned cluster around offset. */
  	start_offset = offset & ~mask;
  	end_offset = offset | mask;
  	if (!start_offset)	/* First page is swap header. */
  		start_offset++;
3fb5c298b   Christian Ehrhardt   swap: allow swap ...
563
  	blk_start_plug(&plug);
67f96aa25   Rik van Riel   mm: make swapin r...
564
  	for (offset = start_offset; offset <= end_offset ; offset++) {
46017e954   Hugh Dickins   swapin_readahead:...
565
  		/* Ok, do the async read-ahead now */
c4fa63092   Huang Ying   mm, swap: fix swa...
566
567
568
  		page = __read_swap_cache_async(
  			swp_entry(swp_type(entry), offset),
  			gfp_mask, vma, addr, &page_allocated);
46017e954   Hugh Dickins   swapin_readahead:...
569
  		if (!page)
67f96aa25   Rik van Riel   mm: make swapin r...
570
  			continue;
c4fa63092   Huang Ying   mm, swap: fix swa...
571
572
573
574
575
576
577
  		if (page_allocated) {
  			swap_readpage(page, false);
  			if (offset != entry_offset &&
  			    likely(!PageTransCompound(page))) {
  				SetPageReadahead(page);
  				count_vm_event(SWAP_RA);
  			}
cbc65df24   Huang Ying   mm, swap: add swa...
578
  		}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
579
  		put_page(page);
46017e954   Hugh Dickins   swapin_readahead:...
580
  	}
3fb5c298b   Christian Ehrhardt   swap: allow swap ...
581
  	blk_finish_plug(&plug);
46017e954   Hugh Dickins   swapin_readahead:...
582
  	lru_add_drain();	/* Push any new pages onto the LRU now */
579f82901   Shaohua Li   swap: add a simpl...
583
  skip:
23955622f   Shaohua Li   swap: add block i...
584
  	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
46017e954   Hugh Dickins   swapin_readahead:...
585
  }
4b3ef9daa   Huang, Ying   mm/swap: split sw...
586
587
588
589
590
591
592
  
  int init_swap_address_space(unsigned int type, unsigned long nr_pages)
  {
  	struct address_space *spaces, *space;
  	unsigned int i, nr;
  
  	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
54f180d3c   Huang Ying   mm, swap: use kvz...
593
  	spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL);
4b3ef9daa   Huang, Ying   mm/swap: split sw...
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
  	if (!spaces)
  		return -ENOMEM;
  	for (i = 0; i < nr; i++) {
  		space = spaces + i;
  		INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
  		atomic_set(&space->i_mmap_writable, 0);
  		space->a_ops = &swap_aops;
  		/* swap cache doesn't use writeback related tags */
  		mapping_set_no_writeback_tags(space);
  		spin_lock_init(&space->tree_lock);
  	}
  	nr_swapper_spaces[type] = nr;
  	rcu_assign_pointer(swapper_spaces[type], spaces);
  
  	return 0;
  }
  
  void exit_swap_address_space(unsigned int type)
  {
  	struct address_space *spaces;
  
  	spaces = swapper_spaces[type];
  	nr_swapper_spaces[type] = 0;
  	rcu_assign_pointer(swapper_spaces[type], NULL);
  	synchronize_rcu();
  	kvfree(spaces);
  }
ec560175c   Huang Ying   mm, swap: VMA bas...
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
  
  static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
  				     unsigned long faddr,
  				     unsigned long lpfn,
  				     unsigned long rpfn,
  				     unsigned long *start,
  				     unsigned long *end)
  {
  	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
  		      PFN_DOWN(faddr & PMD_MASK));
  	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
  		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
  }
  
  struct page *swap_readahead_detect(struct vm_fault *vmf,
  				   struct vma_swap_readahead *swap_ra)
  {
  	struct vm_area_struct *vma = vmf->vma;
  	unsigned long swap_ra_info;
  	struct page *page;
  	swp_entry_t entry;
  	unsigned long faddr, pfn, fpfn;
  	unsigned long start, end;
  	pte_t *pte;
  	unsigned int max_win, hits, prev_win, win, left;
  #ifndef CONFIG_64BIT
  	pte_t *tpte;
  #endif
61b639723   Huang Ying   mm, swap: use pag...
649
650
651
652
653
654
  	max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
  			     SWAP_RA_ORDER_CEILING);
  	if (max_win == 1) {
  		swap_ra->win = 1;
  		return NULL;
  	}
ec560175c   Huang Ying   mm, swap: VMA bas...
655
656
657
658
659
660
661
  	faddr = vmf->address;
  	entry = pte_to_swp_entry(vmf->orig_pte);
  	if ((unlikely(non_swap_entry(entry))))
  		return NULL;
  	page = lookup_swap_cache(entry, vma, faddr);
  	if (page)
  		return page;
ec560175c   Huang Ying   mm, swap: VMA bas...
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
  	fpfn = PFN_DOWN(faddr);
  	swap_ra_info = GET_SWAP_RA_VAL(vma);
  	pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
  	prev_win = SWAP_RA_WIN(swap_ra_info);
  	hits = SWAP_RA_HITS(swap_ra_info);
  	swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
  					       max_win, prev_win);
  	atomic_long_set(&vma->swap_readahead_info,
  			SWAP_RA_VAL(faddr, win, 0));
  
  	if (win == 1)
  		return NULL;
  
  	/* Copy the PTEs because the page table may be unmapped */
  	if (fpfn == pfn + 1)
  		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
  	else if (pfn == fpfn + 1)
  		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
  				  &start, &end);
  	else {
  		left = (win - 1) / 2;
  		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
  				  &start, &end);
  	}
  	swap_ra->nr_pte = end - start;
  	swap_ra->offset = fpfn - start;
  	pte = vmf->pte - swap_ra->offset;
  #ifdef CONFIG_64BIT
  	swap_ra->ptes = pte;
  #else
  	tpte = swap_ra->ptes;
  	for (pfn = start; pfn != end; pfn++)
  		*tpte++ = *pte++;
  #endif
  
  	return NULL;
  }
  
  struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
  				    struct vm_fault *vmf,
  				    struct vma_swap_readahead *swap_ra)
  {
  	struct blk_plug plug;
  	struct vm_area_struct *vma = vmf->vma;
  	struct page *page;
  	pte_t *pte, pentry;
  	swp_entry_t entry;
  	unsigned int i;
  	bool page_allocated;
  
  	if (swap_ra->win == 1)
  		goto skip;
  
  	blk_start_plug(&plug);
  	for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
  	     i++, pte++) {
  		pentry = *pte;
  		if (pte_none(pentry))
  			continue;
  		if (pte_present(pentry))
  			continue;
  		entry = pte_to_swp_entry(pentry);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  		page = __read_swap_cache_async(entry, gfp_mask, vma,
  					       vmf->address, &page_allocated);
  		if (!page)
  			continue;
  		if (page_allocated) {
  			swap_readpage(page, false);
  			if (i != swap_ra->offset &&
  			    likely(!PageTransCompound(page))) {
  				SetPageReadahead(page);
  				count_vm_event(SWAP_RA);
  			}
  		}
  		put_page(page);
  	}
  	blk_finish_plug(&plug);
  	lru_add_drain();
  skip:
  	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
  				     swap_ra->win == 1);
  }
d9bfcfdc4   Huang Ying   mm, swap: add sys...
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
  
  #ifdef CONFIG_SYSFS
  static ssize_t vma_ra_enabled_show(struct kobject *kobj,
  				     struct kobj_attribute *attr, char *buf)
  {
  	return sprintf(buf, "%s
  ", swap_vma_readahead ? "true" : "false");
  }
  static ssize_t vma_ra_enabled_store(struct kobject *kobj,
  				      struct kobj_attribute *attr,
  				      const char *buf, size_t count)
  {
  	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
  		swap_vma_readahead = true;
  	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
  		swap_vma_readahead = false;
  	else
  		return -EINVAL;
  
  	return count;
  }
  static struct kobj_attribute vma_ra_enabled_attr =
  	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
  	       vma_ra_enabled_store);
d9bfcfdc4   Huang Ying   mm, swap: add sys...
770
771
  static struct attribute *swap_attrs[] = {
  	&vma_ra_enabled_attr.attr,
d9bfcfdc4   Huang Ying   mm, swap: add sys...
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
  	NULL,
  };
  
  static struct attribute_group swap_attr_group = {
  	.attrs = swap_attrs,
  };
  
  static int __init swap_init_sysfs(void)
  {
  	int err;
  	struct kobject *swap_kobj;
  
  	swap_kobj = kobject_create_and_add("swap", mm_kobj);
  	if (!swap_kobj) {
  		pr_err("failed to create swap kobject
  ");
  		return -ENOMEM;
  	}
  	err = sysfs_create_group(swap_kobj, &swap_attr_group);
  	if (err) {
  		pr_err("failed to register swap group
  ");
  		goto delete_obj;
  	}
  	return 0;
  
  delete_obj:
  	kobject_put(swap_kobj);
  	return err;
  }
  subsys_initcall(swap_init_sysfs);
  #endif