Commit 33806f06da654092182410d974b6d3c5396ea3eb

Authored by Shaohua Li
Committed by Linus Torvalds
1 parent 9800339b5e

swap: make each swap partition have one address_space

When I use several fast SSD to do swap, swapper_space.tree_lock is
heavily contended.  This makes each swap partition have one
address_space to reduce the lock contention.  There is an array of
address_space for swap.  The swap entry type is the index to the array.

In my test with 3 SSD, this increases the swapout throughput 20%.

[akpm@linux-foundation.org: revert unneeded change to  __add_to_swap_cache]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 8 changed files with 67 additions and 34 deletions Side-by-side Diff

... ... @@ -40,7 +40,7 @@
40 40 * sysctl_overcommit_ratio / 100) + total_swap_pages;
41 41  
42 42 cached = global_page_state(NR_FILE_PAGES) -
43   - total_swapcache_pages - i.bufferram;
  43 + total_swapcache_pages() - i.bufferram;
44 44 if (cached < 0)
45 45 cached = 0;
46 46  
... ... @@ -109,7 +109,7 @@
109 109 K(i.freeram),
110 110 K(i.bufferram),
111 111 K(cached),
112   - K(total_swapcache_pages),
  112 + K(total_swapcache_pages()),
113 113 K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
114 114 K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
115 115 K(pages[LRU_ACTIVE_ANON]),
include/linux/swap.h
... ... @@ -8,7 +8,7 @@
8 8 #include <linux/memcontrol.h>
9 9 #include <linux/sched.h>
10 10 #include <linux/node.h>
11   -
  11 +#include <linux/fs.h>
12 12 #include <linux/atomic.h>
13 13 #include <asm/page.h>
14 14  
... ... @@ -330,8 +330,9 @@
330 330 sector_t *);
331 331  
332 332 /* linux/mm/swap_state.c */
333   -extern struct address_space swapper_space;
334   -#define total_swapcache_pages swapper_space.nrpages
  333 +extern struct address_space swapper_spaces[];
  334 +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
  335 +extern unsigned long total_swapcache_pages(void);
335 336 extern void show_swap_cache_info(void);
336 337 extern int add_to_swap(struct page *);
337 338 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
... ... @@ -382,7 +383,7 @@
382 383  
383 384 #define nr_swap_pages 0L
384 385 #define total_swap_pages 0L
385   -#define total_swapcache_pages 0UL
  386 +#define total_swapcache_pages() 0UL
386 387  
387 388 #define si_swapinfo(val) \
388 389 do { (val)->freeswap = (val)->totalswap = 0; } while (0)
... ... @@ -6307,7 +6307,7 @@
6307 6307 * Because lookup_swap_cache() updates some statistics counter,
6308 6308 * we call find_get_page() with swapper_space directly.
6309 6309 */
6310   - page = find_get_page(&swapper_space, ent.val);
  6310 + page = find_get_page(swap_address_space(ent), ent.val);
6311 6311 if (do_swap_account)
6312 6312 entry->val = ent.val;
6313 6313  
... ... @@ -6348,7 +6348,7 @@
6348 6348 swp_entry_t swap = radix_to_swp_entry(page);
6349 6349 if (do_swap_account)
6350 6350 *entry = swap;
6351   - page = find_get_page(&swapper_space, swap.val);
  6351 + page = find_get_page(swap_address_space(swap), swap.val);
6352 6352 }
6353 6353 #endif
6354 6354 return page;
... ... @@ -75,7 +75,7 @@
75 75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 76 if (radix_tree_exceptional_entry(page)) {
77 77 swp_entry_t swap = radix_to_swp_entry(page);
78   - page = find_get_page(&swapper_space, swap.val);
  78 + page = find_get_page(swap_address_space(swap), swap.val);
79 79 }
80 80 #endif
81 81 if (page) {
... ... @@ -135,7 +135,8 @@
135 135 } else {
136 136 #ifdef CONFIG_SWAP
137 137 pgoff = entry.val;
138   - *vec = mincore_page(&swapper_space, pgoff);
  138 + *vec = mincore_page(swap_address_space(entry),
  139 + pgoff);
139 140 #else
140 141 WARN_ON(1);
141 142 *vec = 1;
... ... @@ -855,9 +855,14 @@
855 855 void __init swap_setup(void)
856 856 {
857 857 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
858   -
859 858 #ifdef CONFIG_SWAP
860   - bdi_init(swapper_space.backing_dev_info);
  859 + int i;
  860 +
  861 + bdi_init(swapper_spaces[0].backing_dev_info);
  862 + for (i = 0; i < MAX_SWAPFILES; i++) {
  863 + spin_lock_init(&swapper_spaces[i].tree_lock);
  864 + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
  865 + }
861 866 #endif
862 867  
863 868 /* Use a smaller cluster for small-memory machines */
... ... @@ -36,12 +36,12 @@
36 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37 37 };
38 38  
39   -struct address_space swapper_space = {
40   - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
41   - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
42   - .a_ops = &swap_aops,
43   - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
44   - .backing_dev_info = &swap_backing_dev_info,
  39 +struct address_space swapper_spaces[MAX_SWAPFILES] = {
  40 + [0 ... MAX_SWAPFILES - 1] = {
  41 + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
  42 + .a_ops = &swap_aops,
  43 + .backing_dev_info = &swap_backing_dev_info,
  44 + }
45 45 };
46 46  
47 47 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
48 48  
... ... @@ -53,9 +53,19 @@
53 53 unsigned long find_total;
54 54 } swap_cache_info;
55 55  
  56 +unsigned long total_swapcache_pages(void)
  57 +{
  58 + int i;
  59 + unsigned long ret = 0;
  60 +
  61 + for (i = 0; i < MAX_SWAPFILES; i++)
  62 + ret += swapper_spaces[i].nrpages;
  63 + return ret;
  64 +}
  65 +
56 66 void show_swap_cache_info(void)
57 67 {
58   - printk("%lu pages in swap cache\n", total_swapcache_pages);
  68 + printk("%lu pages in swap cache\n", total_swapcache_pages());
59 69 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 70 swap_cache_info.add_total, swap_cache_info.del_total,
61 71 swap_cache_info.find_success, swap_cache_info.find_total);
... ... @@ -70,6 +80,7 @@
70 80 static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
71 81 {
72 82 int error;
  83 + struct address_space *address_space;
73 84  
74 85 VM_BUG_ON(!PageLocked(page));
75 86 VM_BUG_ON(PageSwapCache(page));
76 87  
77 88  
... ... @@ -79,14 +90,16 @@
79 90 SetPageSwapCache(page);
80 91 set_page_private(page, entry.val);
81 92  
82   - spin_lock_irq(&swapper_space.tree_lock);
83   - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
  93 + address_space = swap_address_space(entry);
  94 + spin_lock_irq(&address_space->tree_lock);
  95 + error = radix_tree_insert(&address_space->page_tree,
  96 + entry.val, page);
84 97 if (likely(!error)) {
85   - total_swapcache_pages++;
  98 + address_space->nrpages++;
86 99 __inc_zone_page_state(page, NR_FILE_PAGES);
87 100 INC_CACHE_INFO(add_total);
88 101 }
89   - spin_unlock_irq(&swapper_space.tree_lock);
  102 + spin_unlock_irq(&address_space->tree_lock);
90 103  
91 104 if (unlikely(error)) {
92 105 /*
93 106  
94 107  
... ... @@ -122,14 +135,19 @@
122 135 */
123 136 void __delete_from_swap_cache(struct page *page)
124 137 {
  138 + swp_entry_t entry;
  139 + struct address_space *address_space;
  140 +
125 141 VM_BUG_ON(!PageLocked(page));
126 142 VM_BUG_ON(!PageSwapCache(page));
127 143 VM_BUG_ON(PageWriteback(page));
128 144  
129   - radix_tree_delete(&swapper_space.page_tree, page_private(page));
  145 + entry.val = page_private(page);
  146 + address_space = swap_address_space(entry);
  147 + radix_tree_delete(&address_space->page_tree, page_private(page));
130 148 set_page_private(page, 0);
131 149 ClearPageSwapCache(page);
132   - total_swapcache_pages--;
  150 + address_space->nrpages--;
133 151 __dec_zone_page_state(page, NR_FILE_PAGES);
134 152 INC_CACHE_INFO(del_total);
135 153 }
136 154  
137 155  
... ... @@ -195,12 +213,14 @@
195 213 void delete_from_swap_cache(struct page *page)
196 214 {
197 215 swp_entry_t entry;
  216 + struct address_space *address_space;
198 217  
199 218 entry.val = page_private(page);
200 219  
201   - spin_lock_irq(&swapper_space.tree_lock);
  220 + address_space = swap_address_space(entry);
  221 + spin_lock_irq(&address_space->tree_lock);
202 222 __delete_from_swap_cache(page);
203   - spin_unlock_irq(&swapper_space.tree_lock);
  223 + spin_unlock_irq(&address_space->tree_lock);
204 224  
205 225 swapcache_free(entry, page);
206 226 page_cache_release(page);
... ... @@ -263,7 +283,7 @@
263 283 {
264 284 struct page *page;
265 285  
266   - page = find_get_page(&swapper_space, entry.val);
  286 + page = find_get_page(swap_address_space(entry), entry.val);
267 287  
268 288 if (page)
269 289 INC_CACHE_INFO(find_success);
... ... @@ -290,7 +310,8 @@
290 310 * called after lookup_swap_cache() failed, re-calling
291 311 * that would confuse statistics.
292 312 */
293   - found_page = find_get_page(&swapper_space, entry.val);
  313 + found_page = find_get_page(swap_address_space(entry),
  314 + entry.val);
294 315 if (found_page)
295 316 break;
296 317  
... ... @@ -79,7 +79,7 @@
79 79 struct page *page;
80 80 int ret = 0;
81 81  
82   - page = find_get_page(&swapper_space, entry.val);
  82 + page = find_get_page(swap_address_space(entry), entry.val);
83 83 if (!page)
84 84 return 0;
85 85 /*
... ... @@ -699,7 +699,8 @@
699 699 p = swap_info_get(entry);
700 700 if (p) {
701 701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
702   - page = find_get_page(&swapper_space, entry.val);
  702 + page = find_get_page(swap_address_space(entry),
  703 + entry.val);
703 704 if (page && !trylock_page(page)) {
704 705 page_cache_release(page);
705 706 page = NULL;
... ... @@ -6,6 +6,7 @@
6 6 #include <linux/sched.h>
7 7 #include <linux/security.h>
8 8 #include <linux/swap.h>
  9 +#include <linux/swapops.h>
9 10 #include <asm/uaccess.h>
10 11  
11 12 #include "internal.h"
... ... @@ -389,9 +390,12 @@
389 390  
390 391 VM_BUG_ON(PageSlab(page));
391 392 #ifdef CONFIG_SWAP
392   - if (unlikely(PageSwapCache(page)))
393   - mapping = &swapper_space;
394   - else
  393 + if (unlikely(PageSwapCache(page))) {
  394 + swp_entry_t entry;
  395 +
  396 + entry.val = page_private(page);
  397 + mapping = swap_address_space(entry);
  398 + } else
395 399 #endif
396 400 if ((unsigned long)mapping & PAGE_MAPPING_ANON)
397 401 mapping = NULL;