Commit 33806f06da654092182410d974b6d3c5396ea3eb
Committed by
Linus Torvalds
1 parent
9800339b5e
Exists in
master
and in
20 other branches
swap: make each swap partition have one address_space
When I use several fast SSD to do swap, swapper_space.tree_lock is heavily contended. This makes each swap partition have one address_space to reduce the lock contention. There is an array of address_space for swap. The swap entry type is the index to the array. In my test with 3 SSD, this increases the swapout throughput 20%. [akpm@linux-foundation.org: revert unneeded change to __add_to_swap_cache] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Hugh Dickins <hughd@google.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 8 changed files with 67 additions and 34 deletions Side-by-side Diff
fs/proc/meminfo.c
... | ... | @@ -40,7 +40,7 @@ |
40 | 40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; |
41 | 41 | |
42 | 42 | cached = global_page_state(NR_FILE_PAGES) - |
43 | - total_swapcache_pages - i.bufferram; | |
43 | + total_swapcache_pages() - i.bufferram; | |
44 | 44 | if (cached < 0) |
45 | 45 | cached = 0; |
46 | 46 | |
... | ... | @@ -109,7 +109,7 @@ |
109 | 109 | K(i.freeram), |
110 | 110 | K(i.bufferram), |
111 | 111 | K(cached), |
112 | - K(total_swapcache_pages), | |
112 | + K(total_swapcache_pages()), | |
113 | 113 | K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), |
114 | 114 | K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), |
115 | 115 | K(pages[LRU_ACTIVE_ANON]), |
include/linux/swap.h
... | ... | @@ -8,7 +8,7 @@ |
8 | 8 | #include <linux/memcontrol.h> |
9 | 9 | #include <linux/sched.h> |
10 | 10 | #include <linux/node.h> |
11 | - | |
11 | +#include <linux/fs.h> | |
12 | 12 | #include <linux/atomic.h> |
13 | 13 | #include <asm/page.h> |
14 | 14 | |
... | ... | @@ -330,8 +330,9 @@ |
330 | 330 | sector_t *); |
331 | 331 | |
332 | 332 | /* linux/mm/swap_state.c */ |
333 | -extern struct address_space swapper_space; | |
334 | -#define total_swapcache_pages swapper_space.nrpages | |
333 | +extern struct address_space swapper_spaces[]; | |
334 | +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) | |
335 | +extern unsigned long total_swapcache_pages(void); | |
335 | 336 | extern void show_swap_cache_info(void); |
336 | 337 | extern int add_to_swap(struct page *); |
337 | 338 | extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); |
... | ... | @@ -382,7 +383,7 @@ |
382 | 383 | |
383 | 384 | #define nr_swap_pages 0L |
384 | 385 | #define total_swap_pages 0L |
385 | -#define total_swapcache_pages 0UL | |
386 | +#define total_swapcache_pages() 0UL | |
386 | 387 | |
387 | 388 | #define si_swapinfo(val) \ |
388 | 389 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) |
mm/memcontrol.c
... | ... | @@ -6307,7 +6307,7 @@ |
6307 | 6307 | * Because lookup_swap_cache() updates some statistics counter, |
6308 | 6308 | * we call find_get_page() with swapper_space directly. |
6309 | 6309 | */ |
6310 | - page = find_get_page(&swapper_space, ent.val); | |
6310 | + page = find_get_page(swap_address_space(ent), ent.val); | |
6311 | 6311 | if (do_swap_account) |
6312 | 6312 | entry->val = ent.val; |
6313 | 6313 | |
... | ... | @@ -6348,7 +6348,7 @@ |
6348 | 6348 | swp_entry_t swap = radix_to_swp_entry(page); |
6349 | 6349 | if (do_swap_account) |
6350 | 6350 | *entry = swap; |
6351 | - page = find_get_page(&swapper_space, swap.val); | |
6351 | + page = find_get_page(swap_address_space(swap), swap.val); | |
6352 | 6352 | } |
6353 | 6353 | #endif |
6354 | 6354 | return page; |
mm/mincore.c
... | ... | @@ -75,7 +75,7 @@ |
75 | 75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ |
76 | 76 | if (radix_tree_exceptional_entry(page)) { |
77 | 77 | swp_entry_t swap = radix_to_swp_entry(page); |
78 | - page = find_get_page(&swapper_space, swap.val); | |
78 | + page = find_get_page(swap_address_space(swap), swap.val); | |
79 | 79 | } |
80 | 80 | #endif |
81 | 81 | if (page) { |
... | ... | @@ -135,7 +135,8 @@ |
135 | 135 | } else { |
136 | 136 | #ifdef CONFIG_SWAP |
137 | 137 | pgoff = entry.val; |
138 | - *vec = mincore_page(&swapper_space, pgoff); | |
138 | + *vec = mincore_page(swap_address_space(entry), | |
139 | + pgoff); | |
139 | 140 | #else |
140 | 141 | WARN_ON(1); |
141 | 142 | *vec = 1; |
mm/swap.c
... | ... | @@ -855,9 +855,14 @@ |
855 | 855 | void __init swap_setup(void) |
856 | 856 | { |
857 | 857 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); |
858 | - | |
859 | 858 | #ifdef CONFIG_SWAP |
860 | - bdi_init(swapper_space.backing_dev_info); | |
859 | + int i; | |
860 | + | |
861 | + bdi_init(swapper_spaces[0].backing_dev_info); | |
862 | + for (i = 0; i < MAX_SWAPFILES; i++) { | |
863 | + spin_lock_init(&swapper_spaces[i].tree_lock); | |
864 | + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | |
865 | + } | |
861 | 866 | #endif |
862 | 867 | |
863 | 868 | /* Use a smaller cluster for small-memory machines */ |
mm/swap_state.c
... | ... | @@ -36,12 +36,12 @@ |
36 | 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
37 | 37 | }; |
38 | 38 | |
39 | -struct address_space swapper_space = { | |
40 | - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | |
41 | - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), | |
42 | - .a_ops = &swap_aops, | |
43 | - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | |
44 | - .backing_dev_info = &swap_backing_dev_info, | |
39 | +struct address_space swapper_spaces[MAX_SWAPFILES] = { | |
40 | + [0 ... MAX_SWAPFILES - 1] = { | |
41 | + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | |
42 | + .a_ops = &swap_aops, | |
43 | + .backing_dev_info = &swap_backing_dev_info, | |
44 | + } | |
45 | 45 | }; |
46 | 46 | |
47 | 47 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) |
48 | 48 | |
... | ... | @@ -53,9 +53,19 @@ |
53 | 53 | unsigned long find_total; |
54 | 54 | } swap_cache_info; |
55 | 55 | |
56 | +unsigned long total_swapcache_pages(void) | |
57 | +{ | |
58 | + int i; | |
59 | + unsigned long ret = 0; | |
60 | + | |
61 | + for (i = 0; i < MAX_SWAPFILES; i++) | |
62 | + ret += swapper_spaces[i].nrpages; | |
63 | + return ret; | |
64 | +} | |
65 | + | |
56 | 66 | void show_swap_cache_info(void) |
57 | 67 | { |
58 | - printk("%lu pages in swap cache\n", total_swapcache_pages); | |
68 | + printk("%lu pages in swap cache\n", total_swapcache_pages()); | |
59 | 69 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", |
60 | 70 | swap_cache_info.add_total, swap_cache_info.del_total, |
61 | 71 | swap_cache_info.find_success, swap_cache_info.find_total); |
... | ... | @@ -70,6 +80,7 @@ |
70 | 80 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) |
71 | 81 | { |
72 | 82 | int error; |
83 | + struct address_space *address_space; | |
73 | 84 | |
74 | 85 | VM_BUG_ON(!PageLocked(page)); |
75 | 86 | VM_BUG_ON(PageSwapCache(page)); |
76 | 87 | |
77 | 88 | |
... | ... | @@ -79,14 +90,16 @@ |
79 | 90 | SetPageSwapCache(page); |
80 | 91 | set_page_private(page, entry.val); |
81 | 92 | |
82 | - spin_lock_irq(&swapper_space.tree_lock); | |
83 | - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); | |
93 | + address_space = swap_address_space(entry); | |
94 | + spin_lock_irq(&address_space->tree_lock); | |
95 | + error = radix_tree_insert(&address_space->page_tree, | |
96 | + entry.val, page); | |
84 | 97 | if (likely(!error)) { |
85 | - total_swapcache_pages++; | |
98 | + address_space->nrpages++; | |
86 | 99 | __inc_zone_page_state(page, NR_FILE_PAGES); |
87 | 100 | INC_CACHE_INFO(add_total); |
88 | 101 | } |
89 | - spin_unlock_irq(&swapper_space.tree_lock); | |
102 | + spin_unlock_irq(&address_space->tree_lock); | |
90 | 103 | |
91 | 104 | if (unlikely(error)) { |
92 | 105 | /* |
93 | 106 | |
94 | 107 | |
... | ... | @@ -122,14 +135,19 @@ |
122 | 135 | */ |
123 | 136 | void __delete_from_swap_cache(struct page *page) |
124 | 137 | { |
138 | + swp_entry_t entry; | |
139 | + struct address_space *address_space; | |
140 | + | |
125 | 141 | VM_BUG_ON(!PageLocked(page)); |
126 | 142 | VM_BUG_ON(!PageSwapCache(page)); |
127 | 143 | VM_BUG_ON(PageWriteback(page)); |
128 | 144 | |
129 | - radix_tree_delete(&swapper_space.page_tree, page_private(page)); | |
145 | + entry.val = page_private(page); | |
146 | + address_space = swap_address_space(entry); | |
147 | + radix_tree_delete(&address_space->page_tree, page_private(page)); | |
130 | 148 | set_page_private(page, 0); |
131 | 149 | ClearPageSwapCache(page); |
132 | - total_swapcache_pages--; | |
150 | + address_space->nrpages--; | |
133 | 151 | __dec_zone_page_state(page, NR_FILE_PAGES); |
134 | 152 | INC_CACHE_INFO(del_total); |
135 | 153 | } |
136 | 154 | |
137 | 155 | |
... | ... | @@ -195,12 +213,14 @@ |
195 | 213 | void delete_from_swap_cache(struct page *page) |
196 | 214 | { |
197 | 215 | swp_entry_t entry; |
216 | + struct address_space *address_space; | |
198 | 217 | |
199 | 218 | entry.val = page_private(page); |
200 | 219 | |
201 | - spin_lock_irq(&swapper_space.tree_lock); | |
220 | + address_space = swap_address_space(entry); | |
221 | + spin_lock_irq(&address_space->tree_lock); | |
202 | 222 | __delete_from_swap_cache(page); |
203 | - spin_unlock_irq(&swapper_space.tree_lock); | |
223 | + spin_unlock_irq(&address_space->tree_lock); | |
204 | 224 | |
205 | 225 | swapcache_free(entry, page); |
206 | 226 | page_cache_release(page); |
... | ... | @@ -263,7 +283,7 @@ |
263 | 283 | { |
264 | 284 | struct page *page; |
265 | 285 | |
266 | - page = find_get_page(&swapper_space, entry.val); | |
286 | + page = find_get_page(swap_address_space(entry), entry.val); | |
267 | 287 | |
268 | 288 | if (page) |
269 | 289 | INC_CACHE_INFO(find_success); |
... | ... | @@ -290,7 +310,8 @@ |
290 | 310 | * called after lookup_swap_cache() failed, re-calling |
291 | 311 | * that would confuse statistics. |
292 | 312 | */ |
293 | - found_page = find_get_page(&swapper_space, entry.val); | |
313 | + found_page = find_get_page(swap_address_space(entry), | |
314 | + entry.val); | |
294 | 315 | if (found_page) |
295 | 316 | break; |
296 | 317 |
mm/swapfile.c
... | ... | @@ -79,7 +79,7 @@ |
79 | 79 | struct page *page; |
80 | 80 | int ret = 0; |
81 | 81 | |
82 | - page = find_get_page(&swapper_space, entry.val); | |
82 | + page = find_get_page(swap_address_space(entry), entry.val); | |
83 | 83 | if (!page) |
84 | 84 | return 0; |
85 | 85 | /* |
... | ... | @@ -699,7 +699,8 @@ |
699 | 699 | p = swap_info_get(entry); |
700 | 700 | if (p) { |
701 | 701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
702 | - page = find_get_page(&swapper_space, entry.val); | |
702 | + page = find_get_page(swap_address_space(entry), | |
703 | + entry.val); | |
703 | 704 | if (page && !trylock_page(page)) { |
704 | 705 | page_cache_release(page); |
705 | 706 | page = NULL; |
mm/util.c
... | ... | @@ -6,6 +6,7 @@ |
6 | 6 | #include <linux/sched.h> |
7 | 7 | #include <linux/security.h> |
8 | 8 | #include <linux/swap.h> |
9 | +#include <linux/swapops.h> | |
9 | 10 | #include <asm/uaccess.h> |
10 | 11 | |
11 | 12 | #include "internal.h" |
... | ... | @@ -389,9 +390,12 @@ |
389 | 390 | |
390 | 391 | VM_BUG_ON(PageSlab(page)); |
391 | 392 | #ifdef CONFIG_SWAP |
392 | - if (unlikely(PageSwapCache(page))) | |
393 | - mapping = &swapper_space; | |
394 | - else | |
393 | + if (unlikely(PageSwapCache(page))) { | |
394 | + swp_entry_t entry; | |
395 | + | |
396 | + entry.val = page_private(page); | |
397 | + mapping = swap_address_space(entry); | |
398 | + } else | |
395 | 399 | #endif |
396 | 400 | if ((unsigned long)mapping & PAGE_MAPPING_ANON) |
397 | 401 | mapping = NULL; |