Commit 51726b1222863852c46ca21ed0115b85d1edfd89

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 6d91add09f

mm: replace some BUG_ONs by VM_BUG_ONs

The swap code is over-provisioned with BUG_ONs on assorted page flags,
mostly dating back to 2.3.  They're good documentation, and guard against
developer error, but a waste of space on most systems: change them to
VM_BUG_ONs, conditional on CONFIG_DEBUG_VM.  Just delete the PagePrivate
ones: they're later, from 2.5.69, but even less interesting now.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 14 additions and 17 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_io.c 2 * linux/mm/page_io.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * 5 *
6 * Swap reorganised 29.12.95, 6 * Swap reorganised 29.12.95,
7 * Asynchronous swapping added 30.12.95. Stephen Tweedie 7 * Asynchronous swapping added 30.12.95. Stephen Tweedie
8 * Removed race in async swapping. 14.4.1996. Bruno Haible 8 * Removed race in async swapping. 14.4.1996. Bruno Haible
9 * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie 9 * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
10 * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman 10 * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
11 */ 11 */
12 12
13 #include <linux/mm.h> 13 #include <linux/mm.h>
14 #include <linux/kernel_stat.h> 14 #include <linux/kernel_stat.h>
15 #include <linux/pagemap.h> 15 #include <linux/pagemap.h>
16 #include <linux/swap.h> 16 #include <linux/swap.h>
17 #include <linux/bio.h> 17 #include <linux/bio.h>
18 #include <linux/swapops.h> 18 #include <linux/swapops.h>
19 #include <linux/writeback.h> 19 #include <linux/writeback.h>
20 #include <asm/pgtable.h> 20 #include <asm/pgtable.h>
21 21
22 static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, 22 static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
23 struct page *page, bio_end_io_t end_io) 23 struct page *page, bio_end_io_t end_io)
24 { 24 {
25 struct bio *bio; 25 struct bio *bio;
26 26
27 bio = bio_alloc(gfp_flags, 1); 27 bio = bio_alloc(gfp_flags, 1);
28 if (bio) { 28 if (bio) {
29 struct swap_info_struct *sis; 29 struct swap_info_struct *sis;
30 swp_entry_t entry = { .val = index, }; 30 swp_entry_t entry = { .val = index, };
31 31
32 sis = get_swap_info_struct(swp_type(entry)); 32 sis = get_swap_info_struct(swp_type(entry));
33 bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * 33 bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
34 (PAGE_SIZE >> 9); 34 (PAGE_SIZE >> 9);
35 bio->bi_bdev = sis->bdev; 35 bio->bi_bdev = sis->bdev;
36 bio->bi_io_vec[0].bv_page = page; 36 bio->bi_io_vec[0].bv_page = page;
37 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 37 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
38 bio->bi_io_vec[0].bv_offset = 0; 38 bio->bi_io_vec[0].bv_offset = 0;
39 bio->bi_vcnt = 1; 39 bio->bi_vcnt = 1;
40 bio->bi_idx = 0; 40 bio->bi_idx = 0;
41 bio->bi_size = PAGE_SIZE; 41 bio->bi_size = PAGE_SIZE;
42 bio->bi_end_io = end_io; 42 bio->bi_end_io = end_io;
43 } 43 }
44 return bio; 44 return bio;
45 } 45 }
46 46
47 static void end_swap_bio_write(struct bio *bio, int err) 47 static void end_swap_bio_write(struct bio *bio, int err)
48 { 48 {
49 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 49 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
50 struct page *page = bio->bi_io_vec[0].bv_page; 50 struct page *page = bio->bi_io_vec[0].bv_page;
51 51
52 if (!uptodate) { 52 if (!uptodate) {
53 SetPageError(page); 53 SetPageError(page);
54 /* 54 /*
55 * We failed to write the page out to swap-space. 55 * We failed to write the page out to swap-space.
56 * Re-dirty the page in order to avoid it being reclaimed. 56 * Re-dirty the page in order to avoid it being reclaimed.
57 * Also print a dire warning that things will go BAD (tm) 57 * Also print a dire warning that things will go BAD (tm)
58 * very quickly. 58 * very quickly.
59 * 59 *
60 * Also clear PG_reclaim to avoid rotate_reclaimable_page() 60 * Also clear PG_reclaim to avoid rotate_reclaimable_page()
61 */ 61 */
62 set_page_dirty(page); 62 set_page_dirty(page);
63 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", 63 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
64 imajor(bio->bi_bdev->bd_inode), 64 imajor(bio->bi_bdev->bd_inode),
65 iminor(bio->bi_bdev->bd_inode), 65 iminor(bio->bi_bdev->bd_inode),
66 (unsigned long long)bio->bi_sector); 66 (unsigned long long)bio->bi_sector);
67 ClearPageReclaim(page); 67 ClearPageReclaim(page);
68 } 68 }
69 end_page_writeback(page); 69 end_page_writeback(page);
70 bio_put(bio); 70 bio_put(bio);
71 } 71 }
72 72
73 void end_swap_bio_read(struct bio *bio, int err) 73 void end_swap_bio_read(struct bio *bio, int err)
74 { 74 {
75 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 75 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
76 struct page *page = bio->bi_io_vec[0].bv_page; 76 struct page *page = bio->bi_io_vec[0].bv_page;
77 77
78 if (!uptodate) { 78 if (!uptodate) {
79 SetPageError(page); 79 SetPageError(page);
80 ClearPageUptodate(page); 80 ClearPageUptodate(page);
81 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", 81 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
82 imajor(bio->bi_bdev->bd_inode), 82 imajor(bio->bi_bdev->bd_inode),
83 iminor(bio->bi_bdev->bd_inode), 83 iminor(bio->bi_bdev->bd_inode),
84 (unsigned long long)bio->bi_sector); 84 (unsigned long long)bio->bi_sector);
85 } else { 85 } else {
86 SetPageUptodate(page); 86 SetPageUptodate(page);
87 } 87 }
88 unlock_page(page); 88 unlock_page(page);
89 bio_put(bio); 89 bio_put(bio);
90 } 90 }
91 91
92 /* 92 /*
93 * We may have stale swap cache pages in memory: notice 93 * We may have stale swap cache pages in memory: notice
94 * them here and get rid of the unnecessary final write. 94 * them here and get rid of the unnecessary final write.
95 */ 95 */
96 int swap_writepage(struct page *page, struct writeback_control *wbc) 96 int swap_writepage(struct page *page, struct writeback_control *wbc)
97 { 97 {
98 struct bio *bio; 98 struct bio *bio;
99 int ret = 0, rw = WRITE; 99 int ret = 0, rw = WRITE;
100 100
101 if (remove_exclusive_swap_page(page)) { 101 if (remove_exclusive_swap_page(page)) {
102 unlock_page(page); 102 unlock_page(page);
103 goto out; 103 goto out;
104 } 104 }
105 bio = get_swap_bio(GFP_NOIO, page_private(page), page, 105 bio = get_swap_bio(GFP_NOIO, page_private(page), page,
106 end_swap_bio_write); 106 end_swap_bio_write);
107 if (bio == NULL) { 107 if (bio == NULL) {
108 set_page_dirty(page); 108 set_page_dirty(page);
109 unlock_page(page); 109 unlock_page(page);
110 ret = -ENOMEM; 110 ret = -ENOMEM;
111 goto out; 111 goto out;
112 } 112 }
113 if (wbc->sync_mode == WB_SYNC_ALL) 113 if (wbc->sync_mode == WB_SYNC_ALL)
114 rw |= (1 << BIO_RW_SYNC); 114 rw |= (1 << BIO_RW_SYNC);
115 count_vm_event(PSWPOUT); 115 count_vm_event(PSWPOUT);
116 set_page_writeback(page); 116 set_page_writeback(page);
117 unlock_page(page); 117 unlock_page(page);
118 submit_bio(rw, bio); 118 submit_bio(rw, bio);
119 out: 119 out:
120 return ret; 120 return ret;
121 } 121 }
122 122
123 int swap_readpage(struct file *file, struct page *page) 123 int swap_readpage(struct file *file, struct page *page)
124 { 124 {
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
127 127
128 BUG_ON(!PageLocked(page)); 128 VM_BUG_ON(!PageLocked(page));
129 BUG_ON(PageUptodate(page)); 129 VM_BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
131 end_swap_bio_read); 131 end_swap_bio_read);
132 if (bio == NULL) { 132 if (bio == NULL) {
133 unlock_page(page); 133 unlock_page(page);
134 ret = -ENOMEM; 134 ret = -ENOMEM;
135 goto out; 135 goto out;
136 } 136 }
137 count_vm_event(PSWPIN); 137 count_vm_event(PSWPIN);
138 submit_bio(READ, bio); 138 submit_bio(READ, bio);
139 out: 139 out:
140 return ret; 140 return ret;
141 } 141 }
142 142
1 /* 1 /*
2 * linux/mm/swap_state.c 2 * linux/mm/swap_state.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie 5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * 6 *
7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie
8 */ 8 */
9 #include <linux/module.h> 9 #include <linux/module.h>
10 #include <linux/mm.h> 10 #include <linux/mm.h>
11 #include <linux/kernel_stat.h> 11 #include <linux/kernel_stat.h>
12 #include <linux/swap.h> 12 #include <linux/swap.h>
13 #include <linux/swapops.h> 13 #include <linux/swapops.h>
14 #include <linux/init.h> 14 #include <linux/init.h>
15 #include <linux/pagemap.h> 15 #include <linux/pagemap.h>
16 #include <linux/buffer_head.h> 16 #include <linux/buffer_head.h>
17 #include <linux/backing-dev.h> 17 #include <linux/backing-dev.h>
18 #include <linux/pagevec.h> 18 #include <linux/pagevec.h>
19 #include <linux/migrate.h> 19 #include <linux/migrate.h>
20 20
21 #include <asm/pgtable.h> 21 #include <asm/pgtable.h>
22 22
23 /* 23 /*
24 * swapper_space is a fiction, retained to simplify the path through 24 * swapper_space is a fiction, retained to simplify the path through
25 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow 25 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
26 * future use of radix_tree tags in the swap cache. 26 * future use of radix_tree tags in the swap cache.
27 */ 27 */
28 static const struct address_space_operations swap_aops = { 28 static const struct address_space_operations swap_aops = {
29 .writepage = swap_writepage, 29 .writepage = swap_writepage,
30 .sync_page = block_sync_page, 30 .sync_page = block_sync_page,
31 .set_page_dirty = __set_page_dirty_nobuffers, 31 .set_page_dirty = __set_page_dirty_nobuffers,
32 .migratepage = migrate_page, 32 .migratepage = migrate_page,
33 }; 33 };
34 34
35 static struct backing_dev_info swap_backing_dev_info = { 35 static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37 .unplug_io_fn = swap_unplug_io_fn, 37 .unplug_io_fn = swap_unplug_io_fn,
38 }; 38 };
39 39
40 struct address_space swapper_space = { 40 struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), 42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
43 .a_ops = &swap_aops, 43 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_info, 45 .backing_dev_info = &swap_backing_dev_info,
46 }; 46 };
47 47
48 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 48 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
49 49
50 static struct { 50 static struct {
51 unsigned long add_total; 51 unsigned long add_total;
52 unsigned long del_total; 52 unsigned long del_total;
53 unsigned long find_success; 53 unsigned long find_success;
54 unsigned long find_total; 54 unsigned long find_total;
55 } swap_cache_info; 55 } swap_cache_info;
56 56
57 void show_swap_cache_info(void) 57 void show_swap_cache_info(void)
58 { 58 {
59 printk("%lu pages in swap cache\n", total_swapcache_pages); 59 printk("%lu pages in swap cache\n", total_swapcache_pages);
60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
61 swap_cache_info.add_total, swap_cache_info.del_total, 61 swap_cache_info.add_total, swap_cache_info.del_total,
62 swap_cache_info.find_success, swap_cache_info.find_total); 62 swap_cache_info.find_success, swap_cache_info.find_total);
63 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
64 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 64 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
65 } 65 }
66 66
67 /* 67 /*
68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
69 * but sets SwapCache flag and private instead of mapping and index. 69 * but sets SwapCache flag and private instead of mapping and index.
70 */ 70 */
71 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 71 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
72 { 72 {
73 int error; 73 int error;
74 74
75 BUG_ON(!PageLocked(page)); 75 VM_BUG_ON(!PageLocked(page));
76 BUG_ON(PageSwapCache(page)); 76 VM_BUG_ON(PageSwapCache(page));
77 BUG_ON(PagePrivate(page)); 77 VM_BUG_ON(!PageSwapBacked(page));
78 BUG_ON(!PageSwapBacked(page)); 78
79 error = radix_tree_preload(gfp_mask); 79 error = radix_tree_preload(gfp_mask);
80 if (!error) { 80 if (!error) {
81 page_cache_get(page); 81 page_cache_get(page);
82 SetPageSwapCache(page); 82 SetPageSwapCache(page);
83 set_page_private(page, entry.val); 83 set_page_private(page, entry.val);
84 84
85 spin_lock_irq(&swapper_space.tree_lock); 85 spin_lock_irq(&swapper_space.tree_lock);
86 error = radix_tree_insert(&swapper_space.page_tree, 86 error = radix_tree_insert(&swapper_space.page_tree,
87 entry.val, page); 87 entry.val, page);
88 if (likely(!error)) { 88 if (likely(!error)) {
89 total_swapcache_pages++; 89 total_swapcache_pages++;
90 __inc_zone_page_state(page, NR_FILE_PAGES); 90 __inc_zone_page_state(page, NR_FILE_PAGES);
91 INC_CACHE_INFO(add_total); 91 INC_CACHE_INFO(add_total);
92 } 92 }
93 spin_unlock_irq(&swapper_space.tree_lock); 93 spin_unlock_irq(&swapper_space.tree_lock);
94 radix_tree_preload_end(); 94 radix_tree_preload_end();
95 95
96 if (unlikely(error)) { 96 if (unlikely(error)) {
97 set_page_private(page, 0UL); 97 set_page_private(page, 0UL);
98 ClearPageSwapCache(page); 98 ClearPageSwapCache(page);
99 page_cache_release(page); 99 page_cache_release(page);
100 } 100 }
101 } 101 }
102 return error; 102 return error;
103 } 103 }
104 104
105 /* 105 /*
106 * This must be called only on pages that have 106 * This must be called only on pages that have
107 * been verified to be in the swap cache. 107 * been verified to be in the swap cache.
108 */ 108 */
109 void __delete_from_swap_cache(struct page *page) 109 void __delete_from_swap_cache(struct page *page)
110 { 110 {
111 BUG_ON(!PageLocked(page)); 111 VM_BUG_ON(!PageLocked(page));
112 BUG_ON(!PageSwapCache(page)); 112 VM_BUG_ON(!PageSwapCache(page));
113 BUG_ON(PageWriteback(page)); 113 VM_BUG_ON(PageWriteback(page));
114 BUG_ON(PagePrivate(page));
115 114
116 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 115 radix_tree_delete(&swapper_space.page_tree, page_private(page));
117 set_page_private(page, 0); 116 set_page_private(page, 0);
118 ClearPageSwapCache(page); 117 ClearPageSwapCache(page);
119 total_swapcache_pages--; 118 total_swapcache_pages--;
120 __dec_zone_page_state(page, NR_FILE_PAGES); 119 __dec_zone_page_state(page, NR_FILE_PAGES);
121 INC_CACHE_INFO(del_total); 120 INC_CACHE_INFO(del_total);
122 } 121 }
123 122
124 /** 123 /**
125 * add_to_swap - allocate swap space for a page 124 * add_to_swap - allocate swap space for a page
126 * @page: page we want to move to swap 125 * @page: page we want to move to swap
127 * @gfp_mask: memory allocation flags 126 * @gfp_mask: memory allocation flags
128 * 127 *
129 * Allocate swap space for the page and add the page to the 128 * Allocate swap space for the page and add the page to the
130 * swap cache. Caller needs to hold the page lock. 129 * swap cache. Caller needs to hold the page lock.
131 */ 130 */
132 int add_to_swap(struct page * page, gfp_t gfp_mask) 131 int add_to_swap(struct page * page, gfp_t gfp_mask)
133 { 132 {
134 swp_entry_t entry; 133 swp_entry_t entry;
135 int err; 134 int err;
136 135
137 BUG_ON(!PageLocked(page)); 136 VM_BUG_ON(!PageLocked(page));
138 BUG_ON(!PageUptodate(page)); 137 VM_BUG_ON(!PageUptodate(page));
139 138
140 for (;;) { 139 for (;;) {
141 entry = get_swap_page(); 140 entry = get_swap_page();
142 if (!entry.val) 141 if (!entry.val)
143 return 0; 142 return 0;
144 143
145 /* 144 /*
146 * Radix-tree node allocations from PF_MEMALLOC contexts could 145 * Radix-tree node allocations from PF_MEMALLOC contexts could
147 * completely exhaust the page allocator. __GFP_NOMEMALLOC 146 * completely exhaust the page allocator. __GFP_NOMEMALLOC
148 * stops emergency reserves from being allocated. 147 * stops emergency reserves from being allocated.
149 * 148 *
150 * TODO: this could cause a theoretical memory reclaim 149 * TODO: this could cause a theoretical memory reclaim
151 * deadlock in the swap out path. 150 * deadlock in the swap out path.
152 */ 151 */
153 /* 152 /*
154 * Add it to the swap cache and mark it dirty 153 * Add it to the swap cache and mark it dirty
155 */ 154 */
156 err = add_to_swap_cache(page, entry, 155 err = add_to_swap_cache(page, entry,
157 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); 156 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
158 157
159 switch (err) { 158 switch (err) {
160 case 0: /* Success */ 159 case 0: /* Success */
161 SetPageDirty(page); 160 SetPageDirty(page);
162 return 1; 161 return 1;
163 case -EEXIST: 162 case -EEXIST:
164 /* Raced with "speculative" read_swap_cache_async */ 163 /* Raced with "speculative" read_swap_cache_async */
165 swap_free(entry); 164 swap_free(entry);
166 continue; 165 continue;
167 default: 166 default:
168 /* -ENOMEM radix-tree allocation failure */ 167 /* -ENOMEM radix-tree allocation failure */
169 swap_free(entry); 168 swap_free(entry);
170 return 0; 169 return 0;
171 } 170 }
172 } 171 }
173 } 172 }
174 173
175 /* 174 /*
176 * This must be called only on pages that have 175 * This must be called only on pages that have
177 * been verified to be in the swap cache and locked. 176 * been verified to be in the swap cache and locked.
178 * It will never put the page into the free list, 177 * It will never put the page into the free list,
179 * the caller has a reference on the page. 178 * the caller has a reference on the page.
180 */ 179 */
181 void delete_from_swap_cache(struct page *page) 180 void delete_from_swap_cache(struct page *page)
182 { 181 {
183 swp_entry_t entry; 182 swp_entry_t entry;
184 183
185 entry.val = page_private(page); 184 entry.val = page_private(page);
186 185
187 spin_lock_irq(&swapper_space.tree_lock); 186 spin_lock_irq(&swapper_space.tree_lock);
188 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
189 spin_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
190 189
191 swap_free(entry); 190 swap_free(entry);
192 page_cache_release(page); 191 page_cache_release(page);
193 } 192 }
194 193
195 /* 194 /*
196 * If we are the only user, then try to free up the swap cache. 195 * If we are the only user, then try to free up the swap cache.
197 * 196 *
198 * Its ok to check for PageSwapCache without the page lock 197 * Its ok to check for PageSwapCache without the page lock
199 * here because we are going to recheck again inside 198 * here because we are going to recheck again inside
200 * exclusive_swap_page() _with_ the lock. 199 * exclusive_swap_page() _with_ the lock.
201 * - Marcelo 200 * - Marcelo
202 */ 201 */
203 static inline void free_swap_cache(struct page *page) 202 static inline void free_swap_cache(struct page *page)
204 { 203 {
205 if (PageSwapCache(page) && trylock_page(page)) { 204 if (PageSwapCache(page) && trylock_page(page)) {
206 remove_exclusive_swap_page(page); 205 remove_exclusive_swap_page(page);
207 unlock_page(page); 206 unlock_page(page);
208 } 207 }
209 } 208 }
210 209
211 /* 210 /*
212 * Perform a free_page(), also freeing any swap cache associated with 211 * Perform a free_page(), also freeing any swap cache associated with
213 * this page if it is the last user of the page. 212 * this page if it is the last user of the page.
214 */ 213 */
215 void free_page_and_swap_cache(struct page *page) 214 void free_page_and_swap_cache(struct page *page)
216 { 215 {
217 free_swap_cache(page); 216 free_swap_cache(page);
218 page_cache_release(page); 217 page_cache_release(page);
219 } 218 }
220 219
221 /* 220 /*
222 * Passed an array of pages, drop them all from swapcache and then release 221 * Passed an array of pages, drop them all from swapcache and then release
223 * them. They are removed from the LRU and freed if this is their last use. 222 * them. They are removed from the LRU and freed if this is their last use.
224 */ 223 */
225 void free_pages_and_swap_cache(struct page **pages, int nr) 224 void free_pages_and_swap_cache(struct page **pages, int nr)
226 { 225 {
227 struct page **pagep = pages; 226 struct page **pagep = pages;
228 227
229 lru_add_drain(); 228 lru_add_drain();
230 while (nr) { 229 while (nr) {
231 int todo = min(nr, PAGEVEC_SIZE); 230 int todo = min(nr, PAGEVEC_SIZE);
232 int i; 231 int i;
233 232
234 for (i = 0; i < todo; i++) 233 for (i = 0; i < todo; i++)
235 free_swap_cache(pagep[i]); 234 free_swap_cache(pagep[i]);
236 release_pages(pagep, todo, 0); 235 release_pages(pagep, todo, 0);
237 pagep += todo; 236 pagep += todo;
238 nr -= todo; 237 nr -= todo;
239 } 238 }
240 } 239 }
241 240
242 /* 241 /*
243 * Lookup a swap entry in the swap cache. A found page will be returned 242 * Lookup a swap entry in the swap cache. A found page will be returned
244 * unlocked and with its refcount incremented - we rely on the kernel 243 * unlocked and with its refcount incremented - we rely on the kernel
245 * lock getting page table operations atomic even if we drop the page 244 * lock getting page table operations atomic even if we drop the page
246 * lock before returning. 245 * lock before returning.
247 */ 246 */
248 struct page * lookup_swap_cache(swp_entry_t entry) 247 struct page * lookup_swap_cache(swp_entry_t entry)
249 { 248 {
250 struct page *page; 249 struct page *page;
251 250
252 page = find_get_page(&swapper_space, entry.val); 251 page = find_get_page(&swapper_space, entry.val);
253 252
254 if (page) 253 if (page)
255 INC_CACHE_INFO(find_success); 254 INC_CACHE_INFO(find_success);
256 255
257 INC_CACHE_INFO(find_total); 256 INC_CACHE_INFO(find_total);
258 return page; 257 return page;
259 } 258 }
260 259
261 /* 260 /*
262 * Locate a page of swap in physical memory, reserving swap cache space 261 * Locate a page of swap in physical memory, reserving swap cache space
263 * and reading the disk if it is not already cached. 262 * and reading the disk if it is not already cached.
264 * A failure return means that either the page allocation failed or that 263 * A failure return means that either the page allocation failed or that
265 * the swap entry is no longer in use. 264 * the swap entry is no longer in use.
266 */ 265 */
267 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 266 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
268 struct vm_area_struct *vma, unsigned long addr) 267 struct vm_area_struct *vma, unsigned long addr)
269 { 268 {
270 struct page *found_page, *new_page = NULL; 269 struct page *found_page, *new_page = NULL;
271 int err; 270 int err;
272 271
273 do { 272 do {
274 /* 273 /*
275 * First check the swap cache. Since this is normally 274 * First check the swap cache. Since this is normally
276 * called after lookup_swap_cache() failed, re-calling 275 * called after lookup_swap_cache() failed, re-calling
277 * that would confuse statistics. 276 * that would confuse statistics.
278 */ 277 */
279 found_page = find_get_page(&swapper_space, entry.val); 278 found_page = find_get_page(&swapper_space, entry.val);
280 if (found_page) 279 if (found_page)
281 break; 280 break;
282 281
283 /* 282 /*
284 * Get a new page to read into from swap. 283 * Get a new page to read into from swap.
285 */ 284 */
286 if (!new_page) { 285 if (!new_page) {
287 new_page = alloc_page_vma(gfp_mask, vma, addr); 286 new_page = alloc_page_vma(gfp_mask, vma, addr);
288 if (!new_page) 287 if (!new_page)
289 break; /* Out of memory */ 288 break; /* Out of memory */
290 } 289 }
291 290
292 /* 291 /*
293 * Swap entry may have been freed since our caller observed it. 292 * Swap entry may have been freed since our caller observed it.
294 */ 293 */
295 if (!swap_duplicate(entry)) 294 if (!swap_duplicate(entry))
296 break; 295 break;
297 296
298 /* 297 /*
299 * Associate the page with swap entry in the swap cache. 298 * Associate the page with swap entry in the swap cache.
300 * May fail (-EEXIST) if there is already a page associated 299 * May fail (-EEXIST) if there is already a page associated
301 * with this entry in the swap cache: added by a racing 300 * with this entry in the swap cache: added by a racing
302 * read_swap_cache_async, or add_to_swap or shmem_writepage 301 * read_swap_cache_async, or add_to_swap or shmem_writepage
303 * re-using the just freed swap entry for an existing page. 302 * re-using the just freed swap entry for an existing page.
304 * May fail (-ENOMEM) if radix-tree node allocation failed. 303 * May fail (-ENOMEM) if radix-tree node allocation failed.
305 */ 304 */
306 __set_page_locked(new_page); 305 __set_page_locked(new_page);
307 SetPageSwapBacked(new_page); 306 SetPageSwapBacked(new_page);
308 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 307 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
309 if (likely(!err)) { 308 if (likely(!err)) {
310 /* 309 /*
311 * Initiate read into locked page and return. 310 * Initiate read into locked page and return.
312 */ 311 */
313 lru_cache_add_anon(new_page); 312 lru_cache_add_anon(new_page);
314 swap_readpage(NULL, new_page); 313 swap_readpage(NULL, new_page);
315 return new_page; 314 return new_page;
316 } 315 }
317 ClearPageSwapBacked(new_page); 316 ClearPageSwapBacked(new_page);
318 __clear_page_locked(new_page); 317 __clear_page_locked(new_page);
319 swap_free(entry); 318 swap_free(entry);
320 } while (err != -ENOMEM); 319 } while (err != -ENOMEM);
321 320
322 if (new_page) 321 if (new_page)
323 page_cache_release(new_page); 322 page_cache_release(new_page);
324 return found_page; 323 return found_page;
325 } 324 }
326 325
327 /** 326 /**
328 * swapin_readahead - swap in pages in hope we need them soon 327 * swapin_readahead - swap in pages in hope we need them soon
329 * @entry: swap entry of this memory 328 * @entry: swap entry of this memory
330 * @gfp_mask: memory allocation flags 329 * @gfp_mask: memory allocation flags
331 * @vma: user vma this address belongs to 330 * @vma: user vma this address belongs to
332 * @addr: target address for mempolicy 331 * @addr: target address for mempolicy
333 * 332 *
334 * Returns the struct page for entry and addr, after queueing swapin. 333 * Returns the struct page for entry and addr, after queueing swapin.
335 * 334 *
336 * Primitive swap readahead code. We simply read an aligned block of 335 * Primitive swap readahead code. We simply read an aligned block of
337 * (1 << page_cluster) entries in the swap area. This method is chosen 336 * (1 << page_cluster) entries in the swap area. This method is chosen
338 * because it doesn't cost us any seek time. We also make sure to queue 337 * because it doesn't cost us any seek time. We also make sure to queue
339 * the 'original' request together with the readahead ones... 338 * the 'original' request together with the readahead ones...
340 * 339 *
341 * This has been extended to use the NUMA policies from the mm triggering 340 * This has been extended to use the NUMA policies from the mm triggering
342 * the readahead. 341 * the readahead.
343 * 342 *
344 * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 343 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
345 */ 344 */
346 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 345 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
347 struct vm_area_struct *vma, unsigned long addr) 346 struct vm_area_struct *vma, unsigned long addr)
348 { 347 {
349 int nr_pages; 348 int nr_pages;
350 struct page *page; 349 struct page *page;
351 unsigned long offset; 350 unsigned long offset;
352 unsigned long end_offset; 351 unsigned long end_offset;
353 352
354 /* 353 /*
355 * Get starting offset for readaround, and number of pages to read. 354 * Get starting offset for readaround, and number of pages to read.
356 * Adjust starting address by readbehind (for NUMA interleave case)? 355 * Adjust starting address by readbehind (for NUMA interleave case)?
357 * No, it's very unlikely that swap layout would follow vma layout, 356 * No, it's very unlikely that swap layout would follow vma layout,
358 * more likely that neighbouring swap pages came from the same node: 357 * more likely that neighbouring swap pages came from the same node:
359 * so use the same "addr" to choose the same node for each swap read. 358 * so use the same "addr" to choose the same node for each swap read.
360 */ 359 */
361 nr_pages = valid_swaphandles(entry, &offset); 360 nr_pages = valid_swaphandles(entry, &offset);
362 for (end_offset = offset + nr_pages; offset < end_offset; offset++) { 361 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
363 /* Ok, do the async read-ahead now */ 362 /* Ok, do the async read-ahead now */
364 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 363 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
365 gfp_mask, vma, addr); 364 gfp_mask, vma, addr);
366 if (!page) 365 if (!page)
367 break; 366 break;
368 page_cache_release(page); 367 page_cache_release(page);
369 } 368 }
370 lru_add_drain(); /* Push any new pages onto the LRU now */ 369 lru_add_drain(); /* Push any new pages onto the LRU now */
371 return read_swap_cache_async(entry, gfp_mask, vma, addr); 370 return read_swap_cache_async(entry, gfp_mask, vma, addr);
372 } 371 }
373 372
1 /* 1 /*
2 * linux/mm/swapfile.c 2 * linux/mm/swapfile.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie 5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */ 6 */
7 7
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/hugetlb.h> 9 #include <linux/hugetlb.h>
10 #include <linux/mman.h> 10 #include <linux/mman.h>
11 #include <linux/slab.h> 11 #include <linux/slab.h>
12 #include <linux/kernel_stat.h> 12 #include <linux/kernel_stat.h>
13 #include <linux/swap.h> 13 #include <linux/swap.h>
14 #include <linux/vmalloc.h> 14 #include <linux/vmalloc.h>
15 #include <linux/pagemap.h> 15 #include <linux/pagemap.h>
16 #include <linux/namei.h> 16 #include <linux/namei.h>
17 #include <linux/shm.h> 17 #include <linux/shm.h>
18 #include <linux/blkdev.h> 18 #include <linux/blkdev.h>
19 #include <linux/writeback.h> 19 #include <linux/writeback.h>
20 #include <linux/proc_fs.h> 20 #include <linux/proc_fs.h>
21 #include <linux/seq_file.h> 21 #include <linux/seq_file.h>
22 #include <linux/init.h> 22 #include <linux/init.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/rmap.h> 24 #include <linux/rmap.h>
25 #include <linux/security.h> 25 #include <linux/security.h>
26 #include <linux/backing-dev.h> 26 #include <linux/backing-dev.h>
27 #include <linux/mutex.h> 27 #include <linux/mutex.h>
28 #include <linux/capability.h> 28 #include <linux/capability.h>
29 #include <linux/syscalls.h> 29 #include <linux/syscalls.h>
30 #include <linux/memcontrol.h> 30 #include <linux/memcontrol.h>
31 31
32 #include <asm/pgtable.h> 32 #include <asm/pgtable.h>
33 #include <asm/tlbflush.h> 33 #include <asm/tlbflush.h>
34 #include <linux/swapops.h> 34 #include <linux/swapops.h>
35 35
36 static DEFINE_SPINLOCK(swap_lock); 36 static DEFINE_SPINLOCK(swap_lock);
37 static unsigned int nr_swapfiles; 37 static unsigned int nr_swapfiles;
38 long total_swap_pages; 38 long total_swap_pages;
39 static int swap_overflow; 39 static int swap_overflow;
40 static int least_priority; 40 static int least_priority;
41 41
42 static const char Bad_file[] = "Bad swap file entry "; 42 static const char Bad_file[] = "Bad swap file entry ";
43 static const char Unused_file[] = "Unused swap file entry "; 43 static const char Unused_file[] = "Unused swap file entry ";
44 static const char Bad_offset[] = "Bad swap offset entry "; 44 static const char Bad_offset[] = "Bad swap offset entry ";
45 static const char Unused_offset[] = "Unused swap offset entry "; 45 static const char Unused_offset[] = "Unused swap offset entry ";
46 46
47 static struct swap_list_t swap_list = {-1, -1}; 47 static struct swap_list_t swap_list = {-1, -1};
48 48
49 static struct swap_info_struct swap_info[MAX_SWAPFILES]; 49 static struct swap_info_struct swap_info[MAX_SWAPFILES];
50 50
51 static DEFINE_MUTEX(swapon_mutex); 51 static DEFINE_MUTEX(swapon_mutex);
52 52
53 /* 53 /*
54 * We need this because the bdev->unplug_fn can sleep and we cannot 54 * We need this because the bdev->unplug_fn can sleep and we cannot
55 * hold swap_lock while calling the unplug_fn. And swap_lock 55 * hold swap_lock while calling the unplug_fn. And swap_lock
56 * cannot be turned into a mutex. 56 * cannot be turned into a mutex.
57 */ 57 */
58 static DECLARE_RWSEM(swap_unplug_sem); 58 static DECLARE_RWSEM(swap_unplug_sem);
59 59
60 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) 60 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
61 { 61 {
62 swp_entry_t entry; 62 swp_entry_t entry;
63 63
64 down_read(&swap_unplug_sem); 64 down_read(&swap_unplug_sem);
65 entry.val = page_private(page); 65 entry.val = page_private(page);
66 if (PageSwapCache(page)) { 66 if (PageSwapCache(page)) {
67 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 67 struct block_device *bdev = swap_info[swp_type(entry)].bdev;
68 struct backing_dev_info *bdi; 68 struct backing_dev_info *bdi;
69 69
70 /* 70 /*
71 * If the page is removed from swapcache from under us (with a 71 * If the page is removed from swapcache from under us (with a
72 * racy try_to_unuse/swapoff) we need an additional reference 72 * racy try_to_unuse/swapoff) we need an additional reference
73 * count to avoid reading garbage from page_private(page) above. 73 * count to avoid reading garbage from page_private(page) above.
74 * If the WARN_ON triggers during a swapoff it maybe the race 74 * If the WARN_ON triggers during a swapoff it maybe the race
75 * condition and it's harmless. However if it triggers without 75 * condition and it's harmless. However if it triggers without
76 * swapoff it signals a problem. 76 * swapoff it signals a problem.
77 */ 77 */
78 WARN_ON(page_count(page) <= 1); 78 WARN_ON(page_count(page) <= 1);
79 79
80 bdi = bdev->bd_inode->i_mapping->backing_dev_info; 80 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
81 blk_run_backing_dev(bdi, page); 81 blk_run_backing_dev(bdi, page);
82 } 82 }
83 up_read(&swap_unplug_sem); 83 up_read(&swap_unplug_sem);
84 } 84 }
85 85
86 #define SWAPFILE_CLUSTER 256 86 #define SWAPFILE_CLUSTER 256
87 #define LATENCY_LIMIT 256 87 #define LATENCY_LIMIT 256
88 88
89 static inline unsigned long scan_swap_map(struct swap_info_struct *si) 89 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
90 { 90 {
91 unsigned long offset, last_in_cluster; 91 unsigned long offset, last_in_cluster;
92 int latency_ration = LATENCY_LIMIT; 92 int latency_ration = LATENCY_LIMIT;
93 93
94 /* 94 /*
95 * We try to cluster swap pages by allocating them sequentially 95 * We try to cluster swap pages by allocating them sequentially
96 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 96 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
97 * way, however, we resort to first-free allocation, starting 97 * way, however, we resort to first-free allocation, starting
98 * a new cluster. This prevents us from scattering swap pages 98 * a new cluster. This prevents us from scattering swap pages
99 * all over the entire swap partition, so that we reduce 99 * all over the entire swap partition, so that we reduce
100 * overall disk seek times between swap pages. -- sct 100 * overall disk seek times between swap pages. -- sct
101 * But we do now try to find an empty cluster. -Andrea 101 * But we do now try to find an empty cluster. -Andrea
102 */ 102 */
103 103
104 si->flags += SWP_SCANNING; 104 si->flags += SWP_SCANNING;
105 if (unlikely(!si->cluster_nr)) { 105 if (unlikely(!si->cluster_nr)) {
106 si->cluster_nr = SWAPFILE_CLUSTER - 1; 106 si->cluster_nr = SWAPFILE_CLUSTER - 1;
107 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) 107 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
108 goto lowest; 108 goto lowest;
109 spin_unlock(&swap_lock); 109 spin_unlock(&swap_lock);
110 110
111 offset = si->lowest_bit; 111 offset = si->lowest_bit;
112 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 112 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
113 113
114 /* Locate the first empty (unaligned) cluster */ 114 /* Locate the first empty (unaligned) cluster */
115 for (; last_in_cluster <= si->highest_bit; offset++) { 115 for (; last_in_cluster <= si->highest_bit; offset++) {
116 if (si->swap_map[offset]) 116 if (si->swap_map[offset])
117 last_in_cluster = offset + SWAPFILE_CLUSTER; 117 last_in_cluster = offset + SWAPFILE_CLUSTER;
118 else if (offset == last_in_cluster) { 118 else if (offset == last_in_cluster) {
119 spin_lock(&swap_lock); 119 spin_lock(&swap_lock);
120 si->cluster_next = offset-SWAPFILE_CLUSTER+1; 120 si->cluster_next = offset-SWAPFILE_CLUSTER+1;
121 goto cluster; 121 goto cluster;
122 } 122 }
123 if (unlikely(--latency_ration < 0)) { 123 if (unlikely(--latency_ration < 0)) {
124 cond_resched(); 124 cond_resched();
125 latency_ration = LATENCY_LIMIT; 125 latency_ration = LATENCY_LIMIT;
126 } 126 }
127 } 127 }
128 spin_lock(&swap_lock); 128 spin_lock(&swap_lock);
129 goto lowest; 129 goto lowest;
130 } 130 }
131 131
132 si->cluster_nr--; 132 si->cluster_nr--;
133 cluster: 133 cluster:
134 offset = si->cluster_next; 134 offset = si->cluster_next;
135 if (offset > si->highest_bit) 135 if (offset > si->highest_bit)
136 lowest: offset = si->lowest_bit; 136 lowest: offset = si->lowest_bit;
137 checks: if (!(si->flags & SWP_WRITEOK)) 137 checks: if (!(si->flags & SWP_WRITEOK))
138 goto no_page; 138 goto no_page;
139 if (!si->highest_bit) 139 if (!si->highest_bit)
140 goto no_page; 140 goto no_page;
141 if (!si->swap_map[offset]) { 141 if (!si->swap_map[offset]) {
142 if (offset == si->lowest_bit) 142 if (offset == si->lowest_bit)
143 si->lowest_bit++; 143 si->lowest_bit++;
144 if (offset == si->highest_bit) 144 if (offset == si->highest_bit)
145 si->highest_bit--; 145 si->highest_bit--;
146 si->inuse_pages++; 146 si->inuse_pages++;
147 if (si->inuse_pages == si->pages) { 147 if (si->inuse_pages == si->pages) {
148 si->lowest_bit = si->max; 148 si->lowest_bit = si->max;
149 si->highest_bit = 0; 149 si->highest_bit = 0;
150 } 150 }
151 si->swap_map[offset] = 1; 151 si->swap_map[offset] = 1;
152 si->cluster_next = offset + 1; 152 si->cluster_next = offset + 1;
153 si->flags -= SWP_SCANNING; 153 si->flags -= SWP_SCANNING;
154 return offset; 154 return offset;
155 } 155 }
156 156
157 spin_unlock(&swap_lock); 157 spin_unlock(&swap_lock);
158 while (++offset <= si->highest_bit) { 158 while (++offset <= si->highest_bit) {
159 if (!si->swap_map[offset]) { 159 if (!si->swap_map[offset]) {
160 spin_lock(&swap_lock); 160 spin_lock(&swap_lock);
161 goto checks; 161 goto checks;
162 } 162 }
163 if (unlikely(--latency_ration < 0)) { 163 if (unlikely(--latency_ration < 0)) {
164 cond_resched(); 164 cond_resched();
165 latency_ration = LATENCY_LIMIT; 165 latency_ration = LATENCY_LIMIT;
166 } 166 }
167 } 167 }
168 spin_lock(&swap_lock); 168 spin_lock(&swap_lock);
169 goto lowest; 169 goto lowest;
170 170
171 no_page: 171 no_page:
172 si->flags -= SWP_SCANNING; 172 si->flags -= SWP_SCANNING;
173 return 0; 173 return 0;
174 } 174 }
175 175
176 swp_entry_t get_swap_page(void) 176 swp_entry_t get_swap_page(void)
177 { 177 {
178 struct swap_info_struct *si; 178 struct swap_info_struct *si;
179 pgoff_t offset; 179 pgoff_t offset;
180 int type, next; 180 int type, next;
181 int wrapped = 0; 181 int wrapped = 0;
182 182
183 spin_lock(&swap_lock); 183 spin_lock(&swap_lock);
184 if (nr_swap_pages <= 0) 184 if (nr_swap_pages <= 0)
185 goto noswap; 185 goto noswap;
186 nr_swap_pages--; 186 nr_swap_pages--;
187 187
188 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 188 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
189 si = swap_info + type; 189 si = swap_info + type;
190 next = si->next; 190 next = si->next;
191 if (next < 0 || 191 if (next < 0 ||
192 (!wrapped && si->prio != swap_info[next].prio)) { 192 (!wrapped && si->prio != swap_info[next].prio)) {
193 next = swap_list.head; 193 next = swap_list.head;
194 wrapped++; 194 wrapped++;
195 } 195 }
196 196
197 if (!si->highest_bit) 197 if (!si->highest_bit)
198 continue; 198 continue;
199 if (!(si->flags & SWP_WRITEOK)) 199 if (!(si->flags & SWP_WRITEOK))
200 continue; 200 continue;
201 201
202 swap_list.next = next; 202 swap_list.next = next;
203 offset = scan_swap_map(si); 203 offset = scan_swap_map(si);
204 if (offset) { 204 if (offset) {
205 spin_unlock(&swap_lock); 205 spin_unlock(&swap_lock);
206 return swp_entry(type, offset); 206 return swp_entry(type, offset);
207 } 207 }
208 next = swap_list.next; 208 next = swap_list.next;
209 } 209 }
210 210
211 nr_swap_pages++; 211 nr_swap_pages++;
212 noswap: 212 noswap:
213 spin_unlock(&swap_lock); 213 spin_unlock(&swap_lock);
214 return (swp_entry_t) {0}; 214 return (swp_entry_t) {0};
215 } 215 }
216 216
217 swp_entry_t get_swap_page_of_type(int type) 217 swp_entry_t get_swap_page_of_type(int type)
218 { 218 {
219 struct swap_info_struct *si; 219 struct swap_info_struct *si;
220 pgoff_t offset; 220 pgoff_t offset;
221 221
222 spin_lock(&swap_lock); 222 spin_lock(&swap_lock);
223 si = swap_info + type; 223 si = swap_info + type;
224 if (si->flags & SWP_WRITEOK) { 224 if (si->flags & SWP_WRITEOK) {
225 nr_swap_pages--; 225 nr_swap_pages--;
226 offset = scan_swap_map(si); 226 offset = scan_swap_map(si);
227 if (offset) { 227 if (offset) {
228 spin_unlock(&swap_lock); 228 spin_unlock(&swap_lock);
229 return swp_entry(type, offset); 229 return swp_entry(type, offset);
230 } 230 }
231 nr_swap_pages++; 231 nr_swap_pages++;
232 } 232 }
233 spin_unlock(&swap_lock); 233 spin_unlock(&swap_lock);
234 return (swp_entry_t) {0}; 234 return (swp_entry_t) {0};
235 } 235 }
236 236
237 static struct swap_info_struct * swap_info_get(swp_entry_t entry) 237 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
238 { 238 {
239 struct swap_info_struct * p; 239 struct swap_info_struct * p;
240 unsigned long offset, type; 240 unsigned long offset, type;
241 241
242 if (!entry.val) 242 if (!entry.val)
243 goto out; 243 goto out;
244 type = swp_type(entry); 244 type = swp_type(entry);
245 if (type >= nr_swapfiles) 245 if (type >= nr_swapfiles)
246 goto bad_nofile; 246 goto bad_nofile;
247 p = & swap_info[type]; 247 p = & swap_info[type];
248 if (!(p->flags & SWP_USED)) 248 if (!(p->flags & SWP_USED))
249 goto bad_device; 249 goto bad_device;
250 offset = swp_offset(entry); 250 offset = swp_offset(entry);
251 if (offset >= p->max) 251 if (offset >= p->max)
252 goto bad_offset; 252 goto bad_offset;
253 if (!p->swap_map[offset]) 253 if (!p->swap_map[offset])
254 goto bad_free; 254 goto bad_free;
255 spin_lock(&swap_lock); 255 spin_lock(&swap_lock);
256 return p; 256 return p;
257 257
258 bad_free: 258 bad_free:
259 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 259 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
260 goto out; 260 goto out;
261 bad_offset: 261 bad_offset:
262 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 262 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
263 goto out; 263 goto out;
264 bad_device: 264 bad_device:
265 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 265 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
266 goto out; 266 goto out;
267 bad_nofile: 267 bad_nofile:
268 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 268 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
269 out: 269 out:
270 return NULL; 270 return NULL;
271 } 271 }
272 272
273 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 273 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
274 { 274 {
275 int count = p->swap_map[offset]; 275 int count = p->swap_map[offset];
276 276
277 if (count < SWAP_MAP_MAX) { 277 if (count < SWAP_MAP_MAX) {
278 count--; 278 count--;
279 p->swap_map[offset] = count; 279 p->swap_map[offset] = count;
280 if (!count) { 280 if (!count) {
281 if (offset < p->lowest_bit) 281 if (offset < p->lowest_bit)
282 p->lowest_bit = offset; 282 p->lowest_bit = offset;
283 if (offset > p->highest_bit) 283 if (offset > p->highest_bit)
284 p->highest_bit = offset; 284 p->highest_bit = offset;
285 if (p->prio > swap_info[swap_list.next].prio) 285 if (p->prio > swap_info[swap_list.next].prio)
286 swap_list.next = p - swap_info; 286 swap_list.next = p - swap_info;
287 nr_swap_pages++; 287 nr_swap_pages++;
288 p->inuse_pages--; 288 p->inuse_pages--;
289 } 289 }
290 } 290 }
291 return count; 291 return count;
292 } 292 }
293 293
294 /* 294 /*
295 * Caller has made sure that the swapdevice corresponding to entry 295 * Caller has made sure that the swapdevice corresponding to entry
296 * is still around or has not been recycled. 296 * is still around or has not been recycled.
297 */ 297 */
298 void swap_free(swp_entry_t entry) 298 void swap_free(swp_entry_t entry)
299 { 299 {
300 struct swap_info_struct * p; 300 struct swap_info_struct * p;
301 301
302 p = swap_info_get(entry); 302 p = swap_info_get(entry);
303 if (p) { 303 if (p) {
304 swap_entry_free(p, swp_offset(entry)); 304 swap_entry_free(p, swp_offset(entry));
305 spin_unlock(&swap_lock); 305 spin_unlock(&swap_lock);
306 } 306 }
307 } 307 }
308 308
309 /* 309 /*
310 * How many references to page are currently swapped out? 310 * How many references to page are currently swapped out?
311 */ 311 */
312 static inline int page_swapcount(struct page *page) 312 static inline int page_swapcount(struct page *page)
313 { 313 {
314 int count = 0; 314 int count = 0;
315 struct swap_info_struct *p; 315 struct swap_info_struct *p;
316 swp_entry_t entry; 316 swp_entry_t entry;
317 317
318 entry.val = page_private(page); 318 entry.val = page_private(page);
319 p = swap_info_get(entry); 319 p = swap_info_get(entry);
320 if (p) { 320 if (p) {
321 /* Subtract the 1 for the swap cache itself */ 321 /* Subtract the 1 for the swap cache itself */
322 count = p->swap_map[swp_offset(entry)] - 1; 322 count = p->swap_map[swp_offset(entry)] - 1;
323 spin_unlock(&swap_lock); 323 spin_unlock(&swap_lock);
324 } 324 }
325 return count; 325 return count;
326 } 326 }
327 327
328 /* 328 /*
329 * We can use this swap cache entry directly 329 * We can use this swap cache entry directly
330 * if there are no other references to it. 330 * if there are no other references to it.
331 */ 331 */
332 int can_share_swap_page(struct page *page) 332 int can_share_swap_page(struct page *page)
333 { 333 {
334 int count; 334 int count;
335 335
336 BUG_ON(!PageLocked(page)); 336 VM_BUG_ON(!PageLocked(page));
337 count = page_mapcount(page); 337 count = page_mapcount(page);
338 if (count <= 1 && PageSwapCache(page)) 338 if (count <= 1 && PageSwapCache(page))
339 count += page_swapcount(page); 339 count += page_swapcount(page);
340 return count == 1; 340 return count == 1;
341 } 341 }
342 342
343 /* 343 /*
344 * Work out if there are any other processes sharing this 344 * Work out if there are any other processes sharing this
345 * swap cache page. Free it if you can. Return success. 345 * swap cache page. Free it if you can. Return success.
346 */ 346 */
347 static int remove_exclusive_swap_page_count(struct page *page, int count) 347 static int remove_exclusive_swap_page_count(struct page *page, int count)
348 { 348 {
349 int retval; 349 int retval;
350 struct swap_info_struct * p; 350 struct swap_info_struct * p;
351 swp_entry_t entry; 351 swp_entry_t entry;
352 352
353 BUG_ON(PagePrivate(page)); 353 VM_BUG_ON(!PageLocked(page));
354 BUG_ON(!PageLocked(page));
355 354
356 if (!PageSwapCache(page)) 355 if (!PageSwapCache(page))
357 return 0; 356 return 0;
358 if (PageWriteback(page)) 357 if (PageWriteback(page))
359 return 0; 358 return 0;
360 if (page_count(page) != count) /* us + cache + ptes */ 359 if (page_count(page) != count) /* us + cache + ptes */
361 return 0; 360 return 0;
362 361
363 entry.val = page_private(page); 362 entry.val = page_private(page);
364 p = swap_info_get(entry); 363 p = swap_info_get(entry);
365 if (!p) 364 if (!p)
366 return 0; 365 return 0;
367 366
368 /* Is the only swap cache user the cache itself? */ 367 /* Is the only swap cache user the cache itself? */
369 retval = 0; 368 retval = 0;
370 if (p->swap_map[swp_offset(entry)] == 1) { 369 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */ 370 /* Recheck the page count with the swapcache lock held.. */
372 spin_lock_irq(&swapper_space.tree_lock); 371 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == count) && !PageWriteback(page)) { 372 if ((page_count(page) == count) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page); 373 __delete_from_swap_cache(page);
375 SetPageDirty(page); 374 SetPageDirty(page);
376 retval = 1; 375 retval = 1;
377 } 376 }
378 spin_unlock_irq(&swapper_space.tree_lock); 377 spin_unlock_irq(&swapper_space.tree_lock);
379 } 378 }
380 spin_unlock(&swap_lock); 379 spin_unlock(&swap_lock);
381 380
382 if (retval) { 381 if (retval) {
383 swap_free(entry); 382 swap_free(entry);
384 page_cache_release(page); 383 page_cache_release(page);
385 } 384 }
386 385
387 return retval; 386 return retval;
388 } 387 }
389 388
390 /* 389 /*
391 * Most of the time the page should have two references: one for the 390 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache. 391 * process and one for the swap cache.
393 */ 392 */
394 int remove_exclusive_swap_page(struct page *page) 393 int remove_exclusive_swap_page(struct page *page)
395 { 394 {
396 return remove_exclusive_swap_page_count(page, 2); 395 return remove_exclusive_swap_page_count(page, 2);
397 } 396 }
398 397
399 /* 398 /*
400 * The pageout code holds an extra reference to the page. That raises 399 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the 400 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page. 401 * swap cache plus 1 for each process that maps the page.
403 */ 402 */
404 int remove_exclusive_swap_page_ref(struct page *page) 403 int remove_exclusive_swap_page_ref(struct page *page)
405 { 404 {
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); 405 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407 } 406 }
408 407
409 /* 408 /*
410 * Free the swap entry like above, but also try to 409 * Free the swap entry like above, but also try to
411 * free the page cache entry if it is the last user. 410 * free the page cache entry if it is the last user.
412 */ 411 */
413 void free_swap_and_cache(swp_entry_t entry) 412 void free_swap_and_cache(swp_entry_t entry)
414 { 413 {
415 struct swap_info_struct * p; 414 struct swap_info_struct * p;
416 struct page *page = NULL; 415 struct page *page = NULL;
417 416
418 if (is_migration_entry(entry)) 417 if (is_migration_entry(entry))
419 return; 418 return;
420 419
421 p = swap_info_get(entry); 420 p = swap_info_get(entry);
422 if (p) { 421 if (p) {
423 if (swap_entry_free(p, swp_offset(entry)) == 1) { 422 if (swap_entry_free(p, swp_offset(entry)) == 1) {
424 page = find_get_page(&swapper_space, entry.val); 423 page = find_get_page(&swapper_space, entry.val);
425 if (page && !trylock_page(page)) { 424 if (page && !trylock_page(page)) {
426 page_cache_release(page); 425 page_cache_release(page);
427 page = NULL; 426 page = NULL;
428 } 427 }
429 } 428 }
430 spin_unlock(&swap_lock); 429 spin_unlock(&swap_lock);
431 } 430 }
432 if (page) { 431 if (page) {
433 int one_user; 432 int one_user;
434 433
435 BUG_ON(PagePrivate(page));
436 one_user = (page_count(page) == 2); 434 one_user = (page_count(page) == 2);
437 /* Only cache user (+us), or swap space full? Free it! */ 435 /* Only cache user (+us), or swap space full? Free it! */
438 /* Also recheck PageSwapCache after page is locked (above) */ 436 /* Also recheck PageSwapCache after page is locked (above) */
439 if (PageSwapCache(page) && !PageWriteback(page) && 437 if (PageSwapCache(page) && !PageWriteback(page) &&
440 (one_user || vm_swap_full())) { 438 (one_user || vm_swap_full())) {
441 delete_from_swap_cache(page); 439 delete_from_swap_cache(page);
442 SetPageDirty(page); 440 SetPageDirty(page);
443 } 441 }
444 unlock_page(page); 442 unlock_page(page);
445 page_cache_release(page); 443 page_cache_release(page);
446 } 444 }
447 } 445 }
448 446
449 #ifdef CONFIG_HIBERNATION 447 #ifdef CONFIG_HIBERNATION
450 /* 448 /*
451 * Find the swap type that corresponds to given device (if any). 449 * Find the swap type that corresponds to given device (if any).
452 * 450 *
453 * @offset - number of the PAGE_SIZE-sized block of the device, starting 451 * @offset - number of the PAGE_SIZE-sized block of the device, starting
454 * from 0, in which the swap header is expected to be located. 452 * from 0, in which the swap header is expected to be located.
455 * 453 *
456 * This is needed for the suspend to disk (aka swsusp). 454 * This is needed for the suspend to disk (aka swsusp).
457 */ 455 */
458 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 456 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
459 { 457 {
460 struct block_device *bdev = NULL; 458 struct block_device *bdev = NULL;
461 int i; 459 int i;
462 460
463 if (device) 461 if (device)
464 bdev = bdget(device); 462 bdev = bdget(device);
465 463
466 spin_lock(&swap_lock); 464 spin_lock(&swap_lock);
467 for (i = 0; i < nr_swapfiles; i++) { 465 for (i = 0; i < nr_swapfiles; i++) {
468 struct swap_info_struct *sis = swap_info + i; 466 struct swap_info_struct *sis = swap_info + i;
469 467
470 if (!(sis->flags & SWP_WRITEOK)) 468 if (!(sis->flags & SWP_WRITEOK))
471 continue; 469 continue;
472 470
473 if (!bdev) { 471 if (!bdev) {
474 if (bdev_p) 472 if (bdev_p)
475 *bdev_p = sis->bdev; 473 *bdev_p = sis->bdev;
476 474
477 spin_unlock(&swap_lock); 475 spin_unlock(&swap_lock);
478 return i; 476 return i;
479 } 477 }
480 if (bdev == sis->bdev) { 478 if (bdev == sis->bdev) {
481 struct swap_extent *se; 479 struct swap_extent *se;
482 480
483 se = list_entry(sis->extent_list.next, 481 se = list_entry(sis->extent_list.next,
484 struct swap_extent, list); 482 struct swap_extent, list);
485 if (se->start_block == offset) { 483 if (se->start_block == offset) {
486 if (bdev_p) 484 if (bdev_p)
487 *bdev_p = sis->bdev; 485 *bdev_p = sis->bdev;
488 486
489 spin_unlock(&swap_lock); 487 spin_unlock(&swap_lock);
490 bdput(bdev); 488 bdput(bdev);
491 return i; 489 return i;
492 } 490 }
493 } 491 }
494 } 492 }
495 spin_unlock(&swap_lock); 493 spin_unlock(&swap_lock);
496 if (bdev) 494 if (bdev)
497 bdput(bdev); 495 bdput(bdev);
498 496
499 return -ENODEV; 497 return -ENODEV;
500 } 498 }
501 499
502 /* 500 /*
503 * Return either the total number of swap pages of given type, or the number 501 * Return either the total number of swap pages of given type, or the number
504 * of free pages of that type (depending on @free) 502 * of free pages of that type (depending on @free)
505 * 503 *
506 * This is needed for software suspend 504 * This is needed for software suspend
507 */ 505 */
508 unsigned int count_swap_pages(int type, int free) 506 unsigned int count_swap_pages(int type, int free)
509 { 507 {
510 unsigned int n = 0; 508 unsigned int n = 0;
511 509
512 if (type < nr_swapfiles) { 510 if (type < nr_swapfiles) {
513 spin_lock(&swap_lock); 511 spin_lock(&swap_lock);
514 if (swap_info[type].flags & SWP_WRITEOK) { 512 if (swap_info[type].flags & SWP_WRITEOK) {
515 n = swap_info[type].pages; 513 n = swap_info[type].pages;
516 if (free) 514 if (free)
517 n -= swap_info[type].inuse_pages; 515 n -= swap_info[type].inuse_pages;
518 } 516 }
519 spin_unlock(&swap_lock); 517 spin_unlock(&swap_lock);
520 } 518 }
521 return n; 519 return n;
522 } 520 }
523 #endif 521 #endif
524 522
525 /* 523 /*
526 * No need to decide whether this PTE shares the swap entry with others, 524 * No need to decide whether this PTE shares the swap entry with others,
527 * just let do_wp_page work it out if a write is requested later - to 525 * just let do_wp_page work it out if a write is requested later - to
528 * force COW, vm_page_prot omits write permission from any private vma. 526 * force COW, vm_page_prot omits write permission from any private vma.
529 */ 527 */
530 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 528 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
531 unsigned long addr, swp_entry_t entry, struct page *page) 529 unsigned long addr, swp_entry_t entry, struct page *page)
532 { 530 {
533 spinlock_t *ptl; 531 spinlock_t *ptl;
534 pte_t *pte; 532 pte_t *pte;
535 int ret = 1; 533 int ret = 1;
536 534
537 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) 535 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
538 ret = -ENOMEM; 536 ret = -ENOMEM;
539 537
540 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 538 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
541 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 539 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
542 if (ret > 0) 540 if (ret > 0)
543 mem_cgroup_uncharge_page(page); 541 mem_cgroup_uncharge_page(page);
544 ret = 0; 542 ret = 0;
545 goto out; 543 goto out;
546 } 544 }
547 545
548 inc_mm_counter(vma->vm_mm, anon_rss); 546 inc_mm_counter(vma->vm_mm, anon_rss);
549 get_page(page); 547 get_page(page);
550 set_pte_at(vma->vm_mm, addr, pte, 548 set_pte_at(vma->vm_mm, addr, pte,
551 pte_mkold(mk_pte(page, vma->vm_page_prot))); 549 pte_mkold(mk_pte(page, vma->vm_page_prot)));
552 page_add_anon_rmap(page, vma, addr); 550 page_add_anon_rmap(page, vma, addr);
553 swap_free(entry); 551 swap_free(entry);
554 /* 552 /*
555 * Move the page to the active list so it is not 553 * Move the page to the active list so it is not
556 * immediately swapped out again after swapon. 554 * immediately swapped out again after swapon.
557 */ 555 */
558 activate_page(page); 556 activate_page(page);
559 out: 557 out:
560 pte_unmap_unlock(pte, ptl); 558 pte_unmap_unlock(pte, ptl);
561 return ret; 559 return ret;
562 } 560 }
563 561
564 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 562 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
565 unsigned long addr, unsigned long end, 563 unsigned long addr, unsigned long end,
566 swp_entry_t entry, struct page *page) 564 swp_entry_t entry, struct page *page)
567 { 565 {
568 pte_t swp_pte = swp_entry_to_pte(entry); 566 pte_t swp_pte = swp_entry_to_pte(entry);
569 pte_t *pte; 567 pte_t *pte;
570 int ret = 0; 568 int ret = 0;
571 569
572 /* 570 /*
573 * We don't actually need pte lock while scanning for swp_pte: since 571 * We don't actually need pte lock while scanning for swp_pte: since
574 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 572 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
575 * page table while we're scanning; though it could get zapped, and on 573 * page table while we're scanning; though it could get zapped, and on
576 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 574 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
577 * of unmatched parts which look like swp_pte, so unuse_pte must 575 * of unmatched parts which look like swp_pte, so unuse_pte must
578 * recheck under pte lock. Scanning without pte lock lets it be 576 * recheck under pte lock. Scanning without pte lock lets it be
579 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 577 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
580 */ 578 */
581 pte = pte_offset_map(pmd, addr); 579 pte = pte_offset_map(pmd, addr);
582 do { 580 do {
583 /* 581 /*
584 * swapoff spends a _lot_ of time in this loop! 582 * swapoff spends a _lot_ of time in this loop!
585 * Test inline before going to call unuse_pte. 583 * Test inline before going to call unuse_pte.
586 */ 584 */
587 if (unlikely(pte_same(*pte, swp_pte))) { 585 if (unlikely(pte_same(*pte, swp_pte))) {
588 pte_unmap(pte); 586 pte_unmap(pte);
589 ret = unuse_pte(vma, pmd, addr, entry, page); 587 ret = unuse_pte(vma, pmd, addr, entry, page);
590 if (ret) 588 if (ret)
591 goto out; 589 goto out;
592 pte = pte_offset_map(pmd, addr); 590 pte = pte_offset_map(pmd, addr);
593 } 591 }
594 } while (pte++, addr += PAGE_SIZE, addr != end); 592 } while (pte++, addr += PAGE_SIZE, addr != end);
595 pte_unmap(pte - 1); 593 pte_unmap(pte - 1);
596 out: 594 out:
597 return ret; 595 return ret;
598 } 596 }
599 597
600 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 598 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
601 unsigned long addr, unsigned long end, 599 unsigned long addr, unsigned long end,
602 swp_entry_t entry, struct page *page) 600 swp_entry_t entry, struct page *page)
603 { 601 {
604 pmd_t *pmd; 602 pmd_t *pmd;
605 unsigned long next; 603 unsigned long next;
606 int ret; 604 int ret;
607 605
608 pmd = pmd_offset(pud, addr); 606 pmd = pmd_offset(pud, addr);
609 do { 607 do {
610 next = pmd_addr_end(addr, end); 608 next = pmd_addr_end(addr, end);
611 if (pmd_none_or_clear_bad(pmd)) 609 if (pmd_none_or_clear_bad(pmd))
612 continue; 610 continue;
613 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 611 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
614 if (ret) 612 if (ret)
615 return ret; 613 return ret;
616 } while (pmd++, addr = next, addr != end); 614 } while (pmd++, addr = next, addr != end);
617 return 0; 615 return 0;
618 } 616 }
619 617
620 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 618 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
621 unsigned long addr, unsigned long end, 619 unsigned long addr, unsigned long end,
622 swp_entry_t entry, struct page *page) 620 swp_entry_t entry, struct page *page)
623 { 621 {
624 pud_t *pud; 622 pud_t *pud;
625 unsigned long next; 623 unsigned long next;
626 int ret; 624 int ret;
627 625
628 pud = pud_offset(pgd, addr); 626 pud = pud_offset(pgd, addr);
629 do { 627 do {
630 next = pud_addr_end(addr, end); 628 next = pud_addr_end(addr, end);
631 if (pud_none_or_clear_bad(pud)) 629 if (pud_none_or_clear_bad(pud))
632 continue; 630 continue;
633 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 631 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
634 if (ret) 632 if (ret)
635 return ret; 633 return ret;
636 } while (pud++, addr = next, addr != end); 634 } while (pud++, addr = next, addr != end);
637 return 0; 635 return 0;
638 } 636 }
639 637
640 static int unuse_vma(struct vm_area_struct *vma, 638 static int unuse_vma(struct vm_area_struct *vma,
641 swp_entry_t entry, struct page *page) 639 swp_entry_t entry, struct page *page)
642 { 640 {
643 pgd_t *pgd; 641 pgd_t *pgd;
644 unsigned long addr, end, next; 642 unsigned long addr, end, next;
645 int ret; 643 int ret;
646 644
647 if (page->mapping) { 645 if (page->mapping) {
648 addr = page_address_in_vma(page, vma); 646 addr = page_address_in_vma(page, vma);
649 if (addr == -EFAULT) 647 if (addr == -EFAULT)
650 return 0; 648 return 0;
651 else 649 else
652 end = addr + PAGE_SIZE; 650 end = addr + PAGE_SIZE;
653 } else { 651 } else {
654 addr = vma->vm_start; 652 addr = vma->vm_start;
655 end = vma->vm_end; 653 end = vma->vm_end;
656 } 654 }
657 655
658 pgd = pgd_offset(vma->vm_mm, addr); 656 pgd = pgd_offset(vma->vm_mm, addr);
659 do { 657 do {
660 next = pgd_addr_end(addr, end); 658 next = pgd_addr_end(addr, end);
661 if (pgd_none_or_clear_bad(pgd)) 659 if (pgd_none_or_clear_bad(pgd))
662 continue; 660 continue;
663 ret = unuse_pud_range(vma, pgd, addr, next, entry, page); 661 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
664 if (ret) 662 if (ret)
665 return ret; 663 return ret;
666 } while (pgd++, addr = next, addr != end); 664 } while (pgd++, addr = next, addr != end);
667 return 0; 665 return 0;
668 } 666 }
669 667
670 static int unuse_mm(struct mm_struct *mm, 668 static int unuse_mm(struct mm_struct *mm,
671 swp_entry_t entry, struct page *page) 669 swp_entry_t entry, struct page *page)
672 { 670 {
673 struct vm_area_struct *vma; 671 struct vm_area_struct *vma;
674 int ret = 0; 672 int ret = 0;
675 673
676 if (!down_read_trylock(&mm->mmap_sem)) { 674 if (!down_read_trylock(&mm->mmap_sem)) {
677 /* 675 /*
678 * Activate page so shrink_inactive_list is unlikely to unmap 676 * Activate page so shrink_inactive_list is unlikely to unmap
679 * its ptes while lock is dropped, so swapoff can make progress. 677 * its ptes while lock is dropped, so swapoff can make progress.
680 */ 678 */
681 activate_page(page); 679 activate_page(page);
682 unlock_page(page); 680 unlock_page(page);
683 down_read(&mm->mmap_sem); 681 down_read(&mm->mmap_sem);
684 lock_page(page); 682 lock_page(page);
685 } 683 }
686 for (vma = mm->mmap; vma; vma = vma->vm_next) { 684 for (vma = mm->mmap; vma; vma = vma->vm_next) {
687 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 685 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
688 break; 686 break;
689 } 687 }
690 up_read(&mm->mmap_sem); 688 up_read(&mm->mmap_sem);
691 return (ret < 0)? ret: 0; 689 return (ret < 0)? ret: 0;
692 } 690 }
693 691
694 /* 692 /*
695 * Scan swap_map from current position to next entry still in use. 693 * Scan swap_map from current position to next entry still in use.
696 * Recycle to start on reaching the end, returning 0 when empty. 694 * Recycle to start on reaching the end, returning 0 when empty.
697 */ 695 */
698 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 696 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
699 unsigned int prev) 697 unsigned int prev)
700 { 698 {
701 unsigned int max = si->max; 699 unsigned int max = si->max;
702 unsigned int i = prev; 700 unsigned int i = prev;
703 int count; 701 int count;
704 702
705 /* 703 /*
706 * No need for swap_lock here: we're just looking 704 * No need for swap_lock here: we're just looking
707 * for whether an entry is in use, not modifying it; false 705 * for whether an entry is in use, not modifying it; false
708 * hits are okay, and sys_swapoff() has already prevented new 706 * hits are okay, and sys_swapoff() has already prevented new
709 * allocations from this area (while holding swap_lock). 707 * allocations from this area (while holding swap_lock).
710 */ 708 */
711 for (;;) { 709 for (;;) {
712 if (++i >= max) { 710 if (++i >= max) {
713 if (!prev) { 711 if (!prev) {
714 i = 0; 712 i = 0;
715 break; 713 break;
716 } 714 }
717 /* 715 /*
718 * No entries in use at top of swap_map, 716 * No entries in use at top of swap_map,
719 * loop back to start and recheck there. 717 * loop back to start and recheck there.
720 */ 718 */
721 max = prev + 1; 719 max = prev + 1;
722 prev = 0; 720 prev = 0;
723 i = 1; 721 i = 1;
724 } 722 }
725 count = si->swap_map[i]; 723 count = si->swap_map[i];
726 if (count && count != SWAP_MAP_BAD) 724 if (count && count != SWAP_MAP_BAD)
727 break; 725 break;
728 } 726 }
729 return i; 727 return i;
730 } 728 }
731 729
732 /* 730 /*
733 * We completely avoid races by reading each swap page in advance, 731 * We completely avoid races by reading each swap page in advance,
734 * and then search for the process using it. All the necessary 732 * and then search for the process using it. All the necessary
735 * page table adjustments can then be made atomically. 733 * page table adjustments can then be made atomically.
736 */ 734 */
737 static int try_to_unuse(unsigned int type) 735 static int try_to_unuse(unsigned int type)
738 { 736 {
739 struct swap_info_struct * si = &swap_info[type]; 737 struct swap_info_struct * si = &swap_info[type];
740 struct mm_struct *start_mm; 738 struct mm_struct *start_mm;
741 unsigned short *swap_map; 739 unsigned short *swap_map;
742 unsigned short swcount; 740 unsigned short swcount;
743 struct page *page; 741 struct page *page;
744 swp_entry_t entry; 742 swp_entry_t entry;
745 unsigned int i = 0; 743 unsigned int i = 0;
746 int retval = 0; 744 int retval = 0;
747 int reset_overflow = 0; 745 int reset_overflow = 0;
748 int shmem; 746 int shmem;
749 747
750 /* 748 /*
751 * When searching mms for an entry, a good strategy is to 749 * When searching mms for an entry, a good strategy is to
752 * start at the first mm we freed the previous entry from 750 * start at the first mm we freed the previous entry from
753 * (though actually we don't notice whether we or coincidence 751 * (though actually we don't notice whether we or coincidence
754 * freed the entry). Initialize this start_mm with a hold. 752 * freed the entry). Initialize this start_mm with a hold.
755 * 753 *
756 * A simpler strategy would be to start at the last mm we 754 * A simpler strategy would be to start at the last mm we
757 * freed the previous entry from; but that would take less 755 * freed the previous entry from; but that would take less
758 * advantage of mmlist ordering, which clusters forked mms 756 * advantage of mmlist ordering, which clusters forked mms
759 * together, child after parent. If we race with dup_mmap(), we 757 * together, child after parent. If we race with dup_mmap(), we
760 * prefer to resolve parent before child, lest we miss entries 758 * prefer to resolve parent before child, lest we miss entries
761 * duplicated after we scanned child: using last mm would invert 759 * duplicated after we scanned child: using last mm would invert
762 * that. Though it's only a serious concern when an overflowed 760 * that. Though it's only a serious concern when an overflowed
763 * swap count is reset from SWAP_MAP_MAX, preventing a rescan. 761 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
764 */ 762 */
765 start_mm = &init_mm; 763 start_mm = &init_mm;
766 atomic_inc(&init_mm.mm_users); 764 atomic_inc(&init_mm.mm_users);
767 765
768 /* 766 /*
769 * Keep on scanning until all entries have gone. Usually, 767 * Keep on scanning until all entries have gone. Usually,
770 * one pass through swap_map is enough, but not necessarily: 768 * one pass through swap_map is enough, but not necessarily:
771 * there are races when an instance of an entry might be missed. 769 * there are races when an instance of an entry might be missed.
772 */ 770 */
773 while ((i = find_next_to_unuse(si, i)) != 0) { 771 while ((i = find_next_to_unuse(si, i)) != 0) {
774 if (signal_pending(current)) { 772 if (signal_pending(current)) {
775 retval = -EINTR; 773 retval = -EINTR;
776 break; 774 break;
777 } 775 }
778 776
779 /* 777 /*
780 * Get a page for the entry, using the existing swap 778 * Get a page for the entry, using the existing swap
781 * cache page if there is one. Otherwise, get a clean 779 * cache page if there is one. Otherwise, get a clean
782 * page and read the swap into it. 780 * page and read the swap into it.
783 */ 781 */
784 swap_map = &si->swap_map[i]; 782 swap_map = &si->swap_map[i];
785 entry = swp_entry(type, i); 783 entry = swp_entry(type, i);
786 page = read_swap_cache_async(entry, 784 page = read_swap_cache_async(entry,
787 GFP_HIGHUSER_MOVABLE, NULL, 0); 785 GFP_HIGHUSER_MOVABLE, NULL, 0);
788 if (!page) { 786 if (!page) {
789 /* 787 /*
790 * Either swap_duplicate() failed because entry 788 * Either swap_duplicate() failed because entry
791 * has been freed independently, and will not be 789 * has been freed independently, and will not be
792 * reused since sys_swapoff() already disabled 790 * reused since sys_swapoff() already disabled
793 * allocation from here, or alloc_page() failed. 791 * allocation from here, or alloc_page() failed.
794 */ 792 */
795 if (!*swap_map) 793 if (!*swap_map)
796 continue; 794 continue;
797 retval = -ENOMEM; 795 retval = -ENOMEM;
798 break; 796 break;
799 } 797 }
800 798
801 /* 799 /*
802 * Don't hold on to start_mm if it looks like exiting. 800 * Don't hold on to start_mm if it looks like exiting.
803 */ 801 */
804 if (atomic_read(&start_mm->mm_users) == 1) { 802 if (atomic_read(&start_mm->mm_users) == 1) {
805 mmput(start_mm); 803 mmput(start_mm);
806 start_mm = &init_mm; 804 start_mm = &init_mm;
807 atomic_inc(&init_mm.mm_users); 805 atomic_inc(&init_mm.mm_users);
808 } 806 }
809 807
810 /* 808 /*
811 * Wait for and lock page. When do_swap_page races with 809 * Wait for and lock page. When do_swap_page races with
812 * try_to_unuse, do_swap_page can handle the fault much 810 * try_to_unuse, do_swap_page can handle the fault much
813 * faster than try_to_unuse can locate the entry. This 811 * faster than try_to_unuse can locate the entry. This
814 * apparently redundant "wait_on_page_locked" lets try_to_unuse 812 * apparently redundant "wait_on_page_locked" lets try_to_unuse
815 * defer to do_swap_page in such a case - in some tests, 813 * defer to do_swap_page in such a case - in some tests,
816 * do_swap_page and try_to_unuse repeatedly compete. 814 * do_swap_page and try_to_unuse repeatedly compete.
817 */ 815 */
818 wait_on_page_locked(page); 816 wait_on_page_locked(page);
819 wait_on_page_writeback(page); 817 wait_on_page_writeback(page);
820 lock_page(page); 818 lock_page(page);
821 wait_on_page_writeback(page); 819 wait_on_page_writeback(page);
822 820
823 /* 821 /*
824 * Remove all references to entry. 822 * Remove all references to entry.
825 * Whenever we reach init_mm, there's no address space 823 * Whenever we reach init_mm, there's no address space
826 * to search, but use it as a reminder to search shmem. 824 * to search, but use it as a reminder to search shmem.
827 */ 825 */
828 shmem = 0; 826 shmem = 0;
829 swcount = *swap_map; 827 swcount = *swap_map;
830 if (swcount > 1) { 828 if (swcount > 1) {
831 if (start_mm == &init_mm) 829 if (start_mm == &init_mm)
832 shmem = shmem_unuse(entry, page); 830 shmem = shmem_unuse(entry, page);
833 else 831 else
834 retval = unuse_mm(start_mm, entry, page); 832 retval = unuse_mm(start_mm, entry, page);
835 } 833 }
836 if (*swap_map > 1) { 834 if (*swap_map > 1) {
837 int set_start_mm = (*swap_map >= swcount); 835 int set_start_mm = (*swap_map >= swcount);
838 struct list_head *p = &start_mm->mmlist; 836 struct list_head *p = &start_mm->mmlist;
839 struct mm_struct *new_start_mm = start_mm; 837 struct mm_struct *new_start_mm = start_mm;
840 struct mm_struct *prev_mm = start_mm; 838 struct mm_struct *prev_mm = start_mm;
841 struct mm_struct *mm; 839 struct mm_struct *mm;
842 840
843 atomic_inc(&new_start_mm->mm_users); 841 atomic_inc(&new_start_mm->mm_users);
844 atomic_inc(&prev_mm->mm_users); 842 atomic_inc(&prev_mm->mm_users);
845 spin_lock(&mmlist_lock); 843 spin_lock(&mmlist_lock);
846 while (*swap_map > 1 && !retval && !shmem && 844 while (*swap_map > 1 && !retval && !shmem &&
847 (p = p->next) != &start_mm->mmlist) { 845 (p = p->next) != &start_mm->mmlist) {
848 mm = list_entry(p, struct mm_struct, mmlist); 846 mm = list_entry(p, struct mm_struct, mmlist);
849 if (!atomic_inc_not_zero(&mm->mm_users)) 847 if (!atomic_inc_not_zero(&mm->mm_users))
850 continue; 848 continue;
851 spin_unlock(&mmlist_lock); 849 spin_unlock(&mmlist_lock);
852 mmput(prev_mm); 850 mmput(prev_mm);
853 prev_mm = mm; 851 prev_mm = mm;
854 852
855 cond_resched(); 853 cond_resched();
856 854
857 swcount = *swap_map; 855 swcount = *swap_map;
858 if (swcount <= 1) 856 if (swcount <= 1)
859 ; 857 ;
860 else if (mm == &init_mm) { 858 else if (mm == &init_mm) {
861 set_start_mm = 1; 859 set_start_mm = 1;
862 shmem = shmem_unuse(entry, page); 860 shmem = shmem_unuse(entry, page);
863 } else 861 } else
864 retval = unuse_mm(mm, entry, page); 862 retval = unuse_mm(mm, entry, page);
865 if (set_start_mm && *swap_map < swcount) { 863 if (set_start_mm && *swap_map < swcount) {
866 mmput(new_start_mm); 864 mmput(new_start_mm);
867 atomic_inc(&mm->mm_users); 865 atomic_inc(&mm->mm_users);
868 new_start_mm = mm; 866 new_start_mm = mm;
869 set_start_mm = 0; 867 set_start_mm = 0;
870 } 868 }
871 spin_lock(&mmlist_lock); 869 spin_lock(&mmlist_lock);
872 } 870 }
873 spin_unlock(&mmlist_lock); 871 spin_unlock(&mmlist_lock);
874 mmput(prev_mm); 872 mmput(prev_mm);
875 mmput(start_mm); 873 mmput(start_mm);
876 start_mm = new_start_mm; 874 start_mm = new_start_mm;
877 } 875 }
878 if (shmem) { 876 if (shmem) {
879 /* page has already been unlocked and released */ 877 /* page has already been unlocked and released */
880 if (shmem > 0) 878 if (shmem > 0)
881 continue; 879 continue;
882 retval = shmem; 880 retval = shmem;
883 break; 881 break;
884 } 882 }
885 if (retval) { 883 if (retval) {
886 unlock_page(page); 884 unlock_page(page);
887 page_cache_release(page); 885 page_cache_release(page);
888 break; 886 break;
889 } 887 }
890 888
891 /* 889 /*
892 * How could swap count reach 0x7fff when the maximum 890 * How could swap count reach 0x7fff when the maximum
893 * pid is 0x7fff, and there's no way to repeat a swap 891 * pid is 0x7fff, and there's no way to repeat a swap
894 * page within an mm (except in shmem, where it's the 892 * page within an mm (except in shmem, where it's the
895 * shared object which takes the reference count)? 893 * shared object which takes the reference count)?
896 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 894 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
897 * 895 *
898 * If that's wrong, then we should worry more about 896 * If that's wrong, then we should worry more about
899 * exit_mmap() and do_munmap() cases described above: 897 * exit_mmap() and do_munmap() cases described above:
900 * we might be resetting SWAP_MAP_MAX too early here. 898 * we might be resetting SWAP_MAP_MAX too early here.
901 * We know "Undead"s can happen, they're okay, so don't 899 * We know "Undead"s can happen, they're okay, so don't
902 * report them; but do report if we reset SWAP_MAP_MAX. 900 * report them; but do report if we reset SWAP_MAP_MAX.
903 */ 901 */
904 if (*swap_map == SWAP_MAP_MAX) { 902 if (*swap_map == SWAP_MAP_MAX) {
905 spin_lock(&swap_lock); 903 spin_lock(&swap_lock);
906 *swap_map = 1; 904 *swap_map = 1;
907 spin_unlock(&swap_lock); 905 spin_unlock(&swap_lock);
908 reset_overflow = 1; 906 reset_overflow = 1;
909 } 907 }
910 908
911 /* 909 /*
912 * If a reference remains (rare), we would like to leave 910 * If a reference remains (rare), we would like to leave
913 * the page in the swap cache; but try_to_unmap could 911 * the page in the swap cache; but try_to_unmap could
914 * then re-duplicate the entry once we drop page lock, 912 * then re-duplicate the entry once we drop page lock,
915 * so we might loop indefinitely; also, that page could 913 * so we might loop indefinitely; also, that page could
916 * not be swapped out to other storage meanwhile. So: 914 * not be swapped out to other storage meanwhile. So:
917 * delete from cache even if there's another reference, 915 * delete from cache even if there's another reference,
918 * after ensuring that the data has been saved to disk - 916 * after ensuring that the data has been saved to disk -
919 * since if the reference remains (rarer), it will be 917 * since if the reference remains (rarer), it will be
920 * read from disk into another page. Splitting into two 918 * read from disk into another page. Splitting into two
921 * pages would be incorrect if swap supported "shared 919 * pages would be incorrect if swap supported "shared
922 * private" pages, but they are handled by tmpfs files. 920 * private" pages, but they are handled by tmpfs files.
923 */ 921 */
924 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 922 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
925 struct writeback_control wbc = { 923 struct writeback_control wbc = {
926 .sync_mode = WB_SYNC_NONE, 924 .sync_mode = WB_SYNC_NONE,
927 }; 925 };
928 926
929 swap_writepage(page, &wbc); 927 swap_writepage(page, &wbc);
930 lock_page(page); 928 lock_page(page);
931 wait_on_page_writeback(page); 929 wait_on_page_writeback(page);
932 } 930 }
933 if (PageSwapCache(page)) 931 if (PageSwapCache(page))
934 delete_from_swap_cache(page); 932 delete_from_swap_cache(page);
935 933
936 /* 934 /*
937 * So we could skip searching mms once swap count went 935 * So we could skip searching mms once swap count went
938 * to 1, we did not mark any present ptes as dirty: must 936 * to 1, we did not mark any present ptes as dirty: must
939 * mark page dirty so shrink_page_list will preserve it. 937 * mark page dirty so shrink_page_list will preserve it.
940 */ 938 */
941 SetPageDirty(page); 939 SetPageDirty(page);
942 unlock_page(page); 940 unlock_page(page);
943 page_cache_release(page); 941 page_cache_release(page);
944 942
945 /* 943 /*
946 * Make sure that we aren't completely killing 944 * Make sure that we aren't completely killing
947 * interactive performance. 945 * interactive performance.
948 */ 946 */
949 cond_resched(); 947 cond_resched();
950 } 948 }
951 949
952 mmput(start_mm); 950 mmput(start_mm);
953 if (reset_overflow) { 951 if (reset_overflow) {
954 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 952 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
955 swap_overflow = 0; 953 swap_overflow = 0;
956 } 954 }
957 return retval; 955 return retval;
958 } 956 }
959 957
960 /* 958 /*
961 * After a successful try_to_unuse, if no swap is now in use, we know 959 * After a successful try_to_unuse, if no swap is now in use, we know
962 * we can empty the mmlist. swap_lock must be held on entry and exit. 960 * we can empty the mmlist. swap_lock must be held on entry and exit.
963 * Note that mmlist_lock nests inside swap_lock, and an mm must be 961 * Note that mmlist_lock nests inside swap_lock, and an mm must be
964 * added to the mmlist just after page_duplicate - before would be racy. 962 * added to the mmlist just after page_duplicate - before would be racy.
965 */ 963 */
966 static void drain_mmlist(void) 964 static void drain_mmlist(void)
967 { 965 {
968 struct list_head *p, *next; 966 struct list_head *p, *next;
969 unsigned int i; 967 unsigned int i;
970 968
971 for (i = 0; i < nr_swapfiles; i++) 969 for (i = 0; i < nr_swapfiles; i++)
972 if (swap_info[i].inuse_pages) 970 if (swap_info[i].inuse_pages)
973 return; 971 return;
974 spin_lock(&mmlist_lock); 972 spin_lock(&mmlist_lock);
975 list_for_each_safe(p, next, &init_mm.mmlist) 973 list_for_each_safe(p, next, &init_mm.mmlist)
976 list_del_init(p); 974 list_del_init(p);
977 spin_unlock(&mmlist_lock); 975 spin_unlock(&mmlist_lock);
978 } 976 }
979 977
980 /* 978 /*
981 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 979 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
982 * corresponds to page offset `offset'. 980 * corresponds to page offset `offset'.
983 */ 981 */
984 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 982 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
985 { 983 {
986 struct swap_extent *se = sis->curr_swap_extent; 984 struct swap_extent *se = sis->curr_swap_extent;
987 struct swap_extent *start_se = se; 985 struct swap_extent *start_se = se;
988 986
989 for ( ; ; ) { 987 for ( ; ; ) {
990 struct list_head *lh; 988 struct list_head *lh;
991 989
992 if (se->start_page <= offset && 990 if (se->start_page <= offset &&
993 offset < (se->start_page + se->nr_pages)) { 991 offset < (se->start_page + se->nr_pages)) {
994 return se->start_block + (offset - se->start_page); 992 return se->start_block + (offset - se->start_page);
995 } 993 }
996 lh = se->list.next; 994 lh = se->list.next;
997 if (lh == &sis->extent_list) 995 if (lh == &sis->extent_list)
998 lh = lh->next; 996 lh = lh->next;
999 se = list_entry(lh, struct swap_extent, list); 997 se = list_entry(lh, struct swap_extent, list);
1000 sis->curr_swap_extent = se; 998 sis->curr_swap_extent = se;
1001 BUG_ON(se == start_se); /* It *must* be present */ 999 BUG_ON(se == start_se); /* It *must* be present */
1002 } 1000 }
1003 } 1001 }
1004 1002
1005 #ifdef CONFIG_HIBERNATION 1003 #ifdef CONFIG_HIBERNATION
1006 /* 1004 /*
1007 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1005 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1008 * corresponding to given index in swap_info (swap type). 1006 * corresponding to given index in swap_info (swap type).
1009 */ 1007 */
1010 sector_t swapdev_block(int swap_type, pgoff_t offset) 1008 sector_t swapdev_block(int swap_type, pgoff_t offset)
1011 { 1009 {
1012 struct swap_info_struct *sis; 1010 struct swap_info_struct *sis;
1013 1011
1014 if (swap_type >= nr_swapfiles) 1012 if (swap_type >= nr_swapfiles)
1015 return 0; 1013 return 0;
1016 1014
1017 sis = swap_info + swap_type; 1015 sis = swap_info + swap_type;
1018 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; 1016 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
1019 } 1017 }
1020 #endif /* CONFIG_HIBERNATION */ 1018 #endif /* CONFIG_HIBERNATION */
1021 1019
1022 /* 1020 /*
1023 * Free all of a swapdev's extent information 1021 * Free all of a swapdev's extent information
1024 */ 1022 */
1025 static void destroy_swap_extents(struct swap_info_struct *sis) 1023 static void destroy_swap_extents(struct swap_info_struct *sis)
1026 { 1024 {
1027 while (!list_empty(&sis->extent_list)) { 1025 while (!list_empty(&sis->extent_list)) {
1028 struct swap_extent *se; 1026 struct swap_extent *se;
1029 1027
1030 se = list_entry(sis->extent_list.next, 1028 se = list_entry(sis->extent_list.next,
1031 struct swap_extent, list); 1029 struct swap_extent, list);
1032 list_del(&se->list); 1030 list_del(&se->list);
1033 kfree(se); 1031 kfree(se);
1034 } 1032 }
1035 } 1033 }
1036 1034
1037 /* 1035 /*
1038 * Add a block range (and the corresponding page range) into this swapdev's 1036 * Add a block range (and the corresponding page range) into this swapdev's
1039 * extent list. The extent list is kept sorted in page order. 1037 * extent list. The extent list is kept sorted in page order.
1040 * 1038 *
1041 * This function rather assumes that it is called in ascending page order. 1039 * This function rather assumes that it is called in ascending page order.
1042 */ 1040 */
1043 static int 1041 static int
1044 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1042 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1045 unsigned long nr_pages, sector_t start_block) 1043 unsigned long nr_pages, sector_t start_block)
1046 { 1044 {
1047 struct swap_extent *se; 1045 struct swap_extent *se;
1048 struct swap_extent *new_se; 1046 struct swap_extent *new_se;
1049 struct list_head *lh; 1047 struct list_head *lh;
1050 1048
1051 lh = sis->extent_list.prev; /* The highest page extent */ 1049 lh = sis->extent_list.prev; /* The highest page extent */
1052 if (lh != &sis->extent_list) { 1050 if (lh != &sis->extent_list) {
1053 se = list_entry(lh, struct swap_extent, list); 1051 se = list_entry(lh, struct swap_extent, list);
1054 BUG_ON(se->start_page + se->nr_pages != start_page); 1052 BUG_ON(se->start_page + se->nr_pages != start_page);
1055 if (se->start_block + se->nr_pages == start_block) { 1053 if (se->start_block + se->nr_pages == start_block) {
1056 /* Merge it */ 1054 /* Merge it */
1057 se->nr_pages += nr_pages; 1055 se->nr_pages += nr_pages;
1058 return 0; 1056 return 0;
1059 } 1057 }
1060 } 1058 }
1061 1059
1062 /* 1060 /*
1063 * No merge. Insert a new extent, preserving ordering. 1061 * No merge. Insert a new extent, preserving ordering.
1064 */ 1062 */
1065 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 1063 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1066 if (new_se == NULL) 1064 if (new_se == NULL)
1067 return -ENOMEM; 1065 return -ENOMEM;
1068 new_se->start_page = start_page; 1066 new_se->start_page = start_page;
1069 new_se->nr_pages = nr_pages; 1067 new_se->nr_pages = nr_pages;
1070 new_se->start_block = start_block; 1068 new_se->start_block = start_block;
1071 1069
1072 list_add_tail(&new_se->list, &sis->extent_list); 1070 list_add_tail(&new_se->list, &sis->extent_list);
1073 return 1; 1071 return 1;
1074 } 1072 }
1075 1073
1076 /* 1074 /*
1077 * A `swap extent' is a simple thing which maps a contiguous range of pages 1075 * A `swap extent' is a simple thing which maps a contiguous range of pages
1078 * onto a contiguous range of disk blocks. An ordered list of swap extents 1076 * onto a contiguous range of disk blocks. An ordered list of swap extents
1079 * is built at swapon time and is then used at swap_writepage/swap_readpage 1077 * is built at swapon time and is then used at swap_writepage/swap_readpage
1080 * time for locating where on disk a page belongs. 1078 * time for locating where on disk a page belongs.
1081 * 1079 *
1082 * If the swapfile is an S_ISBLK block device, a single extent is installed. 1080 * If the swapfile is an S_ISBLK block device, a single extent is installed.
1083 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 1081 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
1084 * swap files identically. 1082 * swap files identically.
1085 * 1083 *
1086 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 1084 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
1087 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 1085 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
1088 * swapfiles are handled *identically* after swapon time. 1086 * swapfiles are handled *identically* after swapon time.
1089 * 1087 *
1090 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 1088 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
1091 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 1089 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
1092 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 1090 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
1093 * requirements, they are simply tossed out - we will never use those blocks 1091 * requirements, they are simply tossed out - we will never use those blocks
1094 * for swapping. 1092 * for swapping.
1095 * 1093 *
1096 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 1094 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
1097 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 1095 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
1098 * which will scribble on the fs. 1096 * which will scribble on the fs.
1099 * 1097 *
1100 * The amount of disk space which a single swap extent represents varies. 1098 * The amount of disk space which a single swap extent represents varies.
1101 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 1099 * Typically it is in the 1-4 megabyte range. So we can have hundreds of
1102 * extents in the list. To avoid much list walking, we cache the previous 1100 * extents in the list. To avoid much list walking, we cache the previous
1103 * search location in `curr_swap_extent', and start new searches from there. 1101 * search location in `curr_swap_extent', and start new searches from there.
1104 * This is extremely effective. The average number of iterations in 1102 * This is extremely effective. The average number of iterations in
1105 * map_swap_page() has been measured at about 0.3 per page. - akpm. 1103 * map_swap_page() has been measured at about 0.3 per page. - akpm.
1106 */ 1104 */
1107 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1105 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1108 { 1106 {
1109 struct inode *inode; 1107 struct inode *inode;
1110 unsigned blocks_per_page; 1108 unsigned blocks_per_page;
1111 unsigned long page_no; 1109 unsigned long page_no;
1112 unsigned blkbits; 1110 unsigned blkbits;
1113 sector_t probe_block; 1111 sector_t probe_block;
1114 sector_t last_block; 1112 sector_t last_block;
1115 sector_t lowest_block = -1; 1113 sector_t lowest_block = -1;
1116 sector_t highest_block = 0; 1114 sector_t highest_block = 0;
1117 int nr_extents = 0; 1115 int nr_extents = 0;
1118 int ret; 1116 int ret;
1119 1117
1120 inode = sis->swap_file->f_mapping->host; 1118 inode = sis->swap_file->f_mapping->host;
1121 if (S_ISBLK(inode->i_mode)) { 1119 if (S_ISBLK(inode->i_mode)) {
1122 ret = add_swap_extent(sis, 0, sis->max, 0); 1120 ret = add_swap_extent(sis, 0, sis->max, 0);
1123 *span = sis->pages; 1121 *span = sis->pages;
1124 goto done; 1122 goto done;
1125 } 1123 }
1126 1124
1127 blkbits = inode->i_blkbits; 1125 blkbits = inode->i_blkbits;
1128 blocks_per_page = PAGE_SIZE >> blkbits; 1126 blocks_per_page = PAGE_SIZE >> blkbits;
1129 1127
1130 /* 1128 /*
1131 * Map all the blocks into the extent list. This code doesn't try 1129 * Map all the blocks into the extent list. This code doesn't try
1132 * to be very smart. 1130 * to be very smart.
1133 */ 1131 */
1134 probe_block = 0; 1132 probe_block = 0;
1135 page_no = 0; 1133 page_no = 0;
1136 last_block = i_size_read(inode) >> blkbits; 1134 last_block = i_size_read(inode) >> blkbits;
1137 while ((probe_block + blocks_per_page) <= last_block && 1135 while ((probe_block + blocks_per_page) <= last_block &&
1138 page_no < sis->max) { 1136 page_no < sis->max) {
1139 unsigned block_in_page; 1137 unsigned block_in_page;
1140 sector_t first_block; 1138 sector_t first_block;
1141 1139
1142 first_block = bmap(inode, probe_block); 1140 first_block = bmap(inode, probe_block);
1143 if (first_block == 0) 1141 if (first_block == 0)
1144 goto bad_bmap; 1142 goto bad_bmap;
1145 1143
1146 /* 1144 /*
1147 * It must be PAGE_SIZE aligned on-disk 1145 * It must be PAGE_SIZE aligned on-disk
1148 */ 1146 */
1149 if (first_block & (blocks_per_page - 1)) { 1147 if (first_block & (blocks_per_page - 1)) {
1150 probe_block++; 1148 probe_block++;
1151 goto reprobe; 1149 goto reprobe;
1152 } 1150 }
1153 1151
1154 for (block_in_page = 1; block_in_page < blocks_per_page; 1152 for (block_in_page = 1; block_in_page < blocks_per_page;
1155 block_in_page++) { 1153 block_in_page++) {
1156 sector_t block; 1154 sector_t block;
1157 1155
1158 block = bmap(inode, probe_block + block_in_page); 1156 block = bmap(inode, probe_block + block_in_page);
1159 if (block == 0) 1157 if (block == 0)
1160 goto bad_bmap; 1158 goto bad_bmap;
1161 if (block != first_block + block_in_page) { 1159 if (block != first_block + block_in_page) {
1162 /* Discontiguity */ 1160 /* Discontiguity */
1163 probe_block++; 1161 probe_block++;
1164 goto reprobe; 1162 goto reprobe;
1165 } 1163 }
1166 } 1164 }
1167 1165
1168 first_block >>= (PAGE_SHIFT - blkbits); 1166 first_block >>= (PAGE_SHIFT - blkbits);
1169 if (page_no) { /* exclude the header page */ 1167 if (page_no) { /* exclude the header page */
1170 if (first_block < lowest_block) 1168 if (first_block < lowest_block)
1171 lowest_block = first_block; 1169 lowest_block = first_block;
1172 if (first_block > highest_block) 1170 if (first_block > highest_block)
1173 highest_block = first_block; 1171 highest_block = first_block;
1174 } 1172 }
1175 1173
1176 /* 1174 /*
1177 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks 1175 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1178 */ 1176 */
1179 ret = add_swap_extent(sis, page_no, 1, first_block); 1177 ret = add_swap_extent(sis, page_no, 1, first_block);
1180 if (ret < 0) 1178 if (ret < 0)
1181 goto out; 1179 goto out;
1182 nr_extents += ret; 1180 nr_extents += ret;
1183 page_no++; 1181 page_no++;
1184 probe_block += blocks_per_page; 1182 probe_block += blocks_per_page;
1185 reprobe: 1183 reprobe:
1186 continue; 1184 continue;
1187 } 1185 }
1188 ret = nr_extents; 1186 ret = nr_extents;
1189 *span = 1 + highest_block - lowest_block; 1187 *span = 1 + highest_block - lowest_block;
1190 if (page_no == 0) 1188 if (page_no == 0)
1191 page_no = 1; /* force Empty message */ 1189 page_no = 1; /* force Empty message */
1192 sis->max = page_no; 1190 sis->max = page_no;
1193 sis->pages = page_no - 1; 1191 sis->pages = page_no - 1;
1194 sis->highest_bit = page_no - 1; 1192 sis->highest_bit = page_no - 1;
1195 done: 1193 done:
1196 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1194 sis->curr_swap_extent = list_entry(sis->extent_list.prev,
1197 struct swap_extent, list); 1195 struct swap_extent, list);
1198 goto out; 1196 goto out;
1199 bad_bmap: 1197 bad_bmap:
1200 printk(KERN_ERR "swapon: swapfile has holes\n"); 1198 printk(KERN_ERR "swapon: swapfile has holes\n");
1201 ret = -EINVAL; 1199 ret = -EINVAL;
1202 out: 1200 out:
1203 return ret; 1201 return ret;
1204 } 1202 }
1205 1203
1206 #if 0 /* We don't need this yet */ 1204 #if 0 /* We don't need this yet */
1207 #include <linux/backing-dev.h> 1205 #include <linux/backing-dev.h>
1208 int page_queue_congested(struct page *page) 1206 int page_queue_congested(struct page *page)
1209 { 1207 {
1210 struct backing_dev_info *bdi; 1208 struct backing_dev_info *bdi;
1211 1209
1212 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ 1210 VM_BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1213 1211
1214 if (PageSwapCache(page)) { 1212 if (PageSwapCache(page)) {
1215 swp_entry_t entry = { .val = page_private(page) }; 1213 swp_entry_t entry = { .val = page_private(page) };
1216 struct swap_info_struct *sis; 1214 struct swap_info_struct *sis;
1217 1215
1218 sis = get_swap_info_struct(swp_type(entry)); 1216 sis = get_swap_info_struct(swp_type(entry));
1219 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; 1217 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
1220 } else 1218 } else
1221 bdi = page->mapping->backing_dev_info; 1219 bdi = page->mapping->backing_dev_info;
1222 return bdi_write_congested(bdi); 1220 return bdi_write_congested(bdi);
1223 } 1221 }
1224 #endif 1222 #endif
1225 1223
1226 asmlinkage long sys_swapoff(const char __user * specialfile) 1224 asmlinkage long sys_swapoff(const char __user * specialfile)
1227 { 1225 {
1228 struct swap_info_struct * p = NULL; 1226 struct swap_info_struct * p = NULL;
1229 unsigned short *swap_map; 1227 unsigned short *swap_map;
1230 struct file *swap_file, *victim; 1228 struct file *swap_file, *victim;
1231 struct address_space *mapping; 1229 struct address_space *mapping;
1232 struct inode *inode; 1230 struct inode *inode;
1233 char * pathname; 1231 char * pathname;
1234 int i, type, prev; 1232 int i, type, prev;
1235 int err; 1233 int err;
1236 1234
1237 if (!capable(CAP_SYS_ADMIN)) 1235 if (!capable(CAP_SYS_ADMIN))
1238 return -EPERM; 1236 return -EPERM;
1239 1237
1240 pathname = getname(specialfile); 1238 pathname = getname(specialfile);
1241 err = PTR_ERR(pathname); 1239 err = PTR_ERR(pathname);
1242 if (IS_ERR(pathname)) 1240 if (IS_ERR(pathname))
1243 goto out; 1241 goto out;
1244 1242
1245 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 1243 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1246 putname(pathname); 1244 putname(pathname);
1247 err = PTR_ERR(victim); 1245 err = PTR_ERR(victim);
1248 if (IS_ERR(victim)) 1246 if (IS_ERR(victim))
1249 goto out; 1247 goto out;
1250 1248
1251 mapping = victim->f_mapping; 1249 mapping = victim->f_mapping;
1252 prev = -1; 1250 prev = -1;
1253 spin_lock(&swap_lock); 1251 spin_lock(&swap_lock);
1254 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1252 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1255 p = swap_info + type; 1253 p = swap_info + type;
1256 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 1254 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
1257 if (p->swap_file->f_mapping == mapping) 1255 if (p->swap_file->f_mapping == mapping)
1258 break; 1256 break;
1259 } 1257 }
1260 prev = type; 1258 prev = type;
1261 } 1259 }
1262 if (type < 0) { 1260 if (type < 0) {
1263 err = -EINVAL; 1261 err = -EINVAL;
1264 spin_unlock(&swap_lock); 1262 spin_unlock(&swap_lock);
1265 goto out_dput; 1263 goto out_dput;
1266 } 1264 }
1267 if (!security_vm_enough_memory(p->pages)) 1265 if (!security_vm_enough_memory(p->pages))
1268 vm_unacct_memory(p->pages); 1266 vm_unacct_memory(p->pages);
1269 else { 1267 else {
1270 err = -ENOMEM; 1268 err = -ENOMEM;
1271 spin_unlock(&swap_lock); 1269 spin_unlock(&swap_lock);
1272 goto out_dput; 1270 goto out_dput;
1273 } 1271 }
1274 if (prev < 0) { 1272 if (prev < 0) {
1275 swap_list.head = p->next; 1273 swap_list.head = p->next;
1276 } else { 1274 } else {
1277 swap_info[prev].next = p->next; 1275 swap_info[prev].next = p->next;
1278 } 1276 }
1279 if (type == swap_list.next) { 1277 if (type == swap_list.next) {
1280 /* just pick something that's safe... */ 1278 /* just pick something that's safe... */
1281 swap_list.next = swap_list.head; 1279 swap_list.next = swap_list.head;
1282 } 1280 }
1283 if (p->prio < 0) { 1281 if (p->prio < 0) {
1284 for (i = p->next; i >= 0; i = swap_info[i].next) 1282 for (i = p->next; i >= 0; i = swap_info[i].next)
1285 swap_info[i].prio = p->prio--; 1283 swap_info[i].prio = p->prio--;
1286 least_priority++; 1284 least_priority++;
1287 } 1285 }
1288 nr_swap_pages -= p->pages; 1286 nr_swap_pages -= p->pages;
1289 total_swap_pages -= p->pages; 1287 total_swap_pages -= p->pages;
1290 p->flags &= ~SWP_WRITEOK; 1288 p->flags &= ~SWP_WRITEOK;
1291 spin_unlock(&swap_lock); 1289 spin_unlock(&swap_lock);
1292 1290
1293 current->flags |= PF_SWAPOFF; 1291 current->flags |= PF_SWAPOFF;
1294 err = try_to_unuse(type); 1292 err = try_to_unuse(type);
1295 current->flags &= ~PF_SWAPOFF; 1293 current->flags &= ~PF_SWAPOFF;
1296 1294
1297 if (err) { 1295 if (err) {
1298 /* re-insert swap space back into swap_list */ 1296 /* re-insert swap space back into swap_list */
1299 spin_lock(&swap_lock); 1297 spin_lock(&swap_lock);
1300 if (p->prio < 0) 1298 if (p->prio < 0)
1301 p->prio = --least_priority; 1299 p->prio = --least_priority;
1302 prev = -1; 1300 prev = -1;
1303 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1301 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1304 if (p->prio >= swap_info[i].prio) 1302 if (p->prio >= swap_info[i].prio)
1305 break; 1303 break;
1306 prev = i; 1304 prev = i;
1307 } 1305 }
1308 p->next = i; 1306 p->next = i;
1309 if (prev < 0) 1307 if (prev < 0)
1310 swap_list.head = swap_list.next = p - swap_info; 1308 swap_list.head = swap_list.next = p - swap_info;
1311 else 1309 else
1312 swap_info[prev].next = p - swap_info; 1310 swap_info[prev].next = p - swap_info;
1313 nr_swap_pages += p->pages; 1311 nr_swap_pages += p->pages;
1314 total_swap_pages += p->pages; 1312 total_swap_pages += p->pages;
1315 p->flags |= SWP_WRITEOK; 1313 p->flags |= SWP_WRITEOK;
1316 spin_unlock(&swap_lock); 1314 spin_unlock(&swap_lock);
1317 goto out_dput; 1315 goto out_dput;
1318 } 1316 }
1319 1317
1320 /* wait for any unplug function to finish */ 1318 /* wait for any unplug function to finish */
1321 down_write(&swap_unplug_sem); 1319 down_write(&swap_unplug_sem);
1322 up_write(&swap_unplug_sem); 1320 up_write(&swap_unplug_sem);
1323 1321
1324 destroy_swap_extents(p); 1322 destroy_swap_extents(p);
1325 mutex_lock(&swapon_mutex); 1323 mutex_lock(&swapon_mutex);
1326 spin_lock(&swap_lock); 1324 spin_lock(&swap_lock);
1327 drain_mmlist(); 1325 drain_mmlist();
1328 1326
1329 /* wait for anyone still in scan_swap_map */ 1327 /* wait for anyone still in scan_swap_map */
1330 p->highest_bit = 0; /* cuts scans short */ 1328 p->highest_bit = 0; /* cuts scans short */
1331 while (p->flags >= SWP_SCANNING) { 1329 while (p->flags >= SWP_SCANNING) {
1332 spin_unlock(&swap_lock); 1330 spin_unlock(&swap_lock);
1333 schedule_timeout_uninterruptible(1); 1331 schedule_timeout_uninterruptible(1);
1334 spin_lock(&swap_lock); 1332 spin_lock(&swap_lock);
1335 } 1333 }
1336 1334
1337 swap_file = p->swap_file; 1335 swap_file = p->swap_file;
1338 p->swap_file = NULL; 1336 p->swap_file = NULL;
1339 p->max = 0; 1337 p->max = 0;
1340 swap_map = p->swap_map; 1338 swap_map = p->swap_map;
1341 p->swap_map = NULL; 1339 p->swap_map = NULL;
1342 p->flags = 0; 1340 p->flags = 0;
1343 spin_unlock(&swap_lock); 1341 spin_unlock(&swap_lock);
1344 mutex_unlock(&swapon_mutex); 1342 mutex_unlock(&swapon_mutex);
1345 vfree(swap_map); 1343 vfree(swap_map);
1346 inode = mapping->host; 1344 inode = mapping->host;
1347 if (S_ISBLK(inode->i_mode)) { 1345 if (S_ISBLK(inode->i_mode)) {
1348 struct block_device *bdev = I_BDEV(inode); 1346 struct block_device *bdev = I_BDEV(inode);
1349 set_blocksize(bdev, p->old_block_size); 1347 set_blocksize(bdev, p->old_block_size);
1350 bd_release(bdev); 1348 bd_release(bdev);
1351 } else { 1349 } else {
1352 mutex_lock(&inode->i_mutex); 1350 mutex_lock(&inode->i_mutex);
1353 inode->i_flags &= ~S_SWAPFILE; 1351 inode->i_flags &= ~S_SWAPFILE;
1354 mutex_unlock(&inode->i_mutex); 1352 mutex_unlock(&inode->i_mutex);
1355 } 1353 }
1356 filp_close(swap_file, NULL); 1354 filp_close(swap_file, NULL);
1357 err = 0; 1355 err = 0;
1358 1356
1359 out_dput: 1357 out_dput:
1360 filp_close(victim, NULL); 1358 filp_close(victim, NULL);
1361 out: 1359 out:
1362 return err; 1360 return err;
1363 } 1361 }
1364 1362
1365 #ifdef CONFIG_PROC_FS 1363 #ifdef CONFIG_PROC_FS
1366 /* iterator */ 1364 /* iterator */
1367 static void *swap_start(struct seq_file *swap, loff_t *pos) 1365 static void *swap_start(struct seq_file *swap, loff_t *pos)
1368 { 1366 {
1369 struct swap_info_struct *ptr = swap_info; 1367 struct swap_info_struct *ptr = swap_info;
1370 int i; 1368 int i;
1371 loff_t l = *pos; 1369 loff_t l = *pos;
1372 1370
1373 mutex_lock(&swapon_mutex); 1371 mutex_lock(&swapon_mutex);
1374 1372
1375 if (!l) 1373 if (!l)
1376 return SEQ_START_TOKEN; 1374 return SEQ_START_TOKEN;
1377 1375
1378 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1376 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1379 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1377 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1380 continue; 1378 continue;
1381 if (!--l) 1379 if (!--l)
1382 return ptr; 1380 return ptr;
1383 } 1381 }
1384 1382
1385 return NULL; 1383 return NULL;
1386 } 1384 }
1387 1385
1388 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1386 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1389 { 1387 {
1390 struct swap_info_struct *ptr; 1388 struct swap_info_struct *ptr;
1391 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1389 struct swap_info_struct *endptr = swap_info + nr_swapfiles;
1392 1390
1393 if (v == SEQ_START_TOKEN) 1391 if (v == SEQ_START_TOKEN)
1394 ptr = swap_info; 1392 ptr = swap_info;
1395 else { 1393 else {
1396 ptr = v; 1394 ptr = v;
1397 ptr++; 1395 ptr++;
1398 } 1396 }
1399 1397
1400 for (; ptr < endptr; ptr++) { 1398 for (; ptr < endptr; ptr++) {
1401 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1399 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1402 continue; 1400 continue;
1403 ++*pos; 1401 ++*pos;
1404 return ptr; 1402 return ptr;
1405 } 1403 }
1406 1404
1407 return NULL; 1405 return NULL;
1408 } 1406 }
1409 1407
1410 static void swap_stop(struct seq_file *swap, void *v) 1408 static void swap_stop(struct seq_file *swap, void *v)
1411 { 1409 {
1412 mutex_unlock(&swapon_mutex); 1410 mutex_unlock(&swapon_mutex);
1413 } 1411 }
1414 1412
1415 static int swap_show(struct seq_file *swap, void *v) 1413 static int swap_show(struct seq_file *swap, void *v)
1416 { 1414 {
1417 struct swap_info_struct *ptr = v; 1415 struct swap_info_struct *ptr = v;
1418 struct file *file; 1416 struct file *file;
1419 int len; 1417 int len;
1420 1418
1421 if (ptr == SEQ_START_TOKEN) { 1419 if (ptr == SEQ_START_TOKEN) {
1422 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1420 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1423 return 0; 1421 return 0;
1424 } 1422 }
1425 1423
1426 file = ptr->swap_file; 1424 file = ptr->swap_file;
1427 len = seq_path(swap, &file->f_path, " \t\n\\"); 1425 len = seq_path(swap, &file->f_path, " \t\n\\");
1428 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1426 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1429 len < 40 ? 40 - len : 1, " ", 1427 len < 40 ? 40 - len : 1, " ",
1430 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1428 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1431 "partition" : "file\t", 1429 "partition" : "file\t",
1432 ptr->pages << (PAGE_SHIFT - 10), 1430 ptr->pages << (PAGE_SHIFT - 10),
1433 ptr->inuse_pages << (PAGE_SHIFT - 10), 1431 ptr->inuse_pages << (PAGE_SHIFT - 10),
1434 ptr->prio); 1432 ptr->prio);
1435 return 0; 1433 return 0;
1436 } 1434 }
1437 1435
1438 static const struct seq_operations swaps_op = { 1436 static const struct seq_operations swaps_op = {
1439 .start = swap_start, 1437 .start = swap_start,
1440 .next = swap_next, 1438 .next = swap_next,
1441 .stop = swap_stop, 1439 .stop = swap_stop,
1442 .show = swap_show 1440 .show = swap_show
1443 }; 1441 };
1444 1442
1445 static int swaps_open(struct inode *inode, struct file *file) 1443 static int swaps_open(struct inode *inode, struct file *file)
1446 { 1444 {
1447 return seq_open(file, &swaps_op); 1445 return seq_open(file, &swaps_op);
1448 } 1446 }
1449 1447
1450 static const struct file_operations proc_swaps_operations = { 1448 static const struct file_operations proc_swaps_operations = {
1451 .open = swaps_open, 1449 .open = swaps_open,
1452 .read = seq_read, 1450 .read = seq_read,
1453 .llseek = seq_lseek, 1451 .llseek = seq_lseek,
1454 .release = seq_release, 1452 .release = seq_release,
1455 }; 1453 };
1456 1454
1457 static int __init procswaps_init(void) 1455 static int __init procswaps_init(void)
1458 { 1456 {
1459 proc_create("swaps", 0, NULL, &proc_swaps_operations); 1457 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1460 return 0; 1458 return 0;
1461 } 1459 }
1462 __initcall(procswaps_init); 1460 __initcall(procswaps_init);
1463 #endif /* CONFIG_PROC_FS */ 1461 #endif /* CONFIG_PROC_FS */
1464 1462
1465 #ifdef MAX_SWAPFILES_CHECK 1463 #ifdef MAX_SWAPFILES_CHECK
1466 static int __init max_swapfiles_check(void) 1464 static int __init max_swapfiles_check(void)
1467 { 1465 {
1468 MAX_SWAPFILES_CHECK(); 1466 MAX_SWAPFILES_CHECK();
1469 return 0; 1467 return 0;
1470 } 1468 }
1471 late_initcall(max_swapfiles_check); 1469 late_initcall(max_swapfiles_check);
1472 #endif 1470 #endif
1473 1471
1474 /* 1472 /*
1475 * Written 01/25/92 by Simmule Turner, heavily changed by Linus. 1473 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1476 * 1474 *
1477 * The swapon system call 1475 * The swapon system call
1478 */ 1476 */
1479 asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) 1477 asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1480 { 1478 {
1481 struct swap_info_struct * p; 1479 struct swap_info_struct * p;
1482 char *name = NULL; 1480 char *name = NULL;
1483 struct block_device *bdev = NULL; 1481 struct block_device *bdev = NULL;
1484 struct file *swap_file = NULL; 1482 struct file *swap_file = NULL;
1485 struct address_space *mapping; 1483 struct address_space *mapping;
1486 unsigned int type; 1484 unsigned int type;
1487 int i, prev; 1485 int i, prev;
1488 int error; 1486 int error;
1489 union swap_header *swap_header = NULL; 1487 union swap_header *swap_header = NULL;
1490 int swap_header_version; 1488 int swap_header_version;
1491 unsigned int nr_good_pages = 0; 1489 unsigned int nr_good_pages = 0;
1492 int nr_extents = 0; 1490 int nr_extents = 0;
1493 sector_t span; 1491 sector_t span;
1494 unsigned long maxpages = 1; 1492 unsigned long maxpages = 1;
1495 int swapfilesize; 1493 int swapfilesize;
1496 unsigned short *swap_map = NULL; 1494 unsigned short *swap_map = NULL;
1497 struct page *page = NULL; 1495 struct page *page = NULL;
1498 struct inode *inode = NULL; 1496 struct inode *inode = NULL;
1499 int did_down = 0; 1497 int did_down = 0;
1500 1498
1501 if (!capable(CAP_SYS_ADMIN)) 1499 if (!capable(CAP_SYS_ADMIN))
1502 return -EPERM; 1500 return -EPERM;
1503 spin_lock(&swap_lock); 1501 spin_lock(&swap_lock);
1504 p = swap_info; 1502 p = swap_info;
1505 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1503 for (type = 0 ; type < nr_swapfiles ; type++,p++)
1506 if (!(p->flags & SWP_USED)) 1504 if (!(p->flags & SWP_USED))
1507 break; 1505 break;
1508 error = -EPERM; 1506 error = -EPERM;
1509 if (type >= MAX_SWAPFILES) { 1507 if (type >= MAX_SWAPFILES) {
1510 spin_unlock(&swap_lock); 1508 spin_unlock(&swap_lock);
1511 goto out; 1509 goto out;
1512 } 1510 }
1513 if (type >= nr_swapfiles) 1511 if (type >= nr_swapfiles)
1514 nr_swapfiles = type+1; 1512 nr_swapfiles = type+1;
1515 memset(p, 0, sizeof(*p)); 1513 memset(p, 0, sizeof(*p));
1516 INIT_LIST_HEAD(&p->extent_list); 1514 INIT_LIST_HEAD(&p->extent_list);
1517 p->flags = SWP_USED; 1515 p->flags = SWP_USED;
1518 p->next = -1; 1516 p->next = -1;
1519 spin_unlock(&swap_lock); 1517 spin_unlock(&swap_lock);
1520 name = getname(specialfile); 1518 name = getname(specialfile);
1521 error = PTR_ERR(name); 1519 error = PTR_ERR(name);
1522 if (IS_ERR(name)) { 1520 if (IS_ERR(name)) {
1523 name = NULL; 1521 name = NULL;
1524 goto bad_swap_2; 1522 goto bad_swap_2;
1525 } 1523 }
1526 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 1524 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1527 error = PTR_ERR(swap_file); 1525 error = PTR_ERR(swap_file);
1528 if (IS_ERR(swap_file)) { 1526 if (IS_ERR(swap_file)) {
1529 swap_file = NULL; 1527 swap_file = NULL;
1530 goto bad_swap_2; 1528 goto bad_swap_2;
1531 } 1529 }
1532 1530
1533 p->swap_file = swap_file; 1531 p->swap_file = swap_file;
1534 mapping = swap_file->f_mapping; 1532 mapping = swap_file->f_mapping;
1535 inode = mapping->host; 1533 inode = mapping->host;
1536 1534
1537 error = -EBUSY; 1535 error = -EBUSY;
1538 for (i = 0; i < nr_swapfiles; i++) { 1536 for (i = 0; i < nr_swapfiles; i++) {
1539 struct swap_info_struct *q = &swap_info[i]; 1537 struct swap_info_struct *q = &swap_info[i];
1540 1538
1541 if (i == type || !q->swap_file) 1539 if (i == type || !q->swap_file)
1542 continue; 1540 continue;
1543 if (mapping == q->swap_file->f_mapping) 1541 if (mapping == q->swap_file->f_mapping)
1544 goto bad_swap; 1542 goto bad_swap;
1545 } 1543 }
1546 1544
1547 error = -EINVAL; 1545 error = -EINVAL;
1548 if (S_ISBLK(inode->i_mode)) { 1546 if (S_ISBLK(inode->i_mode)) {
1549 bdev = I_BDEV(inode); 1547 bdev = I_BDEV(inode);
1550 error = bd_claim(bdev, sys_swapon); 1548 error = bd_claim(bdev, sys_swapon);
1551 if (error < 0) { 1549 if (error < 0) {
1552 bdev = NULL; 1550 bdev = NULL;
1553 error = -EINVAL; 1551 error = -EINVAL;
1554 goto bad_swap; 1552 goto bad_swap;
1555 } 1553 }
1556 p->old_block_size = block_size(bdev); 1554 p->old_block_size = block_size(bdev);
1557 error = set_blocksize(bdev, PAGE_SIZE); 1555 error = set_blocksize(bdev, PAGE_SIZE);
1558 if (error < 0) 1556 if (error < 0)
1559 goto bad_swap; 1557 goto bad_swap;
1560 p->bdev = bdev; 1558 p->bdev = bdev;
1561 } else if (S_ISREG(inode->i_mode)) { 1559 } else if (S_ISREG(inode->i_mode)) {
1562 p->bdev = inode->i_sb->s_bdev; 1560 p->bdev = inode->i_sb->s_bdev;
1563 mutex_lock(&inode->i_mutex); 1561 mutex_lock(&inode->i_mutex);
1564 did_down = 1; 1562 did_down = 1;
1565 if (IS_SWAPFILE(inode)) { 1563 if (IS_SWAPFILE(inode)) {
1566 error = -EBUSY; 1564 error = -EBUSY;
1567 goto bad_swap; 1565 goto bad_swap;
1568 } 1566 }
1569 } else { 1567 } else {
1570 goto bad_swap; 1568 goto bad_swap;
1571 } 1569 }
1572 1570
1573 swapfilesize = i_size_read(inode) >> PAGE_SHIFT; 1571 swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
1574 1572
1575 /* 1573 /*
1576 * Read the swap header. 1574 * Read the swap header.
1577 */ 1575 */
1578 if (!mapping->a_ops->readpage) { 1576 if (!mapping->a_ops->readpage) {
1579 error = -EINVAL; 1577 error = -EINVAL;
1580 goto bad_swap; 1578 goto bad_swap;
1581 } 1579 }
1582 page = read_mapping_page(mapping, 0, swap_file); 1580 page = read_mapping_page(mapping, 0, swap_file);
1583 if (IS_ERR(page)) { 1581 if (IS_ERR(page)) {
1584 error = PTR_ERR(page); 1582 error = PTR_ERR(page);
1585 goto bad_swap; 1583 goto bad_swap;
1586 } 1584 }
1587 kmap(page); 1585 kmap(page);
1588 swap_header = page_address(page); 1586 swap_header = page_address(page);
1589 1587
1590 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 1588 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
1591 swap_header_version = 1; 1589 swap_header_version = 1;
1592 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) 1590 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1593 swap_header_version = 2; 1591 swap_header_version = 2;
1594 else { 1592 else {
1595 printk(KERN_ERR "Unable to find swap-space signature\n"); 1593 printk(KERN_ERR "Unable to find swap-space signature\n");
1596 error = -EINVAL; 1594 error = -EINVAL;
1597 goto bad_swap; 1595 goto bad_swap;
1598 } 1596 }
1599 1597
1600 switch (swap_header_version) { 1598 switch (swap_header_version) {
1601 case 1: 1599 case 1:
1602 printk(KERN_ERR "version 0 swap is no longer supported. " 1600 printk(KERN_ERR "version 0 swap is no longer supported. "
1603 "Use mkswap -v1 %s\n", name); 1601 "Use mkswap -v1 %s\n", name);
1604 error = -EINVAL; 1602 error = -EINVAL;
1605 goto bad_swap; 1603 goto bad_swap;
1606 case 2: 1604 case 2:
1607 /* swap partition endianess hack... */ 1605 /* swap partition endianess hack... */
1608 if (swab32(swap_header->info.version) == 1) { 1606 if (swab32(swap_header->info.version) == 1) {
1609 swab32s(&swap_header->info.version); 1607 swab32s(&swap_header->info.version);
1610 swab32s(&swap_header->info.last_page); 1608 swab32s(&swap_header->info.last_page);
1611 swab32s(&swap_header->info.nr_badpages); 1609 swab32s(&swap_header->info.nr_badpages);
1612 for (i = 0; i < swap_header->info.nr_badpages; i++) 1610 for (i = 0; i < swap_header->info.nr_badpages; i++)
1613 swab32s(&swap_header->info.badpages[i]); 1611 swab32s(&swap_header->info.badpages[i]);
1614 } 1612 }
1615 /* Check the swap header's sub-version and the size of 1613 /* Check the swap header's sub-version and the size of
1616 the swap file and bad block lists */ 1614 the swap file and bad block lists */
1617 if (swap_header->info.version != 1) { 1615 if (swap_header->info.version != 1) {
1618 printk(KERN_WARNING 1616 printk(KERN_WARNING
1619 "Unable to handle swap header version %d\n", 1617 "Unable to handle swap header version %d\n",
1620 swap_header->info.version); 1618 swap_header->info.version);
1621 error = -EINVAL; 1619 error = -EINVAL;
1622 goto bad_swap; 1620 goto bad_swap;
1623 } 1621 }
1624 1622
1625 p->lowest_bit = 1; 1623 p->lowest_bit = 1;
1626 p->cluster_next = 1; 1624 p->cluster_next = 1;
1627 1625
1628 /* 1626 /*
1629 * Find out how many pages are allowed for a single swap 1627 * Find out how many pages are allowed for a single swap
1630 * device. There are two limiting factors: 1) the number of 1628 * device. There are two limiting factors: 1) the number of
1631 * bits for the swap offset in the swp_entry_t type and 1629 * bits for the swap offset in the swp_entry_t type and
1632 * 2) the number of bits in the a swap pte as defined by 1630 * 2) the number of bits in the a swap pte as defined by
1633 * the different architectures. In order to find the 1631 * the different architectures. In order to find the
1634 * largest possible bit mask a swap entry with swap type 0 1632 * largest possible bit mask a swap entry with swap type 0
1635 * and swap offset ~0UL is created, encoded to a swap pte, 1633 * and swap offset ~0UL is created, encoded to a swap pte,
1636 * decoded to a swp_entry_t again and finally the swap 1634 * decoded to a swp_entry_t again and finally the swap
1637 * offset is extracted. This will mask all the bits from 1635 * offset is extracted. This will mask all the bits from
1638 * the initial ~0UL mask that can't be encoded in either 1636 * the initial ~0UL mask that can't be encoded in either
1639 * the swp_entry_t or the architecture definition of a 1637 * the swp_entry_t or the architecture definition of a
1640 * swap pte. 1638 * swap pte.
1641 */ 1639 */
1642 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; 1640 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
1643 if (maxpages > swap_header->info.last_page) 1641 if (maxpages > swap_header->info.last_page)
1644 maxpages = swap_header->info.last_page; 1642 maxpages = swap_header->info.last_page;
1645 p->highest_bit = maxpages - 1; 1643 p->highest_bit = maxpages - 1;
1646 1644
1647 error = -EINVAL; 1645 error = -EINVAL;
1648 if (!maxpages) 1646 if (!maxpages)
1649 goto bad_swap; 1647 goto bad_swap;
1650 if (swapfilesize && maxpages > swapfilesize) { 1648 if (swapfilesize && maxpages > swapfilesize) {
1651 printk(KERN_WARNING 1649 printk(KERN_WARNING
1652 "Swap area shorter than signature indicates\n"); 1650 "Swap area shorter than signature indicates\n");
1653 goto bad_swap; 1651 goto bad_swap;
1654 } 1652 }
1655 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1653 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1656 goto bad_swap; 1654 goto bad_swap;
1657 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1655 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1658 goto bad_swap; 1656 goto bad_swap;
1659 1657
1660 /* OK, set up the swap map and apply the bad block list */ 1658 /* OK, set up the swap map and apply the bad block list */
1661 swap_map = vmalloc(maxpages * sizeof(short)); 1659 swap_map = vmalloc(maxpages * sizeof(short));
1662 if (!swap_map) { 1660 if (!swap_map) {
1663 error = -ENOMEM; 1661 error = -ENOMEM;
1664 goto bad_swap; 1662 goto bad_swap;
1665 } 1663 }
1666 1664
1667 error = 0; 1665 error = 0;
1668 memset(swap_map, 0, maxpages * sizeof(short)); 1666 memset(swap_map, 0, maxpages * sizeof(short));
1669 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1667 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1670 int page_nr = swap_header->info.badpages[i]; 1668 int page_nr = swap_header->info.badpages[i];
1671 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1669 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1672 error = -EINVAL; 1670 error = -EINVAL;
1673 else 1671 else
1674 swap_map[page_nr] = SWAP_MAP_BAD; 1672 swap_map[page_nr] = SWAP_MAP_BAD;
1675 } 1673 }
1676 nr_good_pages = swap_header->info.last_page - 1674 nr_good_pages = swap_header->info.last_page -
1677 swap_header->info.nr_badpages - 1675 swap_header->info.nr_badpages -
1678 1 /* header page */; 1676 1 /* header page */;
1679 if (error) 1677 if (error)
1680 goto bad_swap; 1678 goto bad_swap;
1681 } 1679 }
1682 1680
1683 if (nr_good_pages) { 1681 if (nr_good_pages) {
1684 swap_map[0] = SWAP_MAP_BAD; 1682 swap_map[0] = SWAP_MAP_BAD;
1685 p->max = maxpages; 1683 p->max = maxpages;
1686 p->pages = nr_good_pages; 1684 p->pages = nr_good_pages;
1687 nr_extents = setup_swap_extents(p, &span); 1685 nr_extents = setup_swap_extents(p, &span);
1688 if (nr_extents < 0) { 1686 if (nr_extents < 0) {
1689 error = nr_extents; 1687 error = nr_extents;
1690 goto bad_swap; 1688 goto bad_swap;
1691 } 1689 }
1692 nr_good_pages = p->pages; 1690 nr_good_pages = p->pages;
1693 } 1691 }
1694 if (!nr_good_pages) { 1692 if (!nr_good_pages) {
1695 printk(KERN_WARNING "Empty swap-file\n"); 1693 printk(KERN_WARNING "Empty swap-file\n");
1696 error = -EINVAL; 1694 error = -EINVAL;
1697 goto bad_swap; 1695 goto bad_swap;
1698 } 1696 }
1699 1697
1700 mutex_lock(&swapon_mutex); 1698 mutex_lock(&swapon_mutex);
1701 spin_lock(&swap_lock); 1699 spin_lock(&swap_lock);
1702 if (swap_flags & SWAP_FLAG_PREFER) 1700 if (swap_flags & SWAP_FLAG_PREFER)
1703 p->prio = 1701 p->prio =
1704 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 1702 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1705 else 1703 else
1706 p->prio = --least_priority; 1704 p->prio = --least_priority;
1707 p->swap_map = swap_map; 1705 p->swap_map = swap_map;
1708 p->flags = SWP_ACTIVE; 1706 p->flags = SWP_ACTIVE;
1709 nr_swap_pages += nr_good_pages; 1707 nr_swap_pages += nr_good_pages;
1710 total_swap_pages += nr_good_pages; 1708 total_swap_pages += nr_good_pages;
1711 1709
1712 printk(KERN_INFO "Adding %uk swap on %s. " 1710 printk(KERN_INFO "Adding %uk swap on %s. "
1713 "Priority:%d extents:%d across:%lluk\n", 1711 "Priority:%d extents:%d across:%lluk\n",
1714 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 1712 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
1715 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); 1713 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
1716 1714
1717 /* insert swap space into swap_list: */ 1715 /* insert swap space into swap_list: */
1718 prev = -1; 1716 prev = -1;
1719 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1717 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1720 if (p->prio >= swap_info[i].prio) { 1718 if (p->prio >= swap_info[i].prio) {
1721 break; 1719 break;
1722 } 1720 }
1723 prev = i; 1721 prev = i;
1724 } 1722 }
1725 p->next = i; 1723 p->next = i;
1726 if (prev < 0) { 1724 if (prev < 0) {
1727 swap_list.head = swap_list.next = p - swap_info; 1725 swap_list.head = swap_list.next = p - swap_info;
1728 } else { 1726 } else {
1729 swap_info[prev].next = p - swap_info; 1727 swap_info[prev].next = p - swap_info;
1730 } 1728 }
1731 spin_unlock(&swap_lock); 1729 spin_unlock(&swap_lock);
1732 mutex_unlock(&swapon_mutex); 1730 mutex_unlock(&swapon_mutex);
1733 error = 0; 1731 error = 0;
1734 goto out; 1732 goto out;
1735 bad_swap: 1733 bad_swap:
1736 if (bdev) { 1734 if (bdev) {
1737 set_blocksize(bdev, p->old_block_size); 1735 set_blocksize(bdev, p->old_block_size);
1738 bd_release(bdev); 1736 bd_release(bdev);
1739 } 1737 }
1740 destroy_swap_extents(p); 1738 destroy_swap_extents(p);
1741 bad_swap_2: 1739 bad_swap_2:
1742 spin_lock(&swap_lock); 1740 spin_lock(&swap_lock);
1743 p->swap_file = NULL; 1741 p->swap_file = NULL;
1744 p->flags = 0; 1742 p->flags = 0;
1745 spin_unlock(&swap_lock); 1743 spin_unlock(&swap_lock);
1746 vfree(swap_map); 1744 vfree(swap_map);
1747 if (swap_file) 1745 if (swap_file)
1748 filp_close(swap_file, NULL); 1746 filp_close(swap_file, NULL);
1749 out: 1747 out:
1750 if (page && !IS_ERR(page)) { 1748 if (page && !IS_ERR(page)) {
1751 kunmap(page); 1749 kunmap(page);
1752 page_cache_release(page); 1750 page_cache_release(page);
1753 } 1751 }
1754 if (name) 1752 if (name)
1755 putname(name); 1753 putname(name);
1756 if (did_down) { 1754 if (did_down) {
1757 if (!error) 1755 if (!error)
1758 inode->i_flags |= S_SWAPFILE; 1756 inode->i_flags |= S_SWAPFILE;
1759 mutex_unlock(&inode->i_mutex); 1757 mutex_unlock(&inode->i_mutex);
1760 } 1758 }
1761 return error; 1759 return error;
1762 } 1760 }
1763 1761
1764 void si_swapinfo(struct sysinfo *val) 1762 void si_swapinfo(struct sysinfo *val)
1765 { 1763 {
1766 unsigned int i; 1764 unsigned int i;
1767 unsigned long nr_to_be_unused = 0; 1765 unsigned long nr_to_be_unused = 0;
1768 1766
1769 spin_lock(&swap_lock); 1767 spin_lock(&swap_lock);
1770 for (i = 0; i < nr_swapfiles; i++) { 1768 for (i = 0; i < nr_swapfiles; i++) {
1771 if (!(swap_info[i].flags & SWP_USED) || 1769 if (!(swap_info[i].flags & SWP_USED) ||
1772 (swap_info[i].flags & SWP_WRITEOK)) 1770 (swap_info[i].flags & SWP_WRITEOK))
1773 continue; 1771 continue;
1774 nr_to_be_unused += swap_info[i].inuse_pages; 1772 nr_to_be_unused += swap_info[i].inuse_pages;
1775 } 1773 }
1776 val->freeswap = nr_swap_pages + nr_to_be_unused; 1774 val->freeswap = nr_swap_pages + nr_to_be_unused;
1777 val->totalswap = total_swap_pages + nr_to_be_unused; 1775 val->totalswap = total_swap_pages + nr_to_be_unused;
1778 spin_unlock(&swap_lock); 1776 spin_unlock(&swap_lock);
1779 } 1777 }
1780 1778
1781 /* 1779 /*
1782 * Verify that a swap entry is valid and increment its swap map count. 1780 * Verify that a swap entry is valid and increment its swap map count.
1783 * 1781 *
1784 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 1782 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1785 * "permanent", but will be reclaimed by the next swapoff. 1783 * "permanent", but will be reclaimed by the next swapoff.
1786 */ 1784 */
1787 int swap_duplicate(swp_entry_t entry) 1785 int swap_duplicate(swp_entry_t entry)
1788 { 1786 {
1789 struct swap_info_struct * p; 1787 struct swap_info_struct * p;
1790 unsigned long offset, type; 1788 unsigned long offset, type;
1791 int result = 0; 1789 int result = 0;
1792 1790
1793 if (is_migration_entry(entry)) 1791 if (is_migration_entry(entry))
1794 return 1; 1792 return 1;
1795 1793
1796 type = swp_type(entry); 1794 type = swp_type(entry);
1797 if (type >= nr_swapfiles) 1795 if (type >= nr_swapfiles)
1798 goto bad_file; 1796 goto bad_file;
1799 p = type + swap_info; 1797 p = type + swap_info;
1800 offset = swp_offset(entry); 1798 offset = swp_offset(entry);
1801 1799
1802 spin_lock(&swap_lock); 1800 spin_lock(&swap_lock);
1803 if (offset < p->max && p->swap_map[offset]) { 1801 if (offset < p->max && p->swap_map[offset]) {
1804 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 1802 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
1805 p->swap_map[offset]++; 1803 p->swap_map[offset]++;
1806 result = 1; 1804 result = 1;
1807 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 1805 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
1808 if (swap_overflow++ < 5) 1806 if (swap_overflow++ < 5)
1809 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 1807 printk(KERN_WARNING "swap_dup: swap entry overflow\n");
1810 p->swap_map[offset] = SWAP_MAP_MAX; 1808 p->swap_map[offset] = SWAP_MAP_MAX;
1811 result = 1; 1809 result = 1;
1812 } 1810 }
1813 } 1811 }
1814 spin_unlock(&swap_lock); 1812 spin_unlock(&swap_lock);
1815 out: 1813 out:
1816 return result; 1814 return result;
1817 1815
1818 bad_file: 1816 bad_file:
1819 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 1817 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1820 goto out; 1818 goto out;
1821 } 1819 }
1822 1820
1823 struct swap_info_struct * 1821 struct swap_info_struct *
1824 get_swap_info_struct(unsigned type) 1822 get_swap_info_struct(unsigned type)
1825 { 1823 {
1826 return &swap_info[type]; 1824 return &swap_info[type];
1827 } 1825 }
1828 1826
1829 /* 1827 /*
1830 * swap_lock prevents swap_map being freed. Don't grab an extra 1828 * swap_lock prevents swap_map being freed. Don't grab an extra
1831 * reference on the swaphandle, it doesn't matter if it becomes unused. 1829 * reference on the swaphandle, it doesn't matter if it becomes unused.
1832 */ 1830 */
1833 int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1831 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1834 { 1832 {
1835 struct swap_info_struct *si; 1833 struct swap_info_struct *si;
1836 int our_page_cluster = page_cluster; 1834 int our_page_cluster = page_cluster;
1837 pgoff_t target, toff; 1835 pgoff_t target, toff;
1838 pgoff_t base, end; 1836 pgoff_t base, end;
1839 int nr_pages = 0; 1837 int nr_pages = 0;
1840 1838
1841 if (!our_page_cluster) /* no readahead */ 1839 if (!our_page_cluster) /* no readahead */
1842 return 0; 1840 return 0;
1843 1841
1844 si = &swap_info[swp_type(entry)]; 1842 si = &swap_info[swp_type(entry)];
1845 target = swp_offset(entry); 1843 target = swp_offset(entry);
1846 base = (target >> our_page_cluster) << our_page_cluster; 1844 base = (target >> our_page_cluster) << our_page_cluster;
1847 end = base + (1 << our_page_cluster); 1845 end = base + (1 << our_page_cluster);
1848 if (!base) /* first page is swap header */ 1846 if (!base) /* first page is swap header */
1849 base++; 1847 base++;
1850 1848
1851 spin_lock(&swap_lock); 1849 spin_lock(&swap_lock);
1852 if (end > si->max) /* don't go beyond end of map */ 1850 if (end > si->max) /* don't go beyond end of map */
1853 end = si->max; 1851 end = si->max;
1854 1852
1855 /* Count contiguous allocated slots above our target */ 1853 /* Count contiguous allocated slots above our target */
1856 for (toff = target; ++toff < end; nr_pages++) { 1854 for (toff = target; ++toff < end; nr_pages++) {
1857 /* Don't read in free or bad pages */ 1855 /* Don't read in free or bad pages */
1858 if (!si->swap_map[toff]) 1856 if (!si->swap_map[toff])
1859 break; 1857 break;
1860 if (si->swap_map[toff] == SWAP_MAP_BAD) 1858 if (si->swap_map[toff] == SWAP_MAP_BAD)
1861 break; 1859 break;
1862 } 1860 }
1863 /* Count contiguous allocated slots below our target */ 1861 /* Count contiguous allocated slots below our target */
1864 for (toff = target; --toff >= base; nr_pages++) { 1862 for (toff = target; --toff >= base; nr_pages++) {
1865 /* Don't read in free or bad pages */ 1863 /* Don't read in free or bad pages */
1866 if (!si->swap_map[toff]) 1864 if (!si->swap_map[toff])
1867 break; 1865 break;
1868 if (si->swap_map[toff] == SWAP_MAP_BAD) 1866 if (si->swap_map[toff] == SWAP_MAP_BAD)
1869 break; 1867 break;
1870 } 1868 }
1871 spin_unlock(&swap_lock); 1869 spin_unlock(&swap_lock);
1872 1870
1873 /* 1871 /*
1874 * Indicate starting offset, and return number of pages to get: 1872 * Indicate starting offset, and return number of pages to get:
1875 * if only 1, say 0, since there's then no readahead to be done. 1873 * if only 1, say 0, since there's then no readahead to be done.
1876 */ 1874 */
1877 *offset = ++toff; 1875 *offset = ++toff;
1878 return nr_pages? ++nr_pages: 0; 1876 return nr_pages? ++nr_pages: 0;
1879 } 1877 }
1880 1878