Commit 87e99511ea54510ffb60b98001d108794d5037f8
Committed by
Al Viro
1 parent
dad5eb6daa
Exists in
master
and in
4 other branches
kill BH_Ordered flag
Instead of abusing a buffer_head flag just add a variant of sync_dirty_buffer which allows passing the exact type of write flag required. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Showing 5 changed files with 63 additions and 73 deletions Inline Diff
fs/buffer.c
1 | /* | 1 | /* |
2 | * linux/fs/buffer.c | 2 | * linux/fs/buffer.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 2002 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 2002 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 | 8 | * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 |
9 | * | 9 | * |
10 | * Removed a lot of unnecessary code and simplified things now that | 10 | * Removed a lot of unnecessary code and simplified things now that |
11 | * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 | 11 | * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 |
12 | * | 12 | * |
13 | * Speed up hash, lru, and free list operations. Use gfp() for allocating | 13 | * Speed up hash, lru, and free list operations. Use gfp() for allocating |
14 | * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM | 14 | * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM |
15 | * | 15 | * |
16 | * Added 32k buffer block sizes - these are required older ARM systems. - RMK | 16 | * Added 32k buffer block sizes - these are required older ARM systems. - RMK |
17 | * | 17 | * |
18 | * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> | 18 | * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/kernel.h> | 21 | #include <linux/kernel.h> |
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | #include <linux/fs.h> | 23 | #include <linux/fs.h> |
24 | #include <linux/mm.h> | 24 | #include <linux/mm.h> |
25 | #include <linux/percpu.h> | 25 | #include <linux/percpu.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/capability.h> | 27 | #include <linux/capability.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
30 | #include <linux/quotaops.h> | 30 | #include <linux/quotaops.h> |
31 | #include <linux/highmem.h> | 31 | #include <linux/highmem.h> |
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <linux/writeback.h> | 33 | #include <linux/writeback.h> |
34 | #include <linux/hash.h> | 34 | #include <linux/hash.h> |
35 | #include <linux/suspend.h> | 35 | #include <linux/suspend.h> |
36 | #include <linux/buffer_head.h> | 36 | #include <linux/buffer_head.h> |
37 | #include <linux/task_io_accounting_ops.h> | 37 | #include <linux/task_io_accounting_ops.h> |
38 | #include <linux/bio.h> | 38 | #include <linux/bio.h> |
39 | #include <linux/notifier.h> | 39 | #include <linux/notifier.h> |
40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
41 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
42 | #include <linux/mpage.h> | 42 | #include <linux/mpage.h> |
43 | #include <linux/bit_spinlock.h> | 43 | #include <linux/bit_spinlock.h> |
44 | 44 | ||
45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); | 45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); |
46 | 46 | ||
47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) | 47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) |
48 | 48 | ||
49 | inline void | 49 | inline void |
50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | 50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) |
51 | { | 51 | { |
52 | bh->b_end_io = handler; | 52 | bh->b_end_io = handler; |
53 | bh->b_private = private; | 53 | bh->b_private = private; |
54 | } | 54 | } |
55 | EXPORT_SYMBOL(init_buffer); | 55 | EXPORT_SYMBOL(init_buffer); |
56 | 56 | ||
57 | static int sync_buffer(void *word) | 57 | static int sync_buffer(void *word) |
58 | { | 58 | { |
59 | struct block_device *bd; | 59 | struct block_device *bd; |
60 | struct buffer_head *bh | 60 | struct buffer_head *bh |
61 | = container_of(word, struct buffer_head, b_state); | 61 | = container_of(word, struct buffer_head, b_state); |
62 | 62 | ||
63 | smp_mb(); | 63 | smp_mb(); |
64 | bd = bh->b_bdev; | 64 | bd = bh->b_bdev; |
65 | if (bd) | 65 | if (bd) |
66 | blk_run_address_space(bd->bd_inode->i_mapping); | 66 | blk_run_address_space(bd->bd_inode->i_mapping); |
67 | io_schedule(); | 67 | io_schedule(); |
68 | return 0; | 68 | return 0; |
69 | } | 69 | } |
70 | 70 | ||
71 | void __lock_buffer(struct buffer_head *bh) | 71 | void __lock_buffer(struct buffer_head *bh) |
72 | { | 72 | { |
73 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, | 73 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, |
74 | TASK_UNINTERRUPTIBLE); | 74 | TASK_UNINTERRUPTIBLE); |
75 | } | 75 | } |
76 | EXPORT_SYMBOL(__lock_buffer); | 76 | EXPORT_SYMBOL(__lock_buffer); |
77 | 77 | ||
78 | void unlock_buffer(struct buffer_head *bh) | 78 | void unlock_buffer(struct buffer_head *bh) |
79 | { | 79 | { |
80 | clear_bit_unlock(BH_Lock, &bh->b_state); | 80 | clear_bit_unlock(BH_Lock, &bh->b_state); |
81 | smp_mb__after_clear_bit(); | 81 | smp_mb__after_clear_bit(); |
82 | wake_up_bit(&bh->b_state, BH_Lock); | 82 | wake_up_bit(&bh->b_state, BH_Lock); |
83 | } | 83 | } |
84 | EXPORT_SYMBOL(unlock_buffer); | 84 | EXPORT_SYMBOL(unlock_buffer); |
85 | 85 | ||
86 | /* | 86 | /* |
87 | * Block until a buffer comes unlocked. This doesn't stop it | 87 | * Block until a buffer comes unlocked. This doesn't stop it |
88 | * from becoming locked again - you have to lock it yourself | 88 | * from becoming locked again - you have to lock it yourself |
89 | * if you want to preserve its state. | 89 | * if you want to preserve its state. |
90 | */ | 90 | */ |
91 | void __wait_on_buffer(struct buffer_head * bh) | 91 | void __wait_on_buffer(struct buffer_head * bh) |
92 | { | 92 | { |
93 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); | 93 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); |
94 | } | 94 | } |
95 | EXPORT_SYMBOL(__wait_on_buffer); | 95 | EXPORT_SYMBOL(__wait_on_buffer); |
96 | 96 | ||
97 | static void | 97 | static void |
98 | __clear_page_buffers(struct page *page) | 98 | __clear_page_buffers(struct page *page) |
99 | { | 99 | { |
100 | ClearPagePrivate(page); | 100 | ClearPagePrivate(page); |
101 | set_page_private(page, 0); | 101 | set_page_private(page, 0); |
102 | page_cache_release(page); | 102 | page_cache_release(page); |
103 | } | 103 | } |
104 | 104 | ||
105 | 105 | ||
106 | static int quiet_error(struct buffer_head *bh) | 106 | static int quiet_error(struct buffer_head *bh) |
107 | { | 107 | { |
108 | if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) | 108 | if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) |
109 | return 0; | 109 | return 0; |
110 | return 1; | 110 | return 1; |
111 | } | 111 | } |
112 | 112 | ||
113 | 113 | ||
114 | static void buffer_io_error(struct buffer_head *bh) | 114 | static void buffer_io_error(struct buffer_head *bh) |
115 | { | 115 | { |
116 | char b[BDEVNAME_SIZE]; | 116 | char b[BDEVNAME_SIZE]; |
117 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", | 117 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", |
118 | bdevname(bh->b_bdev, b), | 118 | bdevname(bh->b_bdev, b), |
119 | (unsigned long long)bh->b_blocknr); | 119 | (unsigned long long)bh->b_blocknr); |
120 | } | 120 | } |
121 | 121 | ||
122 | /* | 122 | /* |
123 | * End-of-IO handler helper function which does not touch the bh after | 123 | * End-of-IO handler helper function which does not touch the bh after |
124 | * unlocking it. | 124 | * unlocking it. |
125 | * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but | 125 | * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but |
126 | * a race there is benign: unlock_buffer() only use the bh's address for | 126 | * a race there is benign: unlock_buffer() only use the bh's address for |
127 | * hashing after unlocking the buffer, so it doesn't actually touch the bh | 127 | * hashing after unlocking the buffer, so it doesn't actually touch the bh |
128 | * itself. | 128 | * itself. |
129 | */ | 129 | */ |
130 | static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) | 130 | static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) |
131 | { | 131 | { |
132 | if (uptodate) { | 132 | if (uptodate) { |
133 | set_buffer_uptodate(bh); | 133 | set_buffer_uptodate(bh); |
134 | } else { | 134 | } else { |
135 | /* This happens, due to failed READA attempts. */ | 135 | /* This happens, due to failed READA attempts. */ |
136 | clear_buffer_uptodate(bh); | 136 | clear_buffer_uptodate(bh); |
137 | } | 137 | } |
138 | unlock_buffer(bh); | 138 | unlock_buffer(bh); |
139 | } | 139 | } |
140 | 140 | ||
141 | /* | 141 | /* |
142 | * Default synchronous end-of-IO handler.. Just mark it up-to-date and | 142 | * Default synchronous end-of-IO handler.. Just mark it up-to-date and |
143 | * unlock the buffer. This is what ll_rw_block uses too. | 143 | * unlock the buffer. This is what ll_rw_block uses too. |
144 | */ | 144 | */ |
145 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate) | 145 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate) |
146 | { | 146 | { |
147 | __end_buffer_read_notouch(bh, uptodate); | 147 | __end_buffer_read_notouch(bh, uptodate); |
148 | put_bh(bh); | 148 | put_bh(bh); |
149 | } | 149 | } |
150 | EXPORT_SYMBOL(end_buffer_read_sync); | 150 | EXPORT_SYMBOL(end_buffer_read_sync); |
151 | 151 | ||
152 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate) | 152 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate) |
153 | { | 153 | { |
154 | char b[BDEVNAME_SIZE]; | 154 | char b[BDEVNAME_SIZE]; |
155 | 155 | ||
156 | if (uptodate) { | 156 | if (uptodate) { |
157 | set_buffer_uptodate(bh); | 157 | set_buffer_uptodate(bh); |
158 | } else { | 158 | } else { |
159 | if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { | 159 | if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { |
160 | buffer_io_error(bh); | 160 | buffer_io_error(bh); |
161 | printk(KERN_WARNING "lost page write due to " | 161 | printk(KERN_WARNING "lost page write due to " |
162 | "I/O error on %s\n", | 162 | "I/O error on %s\n", |
163 | bdevname(bh->b_bdev, b)); | 163 | bdevname(bh->b_bdev, b)); |
164 | } | 164 | } |
165 | set_buffer_write_io_error(bh); | 165 | set_buffer_write_io_error(bh); |
166 | clear_buffer_uptodate(bh); | 166 | clear_buffer_uptodate(bh); |
167 | } | 167 | } |
168 | unlock_buffer(bh); | 168 | unlock_buffer(bh); |
169 | put_bh(bh); | 169 | put_bh(bh); |
170 | } | 170 | } |
171 | EXPORT_SYMBOL(end_buffer_write_sync); | 171 | EXPORT_SYMBOL(end_buffer_write_sync); |
172 | 172 | ||
173 | /* | 173 | /* |
174 | * Various filesystems appear to want __find_get_block to be non-blocking. | 174 | * Various filesystems appear to want __find_get_block to be non-blocking. |
175 | * But it's the page lock which protects the buffers. To get around this, | 175 | * But it's the page lock which protects the buffers. To get around this, |
176 | * we get exclusion from try_to_free_buffers with the blockdev mapping's | 176 | * we get exclusion from try_to_free_buffers with the blockdev mapping's |
177 | * private_lock. | 177 | * private_lock. |
178 | * | 178 | * |
179 | * Hack idea: for the blockdev mapping, i_bufferlist_lock contention | 179 | * Hack idea: for the blockdev mapping, i_bufferlist_lock contention |
180 | * may be quite high. This code could TryLock the page, and if that | 180 | * may be quite high. This code could TryLock the page, and if that |
181 | * succeeds, there is no need to take private_lock. (But if | 181 | * succeeds, there is no need to take private_lock. (But if |
182 | * private_lock is contended then so is mapping->tree_lock). | 182 | * private_lock is contended then so is mapping->tree_lock). |
183 | */ | 183 | */ |
184 | static struct buffer_head * | 184 | static struct buffer_head * |
185 | __find_get_block_slow(struct block_device *bdev, sector_t block) | 185 | __find_get_block_slow(struct block_device *bdev, sector_t block) |
186 | { | 186 | { |
187 | struct inode *bd_inode = bdev->bd_inode; | 187 | struct inode *bd_inode = bdev->bd_inode; |
188 | struct address_space *bd_mapping = bd_inode->i_mapping; | 188 | struct address_space *bd_mapping = bd_inode->i_mapping; |
189 | struct buffer_head *ret = NULL; | 189 | struct buffer_head *ret = NULL; |
190 | pgoff_t index; | 190 | pgoff_t index; |
191 | struct buffer_head *bh; | 191 | struct buffer_head *bh; |
192 | struct buffer_head *head; | 192 | struct buffer_head *head; |
193 | struct page *page; | 193 | struct page *page; |
194 | int all_mapped = 1; | 194 | int all_mapped = 1; |
195 | 195 | ||
196 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); | 196 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); |
197 | page = find_get_page(bd_mapping, index); | 197 | page = find_get_page(bd_mapping, index); |
198 | if (!page) | 198 | if (!page) |
199 | goto out; | 199 | goto out; |
200 | 200 | ||
201 | spin_lock(&bd_mapping->private_lock); | 201 | spin_lock(&bd_mapping->private_lock); |
202 | if (!page_has_buffers(page)) | 202 | if (!page_has_buffers(page)) |
203 | goto out_unlock; | 203 | goto out_unlock; |
204 | head = page_buffers(page); | 204 | head = page_buffers(page); |
205 | bh = head; | 205 | bh = head; |
206 | do { | 206 | do { |
207 | if (!buffer_mapped(bh)) | 207 | if (!buffer_mapped(bh)) |
208 | all_mapped = 0; | 208 | all_mapped = 0; |
209 | else if (bh->b_blocknr == block) { | 209 | else if (bh->b_blocknr == block) { |
210 | ret = bh; | 210 | ret = bh; |
211 | get_bh(bh); | 211 | get_bh(bh); |
212 | goto out_unlock; | 212 | goto out_unlock; |
213 | } | 213 | } |
214 | bh = bh->b_this_page; | 214 | bh = bh->b_this_page; |
215 | } while (bh != head); | 215 | } while (bh != head); |
216 | 216 | ||
217 | /* we might be here because some of the buffers on this page are | 217 | /* we might be here because some of the buffers on this page are |
218 | * not mapped. This is due to various races between | 218 | * not mapped. This is due to various races between |
219 | * file io on the block device and getblk. It gets dealt with | 219 | * file io on the block device and getblk. It gets dealt with |
220 | * elsewhere, don't buffer_error if we had some unmapped buffers | 220 | * elsewhere, don't buffer_error if we had some unmapped buffers |
221 | */ | 221 | */ |
222 | if (all_mapped) { | 222 | if (all_mapped) { |
223 | printk("__find_get_block_slow() failed. " | 223 | printk("__find_get_block_slow() failed. " |
224 | "block=%llu, b_blocknr=%llu\n", | 224 | "block=%llu, b_blocknr=%llu\n", |
225 | (unsigned long long)block, | 225 | (unsigned long long)block, |
226 | (unsigned long long)bh->b_blocknr); | 226 | (unsigned long long)bh->b_blocknr); |
227 | printk("b_state=0x%08lx, b_size=%zu\n", | 227 | printk("b_state=0x%08lx, b_size=%zu\n", |
228 | bh->b_state, bh->b_size); | 228 | bh->b_state, bh->b_size); |
229 | printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); | 229 | printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); |
230 | } | 230 | } |
231 | out_unlock: | 231 | out_unlock: |
232 | spin_unlock(&bd_mapping->private_lock); | 232 | spin_unlock(&bd_mapping->private_lock); |
233 | page_cache_release(page); | 233 | page_cache_release(page); |
234 | out: | 234 | out: |
235 | return ret; | 235 | return ret; |
236 | } | 236 | } |
237 | 237 | ||
238 | /* If invalidate_buffers() will trash dirty buffers, it means some kind | 238 | /* If invalidate_buffers() will trash dirty buffers, it means some kind |
239 | of fs corruption is going on. Trashing dirty data always imply losing | 239 | of fs corruption is going on. Trashing dirty data always imply losing |
240 | information that was supposed to be just stored on the physical layer | 240 | information that was supposed to be just stored on the physical layer |
241 | by the user. | 241 | by the user. |
242 | 242 | ||
243 | Thus invalidate_buffers in general usage is not allwowed to trash | 243 | Thus invalidate_buffers in general usage is not allwowed to trash |
244 | dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to | 244 | dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to |
245 | be preserved. These buffers are simply skipped. | 245 | be preserved. These buffers are simply skipped. |
246 | 246 | ||
247 | We also skip buffers which are still in use. For example this can | 247 | We also skip buffers which are still in use. For example this can |
248 | happen if a userspace program is reading the block device. | 248 | happen if a userspace program is reading the block device. |
249 | 249 | ||
250 | NOTE: In the case where the user removed a removable-media-disk even if | 250 | NOTE: In the case where the user removed a removable-media-disk even if |
251 | there's still dirty data not synced on disk (due a bug in the device driver | 251 | there's still dirty data not synced on disk (due a bug in the device driver |
252 | or due an error of the user), by not destroying the dirty buffers we could | 252 | or due an error of the user), by not destroying the dirty buffers we could |
253 | generate corruption also on the next media inserted, thus a parameter is | 253 | generate corruption also on the next media inserted, thus a parameter is |
254 | necessary to handle this case in the most safe way possible (trying | 254 | necessary to handle this case in the most safe way possible (trying |
255 | to not corrupt also the new disk inserted with the data belonging to | 255 | to not corrupt also the new disk inserted with the data belonging to |
256 | the old now corrupted disk). Also for the ramdisk the natural thing | 256 | the old now corrupted disk). Also for the ramdisk the natural thing |
257 | to do in order to release the ramdisk memory is to destroy dirty buffers. | 257 | to do in order to release the ramdisk memory is to destroy dirty buffers. |
258 | 258 | ||
259 | These are two special cases. Normal usage imply the device driver | 259 | These are two special cases. Normal usage imply the device driver |
260 | to issue a sync on the device (without waiting I/O completion) and | 260 | to issue a sync on the device (without waiting I/O completion) and |
261 | then an invalidate_buffers call that doesn't trash dirty buffers. | 261 | then an invalidate_buffers call that doesn't trash dirty buffers. |
262 | 262 | ||
263 | For handling cache coherency with the blkdev pagecache the 'update' case | 263 | For handling cache coherency with the blkdev pagecache the 'update' case |
264 | is been introduced. It is needed to re-read from disk any pinned | 264 | is been introduced. It is needed to re-read from disk any pinned |
265 | buffer. NOTE: re-reading from disk is destructive so we can do it only | 265 | buffer. NOTE: re-reading from disk is destructive so we can do it only |
266 | when we assume nobody is changing the buffercache under our I/O and when | 266 | when we assume nobody is changing the buffercache under our I/O and when |
267 | we think the disk contains more recent information than the buffercache. | 267 | we think the disk contains more recent information than the buffercache. |
268 | The update == 1 pass marks the buffers we need to update, the update == 2 | 268 | The update == 1 pass marks the buffers we need to update, the update == 2 |
269 | pass does the actual I/O. */ | 269 | pass does the actual I/O. */ |
270 | void invalidate_bdev(struct block_device *bdev) | 270 | void invalidate_bdev(struct block_device *bdev) |
271 | { | 271 | { |
272 | struct address_space *mapping = bdev->bd_inode->i_mapping; | 272 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
273 | 273 | ||
274 | if (mapping->nrpages == 0) | 274 | if (mapping->nrpages == 0) |
275 | return; | 275 | return; |
276 | 276 | ||
277 | invalidate_bh_lrus(); | 277 | invalidate_bh_lrus(); |
278 | lru_add_drain_all(); /* make sure all lru add caches are flushed */ | 278 | lru_add_drain_all(); /* make sure all lru add caches are flushed */ |
279 | invalidate_mapping_pages(mapping, 0, -1); | 279 | invalidate_mapping_pages(mapping, 0, -1); |
280 | } | 280 | } |
281 | EXPORT_SYMBOL(invalidate_bdev); | 281 | EXPORT_SYMBOL(invalidate_bdev); |
282 | 282 | ||
283 | /* | 283 | /* |
284 | * Kick the writeback threads then try to free up some ZONE_NORMAL memory. | 284 | * Kick the writeback threads then try to free up some ZONE_NORMAL memory. |
285 | */ | 285 | */ |
286 | static void free_more_memory(void) | 286 | static void free_more_memory(void) |
287 | { | 287 | { |
288 | struct zone *zone; | 288 | struct zone *zone; |
289 | int nid; | 289 | int nid; |
290 | 290 | ||
291 | wakeup_flusher_threads(1024); | 291 | wakeup_flusher_threads(1024); |
292 | yield(); | 292 | yield(); |
293 | 293 | ||
294 | for_each_online_node(nid) { | 294 | for_each_online_node(nid) { |
295 | (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), | 295 | (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), |
296 | gfp_zone(GFP_NOFS), NULL, | 296 | gfp_zone(GFP_NOFS), NULL, |
297 | &zone); | 297 | &zone); |
298 | if (zone) | 298 | if (zone) |
299 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, | 299 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, |
300 | GFP_NOFS, NULL); | 300 | GFP_NOFS, NULL); |
301 | } | 301 | } |
302 | } | 302 | } |
303 | 303 | ||
304 | /* | 304 | /* |
305 | * I/O completion handler for block_read_full_page() - pages | 305 | * I/O completion handler for block_read_full_page() - pages |
306 | * which come unlocked at the end of I/O. | 306 | * which come unlocked at the end of I/O. |
307 | */ | 307 | */ |
308 | static void end_buffer_async_read(struct buffer_head *bh, int uptodate) | 308 | static void end_buffer_async_read(struct buffer_head *bh, int uptodate) |
309 | { | 309 | { |
310 | unsigned long flags; | 310 | unsigned long flags; |
311 | struct buffer_head *first; | 311 | struct buffer_head *first; |
312 | struct buffer_head *tmp; | 312 | struct buffer_head *tmp; |
313 | struct page *page; | 313 | struct page *page; |
314 | int page_uptodate = 1; | 314 | int page_uptodate = 1; |
315 | 315 | ||
316 | BUG_ON(!buffer_async_read(bh)); | 316 | BUG_ON(!buffer_async_read(bh)); |
317 | 317 | ||
318 | page = bh->b_page; | 318 | page = bh->b_page; |
319 | if (uptodate) { | 319 | if (uptodate) { |
320 | set_buffer_uptodate(bh); | 320 | set_buffer_uptodate(bh); |
321 | } else { | 321 | } else { |
322 | clear_buffer_uptodate(bh); | 322 | clear_buffer_uptodate(bh); |
323 | if (!quiet_error(bh)) | 323 | if (!quiet_error(bh)) |
324 | buffer_io_error(bh); | 324 | buffer_io_error(bh); |
325 | SetPageError(page); | 325 | SetPageError(page); |
326 | } | 326 | } |
327 | 327 | ||
328 | /* | 328 | /* |
329 | * Be _very_ careful from here on. Bad things can happen if | 329 | * Be _very_ careful from here on. Bad things can happen if |
330 | * two buffer heads end IO at almost the same time and both | 330 | * two buffer heads end IO at almost the same time and both |
331 | * decide that the page is now completely done. | 331 | * decide that the page is now completely done. |
332 | */ | 332 | */ |
333 | first = page_buffers(page); | 333 | first = page_buffers(page); |
334 | local_irq_save(flags); | 334 | local_irq_save(flags); |
335 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | 335 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); |
336 | clear_buffer_async_read(bh); | 336 | clear_buffer_async_read(bh); |
337 | unlock_buffer(bh); | 337 | unlock_buffer(bh); |
338 | tmp = bh; | 338 | tmp = bh; |
339 | do { | 339 | do { |
340 | if (!buffer_uptodate(tmp)) | 340 | if (!buffer_uptodate(tmp)) |
341 | page_uptodate = 0; | 341 | page_uptodate = 0; |
342 | if (buffer_async_read(tmp)) { | 342 | if (buffer_async_read(tmp)) { |
343 | BUG_ON(!buffer_locked(tmp)); | 343 | BUG_ON(!buffer_locked(tmp)); |
344 | goto still_busy; | 344 | goto still_busy; |
345 | } | 345 | } |
346 | tmp = tmp->b_this_page; | 346 | tmp = tmp->b_this_page; |
347 | } while (tmp != bh); | 347 | } while (tmp != bh); |
348 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 348 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
349 | local_irq_restore(flags); | 349 | local_irq_restore(flags); |
350 | 350 | ||
351 | /* | 351 | /* |
352 | * If none of the buffers had errors and they are all | 352 | * If none of the buffers had errors and they are all |
353 | * uptodate then we can set the page uptodate. | 353 | * uptodate then we can set the page uptodate. |
354 | */ | 354 | */ |
355 | if (page_uptodate && !PageError(page)) | 355 | if (page_uptodate && !PageError(page)) |
356 | SetPageUptodate(page); | 356 | SetPageUptodate(page); |
357 | unlock_page(page); | 357 | unlock_page(page); |
358 | return; | 358 | return; |
359 | 359 | ||
360 | still_busy: | 360 | still_busy: |
361 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 361 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
362 | local_irq_restore(flags); | 362 | local_irq_restore(flags); |
363 | return; | 363 | return; |
364 | } | 364 | } |
365 | 365 | ||
366 | /* | 366 | /* |
367 | * Completion handler for block_write_full_page() - pages which are unlocked | 367 | * Completion handler for block_write_full_page() - pages which are unlocked |
368 | * during I/O, and which have PageWriteback cleared upon I/O completion. | 368 | * during I/O, and which have PageWriteback cleared upon I/O completion. |
369 | */ | 369 | */ |
370 | void end_buffer_async_write(struct buffer_head *bh, int uptodate) | 370 | void end_buffer_async_write(struct buffer_head *bh, int uptodate) |
371 | { | 371 | { |
372 | char b[BDEVNAME_SIZE]; | 372 | char b[BDEVNAME_SIZE]; |
373 | unsigned long flags; | 373 | unsigned long flags; |
374 | struct buffer_head *first; | 374 | struct buffer_head *first; |
375 | struct buffer_head *tmp; | 375 | struct buffer_head *tmp; |
376 | struct page *page; | 376 | struct page *page; |
377 | 377 | ||
378 | BUG_ON(!buffer_async_write(bh)); | 378 | BUG_ON(!buffer_async_write(bh)); |
379 | 379 | ||
380 | page = bh->b_page; | 380 | page = bh->b_page; |
381 | if (uptodate) { | 381 | if (uptodate) { |
382 | set_buffer_uptodate(bh); | 382 | set_buffer_uptodate(bh); |
383 | } else { | 383 | } else { |
384 | if (!quiet_error(bh)) { | 384 | if (!quiet_error(bh)) { |
385 | buffer_io_error(bh); | 385 | buffer_io_error(bh); |
386 | printk(KERN_WARNING "lost page write due to " | 386 | printk(KERN_WARNING "lost page write due to " |
387 | "I/O error on %s\n", | 387 | "I/O error on %s\n", |
388 | bdevname(bh->b_bdev, b)); | 388 | bdevname(bh->b_bdev, b)); |
389 | } | 389 | } |
390 | set_bit(AS_EIO, &page->mapping->flags); | 390 | set_bit(AS_EIO, &page->mapping->flags); |
391 | set_buffer_write_io_error(bh); | 391 | set_buffer_write_io_error(bh); |
392 | clear_buffer_uptodate(bh); | 392 | clear_buffer_uptodate(bh); |
393 | SetPageError(page); | 393 | SetPageError(page); |
394 | } | 394 | } |
395 | 395 | ||
396 | first = page_buffers(page); | 396 | first = page_buffers(page); |
397 | local_irq_save(flags); | 397 | local_irq_save(flags); |
398 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | 398 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); |
399 | 399 | ||
400 | clear_buffer_async_write(bh); | 400 | clear_buffer_async_write(bh); |
401 | unlock_buffer(bh); | 401 | unlock_buffer(bh); |
402 | tmp = bh->b_this_page; | 402 | tmp = bh->b_this_page; |
403 | while (tmp != bh) { | 403 | while (tmp != bh) { |
404 | if (buffer_async_write(tmp)) { | 404 | if (buffer_async_write(tmp)) { |
405 | BUG_ON(!buffer_locked(tmp)); | 405 | BUG_ON(!buffer_locked(tmp)); |
406 | goto still_busy; | 406 | goto still_busy; |
407 | } | 407 | } |
408 | tmp = tmp->b_this_page; | 408 | tmp = tmp->b_this_page; |
409 | } | 409 | } |
410 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 410 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
411 | local_irq_restore(flags); | 411 | local_irq_restore(flags); |
412 | end_page_writeback(page); | 412 | end_page_writeback(page); |
413 | return; | 413 | return; |
414 | 414 | ||
415 | still_busy: | 415 | still_busy: |
416 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 416 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
417 | local_irq_restore(flags); | 417 | local_irq_restore(flags); |
418 | return; | 418 | return; |
419 | } | 419 | } |
420 | EXPORT_SYMBOL(end_buffer_async_write); | 420 | EXPORT_SYMBOL(end_buffer_async_write); |
421 | 421 | ||
422 | /* | 422 | /* |
423 | * If a page's buffers are under async readin (end_buffer_async_read | 423 | * If a page's buffers are under async readin (end_buffer_async_read |
424 | * completion) then there is a possibility that another thread of | 424 | * completion) then there is a possibility that another thread of |
425 | * control could lock one of the buffers after it has completed | 425 | * control could lock one of the buffers after it has completed |
426 | * but while some of the other buffers have not completed. This | 426 | * but while some of the other buffers have not completed. This |
427 | * locked buffer would confuse end_buffer_async_read() into not unlocking | 427 | * locked buffer would confuse end_buffer_async_read() into not unlocking |
428 | * the page. So the absence of BH_Async_Read tells end_buffer_async_read() | 428 | * the page. So the absence of BH_Async_Read tells end_buffer_async_read() |
429 | * that this buffer is not under async I/O. | 429 | * that this buffer is not under async I/O. |
430 | * | 430 | * |
431 | * The page comes unlocked when it has no locked buffer_async buffers | 431 | * The page comes unlocked when it has no locked buffer_async buffers |
432 | * left. | 432 | * left. |
433 | * | 433 | * |
434 | * PageLocked prevents anyone starting new async I/O reads any of | 434 | * PageLocked prevents anyone starting new async I/O reads any of |
435 | * the buffers. | 435 | * the buffers. |
436 | * | 436 | * |
437 | * PageWriteback is used to prevent simultaneous writeout of the same | 437 | * PageWriteback is used to prevent simultaneous writeout of the same |
438 | * page. | 438 | * page. |
439 | * | 439 | * |
440 | * PageLocked prevents anyone from starting writeback of a page which is | 440 | * PageLocked prevents anyone from starting writeback of a page which is |
441 | * under read I/O (PageWriteback is only ever set against a locked page). | 441 | * under read I/O (PageWriteback is only ever set against a locked page). |
442 | */ | 442 | */ |
443 | static void mark_buffer_async_read(struct buffer_head *bh) | 443 | static void mark_buffer_async_read(struct buffer_head *bh) |
444 | { | 444 | { |
445 | bh->b_end_io = end_buffer_async_read; | 445 | bh->b_end_io = end_buffer_async_read; |
446 | set_buffer_async_read(bh); | 446 | set_buffer_async_read(bh); |
447 | } | 447 | } |
448 | 448 | ||
449 | static void mark_buffer_async_write_endio(struct buffer_head *bh, | 449 | static void mark_buffer_async_write_endio(struct buffer_head *bh, |
450 | bh_end_io_t *handler) | 450 | bh_end_io_t *handler) |
451 | { | 451 | { |
452 | bh->b_end_io = handler; | 452 | bh->b_end_io = handler; |
453 | set_buffer_async_write(bh); | 453 | set_buffer_async_write(bh); |
454 | } | 454 | } |
455 | 455 | ||
456 | void mark_buffer_async_write(struct buffer_head *bh) | 456 | void mark_buffer_async_write(struct buffer_head *bh) |
457 | { | 457 | { |
458 | mark_buffer_async_write_endio(bh, end_buffer_async_write); | 458 | mark_buffer_async_write_endio(bh, end_buffer_async_write); |
459 | } | 459 | } |
460 | EXPORT_SYMBOL(mark_buffer_async_write); | 460 | EXPORT_SYMBOL(mark_buffer_async_write); |
461 | 461 | ||
462 | 462 | ||
463 | /* | 463 | /* |
464 | * fs/buffer.c contains helper functions for buffer-backed address space's | 464 | * fs/buffer.c contains helper functions for buffer-backed address space's |
465 | * fsync functions. A common requirement for buffer-based filesystems is | 465 | * fsync functions. A common requirement for buffer-based filesystems is |
466 | * that certain data from the backing blockdev needs to be written out for | 466 | * that certain data from the backing blockdev needs to be written out for |
467 | * a successful fsync(). For example, ext2 indirect blocks need to be | 467 | * a successful fsync(). For example, ext2 indirect blocks need to be |
468 | * written back and waited upon before fsync() returns. | 468 | * written back and waited upon before fsync() returns. |
469 | * | 469 | * |
470 | * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), | 470 | * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), |
471 | * inode_has_buffers() and invalidate_inode_buffers() are provided for the | 471 | * inode_has_buffers() and invalidate_inode_buffers() are provided for the |
472 | * management of a list of dependent buffers at ->i_mapping->private_list. | 472 | * management of a list of dependent buffers at ->i_mapping->private_list. |
473 | * | 473 | * |
474 | * Locking is a little subtle: try_to_free_buffers() will remove buffers | 474 | * Locking is a little subtle: try_to_free_buffers() will remove buffers |
475 | * from their controlling inode's queue when they are being freed. But | 475 | * from their controlling inode's queue when they are being freed. But |
476 | * try_to_free_buffers() will be operating against the *blockdev* mapping | 476 | * try_to_free_buffers() will be operating against the *blockdev* mapping |
477 | * at the time, not against the S_ISREG file which depends on those buffers. | 477 | * at the time, not against the S_ISREG file which depends on those buffers. |
478 | * So the locking for private_list is via the private_lock in the address_space | 478 | * So the locking for private_list is via the private_lock in the address_space |
479 | * which backs the buffers. Which is different from the address_space | 479 | * which backs the buffers. Which is different from the address_space |
480 | * against which the buffers are listed. So for a particular address_space, | 480 | * against which the buffers are listed. So for a particular address_space, |
481 | * mapping->private_lock does *not* protect mapping->private_list! In fact, | 481 | * mapping->private_lock does *not* protect mapping->private_list! In fact, |
482 | * mapping->private_list will always be protected by the backing blockdev's | 482 | * mapping->private_list will always be protected by the backing blockdev's |
483 | * ->private_lock. | 483 | * ->private_lock. |
484 | * | 484 | * |
485 | * Which introduces a requirement: all buffers on an address_space's | 485 | * Which introduces a requirement: all buffers on an address_space's |
486 | * ->private_list must be from the same address_space: the blockdev's. | 486 | * ->private_list must be from the same address_space: the blockdev's. |
487 | * | 487 | * |
488 | * address_spaces which do not place buffers at ->private_list via these | 488 | * address_spaces which do not place buffers at ->private_list via these |
489 | * utility functions are free to use private_lock and private_list for | 489 | * utility functions are free to use private_lock and private_list for |
490 | * whatever they want. The only requirement is that list_empty(private_list) | 490 | * whatever they want. The only requirement is that list_empty(private_list) |
491 | * be true at clear_inode() time. | 491 | * be true at clear_inode() time. |
492 | * | 492 | * |
493 | * FIXME: clear_inode should not call invalidate_inode_buffers(). The | 493 | * FIXME: clear_inode should not call invalidate_inode_buffers(). The |
494 | * filesystems should do that. invalidate_inode_buffers() should just go | 494 | * filesystems should do that. invalidate_inode_buffers() should just go |
495 | * BUG_ON(!list_empty). | 495 | * BUG_ON(!list_empty). |
496 | * | 496 | * |
497 | * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should | 497 | * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should |
498 | * take an address_space, not an inode. And it should be called | 498 | * take an address_space, not an inode. And it should be called |
499 | * mark_buffer_dirty_fsync() to clearly define why those buffers are being | 499 | * mark_buffer_dirty_fsync() to clearly define why those buffers are being |
500 | * queued up. | 500 | * queued up. |
501 | * | 501 | * |
502 | * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the | 502 | * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the |
503 | * list if it is already on a list. Because if the buffer is on a list, | 503 | * list if it is already on a list. Because if the buffer is on a list, |
504 | * it *must* already be on the right one. If not, the filesystem is being | 504 | * it *must* already be on the right one. If not, the filesystem is being |
505 | * silly. This will save a ton of locking. But first we have to ensure | 505 | * silly. This will save a ton of locking. But first we have to ensure |
506 | * that buffers are taken *off* the old inode's list when they are freed | 506 | * that buffers are taken *off* the old inode's list when they are freed |
507 | * (presumably in truncate). That requires careful auditing of all | 507 | * (presumably in truncate). That requires careful auditing of all |
508 | * filesystems (do it inside bforget()). It could also be done by bringing | 508 | * filesystems (do it inside bforget()). It could also be done by bringing |
509 | * b_inode back. | 509 | * b_inode back. |
510 | */ | 510 | */ |
511 | 511 | ||
512 | /* | 512 | /* |
513 | * The buffer's backing address_space's private_lock must be held | 513 | * The buffer's backing address_space's private_lock must be held |
514 | */ | 514 | */ |
515 | static void __remove_assoc_queue(struct buffer_head *bh) | 515 | static void __remove_assoc_queue(struct buffer_head *bh) |
516 | { | 516 | { |
517 | list_del_init(&bh->b_assoc_buffers); | 517 | list_del_init(&bh->b_assoc_buffers); |
518 | WARN_ON(!bh->b_assoc_map); | 518 | WARN_ON(!bh->b_assoc_map); |
519 | if (buffer_write_io_error(bh)) | 519 | if (buffer_write_io_error(bh)) |
520 | set_bit(AS_EIO, &bh->b_assoc_map->flags); | 520 | set_bit(AS_EIO, &bh->b_assoc_map->flags); |
521 | bh->b_assoc_map = NULL; | 521 | bh->b_assoc_map = NULL; |
522 | } | 522 | } |
523 | 523 | ||
524 | int inode_has_buffers(struct inode *inode) | 524 | int inode_has_buffers(struct inode *inode) |
525 | { | 525 | { |
526 | return !list_empty(&inode->i_data.private_list); | 526 | return !list_empty(&inode->i_data.private_list); |
527 | } | 527 | } |
528 | 528 | ||
529 | /* | 529 | /* |
530 | * osync is designed to support O_SYNC io. It waits synchronously for | 530 | * osync is designed to support O_SYNC io. It waits synchronously for |
531 | * all already-submitted IO to complete, but does not queue any new | 531 | * all already-submitted IO to complete, but does not queue any new |
532 | * writes to the disk. | 532 | * writes to the disk. |
533 | * | 533 | * |
534 | * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as | 534 | * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as |
535 | * you dirty the buffers, and then use osync_inode_buffers to wait for | 535 | * you dirty the buffers, and then use osync_inode_buffers to wait for |
536 | * completion. Any other dirty buffers which are not yet queued for | 536 | * completion. Any other dirty buffers which are not yet queued for |
537 | * write will not be flushed to disk by the osync. | 537 | * write will not be flushed to disk by the osync. |
538 | */ | 538 | */ |
539 | static int osync_buffers_list(spinlock_t *lock, struct list_head *list) | 539 | static int osync_buffers_list(spinlock_t *lock, struct list_head *list) |
540 | { | 540 | { |
541 | struct buffer_head *bh; | 541 | struct buffer_head *bh; |
542 | struct list_head *p; | 542 | struct list_head *p; |
543 | int err = 0; | 543 | int err = 0; |
544 | 544 | ||
545 | spin_lock(lock); | 545 | spin_lock(lock); |
546 | repeat: | 546 | repeat: |
547 | list_for_each_prev(p, list) { | 547 | list_for_each_prev(p, list) { |
548 | bh = BH_ENTRY(p); | 548 | bh = BH_ENTRY(p); |
549 | if (buffer_locked(bh)) { | 549 | if (buffer_locked(bh)) { |
550 | get_bh(bh); | 550 | get_bh(bh); |
551 | spin_unlock(lock); | 551 | spin_unlock(lock); |
552 | wait_on_buffer(bh); | 552 | wait_on_buffer(bh); |
553 | if (!buffer_uptodate(bh)) | 553 | if (!buffer_uptodate(bh)) |
554 | err = -EIO; | 554 | err = -EIO; |
555 | brelse(bh); | 555 | brelse(bh); |
556 | spin_lock(lock); | 556 | spin_lock(lock); |
557 | goto repeat; | 557 | goto repeat; |
558 | } | 558 | } |
559 | } | 559 | } |
560 | spin_unlock(lock); | 560 | spin_unlock(lock); |
561 | return err; | 561 | return err; |
562 | } | 562 | } |
563 | 563 | ||
564 | static void do_thaw_one(struct super_block *sb, void *unused) | 564 | static void do_thaw_one(struct super_block *sb, void *unused) |
565 | { | 565 | { |
566 | char b[BDEVNAME_SIZE]; | 566 | char b[BDEVNAME_SIZE]; |
567 | while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) | 567 | while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) |
568 | printk(KERN_WARNING "Emergency Thaw on %s\n", | 568 | printk(KERN_WARNING "Emergency Thaw on %s\n", |
569 | bdevname(sb->s_bdev, b)); | 569 | bdevname(sb->s_bdev, b)); |
570 | } | 570 | } |
571 | 571 | ||
572 | static void do_thaw_all(struct work_struct *work) | 572 | static void do_thaw_all(struct work_struct *work) |
573 | { | 573 | { |
574 | iterate_supers(do_thaw_one, NULL); | 574 | iterate_supers(do_thaw_one, NULL); |
575 | kfree(work); | 575 | kfree(work); |
576 | printk(KERN_WARNING "Emergency Thaw complete\n"); | 576 | printk(KERN_WARNING "Emergency Thaw complete\n"); |
577 | } | 577 | } |
578 | 578 | ||
579 | /** | 579 | /** |
580 | * emergency_thaw_all -- forcibly thaw every frozen filesystem | 580 | * emergency_thaw_all -- forcibly thaw every frozen filesystem |
581 | * | 581 | * |
582 | * Used for emergency unfreeze of all filesystems via SysRq | 582 | * Used for emergency unfreeze of all filesystems via SysRq |
583 | */ | 583 | */ |
584 | void emergency_thaw_all(void) | 584 | void emergency_thaw_all(void) |
585 | { | 585 | { |
586 | struct work_struct *work; | 586 | struct work_struct *work; |
587 | 587 | ||
588 | work = kmalloc(sizeof(*work), GFP_ATOMIC); | 588 | work = kmalloc(sizeof(*work), GFP_ATOMIC); |
589 | if (work) { | 589 | if (work) { |
590 | INIT_WORK(work, do_thaw_all); | 590 | INIT_WORK(work, do_thaw_all); |
591 | schedule_work(work); | 591 | schedule_work(work); |
592 | } | 592 | } |
593 | } | 593 | } |
594 | 594 | ||
595 | /** | 595 | /** |
596 | * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers | 596 | * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers |
597 | * @mapping: the mapping which wants those buffers written | 597 | * @mapping: the mapping which wants those buffers written |
598 | * | 598 | * |
599 | * Starts I/O against the buffers at mapping->private_list, and waits upon | 599 | * Starts I/O against the buffers at mapping->private_list, and waits upon |
600 | * that I/O. | 600 | * that I/O. |
601 | * | 601 | * |
602 | * Basically, this is a convenience function for fsync(). | 602 | * Basically, this is a convenience function for fsync(). |
603 | * @mapping is a file or directory which needs those buffers to be written for | 603 | * @mapping is a file or directory which needs those buffers to be written for |
604 | * a successful fsync(). | 604 | * a successful fsync(). |
605 | */ | 605 | */ |
606 | int sync_mapping_buffers(struct address_space *mapping) | 606 | int sync_mapping_buffers(struct address_space *mapping) |
607 | { | 607 | { |
608 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 608 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
609 | 609 | ||
610 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) | 610 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) |
611 | return 0; | 611 | return 0; |
612 | 612 | ||
613 | return fsync_buffers_list(&buffer_mapping->private_lock, | 613 | return fsync_buffers_list(&buffer_mapping->private_lock, |
614 | &mapping->private_list); | 614 | &mapping->private_list); |
615 | } | 615 | } |
616 | EXPORT_SYMBOL(sync_mapping_buffers); | 616 | EXPORT_SYMBOL(sync_mapping_buffers); |
617 | 617 | ||
618 | /* | 618 | /* |
619 | * Called when we've recently written block `bblock', and it is known that | 619 | * Called when we've recently written block `bblock', and it is known that |
620 | * `bblock' was for a buffer_boundary() buffer. This means that the block at | 620 | * `bblock' was for a buffer_boundary() buffer. This means that the block at |
621 | * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's | 621 | * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's |
622 | * dirty, schedule it for IO. So that indirects merge nicely with their data. | 622 | * dirty, schedule it for IO. So that indirects merge nicely with their data. |
623 | */ | 623 | */ |
624 | void write_boundary_block(struct block_device *bdev, | 624 | void write_boundary_block(struct block_device *bdev, |
625 | sector_t bblock, unsigned blocksize) | 625 | sector_t bblock, unsigned blocksize) |
626 | { | 626 | { |
627 | struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); | 627 | struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); |
628 | if (bh) { | 628 | if (bh) { |
629 | if (buffer_dirty(bh)) | 629 | if (buffer_dirty(bh)) |
630 | ll_rw_block(WRITE, 1, &bh); | 630 | ll_rw_block(WRITE, 1, &bh); |
631 | put_bh(bh); | 631 | put_bh(bh); |
632 | } | 632 | } |
633 | } | 633 | } |
634 | 634 | ||
635 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) | 635 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) |
636 | { | 636 | { |
637 | struct address_space *mapping = inode->i_mapping; | 637 | struct address_space *mapping = inode->i_mapping; |
638 | struct address_space *buffer_mapping = bh->b_page->mapping; | 638 | struct address_space *buffer_mapping = bh->b_page->mapping; |
639 | 639 | ||
640 | mark_buffer_dirty(bh); | 640 | mark_buffer_dirty(bh); |
641 | if (!mapping->assoc_mapping) { | 641 | if (!mapping->assoc_mapping) { |
642 | mapping->assoc_mapping = buffer_mapping; | 642 | mapping->assoc_mapping = buffer_mapping; |
643 | } else { | 643 | } else { |
644 | BUG_ON(mapping->assoc_mapping != buffer_mapping); | 644 | BUG_ON(mapping->assoc_mapping != buffer_mapping); |
645 | } | 645 | } |
646 | if (!bh->b_assoc_map) { | 646 | if (!bh->b_assoc_map) { |
647 | spin_lock(&buffer_mapping->private_lock); | 647 | spin_lock(&buffer_mapping->private_lock); |
648 | list_move_tail(&bh->b_assoc_buffers, | 648 | list_move_tail(&bh->b_assoc_buffers, |
649 | &mapping->private_list); | 649 | &mapping->private_list); |
650 | bh->b_assoc_map = mapping; | 650 | bh->b_assoc_map = mapping; |
651 | spin_unlock(&buffer_mapping->private_lock); | 651 | spin_unlock(&buffer_mapping->private_lock); |
652 | } | 652 | } |
653 | } | 653 | } |
654 | EXPORT_SYMBOL(mark_buffer_dirty_inode); | 654 | EXPORT_SYMBOL(mark_buffer_dirty_inode); |
655 | 655 | ||
656 | /* | 656 | /* |
657 | * Mark the page dirty, and set it dirty in the radix tree, and mark the inode | 657 | * Mark the page dirty, and set it dirty in the radix tree, and mark the inode |
658 | * dirty. | 658 | * dirty. |
659 | * | 659 | * |
660 | * If warn is true, then emit a warning if the page is not uptodate and has | 660 | * If warn is true, then emit a warning if the page is not uptodate and has |
661 | * not been truncated. | 661 | * not been truncated. |
662 | */ | 662 | */ |
663 | static void __set_page_dirty(struct page *page, | 663 | static void __set_page_dirty(struct page *page, |
664 | struct address_space *mapping, int warn) | 664 | struct address_space *mapping, int warn) |
665 | { | 665 | { |
666 | spin_lock_irq(&mapping->tree_lock); | 666 | spin_lock_irq(&mapping->tree_lock); |
667 | if (page->mapping) { /* Race with truncate? */ | 667 | if (page->mapping) { /* Race with truncate? */ |
668 | WARN_ON_ONCE(warn && !PageUptodate(page)); | 668 | WARN_ON_ONCE(warn && !PageUptodate(page)); |
669 | account_page_dirtied(page, mapping); | 669 | account_page_dirtied(page, mapping); |
670 | radix_tree_tag_set(&mapping->page_tree, | 670 | radix_tree_tag_set(&mapping->page_tree, |
671 | page_index(page), PAGECACHE_TAG_DIRTY); | 671 | page_index(page), PAGECACHE_TAG_DIRTY); |
672 | } | 672 | } |
673 | spin_unlock_irq(&mapping->tree_lock); | 673 | spin_unlock_irq(&mapping->tree_lock); |
674 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 674 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
675 | } | 675 | } |
676 | 676 | ||
677 | /* | 677 | /* |
678 | * Add a page to the dirty page list. | 678 | * Add a page to the dirty page list. |
679 | * | 679 | * |
680 | * It is a sad fact of life that this function is called from several places | 680 | * It is a sad fact of life that this function is called from several places |
681 | * deeply under spinlocking. It may not sleep. | 681 | * deeply under spinlocking. It may not sleep. |
682 | * | 682 | * |
683 | * If the page has buffers, the uptodate buffers are set dirty, to preserve | 683 | * If the page has buffers, the uptodate buffers are set dirty, to preserve |
684 | * dirty-state coherency between the page and the buffers. It the page does | 684 | * dirty-state coherency between the page and the buffers. It the page does |
685 | * not have buffers then when they are later attached they will all be set | 685 | * not have buffers then when they are later attached they will all be set |
686 | * dirty. | 686 | * dirty. |
687 | * | 687 | * |
688 | * The buffers are dirtied before the page is dirtied. There's a small race | 688 | * The buffers are dirtied before the page is dirtied. There's a small race |
689 | * window in which a writepage caller may see the page cleanness but not the | 689 | * window in which a writepage caller may see the page cleanness but not the |
690 | * buffer dirtiness. That's fine. If this code were to set the page dirty | 690 | * buffer dirtiness. That's fine. If this code were to set the page dirty |
691 | * before the buffers, a concurrent writepage caller could clear the page dirty | 691 | * before the buffers, a concurrent writepage caller could clear the page dirty |
692 | * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean | 692 | * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean |
693 | * page on the dirty page list. | 693 | * page on the dirty page list. |
694 | * | 694 | * |
695 | * We use private_lock to lock against try_to_free_buffers while using the | 695 | * We use private_lock to lock against try_to_free_buffers while using the |
696 | * page's buffer list. Also use this to protect against clean buffers being | 696 | * page's buffer list. Also use this to protect against clean buffers being |
697 | * added to the page after it was set dirty. | 697 | * added to the page after it was set dirty. |
698 | * | 698 | * |
699 | * FIXME: may need to call ->reservepage here as well. That's rather up to the | 699 | * FIXME: may need to call ->reservepage here as well. That's rather up to the |
700 | * address_space though. | 700 | * address_space though. |
701 | */ | 701 | */ |
702 | int __set_page_dirty_buffers(struct page *page) | 702 | int __set_page_dirty_buffers(struct page *page) |
703 | { | 703 | { |
704 | int newly_dirty; | 704 | int newly_dirty; |
705 | struct address_space *mapping = page_mapping(page); | 705 | struct address_space *mapping = page_mapping(page); |
706 | 706 | ||
707 | if (unlikely(!mapping)) | 707 | if (unlikely(!mapping)) |
708 | return !TestSetPageDirty(page); | 708 | return !TestSetPageDirty(page); |
709 | 709 | ||
710 | spin_lock(&mapping->private_lock); | 710 | spin_lock(&mapping->private_lock); |
711 | if (page_has_buffers(page)) { | 711 | if (page_has_buffers(page)) { |
712 | struct buffer_head *head = page_buffers(page); | 712 | struct buffer_head *head = page_buffers(page); |
713 | struct buffer_head *bh = head; | 713 | struct buffer_head *bh = head; |
714 | 714 | ||
715 | do { | 715 | do { |
716 | set_buffer_dirty(bh); | 716 | set_buffer_dirty(bh); |
717 | bh = bh->b_this_page; | 717 | bh = bh->b_this_page; |
718 | } while (bh != head); | 718 | } while (bh != head); |
719 | } | 719 | } |
720 | newly_dirty = !TestSetPageDirty(page); | 720 | newly_dirty = !TestSetPageDirty(page); |
721 | spin_unlock(&mapping->private_lock); | 721 | spin_unlock(&mapping->private_lock); |
722 | 722 | ||
723 | if (newly_dirty) | 723 | if (newly_dirty) |
724 | __set_page_dirty(page, mapping, 1); | 724 | __set_page_dirty(page, mapping, 1); |
725 | return newly_dirty; | 725 | return newly_dirty; |
726 | } | 726 | } |
727 | EXPORT_SYMBOL(__set_page_dirty_buffers); | 727 | EXPORT_SYMBOL(__set_page_dirty_buffers); |
728 | 728 | ||
729 | /* | 729 | /* |
730 | * Write out and wait upon a list of buffers. | 730 | * Write out and wait upon a list of buffers. |
731 | * | 731 | * |
732 | * We have conflicting pressures: we want to make sure that all | 732 | * We have conflicting pressures: we want to make sure that all |
733 | * initially dirty buffers get waited on, but that any subsequently | 733 | * initially dirty buffers get waited on, but that any subsequently |
734 | * dirtied buffers don't. After all, we don't want fsync to last | 734 | * dirtied buffers don't. After all, we don't want fsync to last |
735 | * forever if somebody is actively writing to the file. | 735 | * forever if somebody is actively writing to the file. |
736 | * | 736 | * |
737 | * Do this in two main stages: first we copy dirty buffers to a | 737 | * Do this in two main stages: first we copy dirty buffers to a |
738 | * temporary inode list, queueing the writes as we go. Then we clean | 738 | * temporary inode list, queueing the writes as we go. Then we clean |
739 | * up, waiting for those writes to complete. | 739 | * up, waiting for those writes to complete. |
740 | * | 740 | * |
741 | * During this second stage, any subsequent updates to the file may end | 741 | * During this second stage, any subsequent updates to the file may end |
742 | * up refiling the buffer on the original inode's dirty list again, so | 742 | * up refiling the buffer on the original inode's dirty list again, so |
743 | * there is a chance we will end up with a buffer queued for write but | 743 | * there is a chance we will end up with a buffer queued for write but |
744 | * not yet completed on that list. So, as a final cleanup we go through | 744 | * not yet completed on that list. So, as a final cleanup we go through |
745 | * the osync code to catch these locked, dirty buffers without requeuing | 745 | * the osync code to catch these locked, dirty buffers without requeuing |
746 | * any newly dirty buffers for write. | 746 | * any newly dirty buffers for write. |
747 | */ | 747 | */ |
748 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | 748 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) |
749 | { | 749 | { |
750 | struct buffer_head *bh; | 750 | struct buffer_head *bh; |
751 | struct list_head tmp; | 751 | struct list_head tmp; |
752 | struct address_space *mapping, *prev_mapping = NULL; | 752 | struct address_space *mapping, *prev_mapping = NULL; |
753 | int err = 0, err2; | 753 | int err = 0, err2; |
754 | 754 | ||
755 | INIT_LIST_HEAD(&tmp); | 755 | INIT_LIST_HEAD(&tmp); |
756 | 756 | ||
757 | spin_lock(lock); | 757 | spin_lock(lock); |
758 | while (!list_empty(list)) { | 758 | while (!list_empty(list)) { |
759 | bh = BH_ENTRY(list->next); | 759 | bh = BH_ENTRY(list->next); |
760 | mapping = bh->b_assoc_map; | 760 | mapping = bh->b_assoc_map; |
761 | __remove_assoc_queue(bh); | 761 | __remove_assoc_queue(bh); |
762 | /* Avoid race with mark_buffer_dirty_inode() which does | 762 | /* Avoid race with mark_buffer_dirty_inode() which does |
763 | * a lockless check and we rely on seeing the dirty bit */ | 763 | * a lockless check and we rely on seeing the dirty bit */ |
764 | smp_mb(); | 764 | smp_mb(); |
765 | if (buffer_dirty(bh) || buffer_locked(bh)) { | 765 | if (buffer_dirty(bh) || buffer_locked(bh)) { |
766 | list_add(&bh->b_assoc_buffers, &tmp); | 766 | list_add(&bh->b_assoc_buffers, &tmp); |
767 | bh->b_assoc_map = mapping; | 767 | bh->b_assoc_map = mapping; |
768 | if (buffer_dirty(bh)) { | 768 | if (buffer_dirty(bh)) { |
769 | get_bh(bh); | 769 | get_bh(bh); |
770 | spin_unlock(lock); | 770 | spin_unlock(lock); |
771 | /* | 771 | /* |
772 | * Ensure any pending I/O completes so that | 772 | * Ensure any pending I/O completes so that |
773 | * ll_rw_block() actually writes the current | 773 | * ll_rw_block() actually writes the current |
774 | * contents - it is a noop if I/O is still in | 774 | * contents - it is a noop if I/O is still in |
775 | * flight on potentially older contents. | 775 | * flight on potentially older contents. |
776 | */ | 776 | */ |
777 | ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); | 777 | ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); |
778 | 778 | ||
779 | /* | 779 | /* |
780 | * Kick off IO for the previous mapping. Note | 780 | * Kick off IO for the previous mapping. Note |
781 | * that we will not run the very last mapping, | 781 | * that we will not run the very last mapping, |
782 | * wait_on_buffer() will do that for us | 782 | * wait_on_buffer() will do that for us |
783 | * through sync_buffer(). | 783 | * through sync_buffer(). |
784 | */ | 784 | */ |
785 | if (prev_mapping && prev_mapping != mapping) | 785 | if (prev_mapping && prev_mapping != mapping) |
786 | blk_run_address_space(prev_mapping); | 786 | blk_run_address_space(prev_mapping); |
787 | prev_mapping = mapping; | 787 | prev_mapping = mapping; |
788 | 788 | ||
789 | brelse(bh); | 789 | brelse(bh); |
790 | spin_lock(lock); | 790 | spin_lock(lock); |
791 | } | 791 | } |
792 | } | 792 | } |
793 | } | 793 | } |
794 | 794 | ||
795 | while (!list_empty(&tmp)) { | 795 | while (!list_empty(&tmp)) { |
796 | bh = BH_ENTRY(tmp.prev); | 796 | bh = BH_ENTRY(tmp.prev); |
797 | get_bh(bh); | 797 | get_bh(bh); |
798 | mapping = bh->b_assoc_map; | 798 | mapping = bh->b_assoc_map; |
799 | __remove_assoc_queue(bh); | 799 | __remove_assoc_queue(bh); |
800 | /* Avoid race with mark_buffer_dirty_inode() which does | 800 | /* Avoid race with mark_buffer_dirty_inode() which does |
801 | * a lockless check and we rely on seeing the dirty bit */ | 801 | * a lockless check and we rely on seeing the dirty bit */ |
802 | smp_mb(); | 802 | smp_mb(); |
803 | if (buffer_dirty(bh)) { | 803 | if (buffer_dirty(bh)) { |
804 | list_add(&bh->b_assoc_buffers, | 804 | list_add(&bh->b_assoc_buffers, |
805 | &mapping->private_list); | 805 | &mapping->private_list); |
806 | bh->b_assoc_map = mapping; | 806 | bh->b_assoc_map = mapping; |
807 | } | 807 | } |
808 | spin_unlock(lock); | 808 | spin_unlock(lock); |
809 | wait_on_buffer(bh); | 809 | wait_on_buffer(bh); |
810 | if (!buffer_uptodate(bh)) | 810 | if (!buffer_uptodate(bh)) |
811 | err = -EIO; | 811 | err = -EIO; |
812 | brelse(bh); | 812 | brelse(bh); |
813 | spin_lock(lock); | 813 | spin_lock(lock); |
814 | } | 814 | } |
815 | 815 | ||
816 | spin_unlock(lock); | 816 | spin_unlock(lock); |
817 | err2 = osync_buffers_list(lock, list); | 817 | err2 = osync_buffers_list(lock, list); |
818 | if (err) | 818 | if (err) |
819 | return err; | 819 | return err; |
820 | else | 820 | else |
821 | return err2; | 821 | return err2; |
822 | } | 822 | } |
823 | 823 | ||
824 | /* | 824 | /* |
825 | * Invalidate any and all dirty buffers on a given inode. We are | 825 | * Invalidate any and all dirty buffers on a given inode. We are |
826 | * probably unmounting the fs, but that doesn't mean we have already | 826 | * probably unmounting the fs, but that doesn't mean we have already |
827 | * done a sync(). Just drop the buffers from the inode list. | 827 | * done a sync(). Just drop the buffers from the inode list. |
828 | * | 828 | * |
829 | * NOTE: we take the inode's blockdev's mapping's private_lock. Which | 829 | * NOTE: we take the inode's blockdev's mapping's private_lock. Which |
830 | * assumes that all the buffers are against the blockdev. Not true | 830 | * assumes that all the buffers are against the blockdev. Not true |
831 | * for reiserfs. | 831 | * for reiserfs. |
832 | */ | 832 | */ |
833 | void invalidate_inode_buffers(struct inode *inode) | 833 | void invalidate_inode_buffers(struct inode *inode) |
834 | { | 834 | { |
835 | if (inode_has_buffers(inode)) { | 835 | if (inode_has_buffers(inode)) { |
836 | struct address_space *mapping = &inode->i_data; | 836 | struct address_space *mapping = &inode->i_data; |
837 | struct list_head *list = &mapping->private_list; | 837 | struct list_head *list = &mapping->private_list; |
838 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 838 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
839 | 839 | ||
840 | spin_lock(&buffer_mapping->private_lock); | 840 | spin_lock(&buffer_mapping->private_lock); |
841 | while (!list_empty(list)) | 841 | while (!list_empty(list)) |
842 | __remove_assoc_queue(BH_ENTRY(list->next)); | 842 | __remove_assoc_queue(BH_ENTRY(list->next)); |
843 | spin_unlock(&buffer_mapping->private_lock); | 843 | spin_unlock(&buffer_mapping->private_lock); |
844 | } | 844 | } |
845 | } | 845 | } |
846 | EXPORT_SYMBOL(invalidate_inode_buffers); | 846 | EXPORT_SYMBOL(invalidate_inode_buffers); |
847 | 847 | ||
848 | /* | 848 | /* |
849 | * Remove any clean buffers from the inode's buffer list. This is called | 849 | * Remove any clean buffers from the inode's buffer list. This is called |
850 | * when we're trying to free the inode itself. Those buffers can pin it. | 850 | * when we're trying to free the inode itself. Those buffers can pin it. |
851 | * | 851 | * |
852 | * Returns true if all buffers were removed. | 852 | * Returns true if all buffers were removed. |
853 | */ | 853 | */ |
854 | int remove_inode_buffers(struct inode *inode) | 854 | int remove_inode_buffers(struct inode *inode) |
855 | { | 855 | { |
856 | int ret = 1; | 856 | int ret = 1; |
857 | 857 | ||
858 | if (inode_has_buffers(inode)) { | 858 | if (inode_has_buffers(inode)) { |
859 | struct address_space *mapping = &inode->i_data; | 859 | struct address_space *mapping = &inode->i_data; |
860 | struct list_head *list = &mapping->private_list; | 860 | struct list_head *list = &mapping->private_list; |
861 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 861 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
862 | 862 | ||
863 | spin_lock(&buffer_mapping->private_lock); | 863 | spin_lock(&buffer_mapping->private_lock); |
864 | while (!list_empty(list)) { | 864 | while (!list_empty(list)) { |
865 | struct buffer_head *bh = BH_ENTRY(list->next); | 865 | struct buffer_head *bh = BH_ENTRY(list->next); |
866 | if (buffer_dirty(bh)) { | 866 | if (buffer_dirty(bh)) { |
867 | ret = 0; | 867 | ret = 0; |
868 | break; | 868 | break; |
869 | } | 869 | } |
870 | __remove_assoc_queue(bh); | 870 | __remove_assoc_queue(bh); |
871 | } | 871 | } |
872 | spin_unlock(&buffer_mapping->private_lock); | 872 | spin_unlock(&buffer_mapping->private_lock); |
873 | } | 873 | } |
874 | return ret; | 874 | return ret; |
875 | } | 875 | } |
876 | 876 | ||
877 | /* | 877 | /* |
878 | * Create the appropriate buffers when given a page for data area and | 878 | * Create the appropriate buffers when given a page for data area and |
879 | * the size of each buffer.. Use the bh->b_this_page linked list to | 879 | * the size of each buffer.. Use the bh->b_this_page linked list to |
880 | * follow the buffers created. Return NULL if unable to create more | 880 | * follow the buffers created. Return NULL if unable to create more |
881 | * buffers. | 881 | * buffers. |
882 | * | 882 | * |
883 | * The retry flag is used to differentiate async IO (paging, swapping) | 883 | * The retry flag is used to differentiate async IO (paging, swapping) |
884 | * which may not fail from ordinary buffer allocations. | 884 | * which may not fail from ordinary buffer allocations. |
885 | */ | 885 | */ |
886 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | 886 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, |
887 | int retry) | 887 | int retry) |
888 | { | 888 | { |
889 | struct buffer_head *bh, *head; | 889 | struct buffer_head *bh, *head; |
890 | long offset; | 890 | long offset; |
891 | 891 | ||
892 | try_again: | 892 | try_again: |
893 | head = NULL; | 893 | head = NULL; |
894 | offset = PAGE_SIZE; | 894 | offset = PAGE_SIZE; |
895 | while ((offset -= size) >= 0) { | 895 | while ((offset -= size) >= 0) { |
896 | bh = alloc_buffer_head(GFP_NOFS); | 896 | bh = alloc_buffer_head(GFP_NOFS); |
897 | if (!bh) | 897 | if (!bh) |
898 | goto no_grow; | 898 | goto no_grow; |
899 | 899 | ||
900 | bh->b_bdev = NULL; | 900 | bh->b_bdev = NULL; |
901 | bh->b_this_page = head; | 901 | bh->b_this_page = head; |
902 | bh->b_blocknr = -1; | 902 | bh->b_blocknr = -1; |
903 | head = bh; | 903 | head = bh; |
904 | 904 | ||
905 | bh->b_state = 0; | 905 | bh->b_state = 0; |
906 | atomic_set(&bh->b_count, 0); | 906 | atomic_set(&bh->b_count, 0); |
907 | bh->b_private = NULL; | 907 | bh->b_private = NULL; |
908 | bh->b_size = size; | 908 | bh->b_size = size; |
909 | 909 | ||
910 | /* Link the buffer to its page */ | 910 | /* Link the buffer to its page */ |
911 | set_bh_page(bh, page, offset); | 911 | set_bh_page(bh, page, offset); |
912 | 912 | ||
913 | init_buffer(bh, NULL, NULL); | 913 | init_buffer(bh, NULL, NULL); |
914 | } | 914 | } |
915 | return head; | 915 | return head; |
916 | /* | 916 | /* |
917 | * In case anything failed, we just free everything we got. | 917 | * In case anything failed, we just free everything we got. |
918 | */ | 918 | */ |
919 | no_grow: | 919 | no_grow: |
920 | if (head) { | 920 | if (head) { |
921 | do { | 921 | do { |
922 | bh = head; | 922 | bh = head; |
923 | head = head->b_this_page; | 923 | head = head->b_this_page; |
924 | free_buffer_head(bh); | 924 | free_buffer_head(bh); |
925 | } while (head); | 925 | } while (head); |
926 | } | 926 | } |
927 | 927 | ||
928 | /* | 928 | /* |
929 | * Return failure for non-async IO requests. Async IO requests | 929 | * Return failure for non-async IO requests. Async IO requests |
930 | * are not allowed to fail, so we have to wait until buffer heads | 930 | * are not allowed to fail, so we have to wait until buffer heads |
931 | * become available. But we don't want tasks sleeping with | 931 | * become available. But we don't want tasks sleeping with |
932 | * partially complete buffers, so all were released above. | 932 | * partially complete buffers, so all were released above. |
933 | */ | 933 | */ |
934 | if (!retry) | 934 | if (!retry) |
935 | return NULL; | 935 | return NULL; |
936 | 936 | ||
937 | /* We're _really_ low on memory. Now we just | 937 | /* We're _really_ low on memory. Now we just |
938 | * wait for old buffer heads to become free due to | 938 | * wait for old buffer heads to become free due to |
939 | * finishing IO. Since this is an async request and | 939 | * finishing IO. Since this is an async request and |
940 | * the reserve list is empty, we're sure there are | 940 | * the reserve list is empty, we're sure there are |
941 | * async buffer heads in use. | 941 | * async buffer heads in use. |
942 | */ | 942 | */ |
943 | free_more_memory(); | 943 | free_more_memory(); |
944 | goto try_again; | 944 | goto try_again; |
945 | } | 945 | } |
946 | EXPORT_SYMBOL_GPL(alloc_page_buffers); | 946 | EXPORT_SYMBOL_GPL(alloc_page_buffers); |
947 | 947 | ||
948 | static inline void | 948 | static inline void |
949 | link_dev_buffers(struct page *page, struct buffer_head *head) | 949 | link_dev_buffers(struct page *page, struct buffer_head *head) |
950 | { | 950 | { |
951 | struct buffer_head *bh, *tail; | 951 | struct buffer_head *bh, *tail; |
952 | 952 | ||
953 | bh = head; | 953 | bh = head; |
954 | do { | 954 | do { |
955 | tail = bh; | 955 | tail = bh; |
956 | bh = bh->b_this_page; | 956 | bh = bh->b_this_page; |
957 | } while (bh); | 957 | } while (bh); |
958 | tail->b_this_page = head; | 958 | tail->b_this_page = head; |
959 | attach_page_buffers(page, head); | 959 | attach_page_buffers(page, head); |
960 | } | 960 | } |
961 | 961 | ||
962 | /* | 962 | /* |
963 | * Initialise the state of a blockdev page's buffers. | 963 | * Initialise the state of a blockdev page's buffers. |
964 | */ | 964 | */ |
965 | static void | 965 | static void |
966 | init_page_buffers(struct page *page, struct block_device *bdev, | 966 | init_page_buffers(struct page *page, struct block_device *bdev, |
967 | sector_t block, int size) | 967 | sector_t block, int size) |
968 | { | 968 | { |
969 | struct buffer_head *head = page_buffers(page); | 969 | struct buffer_head *head = page_buffers(page); |
970 | struct buffer_head *bh = head; | 970 | struct buffer_head *bh = head; |
971 | int uptodate = PageUptodate(page); | 971 | int uptodate = PageUptodate(page); |
972 | 972 | ||
973 | do { | 973 | do { |
974 | if (!buffer_mapped(bh)) { | 974 | if (!buffer_mapped(bh)) { |
975 | init_buffer(bh, NULL, NULL); | 975 | init_buffer(bh, NULL, NULL); |
976 | bh->b_bdev = bdev; | 976 | bh->b_bdev = bdev; |
977 | bh->b_blocknr = block; | 977 | bh->b_blocknr = block; |
978 | if (uptodate) | 978 | if (uptodate) |
979 | set_buffer_uptodate(bh); | 979 | set_buffer_uptodate(bh); |
980 | set_buffer_mapped(bh); | 980 | set_buffer_mapped(bh); |
981 | } | 981 | } |
982 | block++; | 982 | block++; |
983 | bh = bh->b_this_page; | 983 | bh = bh->b_this_page; |
984 | } while (bh != head); | 984 | } while (bh != head); |
985 | } | 985 | } |
986 | 986 | ||
987 | /* | 987 | /* |
988 | * Create the page-cache page that contains the requested block. | 988 | * Create the page-cache page that contains the requested block. |
989 | * | 989 | * |
990 | * This is user purely for blockdev mappings. | 990 | * This is user purely for blockdev mappings. |
991 | */ | 991 | */ |
992 | static struct page * | 992 | static struct page * |
993 | grow_dev_page(struct block_device *bdev, sector_t block, | 993 | grow_dev_page(struct block_device *bdev, sector_t block, |
994 | pgoff_t index, int size) | 994 | pgoff_t index, int size) |
995 | { | 995 | { |
996 | struct inode *inode = bdev->bd_inode; | 996 | struct inode *inode = bdev->bd_inode; |
997 | struct page *page; | 997 | struct page *page; |
998 | struct buffer_head *bh; | 998 | struct buffer_head *bh; |
999 | 999 | ||
1000 | page = find_or_create_page(inode->i_mapping, index, | 1000 | page = find_or_create_page(inode->i_mapping, index, |
1001 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); | 1001 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); |
1002 | if (!page) | 1002 | if (!page) |
1003 | return NULL; | 1003 | return NULL; |
1004 | 1004 | ||
1005 | BUG_ON(!PageLocked(page)); | 1005 | BUG_ON(!PageLocked(page)); |
1006 | 1006 | ||
1007 | if (page_has_buffers(page)) { | 1007 | if (page_has_buffers(page)) { |
1008 | bh = page_buffers(page); | 1008 | bh = page_buffers(page); |
1009 | if (bh->b_size == size) { | 1009 | if (bh->b_size == size) { |
1010 | init_page_buffers(page, bdev, block, size); | 1010 | init_page_buffers(page, bdev, block, size); |
1011 | return page; | 1011 | return page; |
1012 | } | 1012 | } |
1013 | if (!try_to_free_buffers(page)) | 1013 | if (!try_to_free_buffers(page)) |
1014 | goto failed; | 1014 | goto failed; |
1015 | } | 1015 | } |
1016 | 1016 | ||
1017 | /* | 1017 | /* |
1018 | * Allocate some buffers for this page | 1018 | * Allocate some buffers for this page |
1019 | */ | 1019 | */ |
1020 | bh = alloc_page_buffers(page, size, 0); | 1020 | bh = alloc_page_buffers(page, size, 0); |
1021 | if (!bh) | 1021 | if (!bh) |
1022 | goto failed; | 1022 | goto failed; |
1023 | 1023 | ||
1024 | /* | 1024 | /* |
1025 | * Link the page to the buffers and initialise them. Take the | 1025 | * Link the page to the buffers and initialise them. Take the |
1026 | * lock to be atomic wrt __find_get_block(), which does not | 1026 | * lock to be atomic wrt __find_get_block(), which does not |
1027 | * run under the page lock. | 1027 | * run under the page lock. |
1028 | */ | 1028 | */ |
1029 | spin_lock(&inode->i_mapping->private_lock); | 1029 | spin_lock(&inode->i_mapping->private_lock); |
1030 | link_dev_buffers(page, bh); | 1030 | link_dev_buffers(page, bh); |
1031 | init_page_buffers(page, bdev, block, size); | 1031 | init_page_buffers(page, bdev, block, size); |
1032 | spin_unlock(&inode->i_mapping->private_lock); | 1032 | spin_unlock(&inode->i_mapping->private_lock); |
1033 | return page; | 1033 | return page; |
1034 | 1034 | ||
1035 | failed: | 1035 | failed: |
1036 | BUG(); | 1036 | BUG(); |
1037 | unlock_page(page); | 1037 | unlock_page(page); |
1038 | page_cache_release(page); | 1038 | page_cache_release(page); |
1039 | return NULL; | 1039 | return NULL; |
1040 | } | 1040 | } |
1041 | 1041 | ||
1042 | /* | 1042 | /* |
1043 | * Create buffers for the specified block device block's page. If | 1043 | * Create buffers for the specified block device block's page. If |
1044 | * that page was dirty, the buffers are set dirty also. | 1044 | * that page was dirty, the buffers are set dirty also. |
1045 | */ | 1045 | */ |
1046 | static int | 1046 | static int |
1047 | grow_buffers(struct block_device *bdev, sector_t block, int size) | 1047 | grow_buffers(struct block_device *bdev, sector_t block, int size) |
1048 | { | 1048 | { |
1049 | struct page *page; | 1049 | struct page *page; |
1050 | pgoff_t index; | 1050 | pgoff_t index; |
1051 | int sizebits; | 1051 | int sizebits; |
1052 | 1052 | ||
1053 | sizebits = -1; | 1053 | sizebits = -1; |
1054 | do { | 1054 | do { |
1055 | sizebits++; | 1055 | sizebits++; |
1056 | } while ((size << sizebits) < PAGE_SIZE); | 1056 | } while ((size << sizebits) < PAGE_SIZE); |
1057 | 1057 | ||
1058 | index = block >> sizebits; | 1058 | index = block >> sizebits; |
1059 | 1059 | ||
1060 | /* | 1060 | /* |
1061 | * Check for a block which wants to lie outside our maximum possible | 1061 | * Check for a block which wants to lie outside our maximum possible |
1062 | * pagecache index. (this comparison is done using sector_t types). | 1062 | * pagecache index. (this comparison is done using sector_t types). |
1063 | */ | 1063 | */ |
1064 | if (unlikely(index != block >> sizebits)) { | 1064 | if (unlikely(index != block >> sizebits)) { |
1065 | char b[BDEVNAME_SIZE]; | 1065 | char b[BDEVNAME_SIZE]; |
1066 | 1066 | ||
1067 | printk(KERN_ERR "%s: requested out-of-range block %llu for " | 1067 | printk(KERN_ERR "%s: requested out-of-range block %llu for " |
1068 | "device %s\n", | 1068 | "device %s\n", |
1069 | __func__, (unsigned long long)block, | 1069 | __func__, (unsigned long long)block, |
1070 | bdevname(bdev, b)); | 1070 | bdevname(bdev, b)); |
1071 | return -EIO; | 1071 | return -EIO; |
1072 | } | 1072 | } |
1073 | block = index << sizebits; | 1073 | block = index << sizebits; |
1074 | /* Create a page with the proper size buffers.. */ | 1074 | /* Create a page with the proper size buffers.. */ |
1075 | page = grow_dev_page(bdev, block, index, size); | 1075 | page = grow_dev_page(bdev, block, index, size); |
1076 | if (!page) | 1076 | if (!page) |
1077 | return 0; | 1077 | return 0; |
1078 | unlock_page(page); | 1078 | unlock_page(page); |
1079 | page_cache_release(page); | 1079 | page_cache_release(page); |
1080 | return 1; | 1080 | return 1; |
1081 | } | 1081 | } |
1082 | 1082 | ||
1083 | static struct buffer_head * | 1083 | static struct buffer_head * |
1084 | __getblk_slow(struct block_device *bdev, sector_t block, int size) | 1084 | __getblk_slow(struct block_device *bdev, sector_t block, int size) |
1085 | { | 1085 | { |
1086 | /* Size must be multiple of hard sectorsize */ | 1086 | /* Size must be multiple of hard sectorsize */ |
1087 | if (unlikely(size & (bdev_logical_block_size(bdev)-1) || | 1087 | if (unlikely(size & (bdev_logical_block_size(bdev)-1) || |
1088 | (size < 512 || size > PAGE_SIZE))) { | 1088 | (size < 512 || size > PAGE_SIZE))) { |
1089 | printk(KERN_ERR "getblk(): invalid block size %d requested\n", | 1089 | printk(KERN_ERR "getblk(): invalid block size %d requested\n", |
1090 | size); | 1090 | size); |
1091 | printk(KERN_ERR "logical block size: %d\n", | 1091 | printk(KERN_ERR "logical block size: %d\n", |
1092 | bdev_logical_block_size(bdev)); | 1092 | bdev_logical_block_size(bdev)); |
1093 | 1093 | ||
1094 | dump_stack(); | 1094 | dump_stack(); |
1095 | return NULL; | 1095 | return NULL; |
1096 | } | 1096 | } |
1097 | 1097 | ||
1098 | for (;;) { | 1098 | for (;;) { |
1099 | struct buffer_head * bh; | 1099 | struct buffer_head * bh; |
1100 | int ret; | 1100 | int ret; |
1101 | 1101 | ||
1102 | bh = __find_get_block(bdev, block, size); | 1102 | bh = __find_get_block(bdev, block, size); |
1103 | if (bh) | 1103 | if (bh) |
1104 | return bh; | 1104 | return bh; |
1105 | 1105 | ||
1106 | ret = grow_buffers(bdev, block, size); | 1106 | ret = grow_buffers(bdev, block, size); |
1107 | if (ret < 0) | 1107 | if (ret < 0) |
1108 | return NULL; | 1108 | return NULL; |
1109 | if (ret == 0) | 1109 | if (ret == 0) |
1110 | free_more_memory(); | 1110 | free_more_memory(); |
1111 | } | 1111 | } |
1112 | } | 1112 | } |
1113 | 1113 | ||
1114 | /* | 1114 | /* |
1115 | * The relationship between dirty buffers and dirty pages: | 1115 | * The relationship between dirty buffers and dirty pages: |
1116 | * | 1116 | * |
1117 | * Whenever a page has any dirty buffers, the page's dirty bit is set, and | 1117 | * Whenever a page has any dirty buffers, the page's dirty bit is set, and |
1118 | * the page is tagged dirty in its radix tree. | 1118 | * the page is tagged dirty in its radix tree. |
1119 | * | 1119 | * |
1120 | * At all times, the dirtiness of the buffers represents the dirtiness of | 1120 | * At all times, the dirtiness of the buffers represents the dirtiness of |
1121 | * subsections of the page. If the page has buffers, the page dirty bit is | 1121 | * subsections of the page. If the page has buffers, the page dirty bit is |
1122 | * merely a hint about the true dirty state. | 1122 | * merely a hint about the true dirty state. |
1123 | * | 1123 | * |
1124 | * When a page is set dirty in its entirety, all its buffers are marked dirty | 1124 | * When a page is set dirty in its entirety, all its buffers are marked dirty |
1125 | * (if the page has buffers). | 1125 | * (if the page has buffers). |
1126 | * | 1126 | * |
1127 | * When a buffer is marked dirty, its page is dirtied, but the page's other | 1127 | * When a buffer is marked dirty, its page is dirtied, but the page's other |
1128 | * buffers are not. | 1128 | * buffers are not. |
1129 | * | 1129 | * |
1130 | * Also. When blockdev buffers are explicitly read with bread(), they | 1130 | * Also. When blockdev buffers are explicitly read with bread(), they |
1131 | * individually become uptodate. But their backing page remains not | 1131 | * individually become uptodate. But their backing page remains not |
1132 | * uptodate - even if all of its buffers are uptodate. A subsequent | 1132 | * uptodate - even if all of its buffers are uptodate. A subsequent |
1133 | * block_read_full_page() against that page will discover all the uptodate | 1133 | * block_read_full_page() against that page will discover all the uptodate |
1134 | * buffers, will set the page uptodate and will perform no I/O. | 1134 | * buffers, will set the page uptodate and will perform no I/O. |
1135 | */ | 1135 | */ |
1136 | 1136 | ||
1137 | /** | 1137 | /** |
1138 | * mark_buffer_dirty - mark a buffer_head as needing writeout | 1138 | * mark_buffer_dirty - mark a buffer_head as needing writeout |
1139 | * @bh: the buffer_head to mark dirty | 1139 | * @bh: the buffer_head to mark dirty |
1140 | * | 1140 | * |
1141 | * mark_buffer_dirty() will set the dirty bit against the buffer, then set its | 1141 | * mark_buffer_dirty() will set the dirty bit against the buffer, then set its |
1142 | * backing page dirty, then tag the page as dirty in its address_space's radix | 1142 | * backing page dirty, then tag the page as dirty in its address_space's radix |
1143 | * tree and then attach the address_space's inode to its superblock's dirty | 1143 | * tree and then attach the address_space's inode to its superblock's dirty |
1144 | * inode list. | 1144 | * inode list. |
1145 | * | 1145 | * |
1146 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, | 1146 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, |
1147 | * mapping->tree_lock and the global inode_lock. | 1147 | * mapping->tree_lock and the global inode_lock. |
1148 | */ | 1148 | */ |
1149 | void mark_buffer_dirty(struct buffer_head *bh) | 1149 | void mark_buffer_dirty(struct buffer_head *bh) |
1150 | { | 1150 | { |
1151 | WARN_ON_ONCE(!buffer_uptodate(bh)); | 1151 | WARN_ON_ONCE(!buffer_uptodate(bh)); |
1152 | 1152 | ||
1153 | /* | 1153 | /* |
1154 | * Very *carefully* optimize the it-is-already-dirty case. | 1154 | * Very *carefully* optimize the it-is-already-dirty case. |
1155 | * | 1155 | * |
1156 | * Don't let the final "is it dirty" escape to before we | 1156 | * Don't let the final "is it dirty" escape to before we |
1157 | * perhaps modified the buffer. | 1157 | * perhaps modified the buffer. |
1158 | */ | 1158 | */ |
1159 | if (buffer_dirty(bh)) { | 1159 | if (buffer_dirty(bh)) { |
1160 | smp_mb(); | 1160 | smp_mb(); |
1161 | if (buffer_dirty(bh)) | 1161 | if (buffer_dirty(bh)) |
1162 | return; | 1162 | return; |
1163 | } | 1163 | } |
1164 | 1164 | ||
1165 | if (!test_set_buffer_dirty(bh)) { | 1165 | if (!test_set_buffer_dirty(bh)) { |
1166 | struct page *page = bh->b_page; | 1166 | struct page *page = bh->b_page; |
1167 | if (!TestSetPageDirty(page)) { | 1167 | if (!TestSetPageDirty(page)) { |
1168 | struct address_space *mapping = page_mapping(page); | 1168 | struct address_space *mapping = page_mapping(page); |
1169 | if (mapping) | 1169 | if (mapping) |
1170 | __set_page_dirty(page, mapping, 0); | 1170 | __set_page_dirty(page, mapping, 0); |
1171 | } | 1171 | } |
1172 | } | 1172 | } |
1173 | } | 1173 | } |
1174 | EXPORT_SYMBOL(mark_buffer_dirty); | 1174 | EXPORT_SYMBOL(mark_buffer_dirty); |
1175 | 1175 | ||
1176 | /* | 1176 | /* |
1177 | * Decrement a buffer_head's reference count. If all buffers against a page | 1177 | * Decrement a buffer_head's reference count. If all buffers against a page |
1178 | * have zero reference count, are clean and unlocked, and if the page is clean | 1178 | * have zero reference count, are clean and unlocked, and if the page is clean |
1179 | * and unlocked then try_to_free_buffers() may strip the buffers from the page | 1179 | * and unlocked then try_to_free_buffers() may strip the buffers from the page |
1180 | * in preparation for freeing it (sometimes, rarely, buffers are removed from | 1180 | * in preparation for freeing it (sometimes, rarely, buffers are removed from |
1181 | * a page but it ends up not being freed, and buffers may later be reattached). | 1181 | * a page but it ends up not being freed, and buffers may later be reattached). |
1182 | */ | 1182 | */ |
1183 | void __brelse(struct buffer_head * buf) | 1183 | void __brelse(struct buffer_head * buf) |
1184 | { | 1184 | { |
1185 | if (atomic_read(&buf->b_count)) { | 1185 | if (atomic_read(&buf->b_count)) { |
1186 | put_bh(buf); | 1186 | put_bh(buf); |
1187 | return; | 1187 | return; |
1188 | } | 1188 | } |
1189 | WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); | 1189 | WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); |
1190 | } | 1190 | } |
1191 | EXPORT_SYMBOL(__brelse); | 1191 | EXPORT_SYMBOL(__brelse); |
1192 | 1192 | ||
1193 | /* | 1193 | /* |
1194 | * bforget() is like brelse(), except it discards any | 1194 | * bforget() is like brelse(), except it discards any |
1195 | * potentially dirty data. | 1195 | * potentially dirty data. |
1196 | */ | 1196 | */ |
1197 | void __bforget(struct buffer_head *bh) | 1197 | void __bforget(struct buffer_head *bh) |
1198 | { | 1198 | { |
1199 | clear_buffer_dirty(bh); | 1199 | clear_buffer_dirty(bh); |
1200 | if (bh->b_assoc_map) { | 1200 | if (bh->b_assoc_map) { |
1201 | struct address_space *buffer_mapping = bh->b_page->mapping; | 1201 | struct address_space *buffer_mapping = bh->b_page->mapping; |
1202 | 1202 | ||
1203 | spin_lock(&buffer_mapping->private_lock); | 1203 | spin_lock(&buffer_mapping->private_lock); |
1204 | list_del_init(&bh->b_assoc_buffers); | 1204 | list_del_init(&bh->b_assoc_buffers); |
1205 | bh->b_assoc_map = NULL; | 1205 | bh->b_assoc_map = NULL; |
1206 | spin_unlock(&buffer_mapping->private_lock); | 1206 | spin_unlock(&buffer_mapping->private_lock); |
1207 | } | 1207 | } |
1208 | __brelse(bh); | 1208 | __brelse(bh); |
1209 | } | 1209 | } |
1210 | EXPORT_SYMBOL(__bforget); | 1210 | EXPORT_SYMBOL(__bforget); |
1211 | 1211 | ||
1212 | static struct buffer_head *__bread_slow(struct buffer_head *bh) | 1212 | static struct buffer_head *__bread_slow(struct buffer_head *bh) |
1213 | { | 1213 | { |
1214 | lock_buffer(bh); | 1214 | lock_buffer(bh); |
1215 | if (buffer_uptodate(bh)) { | 1215 | if (buffer_uptodate(bh)) { |
1216 | unlock_buffer(bh); | 1216 | unlock_buffer(bh); |
1217 | return bh; | 1217 | return bh; |
1218 | } else { | 1218 | } else { |
1219 | get_bh(bh); | 1219 | get_bh(bh); |
1220 | bh->b_end_io = end_buffer_read_sync; | 1220 | bh->b_end_io = end_buffer_read_sync; |
1221 | submit_bh(READ, bh); | 1221 | submit_bh(READ, bh); |
1222 | wait_on_buffer(bh); | 1222 | wait_on_buffer(bh); |
1223 | if (buffer_uptodate(bh)) | 1223 | if (buffer_uptodate(bh)) |
1224 | return bh; | 1224 | return bh; |
1225 | } | 1225 | } |
1226 | brelse(bh); | 1226 | brelse(bh); |
1227 | return NULL; | 1227 | return NULL; |
1228 | } | 1228 | } |
1229 | 1229 | ||
1230 | /* | 1230 | /* |
1231 | * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). | 1231 | * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). |
1232 | * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their | 1232 | * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their |
1233 | * refcount elevated by one when they're in an LRU. A buffer can only appear | 1233 | * refcount elevated by one when they're in an LRU. A buffer can only appear |
1234 | * once in a particular CPU's LRU. A single buffer can be present in multiple | 1234 | * once in a particular CPU's LRU. A single buffer can be present in multiple |
1235 | * CPU's LRUs at the same time. | 1235 | * CPU's LRUs at the same time. |
1236 | * | 1236 | * |
1237 | * This is a transparent caching front-end to sb_bread(), sb_getblk() and | 1237 | * This is a transparent caching front-end to sb_bread(), sb_getblk() and |
1238 | * sb_find_get_block(). | 1238 | * sb_find_get_block(). |
1239 | * | 1239 | * |
1240 | * The LRUs themselves only need locking against invalidate_bh_lrus. We use | 1240 | * The LRUs themselves only need locking against invalidate_bh_lrus. We use |
1241 | * a local interrupt disable for that. | 1241 | * a local interrupt disable for that. |
1242 | */ | 1242 | */ |
1243 | 1243 | ||
1244 | #define BH_LRU_SIZE 8 | 1244 | #define BH_LRU_SIZE 8 |
1245 | 1245 | ||
1246 | struct bh_lru { | 1246 | struct bh_lru { |
1247 | struct buffer_head *bhs[BH_LRU_SIZE]; | 1247 | struct buffer_head *bhs[BH_LRU_SIZE]; |
1248 | }; | 1248 | }; |
1249 | 1249 | ||
1250 | static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; | 1250 | static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; |
1251 | 1251 | ||
1252 | #ifdef CONFIG_SMP | 1252 | #ifdef CONFIG_SMP |
1253 | #define bh_lru_lock() local_irq_disable() | 1253 | #define bh_lru_lock() local_irq_disable() |
1254 | #define bh_lru_unlock() local_irq_enable() | 1254 | #define bh_lru_unlock() local_irq_enable() |
1255 | #else | 1255 | #else |
1256 | #define bh_lru_lock() preempt_disable() | 1256 | #define bh_lru_lock() preempt_disable() |
1257 | #define bh_lru_unlock() preempt_enable() | 1257 | #define bh_lru_unlock() preempt_enable() |
1258 | #endif | 1258 | #endif |
1259 | 1259 | ||
1260 | static inline void check_irqs_on(void) | 1260 | static inline void check_irqs_on(void) |
1261 | { | 1261 | { |
1262 | #ifdef irqs_disabled | 1262 | #ifdef irqs_disabled |
1263 | BUG_ON(irqs_disabled()); | 1263 | BUG_ON(irqs_disabled()); |
1264 | #endif | 1264 | #endif |
1265 | } | 1265 | } |
1266 | 1266 | ||
1267 | /* | 1267 | /* |
1268 | * The LRU management algorithm is dopey-but-simple. Sorry. | 1268 | * The LRU management algorithm is dopey-but-simple. Sorry. |
1269 | */ | 1269 | */ |
1270 | static void bh_lru_install(struct buffer_head *bh) | 1270 | static void bh_lru_install(struct buffer_head *bh) |
1271 | { | 1271 | { |
1272 | struct buffer_head *evictee = NULL; | 1272 | struct buffer_head *evictee = NULL; |
1273 | struct bh_lru *lru; | 1273 | struct bh_lru *lru; |
1274 | 1274 | ||
1275 | check_irqs_on(); | 1275 | check_irqs_on(); |
1276 | bh_lru_lock(); | 1276 | bh_lru_lock(); |
1277 | lru = &__get_cpu_var(bh_lrus); | 1277 | lru = &__get_cpu_var(bh_lrus); |
1278 | if (lru->bhs[0] != bh) { | 1278 | if (lru->bhs[0] != bh) { |
1279 | struct buffer_head *bhs[BH_LRU_SIZE]; | 1279 | struct buffer_head *bhs[BH_LRU_SIZE]; |
1280 | int in; | 1280 | int in; |
1281 | int out = 0; | 1281 | int out = 0; |
1282 | 1282 | ||
1283 | get_bh(bh); | 1283 | get_bh(bh); |
1284 | bhs[out++] = bh; | 1284 | bhs[out++] = bh; |
1285 | for (in = 0; in < BH_LRU_SIZE; in++) { | 1285 | for (in = 0; in < BH_LRU_SIZE; in++) { |
1286 | struct buffer_head *bh2 = lru->bhs[in]; | 1286 | struct buffer_head *bh2 = lru->bhs[in]; |
1287 | 1287 | ||
1288 | if (bh2 == bh) { | 1288 | if (bh2 == bh) { |
1289 | __brelse(bh2); | 1289 | __brelse(bh2); |
1290 | } else { | 1290 | } else { |
1291 | if (out >= BH_LRU_SIZE) { | 1291 | if (out >= BH_LRU_SIZE) { |
1292 | BUG_ON(evictee != NULL); | 1292 | BUG_ON(evictee != NULL); |
1293 | evictee = bh2; | 1293 | evictee = bh2; |
1294 | } else { | 1294 | } else { |
1295 | bhs[out++] = bh2; | 1295 | bhs[out++] = bh2; |
1296 | } | 1296 | } |
1297 | } | 1297 | } |
1298 | } | 1298 | } |
1299 | while (out < BH_LRU_SIZE) | 1299 | while (out < BH_LRU_SIZE) |
1300 | bhs[out++] = NULL; | 1300 | bhs[out++] = NULL; |
1301 | memcpy(lru->bhs, bhs, sizeof(bhs)); | 1301 | memcpy(lru->bhs, bhs, sizeof(bhs)); |
1302 | } | 1302 | } |
1303 | bh_lru_unlock(); | 1303 | bh_lru_unlock(); |
1304 | 1304 | ||
1305 | if (evictee) | 1305 | if (evictee) |
1306 | __brelse(evictee); | 1306 | __brelse(evictee); |
1307 | } | 1307 | } |
1308 | 1308 | ||
1309 | /* | 1309 | /* |
1310 | * Look up the bh in this cpu's LRU. If it's there, move it to the head. | 1310 | * Look up the bh in this cpu's LRU. If it's there, move it to the head. |
1311 | */ | 1311 | */ |
1312 | static struct buffer_head * | 1312 | static struct buffer_head * |
1313 | lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) | 1313 | lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) |
1314 | { | 1314 | { |
1315 | struct buffer_head *ret = NULL; | 1315 | struct buffer_head *ret = NULL; |
1316 | struct bh_lru *lru; | 1316 | struct bh_lru *lru; |
1317 | unsigned int i; | 1317 | unsigned int i; |
1318 | 1318 | ||
1319 | check_irqs_on(); | 1319 | check_irqs_on(); |
1320 | bh_lru_lock(); | 1320 | bh_lru_lock(); |
1321 | lru = &__get_cpu_var(bh_lrus); | 1321 | lru = &__get_cpu_var(bh_lrus); |
1322 | for (i = 0; i < BH_LRU_SIZE; i++) { | 1322 | for (i = 0; i < BH_LRU_SIZE; i++) { |
1323 | struct buffer_head *bh = lru->bhs[i]; | 1323 | struct buffer_head *bh = lru->bhs[i]; |
1324 | 1324 | ||
1325 | if (bh && bh->b_bdev == bdev && | 1325 | if (bh && bh->b_bdev == bdev && |
1326 | bh->b_blocknr == block && bh->b_size == size) { | 1326 | bh->b_blocknr == block && bh->b_size == size) { |
1327 | if (i) { | 1327 | if (i) { |
1328 | while (i) { | 1328 | while (i) { |
1329 | lru->bhs[i] = lru->bhs[i - 1]; | 1329 | lru->bhs[i] = lru->bhs[i - 1]; |
1330 | i--; | 1330 | i--; |
1331 | } | 1331 | } |
1332 | lru->bhs[0] = bh; | 1332 | lru->bhs[0] = bh; |
1333 | } | 1333 | } |
1334 | get_bh(bh); | 1334 | get_bh(bh); |
1335 | ret = bh; | 1335 | ret = bh; |
1336 | break; | 1336 | break; |
1337 | } | 1337 | } |
1338 | } | 1338 | } |
1339 | bh_lru_unlock(); | 1339 | bh_lru_unlock(); |
1340 | return ret; | 1340 | return ret; |
1341 | } | 1341 | } |
1342 | 1342 | ||
1343 | /* | 1343 | /* |
1344 | * Perform a pagecache lookup for the matching buffer. If it's there, refresh | 1344 | * Perform a pagecache lookup for the matching buffer. If it's there, refresh |
1345 | * it in the LRU and mark it as accessed. If it is not present then return | 1345 | * it in the LRU and mark it as accessed. If it is not present then return |
1346 | * NULL | 1346 | * NULL |
1347 | */ | 1347 | */ |
1348 | struct buffer_head * | 1348 | struct buffer_head * |
1349 | __find_get_block(struct block_device *bdev, sector_t block, unsigned size) | 1349 | __find_get_block(struct block_device *bdev, sector_t block, unsigned size) |
1350 | { | 1350 | { |
1351 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); | 1351 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); |
1352 | 1352 | ||
1353 | if (bh == NULL) { | 1353 | if (bh == NULL) { |
1354 | bh = __find_get_block_slow(bdev, block); | 1354 | bh = __find_get_block_slow(bdev, block); |
1355 | if (bh) | 1355 | if (bh) |
1356 | bh_lru_install(bh); | 1356 | bh_lru_install(bh); |
1357 | } | 1357 | } |
1358 | if (bh) | 1358 | if (bh) |
1359 | touch_buffer(bh); | 1359 | touch_buffer(bh); |
1360 | return bh; | 1360 | return bh; |
1361 | } | 1361 | } |
1362 | EXPORT_SYMBOL(__find_get_block); | 1362 | EXPORT_SYMBOL(__find_get_block); |
1363 | 1363 | ||
1364 | /* | 1364 | /* |
1365 | * __getblk will locate (and, if necessary, create) the buffer_head | 1365 | * __getblk will locate (and, if necessary, create) the buffer_head |
1366 | * which corresponds to the passed block_device, block and size. The | 1366 | * which corresponds to the passed block_device, block and size. The |
1367 | * returned buffer has its reference count incremented. | 1367 | * returned buffer has its reference count incremented. |
1368 | * | 1368 | * |
1369 | * __getblk() cannot fail - it just keeps trying. If you pass it an | 1369 | * __getblk() cannot fail - it just keeps trying. If you pass it an |
1370 | * illegal block number, __getblk() will happily return a buffer_head | 1370 | * illegal block number, __getblk() will happily return a buffer_head |
1371 | * which represents the non-existent block. Very weird. | 1371 | * which represents the non-existent block. Very weird. |
1372 | * | 1372 | * |
1373 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() | 1373 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() |
1374 | * attempt is failing. FIXME, perhaps? | 1374 | * attempt is failing. FIXME, perhaps? |
1375 | */ | 1375 | */ |
1376 | struct buffer_head * | 1376 | struct buffer_head * |
1377 | __getblk(struct block_device *bdev, sector_t block, unsigned size) | 1377 | __getblk(struct block_device *bdev, sector_t block, unsigned size) |
1378 | { | 1378 | { |
1379 | struct buffer_head *bh = __find_get_block(bdev, block, size); | 1379 | struct buffer_head *bh = __find_get_block(bdev, block, size); |
1380 | 1380 | ||
1381 | might_sleep(); | 1381 | might_sleep(); |
1382 | if (bh == NULL) | 1382 | if (bh == NULL) |
1383 | bh = __getblk_slow(bdev, block, size); | 1383 | bh = __getblk_slow(bdev, block, size); |
1384 | return bh; | 1384 | return bh; |
1385 | } | 1385 | } |
1386 | EXPORT_SYMBOL(__getblk); | 1386 | EXPORT_SYMBOL(__getblk); |
1387 | 1387 | ||
1388 | /* | 1388 | /* |
1389 | * Do async read-ahead on a buffer.. | 1389 | * Do async read-ahead on a buffer.. |
1390 | */ | 1390 | */ |
1391 | void __breadahead(struct block_device *bdev, sector_t block, unsigned size) | 1391 | void __breadahead(struct block_device *bdev, sector_t block, unsigned size) |
1392 | { | 1392 | { |
1393 | struct buffer_head *bh = __getblk(bdev, block, size); | 1393 | struct buffer_head *bh = __getblk(bdev, block, size); |
1394 | if (likely(bh)) { | 1394 | if (likely(bh)) { |
1395 | ll_rw_block(READA, 1, &bh); | 1395 | ll_rw_block(READA, 1, &bh); |
1396 | brelse(bh); | 1396 | brelse(bh); |
1397 | } | 1397 | } |
1398 | } | 1398 | } |
1399 | EXPORT_SYMBOL(__breadahead); | 1399 | EXPORT_SYMBOL(__breadahead); |
1400 | 1400 | ||
1401 | /** | 1401 | /** |
1402 | * __bread() - reads a specified block and returns the bh | 1402 | * __bread() - reads a specified block and returns the bh |
1403 | * @bdev: the block_device to read from | 1403 | * @bdev: the block_device to read from |
1404 | * @block: number of block | 1404 | * @block: number of block |
1405 | * @size: size (in bytes) to read | 1405 | * @size: size (in bytes) to read |
1406 | * | 1406 | * |
1407 | * Reads a specified block, and returns buffer head that contains it. | 1407 | * Reads a specified block, and returns buffer head that contains it. |
1408 | * It returns NULL if the block was unreadable. | 1408 | * It returns NULL if the block was unreadable. |
1409 | */ | 1409 | */ |
1410 | struct buffer_head * | 1410 | struct buffer_head * |
1411 | __bread(struct block_device *bdev, sector_t block, unsigned size) | 1411 | __bread(struct block_device *bdev, sector_t block, unsigned size) |
1412 | { | 1412 | { |
1413 | struct buffer_head *bh = __getblk(bdev, block, size); | 1413 | struct buffer_head *bh = __getblk(bdev, block, size); |
1414 | 1414 | ||
1415 | if (likely(bh) && !buffer_uptodate(bh)) | 1415 | if (likely(bh) && !buffer_uptodate(bh)) |
1416 | bh = __bread_slow(bh); | 1416 | bh = __bread_slow(bh); |
1417 | return bh; | 1417 | return bh; |
1418 | } | 1418 | } |
1419 | EXPORT_SYMBOL(__bread); | 1419 | EXPORT_SYMBOL(__bread); |
1420 | 1420 | ||
1421 | /* | 1421 | /* |
1422 | * invalidate_bh_lrus() is called rarely - but not only at unmount. | 1422 | * invalidate_bh_lrus() is called rarely - but not only at unmount. |
1423 | * This doesn't race because it runs in each cpu either in irq | 1423 | * This doesn't race because it runs in each cpu either in irq |
1424 | * or with preempt disabled. | 1424 | * or with preempt disabled. |
1425 | */ | 1425 | */ |
1426 | static void invalidate_bh_lru(void *arg) | 1426 | static void invalidate_bh_lru(void *arg) |
1427 | { | 1427 | { |
1428 | struct bh_lru *b = &get_cpu_var(bh_lrus); | 1428 | struct bh_lru *b = &get_cpu_var(bh_lrus); |
1429 | int i; | 1429 | int i; |
1430 | 1430 | ||
1431 | for (i = 0; i < BH_LRU_SIZE; i++) { | 1431 | for (i = 0; i < BH_LRU_SIZE; i++) { |
1432 | brelse(b->bhs[i]); | 1432 | brelse(b->bhs[i]); |
1433 | b->bhs[i] = NULL; | 1433 | b->bhs[i] = NULL; |
1434 | } | 1434 | } |
1435 | put_cpu_var(bh_lrus); | 1435 | put_cpu_var(bh_lrus); |
1436 | } | 1436 | } |
1437 | 1437 | ||
1438 | void invalidate_bh_lrus(void) | 1438 | void invalidate_bh_lrus(void) |
1439 | { | 1439 | { |
1440 | on_each_cpu(invalidate_bh_lru, NULL, 1); | 1440 | on_each_cpu(invalidate_bh_lru, NULL, 1); |
1441 | } | 1441 | } |
1442 | EXPORT_SYMBOL_GPL(invalidate_bh_lrus); | 1442 | EXPORT_SYMBOL_GPL(invalidate_bh_lrus); |
1443 | 1443 | ||
1444 | void set_bh_page(struct buffer_head *bh, | 1444 | void set_bh_page(struct buffer_head *bh, |
1445 | struct page *page, unsigned long offset) | 1445 | struct page *page, unsigned long offset) |
1446 | { | 1446 | { |
1447 | bh->b_page = page; | 1447 | bh->b_page = page; |
1448 | BUG_ON(offset >= PAGE_SIZE); | 1448 | BUG_ON(offset >= PAGE_SIZE); |
1449 | if (PageHighMem(page)) | 1449 | if (PageHighMem(page)) |
1450 | /* | 1450 | /* |
1451 | * This catches illegal uses and preserves the offset: | 1451 | * This catches illegal uses and preserves the offset: |
1452 | */ | 1452 | */ |
1453 | bh->b_data = (char *)(0 + offset); | 1453 | bh->b_data = (char *)(0 + offset); |
1454 | else | 1454 | else |
1455 | bh->b_data = page_address(page) + offset; | 1455 | bh->b_data = page_address(page) + offset; |
1456 | } | 1456 | } |
1457 | EXPORT_SYMBOL(set_bh_page); | 1457 | EXPORT_SYMBOL(set_bh_page); |
1458 | 1458 | ||
1459 | /* | 1459 | /* |
1460 | * Called when truncating a buffer on a page completely. | 1460 | * Called when truncating a buffer on a page completely. |
1461 | */ | 1461 | */ |
1462 | static void discard_buffer(struct buffer_head * bh) | 1462 | static void discard_buffer(struct buffer_head * bh) |
1463 | { | 1463 | { |
1464 | lock_buffer(bh); | 1464 | lock_buffer(bh); |
1465 | clear_buffer_dirty(bh); | 1465 | clear_buffer_dirty(bh); |
1466 | bh->b_bdev = NULL; | 1466 | bh->b_bdev = NULL; |
1467 | clear_buffer_mapped(bh); | 1467 | clear_buffer_mapped(bh); |
1468 | clear_buffer_req(bh); | 1468 | clear_buffer_req(bh); |
1469 | clear_buffer_new(bh); | 1469 | clear_buffer_new(bh); |
1470 | clear_buffer_delay(bh); | 1470 | clear_buffer_delay(bh); |
1471 | clear_buffer_unwritten(bh); | 1471 | clear_buffer_unwritten(bh); |
1472 | unlock_buffer(bh); | 1472 | unlock_buffer(bh); |
1473 | } | 1473 | } |
1474 | 1474 | ||
1475 | /** | 1475 | /** |
1476 | * block_invalidatepage - invalidate part of all of a buffer-backed page | 1476 | * block_invalidatepage - invalidate part of all of a buffer-backed page |
1477 | * | 1477 | * |
1478 | * @page: the page which is affected | 1478 | * @page: the page which is affected |
1479 | * @offset: the index of the truncation point | 1479 | * @offset: the index of the truncation point |
1480 | * | 1480 | * |
1481 | * block_invalidatepage() is called when all or part of the page has become | 1481 | * block_invalidatepage() is called when all or part of the page has become |
1482 | * invalidatedby a truncate operation. | 1482 | * invalidatedby a truncate operation. |
1483 | * | 1483 | * |
1484 | * block_invalidatepage() does not have to release all buffers, but it must | 1484 | * block_invalidatepage() does not have to release all buffers, but it must |
1485 | * ensure that no dirty buffer is left outside @offset and that no I/O | 1485 | * ensure that no dirty buffer is left outside @offset and that no I/O |
1486 | * is underway against any of the blocks which are outside the truncation | 1486 | * is underway against any of the blocks which are outside the truncation |
1487 | * point. Because the caller is about to free (and possibly reuse) those | 1487 | * point. Because the caller is about to free (and possibly reuse) those |
1488 | * blocks on-disk. | 1488 | * blocks on-disk. |
1489 | */ | 1489 | */ |
1490 | void block_invalidatepage(struct page *page, unsigned long offset) | 1490 | void block_invalidatepage(struct page *page, unsigned long offset) |
1491 | { | 1491 | { |
1492 | struct buffer_head *head, *bh, *next; | 1492 | struct buffer_head *head, *bh, *next; |
1493 | unsigned int curr_off = 0; | 1493 | unsigned int curr_off = 0; |
1494 | 1494 | ||
1495 | BUG_ON(!PageLocked(page)); | 1495 | BUG_ON(!PageLocked(page)); |
1496 | if (!page_has_buffers(page)) | 1496 | if (!page_has_buffers(page)) |
1497 | goto out; | 1497 | goto out; |
1498 | 1498 | ||
1499 | head = page_buffers(page); | 1499 | head = page_buffers(page); |
1500 | bh = head; | 1500 | bh = head; |
1501 | do { | 1501 | do { |
1502 | unsigned int next_off = curr_off + bh->b_size; | 1502 | unsigned int next_off = curr_off + bh->b_size; |
1503 | next = bh->b_this_page; | 1503 | next = bh->b_this_page; |
1504 | 1504 | ||
1505 | /* | 1505 | /* |
1506 | * is this block fully invalidated? | 1506 | * is this block fully invalidated? |
1507 | */ | 1507 | */ |
1508 | if (offset <= curr_off) | 1508 | if (offset <= curr_off) |
1509 | discard_buffer(bh); | 1509 | discard_buffer(bh); |
1510 | curr_off = next_off; | 1510 | curr_off = next_off; |
1511 | bh = next; | 1511 | bh = next; |
1512 | } while (bh != head); | 1512 | } while (bh != head); |
1513 | 1513 | ||
1514 | /* | 1514 | /* |
1515 | * We release buffers only if the entire page is being invalidated. | 1515 | * We release buffers only if the entire page is being invalidated. |
1516 | * The get_block cached value has been unconditionally invalidated, | 1516 | * The get_block cached value has been unconditionally invalidated, |
1517 | * so real IO is not possible anymore. | 1517 | * so real IO is not possible anymore. |
1518 | */ | 1518 | */ |
1519 | if (offset == 0) | 1519 | if (offset == 0) |
1520 | try_to_release_page(page, 0); | 1520 | try_to_release_page(page, 0); |
1521 | out: | 1521 | out: |
1522 | return; | 1522 | return; |
1523 | } | 1523 | } |
1524 | EXPORT_SYMBOL(block_invalidatepage); | 1524 | EXPORT_SYMBOL(block_invalidatepage); |
1525 | 1525 | ||
1526 | /* | 1526 | /* |
1527 | * We attach and possibly dirty the buffers atomically wrt | 1527 | * We attach and possibly dirty the buffers atomically wrt |
1528 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers | 1528 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers |
1529 | * is already excluded via the page lock. | 1529 | * is already excluded via the page lock. |
1530 | */ | 1530 | */ |
1531 | void create_empty_buffers(struct page *page, | 1531 | void create_empty_buffers(struct page *page, |
1532 | unsigned long blocksize, unsigned long b_state) | 1532 | unsigned long blocksize, unsigned long b_state) |
1533 | { | 1533 | { |
1534 | struct buffer_head *bh, *head, *tail; | 1534 | struct buffer_head *bh, *head, *tail; |
1535 | 1535 | ||
1536 | head = alloc_page_buffers(page, blocksize, 1); | 1536 | head = alloc_page_buffers(page, blocksize, 1); |
1537 | bh = head; | 1537 | bh = head; |
1538 | do { | 1538 | do { |
1539 | bh->b_state |= b_state; | 1539 | bh->b_state |= b_state; |
1540 | tail = bh; | 1540 | tail = bh; |
1541 | bh = bh->b_this_page; | 1541 | bh = bh->b_this_page; |
1542 | } while (bh); | 1542 | } while (bh); |
1543 | tail->b_this_page = head; | 1543 | tail->b_this_page = head; |
1544 | 1544 | ||
1545 | spin_lock(&page->mapping->private_lock); | 1545 | spin_lock(&page->mapping->private_lock); |
1546 | if (PageUptodate(page) || PageDirty(page)) { | 1546 | if (PageUptodate(page) || PageDirty(page)) { |
1547 | bh = head; | 1547 | bh = head; |
1548 | do { | 1548 | do { |
1549 | if (PageDirty(page)) | 1549 | if (PageDirty(page)) |
1550 | set_buffer_dirty(bh); | 1550 | set_buffer_dirty(bh); |
1551 | if (PageUptodate(page)) | 1551 | if (PageUptodate(page)) |
1552 | set_buffer_uptodate(bh); | 1552 | set_buffer_uptodate(bh); |
1553 | bh = bh->b_this_page; | 1553 | bh = bh->b_this_page; |
1554 | } while (bh != head); | 1554 | } while (bh != head); |
1555 | } | 1555 | } |
1556 | attach_page_buffers(page, head); | 1556 | attach_page_buffers(page, head); |
1557 | spin_unlock(&page->mapping->private_lock); | 1557 | spin_unlock(&page->mapping->private_lock); |
1558 | } | 1558 | } |
1559 | EXPORT_SYMBOL(create_empty_buffers); | 1559 | EXPORT_SYMBOL(create_empty_buffers); |
1560 | 1560 | ||
1561 | /* | 1561 | /* |
1562 | * We are taking a block for data and we don't want any output from any | 1562 | * We are taking a block for data and we don't want any output from any |
1563 | * buffer-cache aliases starting from return from that function and | 1563 | * buffer-cache aliases starting from return from that function and |
1564 | * until the moment when something will explicitly mark the buffer | 1564 | * until the moment when something will explicitly mark the buffer |
1565 | * dirty (hopefully that will not happen until we will free that block ;-) | 1565 | * dirty (hopefully that will not happen until we will free that block ;-) |
1566 | * We don't even need to mark it not-uptodate - nobody can expect | 1566 | * We don't even need to mark it not-uptodate - nobody can expect |
1567 | * anything from a newly allocated buffer anyway. We used to used | 1567 | * anything from a newly allocated buffer anyway. We used to used |
1568 | * unmap_buffer() for such invalidation, but that was wrong. We definitely | 1568 | * unmap_buffer() for such invalidation, but that was wrong. We definitely |
1569 | * don't want to mark the alias unmapped, for example - it would confuse | 1569 | * don't want to mark the alias unmapped, for example - it would confuse |
1570 | * anyone who might pick it with bread() afterwards... | 1570 | * anyone who might pick it with bread() afterwards... |
1571 | * | 1571 | * |
1572 | * Also.. Note that bforget() doesn't lock the buffer. So there can | 1572 | * Also.. Note that bforget() doesn't lock the buffer. So there can |
1573 | * be writeout I/O going on against recently-freed buffers. We don't | 1573 | * be writeout I/O going on against recently-freed buffers. We don't |
1574 | * wait on that I/O in bforget() - it's more efficient to wait on the I/O | 1574 | * wait on that I/O in bforget() - it's more efficient to wait on the I/O |
1575 | * only if we really need to. That happens here. | 1575 | * only if we really need to. That happens here. |
1576 | */ | 1576 | */ |
1577 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block) | 1577 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block) |
1578 | { | 1578 | { |
1579 | struct buffer_head *old_bh; | 1579 | struct buffer_head *old_bh; |
1580 | 1580 | ||
1581 | might_sleep(); | 1581 | might_sleep(); |
1582 | 1582 | ||
1583 | old_bh = __find_get_block_slow(bdev, block); | 1583 | old_bh = __find_get_block_slow(bdev, block); |
1584 | if (old_bh) { | 1584 | if (old_bh) { |
1585 | clear_buffer_dirty(old_bh); | 1585 | clear_buffer_dirty(old_bh); |
1586 | wait_on_buffer(old_bh); | 1586 | wait_on_buffer(old_bh); |
1587 | clear_buffer_req(old_bh); | 1587 | clear_buffer_req(old_bh); |
1588 | __brelse(old_bh); | 1588 | __brelse(old_bh); |
1589 | } | 1589 | } |
1590 | } | 1590 | } |
1591 | EXPORT_SYMBOL(unmap_underlying_metadata); | 1591 | EXPORT_SYMBOL(unmap_underlying_metadata); |
1592 | 1592 | ||
1593 | /* | 1593 | /* |
1594 | * NOTE! All mapped/uptodate combinations are valid: | 1594 | * NOTE! All mapped/uptodate combinations are valid: |
1595 | * | 1595 | * |
1596 | * Mapped Uptodate Meaning | 1596 | * Mapped Uptodate Meaning |
1597 | * | 1597 | * |
1598 | * No No "unknown" - must do get_block() | 1598 | * No No "unknown" - must do get_block() |
1599 | * No Yes "hole" - zero-filled | 1599 | * No Yes "hole" - zero-filled |
1600 | * Yes No "allocated" - allocated on disk, not read in | 1600 | * Yes No "allocated" - allocated on disk, not read in |
1601 | * Yes Yes "valid" - allocated and up-to-date in memory. | 1601 | * Yes Yes "valid" - allocated and up-to-date in memory. |
1602 | * | 1602 | * |
1603 | * "Dirty" is valid only with the last case (mapped+uptodate). | 1603 | * "Dirty" is valid only with the last case (mapped+uptodate). |
1604 | */ | 1604 | */ |
1605 | 1605 | ||
1606 | /* | 1606 | /* |
1607 | * While block_write_full_page is writing back the dirty buffers under | 1607 | * While block_write_full_page is writing back the dirty buffers under |
1608 | * the page lock, whoever dirtied the buffers may decide to clean them | 1608 | * the page lock, whoever dirtied the buffers may decide to clean them |
1609 | * again at any time. We handle that by only looking at the buffer | 1609 | * again at any time. We handle that by only looking at the buffer |
1610 | * state inside lock_buffer(). | 1610 | * state inside lock_buffer(). |
1611 | * | 1611 | * |
1612 | * If block_write_full_page() is called for regular writeback | 1612 | * If block_write_full_page() is called for regular writeback |
1613 | * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a | 1613 | * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a |
1614 | * locked buffer. This only can happen if someone has written the buffer | 1614 | * locked buffer. This only can happen if someone has written the buffer |
1615 | * directly, with submit_bh(). At the address_space level PageWriteback | 1615 | * directly, with submit_bh(). At the address_space level PageWriteback |
1616 | * prevents this contention from occurring. | 1616 | * prevents this contention from occurring. |
1617 | * | 1617 | * |
1618 | * If block_write_full_page() is called with wbc->sync_mode == | 1618 | * If block_write_full_page() is called with wbc->sync_mode == |
1619 | * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this | 1619 | * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this |
1620 | * causes the writes to be flagged as synchronous writes, but the | 1620 | * causes the writes to be flagged as synchronous writes, but the |
1621 | * block device queue will NOT be unplugged, since usually many pages | 1621 | * block device queue will NOT be unplugged, since usually many pages |
1622 | * will be pushed to the out before the higher-level caller actually | 1622 | * will be pushed to the out before the higher-level caller actually |
1623 | * waits for the writes to be completed. The various wait functions, | 1623 | * waits for the writes to be completed. The various wait functions, |
1624 | * such as wait_on_writeback_range() will ultimately call sync_page() | 1624 | * such as wait_on_writeback_range() will ultimately call sync_page() |
1625 | * which will ultimately call blk_run_backing_dev(), which will end up | 1625 | * which will ultimately call blk_run_backing_dev(), which will end up |
1626 | * unplugging the device queue. | 1626 | * unplugging the device queue. |
1627 | */ | 1627 | */ |
1628 | static int __block_write_full_page(struct inode *inode, struct page *page, | 1628 | static int __block_write_full_page(struct inode *inode, struct page *page, |
1629 | get_block_t *get_block, struct writeback_control *wbc, | 1629 | get_block_t *get_block, struct writeback_control *wbc, |
1630 | bh_end_io_t *handler) | 1630 | bh_end_io_t *handler) |
1631 | { | 1631 | { |
1632 | int err; | 1632 | int err; |
1633 | sector_t block; | 1633 | sector_t block; |
1634 | sector_t last_block; | 1634 | sector_t last_block; |
1635 | struct buffer_head *bh, *head; | 1635 | struct buffer_head *bh, *head; |
1636 | const unsigned blocksize = 1 << inode->i_blkbits; | 1636 | const unsigned blocksize = 1 << inode->i_blkbits; |
1637 | int nr_underway = 0; | 1637 | int nr_underway = 0; |
1638 | int write_op = (wbc->sync_mode == WB_SYNC_ALL ? | 1638 | int write_op = (wbc->sync_mode == WB_SYNC_ALL ? |
1639 | WRITE_SYNC_PLUG : WRITE); | 1639 | WRITE_SYNC_PLUG : WRITE); |
1640 | 1640 | ||
1641 | BUG_ON(!PageLocked(page)); | 1641 | BUG_ON(!PageLocked(page)); |
1642 | 1642 | ||
1643 | last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; | 1643 | last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; |
1644 | 1644 | ||
1645 | if (!page_has_buffers(page)) { | 1645 | if (!page_has_buffers(page)) { |
1646 | create_empty_buffers(page, blocksize, | 1646 | create_empty_buffers(page, blocksize, |
1647 | (1 << BH_Dirty)|(1 << BH_Uptodate)); | 1647 | (1 << BH_Dirty)|(1 << BH_Uptodate)); |
1648 | } | 1648 | } |
1649 | 1649 | ||
1650 | /* | 1650 | /* |
1651 | * Be very careful. We have no exclusion from __set_page_dirty_buffers | 1651 | * Be very careful. We have no exclusion from __set_page_dirty_buffers |
1652 | * here, and the (potentially unmapped) buffers may become dirty at | 1652 | * here, and the (potentially unmapped) buffers may become dirty at |
1653 | * any time. If a buffer becomes dirty here after we've inspected it | 1653 | * any time. If a buffer becomes dirty here after we've inspected it |
1654 | * then we just miss that fact, and the page stays dirty. | 1654 | * then we just miss that fact, and the page stays dirty. |
1655 | * | 1655 | * |
1656 | * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; | 1656 | * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; |
1657 | * handle that here by just cleaning them. | 1657 | * handle that here by just cleaning them. |
1658 | */ | 1658 | */ |
1659 | 1659 | ||
1660 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 1660 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
1661 | head = page_buffers(page); | 1661 | head = page_buffers(page); |
1662 | bh = head; | 1662 | bh = head; |
1663 | 1663 | ||
1664 | /* | 1664 | /* |
1665 | * Get all the dirty buffers mapped to disk addresses and | 1665 | * Get all the dirty buffers mapped to disk addresses and |
1666 | * handle any aliases from the underlying blockdev's mapping. | 1666 | * handle any aliases from the underlying blockdev's mapping. |
1667 | */ | 1667 | */ |
1668 | do { | 1668 | do { |
1669 | if (block > last_block) { | 1669 | if (block > last_block) { |
1670 | /* | 1670 | /* |
1671 | * mapped buffers outside i_size will occur, because | 1671 | * mapped buffers outside i_size will occur, because |
1672 | * this page can be outside i_size when there is a | 1672 | * this page can be outside i_size when there is a |
1673 | * truncate in progress. | 1673 | * truncate in progress. |
1674 | */ | 1674 | */ |
1675 | /* | 1675 | /* |
1676 | * The buffer was zeroed by block_write_full_page() | 1676 | * The buffer was zeroed by block_write_full_page() |
1677 | */ | 1677 | */ |
1678 | clear_buffer_dirty(bh); | 1678 | clear_buffer_dirty(bh); |
1679 | set_buffer_uptodate(bh); | 1679 | set_buffer_uptodate(bh); |
1680 | } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && | 1680 | } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && |
1681 | buffer_dirty(bh)) { | 1681 | buffer_dirty(bh)) { |
1682 | WARN_ON(bh->b_size != blocksize); | 1682 | WARN_ON(bh->b_size != blocksize); |
1683 | err = get_block(inode, block, bh, 1); | 1683 | err = get_block(inode, block, bh, 1); |
1684 | if (err) | 1684 | if (err) |
1685 | goto recover; | 1685 | goto recover; |
1686 | clear_buffer_delay(bh); | 1686 | clear_buffer_delay(bh); |
1687 | if (buffer_new(bh)) { | 1687 | if (buffer_new(bh)) { |
1688 | /* blockdev mappings never come here */ | 1688 | /* blockdev mappings never come here */ |
1689 | clear_buffer_new(bh); | 1689 | clear_buffer_new(bh); |
1690 | unmap_underlying_metadata(bh->b_bdev, | 1690 | unmap_underlying_metadata(bh->b_bdev, |
1691 | bh->b_blocknr); | 1691 | bh->b_blocknr); |
1692 | } | 1692 | } |
1693 | } | 1693 | } |
1694 | bh = bh->b_this_page; | 1694 | bh = bh->b_this_page; |
1695 | block++; | 1695 | block++; |
1696 | } while (bh != head); | 1696 | } while (bh != head); |
1697 | 1697 | ||
1698 | do { | 1698 | do { |
1699 | if (!buffer_mapped(bh)) | 1699 | if (!buffer_mapped(bh)) |
1700 | continue; | 1700 | continue; |
1701 | /* | 1701 | /* |
1702 | * If it's a fully non-blocking write attempt and we cannot | 1702 | * If it's a fully non-blocking write attempt and we cannot |
1703 | * lock the buffer then redirty the page. Note that this can | 1703 | * lock the buffer then redirty the page. Note that this can |
1704 | * potentially cause a busy-wait loop from writeback threads | 1704 | * potentially cause a busy-wait loop from writeback threads |
1705 | * and kswapd activity, but those code paths have their own | 1705 | * and kswapd activity, but those code paths have their own |
1706 | * higher-level throttling. | 1706 | * higher-level throttling. |
1707 | */ | 1707 | */ |
1708 | if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { | 1708 | if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { |
1709 | lock_buffer(bh); | 1709 | lock_buffer(bh); |
1710 | } else if (!trylock_buffer(bh)) { | 1710 | } else if (!trylock_buffer(bh)) { |
1711 | redirty_page_for_writepage(wbc, page); | 1711 | redirty_page_for_writepage(wbc, page); |
1712 | continue; | 1712 | continue; |
1713 | } | 1713 | } |
1714 | if (test_clear_buffer_dirty(bh)) { | 1714 | if (test_clear_buffer_dirty(bh)) { |
1715 | mark_buffer_async_write_endio(bh, handler); | 1715 | mark_buffer_async_write_endio(bh, handler); |
1716 | } else { | 1716 | } else { |
1717 | unlock_buffer(bh); | 1717 | unlock_buffer(bh); |
1718 | } | 1718 | } |
1719 | } while ((bh = bh->b_this_page) != head); | 1719 | } while ((bh = bh->b_this_page) != head); |
1720 | 1720 | ||
1721 | /* | 1721 | /* |
1722 | * The page and its buffers are protected by PageWriteback(), so we can | 1722 | * The page and its buffers are protected by PageWriteback(), so we can |
1723 | * drop the bh refcounts early. | 1723 | * drop the bh refcounts early. |
1724 | */ | 1724 | */ |
1725 | BUG_ON(PageWriteback(page)); | 1725 | BUG_ON(PageWriteback(page)); |
1726 | set_page_writeback(page); | 1726 | set_page_writeback(page); |
1727 | 1727 | ||
1728 | do { | 1728 | do { |
1729 | struct buffer_head *next = bh->b_this_page; | 1729 | struct buffer_head *next = bh->b_this_page; |
1730 | if (buffer_async_write(bh)) { | 1730 | if (buffer_async_write(bh)) { |
1731 | submit_bh(write_op, bh); | 1731 | submit_bh(write_op, bh); |
1732 | nr_underway++; | 1732 | nr_underway++; |
1733 | } | 1733 | } |
1734 | bh = next; | 1734 | bh = next; |
1735 | } while (bh != head); | 1735 | } while (bh != head); |
1736 | unlock_page(page); | 1736 | unlock_page(page); |
1737 | 1737 | ||
1738 | err = 0; | 1738 | err = 0; |
1739 | done: | 1739 | done: |
1740 | if (nr_underway == 0) { | 1740 | if (nr_underway == 0) { |
1741 | /* | 1741 | /* |
1742 | * The page was marked dirty, but the buffers were | 1742 | * The page was marked dirty, but the buffers were |
1743 | * clean. Someone wrote them back by hand with | 1743 | * clean. Someone wrote them back by hand with |
1744 | * ll_rw_block/submit_bh. A rare case. | 1744 | * ll_rw_block/submit_bh. A rare case. |
1745 | */ | 1745 | */ |
1746 | end_page_writeback(page); | 1746 | end_page_writeback(page); |
1747 | 1747 | ||
1748 | /* | 1748 | /* |
1749 | * The page and buffer_heads can be released at any time from | 1749 | * The page and buffer_heads can be released at any time from |
1750 | * here on. | 1750 | * here on. |
1751 | */ | 1751 | */ |
1752 | } | 1752 | } |
1753 | return err; | 1753 | return err; |
1754 | 1754 | ||
1755 | recover: | 1755 | recover: |
1756 | /* | 1756 | /* |
1757 | * ENOSPC, or some other error. We may already have added some | 1757 | * ENOSPC, or some other error. We may already have added some |
1758 | * blocks to the file, so we need to write these out to avoid | 1758 | * blocks to the file, so we need to write these out to avoid |
1759 | * exposing stale data. | 1759 | * exposing stale data. |
1760 | * The page is currently locked and not marked for writeback | 1760 | * The page is currently locked and not marked for writeback |
1761 | */ | 1761 | */ |
1762 | bh = head; | 1762 | bh = head; |
1763 | /* Recovery: lock and submit the mapped buffers */ | 1763 | /* Recovery: lock and submit the mapped buffers */ |
1764 | do { | 1764 | do { |
1765 | if (buffer_mapped(bh) && buffer_dirty(bh) && | 1765 | if (buffer_mapped(bh) && buffer_dirty(bh) && |
1766 | !buffer_delay(bh)) { | 1766 | !buffer_delay(bh)) { |
1767 | lock_buffer(bh); | 1767 | lock_buffer(bh); |
1768 | mark_buffer_async_write_endio(bh, handler); | 1768 | mark_buffer_async_write_endio(bh, handler); |
1769 | } else { | 1769 | } else { |
1770 | /* | 1770 | /* |
1771 | * The buffer may have been set dirty during | 1771 | * The buffer may have been set dirty during |
1772 | * attachment to a dirty page. | 1772 | * attachment to a dirty page. |
1773 | */ | 1773 | */ |
1774 | clear_buffer_dirty(bh); | 1774 | clear_buffer_dirty(bh); |
1775 | } | 1775 | } |
1776 | } while ((bh = bh->b_this_page) != head); | 1776 | } while ((bh = bh->b_this_page) != head); |
1777 | SetPageError(page); | 1777 | SetPageError(page); |
1778 | BUG_ON(PageWriteback(page)); | 1778 | BUG_ON(PageWriteback(page)); |
1779 | mapping_set_error(page->mapping, err); | 1779 | mapping_set_error(page->mapping, err); |
1780 | set_page_writeback(page); | 1780 | set_page_writeback(page); |
1781 | do { | 1781 | do { |
1782 | struct buffer_head *next = bh->b_this_page; | 1782 | struct buffer_head *next = bh->b_this_page; |
1783 | if (buffer_async_write(bh)) { | 1783 | if (buffer_async_write(bh)) { |
1784 | clear_buffer_dirty(bh); | 1784 | clear_buffer_dirty(bh); |
1785 | submit_bh(write_op, bh); | 1785 | submit_bh(write_op, bh); |
1786 | nr_underway++; | 1786 | nr_underway++; |
1787 | } | 1787 | } |
1788 | bh = next; | 1788 | bh = next; |
1789 | } while (bh != head); | 1789 | } while (bh != head); |
1790 | unlock_page(page); | 1790 | unlock_page(page); |
1791 | goto done; | 1791 | goto done; |
1792 | } | 1792 | } |
1793 | 1793 | ||
1794 | /* | 1794 | /* |
1795 | * If a page has any new buffers, zero them out here, and mark them uptodate | 1795 | * If a page has any new buffers, zero them out here, and mark them uptodate |
1796 | * and dirty so they'll be written out (in order to prevent uninitialised | 1796 | * and dirty so they'll be written out (in order to prevent uninitialised |
1797 | * block data from leaking). And clear the new bit. | 1797 | * block data from leaking). And clear the new bit. |
1798 | */ | 1798 | */ |
1799 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) | 1799 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) |
1800 | { | 1800 | { |
1801 | unsigned int block_start, block_end; | 1801 | unsigned int block_start, block_end; |
1802 | struct buffer_head *head, *bh; | 1802 | struct buffer_head *head, *bh; |
1803 | 1803 | ||
1804 | BUG_ON(!PageLocked(page)); | 1804 | BUG_ON(!PageLocked(page)); |
1805 | if (!page_has_buffers(page)) | 1805 | if (!page_has_buffers(page)) |
1806 | return; | 1806 | return; |
1807 | 1807 | ||
1808 | bh = head = page_buffers(page); | 1808 | bh = head = page_buffers(page); |
1809 | block_start = 0; | 1809 | block_start = 0; |
1810 | do { | 1810 | do { |
1811 | block_end = block_start + bh->b_size; | 1811 | block_end = block_start + bh->b_size; |
1812 | 1812 | ||
1813 | if (buffer_new(bh)) { | 1813 | if (buffer_new(bh)) { |
1814 | if (block_end > from && block_start < to) { | 1814 | if (block_end > from && block_start < to) { |
1815 | if (!PageUptodate(page)) { | 1815 | if (!PageUptodate(page)) { |
1816 | unsigned start, size; | 1816 | unsigned start, size; |
1817 | 1817 | ||
1818 | start = max(from, block_start); | 1818 | start = max(from, block_start); |
1819 | size = min(to, block_end) - start; | 1819 | size = min(to, block_end) - start; |
1820 | 1820 | ||
1821 | zero_user(page, start, size); | 1821 | zero_user(page, start, size); |
1822 | set_buffer_uptodate(bh); | 1822 | set_buffer_uptodate(bh); |
1823 | } | 1823 | } |
1824 | 1824 | ||
1825 | clear_buffer_new(bh); | 1825 | clear_buffer_new(bh); |
1826 | mark_buffer_dirty(bh); | 1826 | mark_buffer_dirty(bh); |
1827 | } | 1827 | } |
1828 | } | 1828 | } |
1829 | 1829 | ||
1830 | block_start = block_end; | 1830 | block_start = block_end; |
1831 | bh = bh->b_this_page; | 1831 | bh = bh->b_this_page; |
1832 | } while (bh != head); | 1832 | } while (bh != head); |
1833 | } | 1833 | } |
1834 | EXPORT_SYMBOL(page_zero_new_buffers); | 1834 | EXPORT_SYMBOL(page_zero_new_buffers); |
1835 | 1835 | ||
1836 | int block_prepare_write(struct page *page, unsigned from, unsigned to, | 1836 | int block_prepare_write(struct page *page, unsigned from, unsigned to, |
1837 | get_block_t *get_block) | 1837 | get_block_t *get_block) |
1838 | { | 1838 | { |
1839 | struct inode *inode = page->mapping->host; | 1839 | struct inode *inode = page->mapping->host; |
1840 | unsigned block_start, block_end; | 1840 | unsigned block_start, block_end; |
1841 | sector_t block; | 1841 | sector_t block; |
1842 | int err = 0; | 1842 | int err = 0; |
1843 | unsigned blocksize, bbits; | 1843 | unsigned blocksize, bbits; |
1844 | struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; | 1844 | struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; |
1845 | 1845 | ||
1846 | BUG_ON(!PageLocked(page)); | 1846 | BUG_ON(!PageLocked(page)); |
1847 | BUG_ON(from > PAGE_CACHE_SIZE); | 1847 | BUG_ON(from > PAGE_CACHE_SIZE); |
1848 | BUG_ON(to > PAGE_CACHE_SIZE); | 1848 | BUG_ON(to > PAGE_CACHE_SIZE); |
1849 | BUG_ON(from > to); | 1849 | BUG_ON(from > to); |
1850 | 1850 | ||
1851 | blocksize = 1 << inode->i_blkbits; | 1851 | blocksize = 1 << inode->i_blkbits; |
1852 | if (!page_has_buffers(page)) | 1852 | if (!page_has_buffers(page)) |
1853 | create_empty_buffers(page, blocksize, 0); | 1853 | create_empty_buffers(page, blocksize, 0); |
1854 | head = page_buffers(page); | 1854 | head = page_buffers(page); |
1855 | 1855 | ||
1856 | bbits = inode->i_blkbits; | 1856 | bbits = inode->i_blkbits; |
1857 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); | 1857 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); |
1858 | 1858 | ||
1859 | for(bh = head, block_start = 0; bh != head || !block_start; | 1859 | for(bh = head, block_start = 0; bh != head || !block_start; |
1860 | block++, block_start=block_end, bh = bh->b_this_page) { | 1860 | block++, block_start=block_end, bh = bh->b_this_page) { |
1861 | block_end = block_start + blocksize; | 1861 | block_end = block_start + blocksize; |
1862 | if (block_end <= from || block_start >= to) { | 1862 | if (block_end <= from || block_start >= to) { |
1863 | if (PageUptodate(page)) { | 1863 | if (PageUptodate(page)) { |
1864 | if (!buffer_uptodate(bh)) | 1864 | if (!buffer_uptodate(bh)) |
1865 | set_buffer_uptodate(bh); | 1865 | set_buffer_uptodate(bh); |
1866 | } | 1866 | } |
1867 | continue; | 1867 | continue; |
1868 | } | 1868 | } |
1869 | if (buffer_new(bh)) | 1869 | if (buffer_new(bh)) |
1870 | clear_buffer_new(bh); | 1870 | clear_buffer_new(bh); |
1871 | if (!buffer_mapped(bh)) { | 1871 | if (!buffer_mapped(bh)) { |
1872 | WARN_ON(bh->b_size != blocksize); | 1872 | WARN_ON(bh->b_size != blocksize); |
1873 | err = get_block(inode, block, bh, 1); | 1873 | err = get_block(inode, block, bh, 1); |
1874 | if (err) | 1874 | if (err) |
1875 | break; | 1875 | break; |
1876 | if (buffer_new(bh)) { | 1876 | if (buffer_new(bh)) { |
1877 | unmap_underlying_metadata(bh->b_bdev, | 1877 | unmap_underlying_metadata(bh->b_bdev, |
1878 | bh->b_blocknr); | 1878 | bh->b_blocknr); |
1879 | if (PageUptodate(page)) { | 1879 | if (PageUptodate(page)) { |
1880 | clear_buffer_new(bh); | 1880 | clear_buffer_new(bh); |
1881 | set_buffer_uptodate(bh); | 1881 | set_buffer_uptodate(bh); |
1882 | mark_buffer_dirty(bh); | 1882 | mark_buffer_dirty(bh); |
1883 | continue; | 1883 | continue; |
1884 | } | 1884 | } |
1885 | if (block_end > to || block_start < from) | 1885 | if (block_end > to || block_start < from) |
1886 | zero_user_segments(page, | 1886 | zero_user_segments(page, |
1887 | to, block_end, | 1887 | to, block_end, |
1888 | block_start, from); | 1888 | block_start, from); |
1889 | continue; | 1889 | continue; |
1890 | } | 1890 | } |
1891 | } | 1891 | } |
1892 | if (PageUptodate(page)) { | 1892 | if (PageUptodate(page)) { |
1893 | if (!buffer_uptodate(bh)) | 1893 | if (!buffer_uptodate(bh)) |
1894 | set_buffer_uptodate(bh); | 1894 | set_buffer_uptodate(bh); |
1895 | continue; | 1895 | continue; |
1896 | } | 1896 | } |
1897 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && | 1897 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && |
1898 | !buffer_unwritten(bh) && | 1898 | !buffer_unwritten(bh) && |
1899 | (block_start < from || block_end > to)) { | 1899 | (block_start < from || block_end > to)) { |
1900 | ll_rw_block(READ, 1, &bh); | 1900 | ll_rw_block(READ, 1, &bh); |
1901 | *wait_bh++=bh; | 1901 | *wait_bh++=bh; |
1902 | } | 1902 | } |
1903 | } | 1903 | } |
1904 | /* | 1904 | /* |
1905 | * If we issued read requests - let them complete. | 1905 | * If we issued read requests - let them complete. |
1906 | */ | 1906 | */ |
1907 | while(wait_bh > wait) { | 1907 | while(wait_bh > wait) { |
1908 | wait_on_buffer(*--wait_bh); | 1908 | wait_on_buffer(*--wait_bh); |
1909 | if (!buffer_uptodate(*wait_bh)) | 1909 | if (!buffer_uptodate(*wait_bh)) |
1910 | err = -EIO; | 1910 | err = -EIO; |
1911 | } | 1911 | } |
1912 | if (unlikely(err)) { | 1912 | if (unlikely(err)) { |
1913 | page_zero_new_buffers(page, from, to); | 1913 | page_zero_new_buffers(page, from, to); |
1914 | ClearPageUptodate(page); | 1914 | ClearPageUptodate(page); |
1915 | } | 1915 | } |
1916 | return err; | 1916 | return err; |
1917 | } | 1917 | } |
1918 | EXPORT_SYMBOL(block_prepare_write); | 1918 | EXPORT_SYMBOL(block_prepare_write); |
1919 | 1919 | ||
1920 | static int __block_commit_write(struct inode *inode, struct page *page, | 1920 | static int __block_commit_write(struct inode *inode, struct page *page, |
1921 | unsigned from, unsigned to) | 1921 | unsigned from, unsigned to) |
1922 | { | 1922 | { |
1923 | unsigned block_start, block_end; | 1923 | unsigned block_start, block_end; |
1924 | int partial = 0; | 1924 | int partial = 0; |
1925 | unsigned blocksize; | 1925 | unsigned blocksize; |
1926 | struct buffer_head *bh, *head; | 1926 | struct buffer_head *bh, *head; |
1927 | 1927 | ||
1928 | blocksize = 1 << inode->i_blkbits; | 1928 | blocksize = 1 << inode->i_blkbits; |
1929 | 1929 | ||
1930 | for(bh = head = page_buffers(page), block_start = 0; | 1930 | for(bh = head = page_buffers(page), block_start = 0; |
1931 | bh != head || !block_start; | 1931 | bh != head || !block_start; |
1932 | block_start=block_end, bh = bh->b_this_page) { | 1932 | block_start=block_end, bh = bh->b_this_page) { |
1933 | block_end = block_start + blocksize; | 1933 | block_end = block_start + blocksize; |
1934 | if (block_end <= from || block_start >= to) { | 1934 | if (block_end <= from || block_start >= to) { |
1935 | if (!buffer_uptodate(bh)) | 1935 | if (!buffer_uptodate(bh)) |
1936 | partial = 1; | 1936 | partial = 1; |
1937 | } else { | 1937 | } else { |
1938 | set_buffer_uptodate(bh); | 1938 | set_buffer_uptodate(bh); |
1939 | mark_buffer_dirty(bh); | 1939 | mark_buffer_dirty(bh); |
1940 | } | 1940 | } |
1941 | clear_buffer_new(bh); | 1941 | clear_buffer_new(bh); |
1942 | } | 1942 | } |
1943 | 1943 | ||
1944 | /* | 1944 | /* |
1945 | * If this is a partial write which happened to make all buffers | 1945 | * If this is a partial write which happened to make all buffers |
1946 | * uptodate then we can optimize away a bogus readpage() for | 1946 | * uptodate then we can optimize away a bogus readpage() for |
1947 | * the next read(). Here we 'discover' whether the page went | 1947 | * the next read(). Here we 'discover' whether the page went |
1948 | * uptodate as a result of this (potentially partial) write. | 1948 | * uptodate as a result of this (potentially partial) write. |
1949 | */ | 1949 | */ |
1950 | if (!partial) | 1950 | if (!partial) |
1951 | SetPageUptodate(page); | 1951 | SetPageUptodate(page); |
1952 | return 0; | 1952 | return 0; |
1953 | } | 1953 | } |
1954 | 1954 | ||
1955 | int __block_write_begin(struct page *page, loff_t pos, unsigned len, | 1955 | int __block_write_begin(struct page *page, loff_t pos, unsigned len, |
1956 | get_block_t *get_block) | 1956 | get_block_t *get_block) |
1957 | { | 1957 | { |
1958 | unsigned start = pos & (PAGE_CACHE_SIZE - 1); | 1958 | unsigned start = pos & (PAGE_CACHE_SIZE - 1); |
1959 | 1959 | ||
1960 | return block_prepare_write(page, start, start + len, get_block); | 1960 | return block_prepare_write(page, start, start + len, get_block); |
1961 | } | 1961 | } |
1962 | EXPORT_SYMBOL(__block_write_begin); | 1962 | EXPORT_SYMBOL(__block_write_begin); |
1963 | 1963 | ||
1964 | /* | 1964 | /* |
1965 | * block_write_begin takes care of the basic task of block allocation and | 1965 | * block_write_begin takes care of the basic task of block allocation and |
1966 | * bringing partial write blocks uptodate first. | 1966 | * bringing partial write blocks uptodate first. |
1967 | * | 1967 | * |
1968 | * The filesystem needs to handle block truncation upon failure. | 1968 | * The filesystem needs to handle block truncation upon failure. |
1969 | */ | 1969 | */ |
1970 | int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, | 1970 | int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, |
1971 | unsigned flags, struct page **pagep, get_block_t *get_block) | 1971 | unsigned flags, struct page **pagep, get_block_t *get_block) |
1972 | { | 1972 | { |
1973 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1973 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1974 | struct page *page; | 1974 | struct page *page; |
1975 | int status; | 1975 | int status; |
1976 | 1976 | ||
1977 | page = grab_cache_page_write_begin(mapping, index, flags); | 1977 | page = grab_cache_page_write_begin(mapping, index, flags); |
1978 | if (!page) | 1978 | if (!page) |
1979 | return -ENOMEM; | 1979 | return -ENOMEM; |
1980 | 1980 | ||
1981 | status = __block_write_begin(page, pos, len, get_block); | 1981 | status = __block_write_begin(page, pos, len, get_block); |
1982 | if (unlikely(status)) { | 1982 | if (unlikely(status)) { |
1983 | unlock_page(page); | 1983 | unlock_page(page); |
1984 | page_cache_release(page); | 1984 | page_cache_release(page); |
1985 | page = NULL; | 1985 | page = NULL; |
1986 | } | 1986 | } |
1987 | 1987 | ||
1988 | *pagep = page; | 1988 | *pagep = page; |
1989 | return status; | 1989 | return status; |
1990 | } | 1990 | } |
1991 | EXPORT_SYMBOL(block_write_begin); | 1991 | EXPORT_SYMBOL(block_write_begin); |
1992 | 1992 | ||
1993 | int block_write_end(struct file *file, struct address_space *mapping, | 1993 | int block_write_end(struct file *file, struct address_space *mapping, |
1994 | loff_t pos, unsigned len, unsigned copied, | 1994 | loff_t pos, unsigned len, unsigned copied, |
1995 | struct page *page, void *fsdata) | 1995 | struct page *page, void *fsdata) |
1996 | { | 1996 | { |
1997 | struct inode *inode = mapping->host; | 1997 | struct inode *inode = mapping->host; |
1998 | unsigned start; | 1998 | unsigned start; |
1999 | 1999 | ||
2000 | start = pos & (PAGE_CACHE_SIZE - 1); | 2000 | start = pos & (PAGE_CACHE_SIZE - 1); |
2001 | 2001 | ||
2002 | if (unlikely(copied < len)) { | 2002 | if (unlikely(copied < len)) { |
2003 | /* | 2003 | /* |
2004 | * The buffers that were written will now be uptodate, so we | 2004 | * The buffers that were written will now be uptodate, so we |
2005 | * don't have to worry about a readpage reading them and | 2005 | * don't have to worry about a readpage reading them and |
2006 | * overwriting a partial write. However if we have encountered | 2006 | * overwriting a partial write. However if we have encountered |
2007 | * a short write and only partially written into a buffer, it | 2007 | * a short write and only partially written into a buffer, it |
2008 | * will not be marked uptodate, so a readpage might come in and | 2008 | * will not be marked uptodate, so a readpage might come in and |
2009 | * destroy our partial write. | 2009 | * destroy our partial write. |
2010 | * | 2010 | * |
2011 | * Do the simplest thing, and just treat any short write to a | 2011 | * Do the simplest thing, and just treat any short write to a |
2012 | * non uptodate page as a zero-length write, and force the | 2012 | * non uptodate page as a zero-length write, and force the |
2013 | * caller to redo the whole thing. | 2013 | * caller to redo the whole thing. |
2014 | */ | 2014 | */ |
2015 | if (!PageUptodate(page)) | 2015 | if (!PageUptodate(page)) |
2016 | copied = 0; | 2016 | copied = 0; |
2017 | 2017 | ||
2018 | page_zero_new_buffers(page, start+copied, start+len); | 2018 | page_zero_new_buffers(page, start+copied, start+len); |
2019 | } | 2019 | } |
2020 | flush_dcache_page(page); | 2020 | flush_dcache_page(page); |
2021 | 2021 | ||
2022 | /* This could be a short (even 0-length) commit */ | 2022 | /* This could be a short (even 0-length) commit */ |
2023 | __block_commit_write(inode, page, start, start+copied); | 2023 | __block_commit_write(inode, page, start, start+copied); |
2024 | 2024 | ||
2025 | return copied; | 2025 | return copied; |
2026 | } | 2026 | } |
2027 | EXPORT_SYMBOL(block_write_end); | 2027 | EXPORT_SYMBOL(block_write_end); |
2028 | 2028 | ||
2029 | int generic_write_end(struct file *file, struct address_space *mapping, | 2029 | int generic_write_end(struct file *file, struct address_space *mapping, |
2030 | loff_t pos, unsigned len, unsigned copied, | 2030 | loff_t pos, unsigned len, unsigned copied, |
2031 | struct page *page, void *fsdata) | 2031 | struct page *page, void *fsdata) |
2032 | { | 2032 | { |
2033 | struct inode *inode = mapping->host; | 2033 | struct inode *inode = mapping->host; |
2034 | int i_size_changed = 0; | 2034 | int i_size_changed = 0; |
2035 | 2035 | ||
2036 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | 2036 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); |
2037 | 2037 | ||
2038 | /* | 2038 | /* |
2039 | * No need to use i_size_read() here, the i_size | 2039 | * No need to use i_size_read() here, the i_size |
2040 | * cannot change under us because we hold i_mutex. | 2040 | * cannot change under us because we hold i_mutex. |
2041 | * | 2041 | * |
2042 | * But it's important to update i_size while still holding page lock: | 2042 | * But it's important to update i_size while still holding page lock: |
2043 | * page writeout could otherwise come in and zero beyond i_size. | 2043 | * page writeout could otherwise come in and zero beyond i_size. |
2044 | */ | 2044 | */ |
2045 | if (pos+copied > inode->i_size) { | 2045 | if (pos+copied > inode->i_size) { |
2046 | i_size_write(inode, pos+copied); | 2046 | i_size_write(inode, pos+copied); |
2047 | i_size_changed = 1; | 2047 | i_size_changed = 1; |
2048 | } | 2048 | } |
2049 | 2049 | ||
2050 | unlock_page(page); | 2050 | unlock_page(page); |
2051 | page_cache_release(page); | 2051 | page_cache_release(page); |
2052 | 2052 | ||
2053 | /* | 2053 | /* |
2054 | * Don't mark the inode dirty under page lock. First, it unnecessarily | 2054 | * Don't mark the inode dirty under page lock. First, it unnecessarily |
2055 | * makes the holding time of page lock longer. Second, it forces lock | 2055 | * makes the holding time of page lock longer. Second, it forces lock |
2056 | * ordering of page lock and transaction start for journaling | 2056 | * ordering of page lock and transaction start for journaling |
2057 | * filesystems. | 2057 | * filesystems. |
2058 | */ | 2058 | */ |
2059 | if (i_size_changed) | 2059 | if (i_size_changed) |
2060 | mark_inode_dirty(inode); | 2060 | mark_inode_dirty(inode); |
2061 | 2061 | ||
2062 | return copied; | 2062 | return copied; |
2063 | } | 2063 | } |
2064 | EXPORT_SYMBOL(generic_write_end); | 2064 | EXPORT_SYMBOL(generic_write_end); |
2065 | 2065 | ||
2066 | /* | 2066 | /* |
2067 | * block_is_partially_uptodate checks whether buffers within a page are | 2067 | * block_is_partially_uptodate checks whether buffers within a page are |
2068 | * uptodate or not. | 2068 | * uptodate or not. |
2069 | * | 2069 | * |
2070 | * Returns true if all buffers which correspond to a file portion | 2070 | * Returns true if all buffers which correspond to a file portion |
2071 | * we want to read are uptodate. | 2071 | * we want to read are uptodate. |
2072 | */ | 2072 | */ |
2073 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | 2073 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, |
2074 | unsigned long from) | 2074 | unsigned long from) |
2075 | { | 2075 | { |
2076 | struct inode *inode = page->mapping->host; | 2076 | struct inode *inode = page->mapping->host; |
2077 | unsigned block_start, block_end, blocksize; | 2077 | unsigned block_start, block_end, blocksize; |
2078 | unsigned to; | 2078 | unsigned to; |
2079 | struct buffer_head *bh, *head; | 2079 | struct buffer_head *bh, *head; |
2080 | int ret = 1; | 2080 | int ret = 1; |
2081 | 2081 | ||
2082 | if (!page_has_buffers(page)) | 2082 | if (!page_has_buffers(page)) |
2083 | return 0; | 2083 | return 0; |
2084 | 2084 | ||
2085 | blocksize = 1 << inode->i_blkbits; | 2085 | blocksize = 1 << inode->i_blkbits; |
2086 | to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); | 2086 | to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); |
2087 | to = from + to; | 2087 | to = from + to; |
2088 | if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) | 2088 | if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) |
2089 | return 0; | 2089 | return 0; |
2090 | 2090 | ||
2091 | head = page_buffers(page); | 2091 | head = page_buffers(page); |
2092 | bh = head; | 2092 | bh = head; |
2093 | block_start = 0; | 2093 | block_start = 0; |
2094 | do { | 2094 | do { |
2095 | block_end = block_start + blocksize; | 2095 | block_end = block_start + blocksize; |
2096 | if (block_end > from && block_start < to) { | 2096 | if (block_end > from && block_start < to) { |
2097 | if (!buffer_uptodate(bh)) { | 2097 | if (!buffer_uptodate(bh)) { |
2098 | ret = 0; | 2098 | ret = 0; |
2099 | break; | 2099 | break; |
2100 | } | 2100 | } |
2101 | if (block_end >= to) | 2101 | if (block_end >= to) |
2102 | break; | 2102 | break; |
2103 | } | 2103 | } |
2104 | block_start = block_end; | 2104 | block_start = block_end; |
2105 | bh = bh->b_this_page; | 2105 | bh = bh->b_this_page; |
2106 | } while (bh != head); | 2106 | } while (bh != head); |
2107 | 2107 | ||
2108 | return ret; | 2108 | return ret; |
2109 | } | 2109 | } |
2110 | EXPORT_SYMBOL(block_is_partially_uptodate); | 2110 | EXPORT_SYMBOL(block_is_partially_uptodate); |
2111 | 2111 | ||
2112 | /* | 2112 | /* |
2113 | * Generic "read page" function for block devices that have the normal | 2113 | * Generic "read page" function for block devices that have the normal |
2114 | * get_block functionality. This is most of the block device filesystems. | 2114 | * get_block functionality. This is most of the block device filesystems. |
2115 | * Reads the page asynchronously --- the unlock_buffer() and | 2115 | * Reads the page asynchronously --- the unlock_buffer() and |
2116 | * set/clear_buffer_uptodate() functions propagate buffer state into the | 2116 | * set/clear_buffer_uptodate() functions propagate buffer state into the |
2117 | * page struct once IO has completed. | 2117 | * page struct once IO has completed. |
2118 | */ | 2118 | */ |
2119 | int block_read_full_page(struct page *page, get_block_t *get_block) | 2119 | int block_read_full_page(struct page *page, get_block_t *get_block) |
2120 | { | 2120 | { |
2121 | struct inode *inode = page->mapping->host; | 2121 | struct inode *inode = page->mapping->host; |
2122 | sector_t iblock, lblock; | 2122 | sector_t iblock, lblock; |
2123 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; | 2123 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; |
2124 | unsigned int blocksize; | 2124 | unsigned int blocksize; |
2125 | int nr, i; | 2125 | int nr, i; |
2126 | int fully_mapped = 1; | 2126 | int fully_mapped = 1; |
2127 | 2127 | ||
2128 | BUG_ON(!PageLocked(page)); | 2128 | BUG_ON(!PageLocked(page)); |
2129 | blocksize = 1 << inode->i_blkbits; | 2129 | blocksize = 1 << inode->i_blkbits; |
2130 | if (!page_has_buffers(page)) | 2130 | if (!page_has_buffers(page)) |
2131 | create_empty_buffers(page, blocksize, 0); | 2131 | create_empty_buffers(page, blocksize, 0); |
2132 | head = page_buffers(page); | 2132 | head = page_buffers(page); |
2133 | 2133 | ||
2134 | iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2134 | iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2135 | lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; | 2135 | lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; |
2136 | bh = head; | 2136 | bh = head; |
2137 | nr = 0; | 2137 | nr = 0; |
2138 | i = 0; | 2138 | i = 0; |
2139 | 2139 | ||
2140 | do { | 2140 | do { |
2141 | if (buffer_uptodate(bh)) | 2141 | if (buffer_uptodate(bh)) |
2142 | continue; | 2142 | continue; |
2143 | 2143 | ||
2144 | if (!buffer_mapped(bh)) { | 2144 | if (!buffer_mapped(bh)) { |
2145 | int err = 0; | 2145 | int err = 0; |
2146 | 2146 | ||
2147 | fully_mapped = 0; | 2147 | fully_mapped = 0; |
2148 | if (iblock < lblock) { | 2148 | if (iblock < lblock) { |
2149 | WARN_ON(bh->b_size != blocksize); | 2149 | WARN_ON(bh->b_size != blocksize); |
2150 | err = get_block(inode, iblock, bh, 0); | 2150 | err = get_block(inode, iblock, bh, 0); |
2151 | if (err) | 2151 | if (err) |
2152 | SetPageError(page); | 2152 | SetPageError(page); |
2153 | } | 2153 | } |
2154 | if (!buffer_mapped(bh)) { | 2154 | if (!buffer_mapped(bh)) { |
2155 | zero_user(page, i * blocksize, blocksize); | 2155 | zero_user(page, i * blocksize, blocksize); |
2156 | if (!err) | 2156 | if (!err) |
2157 | set_buffer_uptodate(bh); | 2157 | set_buffer_uptodate(bh); |
2158 | continue; | 2158 | continue; |
2159 | } | 2159 | } |
2160 | /* | 2160 | /* |
2161 | * get_block() might have updated the buffer | 2161 | * get_block() might have updated the buffer |
2162 | * synchronously | 2162 | * synchronously |
2163 | */ | 2163 | */ |
2164 | if (buffer_uptodate(bh)) | 2164 | if (buffer_uptodate(bh)) |
2165 | continue; | 2165 | continue; |
2166 | } | 2166 | } |
2167 | arr[nr++] = bh; | 2167 | arr[nr++] = bh; |
2168 | } while (i++, iblock++, (bh = bh->b_this_page) != head); | 2168 | } while (i++, iblock++, (bh = bh->b_this_page) != head); |
2169 | 2169 | ||
2170 | if (fully_mapped) | 2170 | if (fully_mapped) |
2171 | SetPageMappedToDisk(page); | 2171 | SetPageMappedToDisk(page); |
2172 | 2172 | ||
2173 | if (!nr) { | 2173 | if (!nr) { |
2174 | /* | 2174 | /* |
2175 | * All buffers are uptodate - we can set the page uptodate | 2175 | * All buffers are uptodate - we can set the page uptodate |
2176 | * as well. But not if get_block() returned an error. | 2176 | * as well. But not if get_block() returned an error. |
2177 | */ | 2177 | */ |
2178 | if (!PageError(page)) | 2178 | if (!PageError(page)) |
2179 | SetPageUptodate(page); | 2179 | SetPageUptodate(page); |
2180 | unlock_page(page); | 2180 | unlock_page(page); |
2181 | return 0; | 2181 | return 0; |
2182 | } | 2182 | } |
2183 | 2183 | ||
2184 | /* Stage two: lock the buffers */ | 2184 | /* Stage two: lock the buffers */ |
2185 | for (i = 0; i < nr; i++) { | 2185 | for (i = 0; i < nr; i++) { |
2186 | bh = arr[i]; | 2186 | bh = arr[i]; |
2187 | lock_buffer(bh); | 2187 | lock_buffer(bh); |
2188 | mark_buffer_async_read(bh); | 2188 | mark_buffer_async_read(bh); |
2189 | } | 2189 | } |
2190 | 2190 | ||
2191 | /* | 2191 | /* |
2192 | * Stage 3: start the IO. Check for uptodateness | 2192 | * Stage 3: start the IO. Check for uptodateness |
2193 | * inside the buffer lock in case another process reading | 2193 | * inside the buffer lock in case another process reading |
2194 | * the underlying blockdev brought it uptodate (the sct fix). | 2194 | * the underlying blockdev brought it uptodate (the sct fix). |
2195 | */ | 2195 | */ |
2196 | for (i = 0; i < nr; i++) { | 2196 | for (i = 0; i < nr; i++) { |
2197 | bh = arr[i]; | 2197 | bh = arr[i]; |
2198 | if (buffer_uptodate(bh)) | 2198 | if (buffer_uptodate(bh)) |
2199 | end_buffer_async_read(bh, 1); | 2199 | end_buffer_async_read(bh, 1); |
2200 | else | 2200 | else |
2201 | submit_bh(READ, bh); | 2201 | submit_bh(READ, bh); |
2202 | } | 2202 | } |
2203 | return 0; | 2203 | return 0; |
2204 | } | 2204 | } |
2205 | EXPORT_SYMBOL(block_read_full_page); | 2205 | EXPORT_SYMBOL(block_read_full_page); |
2206 | 2206 | ||
2207 | /* utility function for filesystems that need to do work on expanding | 2207 | /* utility function for filesystems that need to do work on expanding |
2208 | * truncates. Uses filesystem pagecache writes to allow the filesystem to | 2208 | * truncates. Uses filesystem pagecache writes to allow the filesystem to |
2209 | * deal with the hole. | 2209 | * deal with the hole. |
2210 | */ | 2210 | */ |
2211 | int generic_cont_expand_simple(struct inode *inode, loff_t size) | 2211 | int generic_cont_expand_simple(struct inode *inode, loff_t size) |
2212 | { | 2212 | { |
2213 | struct address_space *mapping = inode->i_mapping; | 2213 | struct address_space *mapping = inode->i_mapping; |
2214 | struct page *page; | 2214 | struct page *page; |
2215 | void *fsdata; | 2215 | void *fsdata; |
2216 | int err; | 2216 | int err; |
2217 | 2217 | ||
2218 | err = inode_newsize_ok(inode, size); | 2218 | err = inode_newsize_ok(inode, size); |
2219 | if (err) | 2219 | if (err) |
2220 | goto out; | 2220 | goto out; |
2221 | 2221 | ||
2222 | err = pagecache_write_begin(NULL, mapping, size, 0, | 2222 | err = pagecache_write_begin(NULL, mapping, size, 0, |
2223 | AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, | 2223 | AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, |
2224 | &page, &fsdata); | 2224 | &page, &fsdata); |
2225 | if (err) | 2225 | if (err) |
2226 | goto out; | 2226 | goto out; |
2227 | 2227 | ||
2228 | err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); | 2228 | err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); |
2229 | BUG_ON(err > 0); | 2229 | BUG_ON(err > 0); |
2230 | 2230 | ||
2231 | out: | 2231 | out: |
2232 | return err; | 2232 | return err; |
2233 | } | 2233 | } |
2234 | EXPORT_SYMBOL(generic_cont_expand_simple); | 2234 | EXPORT_SYMBOL(generic_cont_expand_simple); |
2235 | 2235 | ||
2236 | static int cont_expand_zero(struct file *file, struct address_space *mapping, | 2236 | static int cont_expand_zero(struct file *file, struct address_space *mapping, |
2237 | loff_t pos, loff_t *bytes) | 2237 | loff_t pos, loff_t *bytes) |
2238 | { | 2238 | { |
2239 | struct inode *inode = mapping->host; | 2239 | struct inode *inode = mapping->host; |
2240 | unsigned blocksize = 1 << inode->i_blkbits; | 2240 | unsigned blocksize = 1 << inode->i_blkbits; |
2241 | struct page *page; | 2241 | struct page *page; |
2242 | void *fsdata; | 2242 | void *fsdata; |
2243 | pgoff_t index, curidx; | 2243 | pgoff_t index, curidx; |
2244 | loff_t curpos; | 2244 | loff_t curpos; |
2245 | unsigned zerofrom, offset, len; | 2245 | unsigned zerofrom, offset, len; |
2246 | int err = 0; | 2246 | int err = 0; |
2247 | 2247 | ||
2248 | index = pos >> PAGE_CACHE_SHIFT; | 2248 | index = pos >> PAGE_CACHE_SHIFT; |
2249 | offset = pos & ~PAGE_CACHE_MASK; | 2249 | offset = pos & ~PAGE_CACHE_MASK; |
2250 | 2250 | ||
2251 | while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { | 2251 | while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { |
2252 | zerofrom = curpos & ~PAGE_CACHE_MASK; | 2252 | zerofrom = curpos & ~PAGE_CACHE_MASK; |
2253 | if (zerofrom & (blocksize-1)) { | 2253 | if (zerofrom & (blocksize-1)) { |
2254 | *bytes |= (blocksize-1); | 2254 | *bytes |= (blocksize-1); |
2255 | (*bytes)++; | 2255 | (*bytes)++; |
2256 | } | 2256 | } |
2257 | len = PAGE_CACHE_SIZE - zerofrom; | 2257 | len = PAGE_CACHE_SIZE - zerofrom; |
2258 | 2258 | ||
2259 | err = pagecache_write_begin(file, mapping, curpos, len, | 2259 | err = pagecache_write_begin(file, mapping, curpos, len, |
2260 | AOP_FLAG_UNINTERRUPTIBLE, | 2260 | AOP_FLAG_UNINTERRUPTIBLE, |
2261 | &page, &fsdata); | 2261 | &page, &fsdata); |
2262 | if (err) | 2262 | if (err) |
2263 | goto out; | 2263 | goto out; |
2264 | zero_user(page, zerofrom, len); | 2264 | zero_user(page, zerofrom, len); |
2265 | err = pagecache_write_end(file, mapping, curpos, len, len, | 2265 | err = pagecache_write_end(file, mapping, curpos, len, len, |
2266 | page, fsdata); | 2266 | page, fsdata); |
2267 | if (err < 0) | 2267 | if (err < 0) |
2268 | goto out; | 2268 | goto out; |
2269 | BUG_ON(err != len); | 2269 | BUG_ON(err != len); |
2270 | err = 0; | 2270 | err = 0; |
2271 | 2271 | ||
2272 | balance_dirty_pages_ratelimited(mapping); | 2272 | balance_dirty_pages_ratelimited(mapping); |
2273 | } | 2273 | } |
2274 | 2274 | ||
2275 | /* page covers the boundary, find the boundary offset */ | 2275 | /* page covers the boundary, find the boundary offset */ |
2276 | if (index == curidx) { | 2276 | if (index == curidx) { |
2277 | zerofrom = curpos & ~PAGE_CACHE_MASK; | 2277 | zerofrom = curpos & ~PAGE_CACHE_MASK; |
2278 | /* if we will expand the thing last block will be filled */ | 2278 | /* if we will expand the thing last block will be filled */ |
2279 | if (offset <= zerofrom) { | 2279 | if (offset <= zerofrom) { |
2280 | goto out; | 2280 | goto out; |
2281 | } | 2281 | } |
2282 | if (zerofrom & (blocksize-1)) { | 2282 | if (zerofrom & (blocksize-1)) { |
2283 | *bytes |= (blocksize-1); | 2283 | *bytes |= (blocksize-1); |
2284 | (*bytes)++; | 2284 | (*bytes)++; |
2285 | } | 2285 | } |
2286 | len = offset - zerofrom; | 2286 | len = offset - zerofrom; |
2287 | 2287 | ||
2288 | err = pagecache_write_begin(file, mapping, curpos, len, | 2288 | err = pagecache_write_begin(file, mapping, curpos, len, |
2289 | AOP_FLAG_UNINTERRUPTIBLE, | 2289 | AOP_FLAG_UNINTERRUPTIBLE, |
2290 | &page, &fsdata); | 2290 | &page, &fsdata); |
2291 | if (err) | 2291 | if (err) |
2292 | goto out; | 2292 | goto out; |
2293 | zero_user(page, zerofrom, len); | 2293 | zero_user(page, zerofrom, len); |
2294 | err = pagecache_write_end(file, mapping, curpos, len, len, | 2294 | err = pagecache_write_end(file, mapping, curpos, len, len, |
2295 | page, fsdata); | 2295 | page, fsdata); |
2296 | if (err < 0) | 2296 | if (err < 0) |
2297 | goto out; | 2297 | goto out; |
2298 | BUG_ON(err != len); | 2298 | BUG_ON(err != len); |
2299 | err = 0; | 2299 | err = 0; |
2300 | } | 2300 | } |
2301 | out: | 2301 | out: |
2302 | return err; | 2302 | return err; |
2303 | } | 2303 | } |
2304 | 2304 | ||
2305 | /* | 2305 | /* |
2306 | * For moronic filesystems that do not allow holes in file. | 2306 | * For moronic filesystems that do not allow holes in file. |
2307 | * We may have to extend the file. | 2307 | * We may have to extend the file. |
2308 | */ | 2308 | */ |
2309 | int cont_write_begin(struct file *file, struct address_space *mapping, | 2309 | int cont_write_begin(struct file *file, struct address_space *mapping, |
2310 | loff_t pos, unsigned len, unsigned flags, | 2310 | loff_t pos, unsigned len, unsigned flags, |
2311 | struct page **pagep, void **fsdata, | 2311 | struct page **pagep, void **fsdata, |
2312 | get_block_t *get_block, loff_t *bytes) | 2312 | get_block_t *get_block, loff_t *bytes) |
2313 | { | 2313 | { |
2314 | struct inode *inode = mapping->host; | 2314 | struct inode *inode = mapping->host; |
2315 | unsigned blocksize = 1 << inode->i_blkbits; | 2315 | unsigned blocksize = 1 << inode->i_blkbits; |
2316 | unsigned zerofrom; | 2316 | unsigned zerofrom; |
2317 | int err; | 2317 | int err; |
2318 | 2318 | ||
2319 | err = cont_expand_zero(file, mapping, pos, bytes); | 2319 | err = cont_expand_zero(file, mapping, pos, bytes); |
2320 | if (err) | 2320 | if (err) |
2321 | return err; | 2321 | return err; |
2322 | 2322 | ||
2323 | zerofrom = *bytes & ~PAGE_CACHE_MASK; | 2323 | zerofrom = *bytes & ~PAGE_CACHE_MASK; |
2324 | if (pos+len > *bytes && zerofrom & (blocksize-1)) { | 2324 | if (pos+len > *bytes && zerofrom & (blocksize-1)) { |
2325 | *bytes |= (blocksize-1); | 2325 | *bytes |= (blocksize-1); |
2326 | (*bytes)++; | 2326 | (*bytes)++; |
2327 | } | 2327 | } |
2328 | 2328 | ||
2329 | return block_write_begin(mapping, pos, len, flags, pagep, get_block); | 2329 | return block_write_begin(mapping, pos, len, flags, pagep, get_block); |
2330 | } | 2330 | } |
2331 | EXPORT_SYMBOL(cont_write_begin); | 2331 | EXPORT_SYMBOL(cont_write_begin); |
2332 | 2332 | ||
2333 | int block_commit_write(struct page *page, unsigned from, unsigned to) | 2333 | int block_commit_write(struct page *page, unsigned from, unsigned to) |
2334 | { | 2334 | { |
2335 | struct inode *inode = page->mapping->host; | 2335 | struct inode *inode = page->mapping->host; |
2336 | __block_commit_write(inode,page,from,to); | 2336 | __block_commit_write(inode,page,from,to); |
2337 | return 0; | 2337 | return 0; |
2338 | } | 2338 | } |
2339 | EXPORT_SYMBOL(block_commit_write); | 2339 | EXPORT_SYMBOL(block_commit_write); |
2340 | 2340 | ||
2341 | /* | 2341 | /* |
2342 | * block_page_mkwrite() is not allowed to change the file size as it gets | 2342 | * block_page_mkwrite() is not allowed to change the file size as it gets |
2343 | * called from a page fault handler when a page is first dirtied. Hence we must | 2343 | * called from a page fault handler when a page is first dirtied. Hence we must |
2344 | * be careful to check for EOF conditions here. We set the page up correctly | 2344 | * be careful to check for EOF conditions here. We set the page up correctly |
2345 | * for a written page which means we get ENOSPC checking when writing into | 2345 | * for a written page which means we get ENOSPC checking when writing into |
2346 | * holes and correct delalloc and unwritten extent mapping on filesystems that | 2346 | * holes and correct delalloc and unwritten extent mapping on filesystems that |
2347 | * support these features. | 2347 | * support these features. |
2348 | * | 2348 | * |
2349 | * We are not allowed to take the i_mutex here so we have to play games to | 2349 | * We are not allowed to take the i_mutex here so we have to play games to |
2350 | * protect against truncate races as the page could now be beyond EOF. Because | 2350 | * protect against truncate races as the page could now be beyond EOF. Because |
2351 | * truncate writes the inode size before removing pages, once we have the | 2351 | * truncate writes the inode size before removing pages, once we have the |
2352 | * page lock we can determine safely if the page is beyond EOF. If it is not | 2352 | * page lock we can determine safely if the page is beyond EOF. If it is not |
2353 | * beyond EOF, then the page is guaranteed safe against truncation until we | 2353 | * beyond EOF, then the page is guaranteed safe against truncation until we |
2354 | * unlock the page. | 2354 | * unlock the page. |
2355 | */ | 2355 | */ |
2356 | int | 2356 | int |
2357 | block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | 2357 | block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, |
2358 | get_block_t get_block) | 2358 | get_block_t get_block) |
2359 | { | 2359 | { |
2360 | struct page *page = vmf->page; | 2360 | struct page *page = vmf->page; |
2361 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 2361 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
2362 | unsigned long end; | 2362 | unsigned long end; |
2363 | loff_t size; | 2363 | loff_t size; |
2364 | int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ | 2364 | int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ |
2365 | 2365 | ||
2366 | lock_page(page); | 2366 | lock_page(page); |
2367 | size = i_size_read(inode); | 2367 | size = i_size_read(inode); |
2368 | if ((page->mapping != inode->i_mapping) || | 2368 | if ((page->mapping != inode->i_mapping) || |
2369 | (page_offset(page) > size)) { | 2369 | (page_offset(page) > size)) { |
2370 | /* page got truncated out from underneath us */ | 2370 | /* page got truncated out from underneath us */ |
2371 | unlock_page(page); | 2371 | unlock_page(page); |
2372 | goto out; | 2372 | goto out; |
2373 | } | 2373 | } |
2374 | 2374 | ||
2375 | /* page is wholly or partially inside EOF */ | 2375 | /* page is wholly or partially inside EOF */ |
2376 | if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) | 2376 | if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) |
2377 | end = size & ~PAGE_CACHE_MASK; | 2377 | end = size & ~PAGE_CACHE_MASK; |
2378 | else | 2378 | else |
2379 | end = PAGE_CACHE_SIZE; | 2379 | end = PAGE_CACHE_SIZE; |
2380 | 2380 | ||
2381 | ret = block_prepare_write(page, 0, end, get_block); | 2381 | ret = block_prepare_write(page, 0, end, get_block); |
2382 | if (!ret) | 2382 | if (!ret) |
2383 | ret = block_commit_write(page, 0, end); | 2383 | ret = block_commit_write(page, 0, end); |
2384 | 2384 | ||
2385 | if (unlikely(ret)) { | 2385 | if (unlikely(ret)) { |
2386 | unlock_page(page); | 2386 | unlock_page(page); |
2387 | if (ret == -ENOMEM) | 2387 | if (ret == -ENOMEM) |
2388 | ret = VM_FAULT_OOM; | 2388 | ret = VM_FAULT_OOM; |
2389 | else /* -ENOSPC, -EIO, etc */ | 2389 | else /* -ENOSPC, -EIO, etc */ |
2390 | ret = VM_FAULT_SIGBUS; | 2390 | ret = VM_FAULT_SIGBUS; |
2391 | } else | 2391 | } else |
2392 | ret = VM_FAULT_LOCKED; | 2392 | ret = VM_FAULT_LOCKED; |
2393 | 2393 | ||
2394 | out: | 2394 | out: |
2395 | return ret; | 2395 | return ret; |
2396 | } | 2396 | } |
2397 | EXPORT_SYMBOL(block_page_mkwrite); | 2397 | EXPORT_SYMBOL(block_page_mkwrite); |
2398 | 2398 | ||
2399 | /* | 2399 | /* |
2400 | * nobh_write_begin()'s prereads are special: the buffer_heads are freed | 2400 | * nobh_write_begin()'s prereads are special: the buffer_heads are freed |
2401 | * immediately, while under the page lock. So it needs a special end_io | 2401 | * immediately, while under the page lock. So it needs a special end_io |
2402 | * handler which does not touch the bh after unlocking it. | 2402 | * handler which does not touch the bh after unlocking it. |
2403 | */ | 2403 | */ |
2404 | static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) | 2404 | static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) |
2405 | { | 2405 | { |
2406 | __end_buffer_read_notouch(bh, uptodate); | 2406 | __end_buffer_read_notouch(bh, uptodate); |
2407 | } | 2407 | } |
2408 | 2408 | ||
2409 | /* | 2409 | /* |
2410 | * Attach the singly-linked list of buffers created by nobh_write_begin, to | 2410 | * Attach the singly-linked list of buffers created by nobh_write_begin, to |
2411 | * the page (converting it to circular linked list and taking care of page | 2411 | * the page (converting it to circular linked list and taking care of page |
2412 | * dirty races). | 2412 | * dirty races). |
2413 | */ | 2413 | */ |
2414 | static void attach_nobh_buffers(struct page *page, struct buffer_head *head) | 2414 | static void attach_nobh_buffers(struct page *page, struct buffer_head *head) |
2415 | { | 2415 | { |
2416 | struct buffer_head *bh; | 2416 | struct buffer_head *bh; |
2417 | 2417 | ||
2418 | BUG_ON(!PageLocked(page)); | 2418 | BUG_ON(!PageLocked(page)); |
2419 | 2419 | ||
2420 | spin_lock(&page->mapping->private_lock); | 2420 | spin_lock(&page->mapping->private_lock); |
2421 | bh = head; | 2421 | bh = head; |
2422 | do { | 2422 | do { |
2423 | if (PageDirty(page)) | 2423 | if (PageDirty(page)) |
2424 | set_buffer_dirty(bh); | 2424 | set_buffer_dirty(bh); |
2425 | if (!bh->b_this_page) | 2425 | if (!bh->b_this_page) |
2426 | bh->b_this_page = head; | 2426 | bh->b_this_page = head; |
2427 | bh = bh->b_this_page; | 2427 | bh = bh->b_this_page; |
2428 | } while (bh != head); | 2428 | } while (bh != head); |
2429 | attach_page_buffers(page, head); | 2429 | attach_page_buffers(page, head); |
2430 | spin_unlock(&page->mapping->private_lock); | 2430 | spin_unlock(&page->mapping->private_lock); |
2431 | } | 2431 | } |
2432 | 2432 | ||
2433 | /* | 2433 | /* |
2434 | * On entry, the page is fully not uptodate. | 2434 | * On entry, the page is fully not uptodate. |
2435 | * On exit the page is fully uptodate in the areas outside (from,to) | 2435 | * On exit the page is fully uptodate in the areas outside (from,to) |
2436 | * The filesystem needs to handle block truncation upon failure. | 2436 | * The filesystem needs to handle block truncation upon failure. |
2437 | */ | 2437 | */ |
2438 | int nobh_write_begin(struct address_space *mapping, | 2438 | int nobh_write_begin(struct address_space *mapping, |
2439 | loff_t pos, unsigned len, unsigned flags, | 2439 | loff_t pos, unsigned len, unsigned flags, |
2440 | struct page **pagep, void **fsdata, | 2440 | struct page **pagep, void **fsdata, |
2441 | get_block_t *get_block) | 2441 | get_block_t *get_block) |
2442 | { | 2442 | { |
2443 | struct inode *inode = mapping->host; | 2443 | struct inode *inode = mapping->host; |
2444 | const unsigned blkbits = inode->i_blkbits; | 2444 | const unsigned blkbits = inode->i_blkbits; |
2445 | const unsigned blocksize = 1 << blkbits; | 2445 | const unsigned blocksize = 1 << blkbits; |
2446 | struct buffer_head *head, *bh; | 2446 | struct buffer_head *head, *bh; |
2447 | struct page *page; | 2447 | struct page *page; |
2448 | pgoff_t index; | 2448 | pgoff_t index; |
2449 | unsigned from, to; | 2449 | unsigned from, to; |
2450 | unsigned block_in_page; | 2450 | unsigned block_in_page; |
2451 | unsigned block_start, block_end; | 2451 | unsigned block_start, block_end; |
2452 | sector_t block_in_file; | 2452 | sector_t block_in_file; |
2453 | int nr_reads = 0; | 2453 | int nr_reads = 0; |
2454 | int ret = 0; | 2454 | int ret = 0; |
2455 | int is_mapped_to_disk = 1; | 2455 | int is_mapped_to_disk = 1; |
2456 | 2456 | ||
2457 | index = pos >> PAGE_CACHE_SHIFT; | 2457 | index = pos >> PAGE_CACHE_SHIFT; |
2458 | from = pos & (PAGE_CACHE_SIZE - 1); | 2458 | from = pos & (PAGE_CACHE_SIZE - 1); |
2459 | to = from + len; | 2459 | to = from + len; |
2460 | 2460 | ||
2461 | page = grab_cache_page_write_begin(mapping, index, flags); | 2461 | page = grab_cache_page_write_begin(mapping, index, flags); |
2462 | if (!page) | 2462 | if (!page) |
2463 | return -ENOMEM; | 2463 | return -ENOMEM; |
2464 | *pagep = page; | 2464 | *pagep = page; |
2465 | *fsdata = NULL; | 2465 | *fsdata = NULL; |
2466 | 2466 | ||
2467 | if (page_has_buffers(page)) { | 2467 | if (page_has_buffers(page)) { |
2468 | unlock_page(page); | 2468 | unlock_page(page); |
2469 | page_cache_release(page); | 2469 | page_cache_release(page); |
2470 | *pagep = NULL; | 2470 | *pagep = NULL; |
2471 | return block_write_begin(mapping, pos, len, flags, pagep, | 2471 | return block_write_begin(mapping, pos, len, flags, pagep, |
2472 | get_block); | 2472 | get_block); |
2473 | } | 2473 | } |
2474 | 2474 | ||
2475 | if (PageMappedToDisk(page)) | 2475 | if (PageMappedToDisk(page)) |
2476 | return 0; | 2476 | return 0; |
2477 | 2477 | ||
2478 | /* | 2478 | /* |
2479 | * Allocate buffers so that we can keep track of state, and potentially | 2479 | * Allocate buffers so that we can keep track of state, and potentially |
2480 | * attach them to the page if an error occurs. In the common case of | 2480 | * attach them to the page if an error occurs. In the common case of |
2481 | * no error, they will just be freed again without ever being attached | 2481 | * no error, they will just be freed again without ever being attached |
2482 | * to the page (which is all OK, because we're under the page lock). | 2482 | * to the page (which is all OK, because we're under the page lock). |
2483 | * | 2483 | * |
2484 | * Be careful: the buffer linked list is a NULL terminated one, rather | 2484 | * Be careful: the buffer linked list is a NULL terminated one, rather |
2485 | * than the circular one we're used to. | 2485 | * than the circular one we're used to. |
2486 | */ | 2486 | */ |
2487 | head = alloc_page_buffers(page, blocksize, 0); | 2487 | head = alloc_page_buffers(page, blocksize, 0); |
2488 | if (!head) { | 2488 | if (!head) { |
2489 | ret = -ENOMEM; | 2489 | ret = -ENOMEM; |
2490 | goto out_release; | 2490 | goto out_release; |
2491 | } | 2491 | } |
2492 | 2492 | ||
2493 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); | 2493 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); |
2494 | 2494 | ||
2495 | /* | 2495 | /* |
2496 | * We loop across all blocks in the page, whether or not they are | 2496 | * We loop across all blocks in the page, whether or not they are |
2497 | * part of the affected region. This is so we can discover if the | 2497 | * part of the affected region. This is so we can discover if the |
2498 | * page is fully mapped-to-disk. | 2498 | * page is fully mapped-to-disk. |
2499 | */ | 2499 | */ |
2500 | for (block_start = 0, block_in_page = 0, bh = head; | 2500 | for (block_start = 0, block_in_page = 0, bh = head; |
2501 | block_start < PAGE_CACHE_SIZE; | 2501 | block_start < PAGE_CACHE_SIZE; |
2502 | block_in_page++, block_start += blocksize, bh = bh->b_this_page) { | 2502 | block_in_page++, block_start += blocksize, bh = bh->b_this_page) { |
2503 | int create; | 2503 | int create; |
2504 | 2504 | ||
2505 | block_end = block_start + blocksize; | 2505 | block_end = block_start + blocksize; |
2506 | bh->b_state = 0; | 2506 | bh->b_state = 0; |
2507 | create = 1; | 2507 | create = 1; |
2508 | if (block_start >= to) | 2508 | if (block_start >= to) |
2509 | create = 0; | 2509 | create = 0; |
2510 | ret = get_block(inode, block_in_file + block_in_page, | 2510 | ret = get_block(inode, block_in_file + block_in_page, |
2511 | bh, create); | 2511 | bh, create); |
2512 | if (ret) | 2512 | if (ret) |
2513 | goto failed; | 2513 | goto failed; |
2514 | if (!buffer_mapped(bh)) | 2514 | if (!buffer_mapped(bh)) |
2515 | is_mapped_to_disk = 0; | 2515 | is_mapped_to_disk = 0; |
2516 | if (buffer_new(bh)) | 2516 | if (buffer_new(bh)) |
2517 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | 2517 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); |
2518 | if (PageUptodate(page)) { | 2518 | if (PageUptodate(page)) { |
2519 | set_buffer_uptodate(bh); | 2519 | set_buffer_uptodate(bh); |
2520 | continue; | 2520 | continue; |
2521 | } | 2521 | } |
2522 | if (buffer_new(bh) || !buffer_mapped(bh)) { | 2522 | if (buffer_new(bh) || !buffer_mapped(bh)) { |
2523 | zero_user_segments(page, block_start, from, | 2523 | zero_user_segments(page, block_start, from, |
2524 | to, block_end); | 2524 | to, block_end); |
2525 | continue; | 2525 | continue; |
2526 | } | 2526 | } |
2527 | if (buffer_uptodate(bh)) | 2527 | if (buffer_uptodate(bh)) |
2528 | continue; /* reiserfs does this */ | 2528 | continue; /* reiserfs does this */ |
2529 | if (block_start < from || block_end > to) { | 2529 | if (block_start < from || block_end > to) { |
2530 | lock_buffer(bh); | 2530 | lock_buffer(bh); |
2531 | bh->b_end_io = end_buffer_read_nobh; | 2531 | bh->b_end_io = end_buffer_read_nobh; |
2532 | submit_bh(READ, bh); | 2532 | submit_bh(READ, bh); |
2533 | nr_reads++; | 2533 | nr_reads++; |
2534 | } | 2534 | } |
2535 | } | 2535 | } |
2536 | 2536 | ||
2537 | if (nr_reads) { | 2537 | if (nr_reads) { |
2538 | /* | 2538 | /* |
2539 | * The page is locked, so these buffers are protected from | 2539 | * The page is locked, so these buffers are protected from |
2540 | * any VM or truncate activity. Hence we don't need to care | 2540 | * any VM or truncate activity. Hence we don't need to care |
2541 | * for the buffer_head refcounts. | 2541 | * for the buffer_head refcounts. |
2542 | */ | 2542 | */ |
2543 | for (bh = head; bh; bh = bh->b_this_page) { | 2543 | for (bh = head; bh; bh = bh->b_this_page) { |
2544 | wait_on_buffer(bh); | 2544 | wait_on_buffer(bh); |
2545 | if (!buffer_uptodate(bh)) | 2545 | if (!buffer_uptodate(bh)) |
2546 | ret = -EIO; | 2546 | ret = -EIO; |
2547 | } | 2547 | } |
2548 | if (ret) | 2548 | if (ret) |
2549 | goto failed; | 2549 | goto failed; |
2550 | } | 2550 | } |
2551 | 2551 | ||
2552 | if (is_mapped_to_disk) | 2552 | if (is_mapped_to_disk) |
2553 | SetPageMappedToDisk(page); | 2553 | SetPageMappedToDisk(page); |
2554 | 2554 | ||
2555 | *fsdata = head; /* to be released by nobh_write_end */ | 2555 | *fsdata = head; /* to be released by nobh_write_end */ |
2556 | 2556 | ||
2557 | return 0; | 2557 | return 0; |
2558 | 2558 | ||
2559 | failed: | 2559 | failed: |
2560 | BUG_ON(!ret); | 2560 | BUG_ON(!ret); |
2561 | /* | 2561 | /* |
2562 | * Error recovery is a bit difficult. We need to zero out blocks that | 2562 | * Error recovery is a bit difficult. We need to zero out blocks that |
2563 | * were newly allocated, and dirty them to ensure they get written out. | 2563 | * were newly allocated, and dirty them to ensure they get written out. |
2564 | * Buffers need to be attached to the page at this point, otherwise | 2564 | * Buffers need to be attached to the page at this point, otherwise |
2565 | * the handling of potential IO errors during writeout would be hard | 2565 | * the handling of potential IO errors during writeout would be hard |
2566 | * (could try doing synchronous writeout, but what if that fails too?) | 2566 | * (could try doing synchronous writeout, but what if that fails too?) |
2567 | */ | 2567 | */ |
2568 | attach_nobh_buffers(page, head); | 2568 | attach_nobh_buffers(page, head); |
2569 | page_zero_new_buffers(page, from, to); | 2569 | page_zero_new_buffers(page, from, to); |
2570 | 2570 | ||
2571 | out_release: | 2571 | out_release: |
2572 | unlock_page(page); | 2572 | unlock_page(page); |
2573 | page_cache_release(page); | 2573 | page_cache_release(page); |
2574 | *pagep = NULL; | 2574 | *pagep = NULL; |
2575 | 2575 | ||
2576 | return ret; | 2576 | return ret; |
2577 | } | 2577 | } |
2578 | EXPORT_SYMBOL(nobh_write_begin); | 2578 | EXPORT_SYMBOL(nobh_write_begin); |
2579 | 2579 | ||
2580 | int nobh_write_end(struct file *file, struct address_space *mapping, | 2580 | int nobh_write_end(struct file *file, struct address_space *mapping, |
2581 | loff_t pos, unsigned len, unsigned copied, | 2581 | loff_t pos, unsigned len, unsigned copied, |
2582 | struct page *page, void *fsdata) | 2582 | struct page *page, void *fsdata) |
2583 | { | 2583 | { |
2584 | struct inode *inode = page->mapping->host; | 2584 | struct inode *inode = page->mapping->host; |
2585 | struct buffer_head *head = fsdata; | 2585 | struct buffer_head *head = fsdata; |
2586 | struct buffer_head *bh; | 2586 | struct buffer_head *bh; |
2587 | BUG_ON(fsdata != NULL && page_has_buffers(page)); | 2587 | BUG_ON(fsdata != NULL && page_has_buffers(page)); |
2588 | 2588 | ||
2589 | if (unlikely(copied < len) && head) | 2589 | if (unlikely(copied < len) && head) |
2590 | attach_nobh_buffers(page, head); | 2590 | attach_nobh_buffers(page, head); |
2591 | if (page_has_buffers(page)) | 2591 | if (page_has_buffers(page)) |
2592 | return generic_write_end(file, mapping, pos, len, | 2592 | return generic_write_end(file, mapping, pos, len, |
2593 | copied, page, fsdata); | 2593 | copied, page, fsdata); |
2594 | 2594 | ||
2595 | SetPageUptodate(page); | 2595 | SetPageUptodate(page); |
2596 | set_page_dirty(page); | 2596 | set_page_dirty(page); |
2597 | if (pos+copied > inode->i_size) { | 2597 | if (pos+copied > inode->i_size) { |
2598 | i_size_write(inode, pos+copied); | 2598 | i_size_write(inode, pos+copied); |
2599 | mark_inode_dirty(inode); | 2599 | mark_inode_dirty(inode); |
2600 | } | 2600 | } |
2601 | 2601 | ||
2602 | unlock_page(page); | 2602 | unlock_page(page); |
2603 | page_cache_release(page); | 2603 | page_cache_release(page); |
2604 | 2604 | ||
2605 | while (head) { | 2605 | while (head) { |
2606 | bh = head; | 2606 | bh = head; |
2607 | head = head->b_this_page; | 2607 | head = head->b_this_page; |
2608 | free_buffer_head(bh); | 2608 | free_buffer_head(bh); |
2609 | } | 2609 | } |
2610 | 2610 | ||
2611 | return copied; | 2611 | return copied; |
2612 | } | 2612 | } |
2613 | EXPORT_SYMBOL(nobh_write_end); | 2613 | EXPORT_SYMBOL(nobh_write_end); |
2614 | 2614 | ||
2615 | /* | 2615 | /* |
2616 | * nobh_writepage() - based on block_full_write_page() except | 2616 | * nobh_writepage() - based on block_full_write_page() except |
2617 | * that it tries to operate without attaching bufferheads to | 2617 | * that it tries to operate without attaching bufferheads to |
2618 | * the page. | 2618 | * the page. |
2619 | */ | 2619 | */ |
2620 | int nobh_writepage(struct page *page, get_block_t *get_block, | 2620 | int nobh_writepage(struct page *page, get_block_t *get_block, |
2621 | struct writeback_control *wbc) | 2621 | struct writeback_control *wbc) |
2622 | { | 2622 | { |
2623 | struct inode * const inode = page->mapping->host; | 2623 | struct inode * const inode = page->mapping->host; |
2624 | loff_t i_size = i_size_read(inode); | 2624 | loff_t i_size = i_size_read(inode); |
2625 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; | 2625 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; |
2626 | unsigned offset; | 2626 | unsigned offset; |
2627 | int ret; | 2627 | int ret; |
2628 | 2628 | ||
2629 | /* Is the page fully inside i_size? */ | 2629 | /* Is the page fully inside i_size? */ |
2630 | if (page->index < end_index) | 2630 | if (page->index < end_index) |
2631 | goto out; | 2631 | goto out; |
2632 | 2632 | ||
2633 | /* Is the page fully outside i_size? (truncate in progress) */ | 2633 | /* Is the page fully outside i_size? (truncate in progress) */ |
2634 | offset = i_size & (PAGE_CACHE_SIZE-1); | 2634 | offset = i_size & (PAGE_CACHE_SIZE-1); |
2635 | if (page->index >= end_index+1 || !offset) { | 2635 | if (page->index >= end_index+1 || !offset) { |
2636 | /* | 2636 | /* |
2637 | * The page may have dirty, unmapped buffers. For example, | 2637 | * The page may have dirty, unmapped buffers. For example, |
2638 | * they may have been added in ext3_writepage(). Make them | 2638 | * they may have been added in ext3_writepage(). Make them |
2639 | * freeable here, so the page does not leak. | 2639 | * freeable here, so the page does not leak. |
2640 | */ | 2640 | */ |
2641 | #if 0 | 2641 | #if 0 |
2642 | /* Not really sure about this - do we need this ? */ | 2642 | /* Not really sure about this - do we need this ? */ |
2643 | if (page->mapping->a_ops->invalidatepage) | 2643 | if (page->mapping->a_ops->invalidatepage) |
2644 | page->mapping->a_ops->invalidatepage(page, offset); | 2644 | page->mapping->a_ops->invalidatepage(page, offset); |
2645 | #endif | 2645 | #endif |
2646 | unlock_page(page); | 2646 | unlock_page(page); |
2647 | return 0; /* don't care */ | 2647 | return 0; /* don't care */ |
2648 | } | 2648 | } |
2649 | 2649 | ||
2650 | /* | 2650 | /* |
2651 | * The page straddles i_size. It must be zeroed out on each and every | 2651 | * The page straddles i_size. It must be zeroed out on each and every |
2652 | * writepage invocation because it may be mmapped. "A file is mapped | 2652 | * writepage invocation because it may be mmapped. "A file is mapped |
2653 | * in multiples of the page size. For a file that is not a multiple of | 2653 | * in multiples of the page size. For a file that is not a multiple of |
2654 | * the page size, the remaining memory is zeroed when mapped, and | 2654 | * the page size, the remaining memory is zeroed when mapped, and |
2655 | * writes to that region are not written out to the file." | 2655 | * writes to that region are not written out to the file." |
2656 | */ | 2656 | */ |
2657 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 2657 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
2658 | out: | 2658 | out: |
2659 | ret = mpage_writepage(page, get_block, wbc); | 2659 | ret = mpage_writepage(page, get_block, wbc); |
2660 | if (ret == -EAGAIN) | 2660 | if (ret == -EAGAIN) |
2661 | ret = __block_write_full_page(inode, page, get_block, wbc, | 2661 | ret = __block_write_full_page(inode, page, get_block, wbc, |
2662 | end_buffer_async_write); | 2662 | end_buffer_async_write); |
2663 | return ret; | 2663 | return ret; |
2664 | } | 2664 | } |
2665 | EXPORT_SYMBOL(nobh_writepage); | 2665 | EXPORT_SYMBOL(nobh_writepage); |
2666 | 2666 | ||
2667 | int nobh_truncate_page(struct address_space *mapping, | 2667 | int nobh_truncate_page(struct address_space *mapping, |
2668 | loff_t from, get_block_t *get_block) | 2668 | loff_t from, get_block_t *get_block) |
2669 | { | 2669 | { |
2670 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 2670 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
2671 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 2671 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
2672 | unsigned blocksize; | 2672 | unsigned blocksize; |
2673 | sector_t iblock; | 2673 | sector_t iblock; |
2674 | unsigned length, pos; | 2674 | unsigned length, pos; |
2675 | struct inode *inode = mapping->host; | 2675 | struct inode *inode = mapping->host; |
2676 | struct page *page; | 2676 | struct page *page; |
2677 | struct buffer_head map_bh; | 2677 | struct buffer_head map_bh; |
2678 | int err; | 2678 | int err; |
2679 | 2679 | ||
2680 | blocksize = 1 << inode->i_blkbits; | 2680 | blocksize = 1 << inode->i_blkbits; |
2681 | length = offset & (blocksize - 1); | 2681 | length = offset & (blocksize - 1); |
2682 | 2682 | ||
2683 | /* Block boundary? Nothing to do */ | 2683 | /* Block boundary? Nothing to do */ |
2684 | if (!length) | 2684 | if (!length) |
2685 | return 0; | 2685 | return 0; |
2686 | 2686 | ||
2687 | length = blocksize - length; | 2687 | length = blocksize - length; |
2688 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2688 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2689 | 2689 | ||
2690 | page = grab_cache_page(mapping, index); | 2690 | page = grab_cache_page(mapping, index); |
2691 | err = -ENOMEM; | 2691 | err = -ENOMEM; |
2692 | if (!page) | 2692 | if (!page) |
2693 | goto out; | 2693 | goto out; |
2694 | 2694 | ||
2695 | if (page_has_buffers(page)) { | 2695 | if (page_has_buffers(page)) { |
2696 | has_buffers: | 2696 | has_buffers: |
2697 | unlock_page(page); | 2697 | unlock_page(page); |
2698 | page_cache_release(page); | 2698 | page_cache_release(page); |
2699 | return block_truncate_page(mapping, from, get_block); | 2699 | return block_truncate_page(mapping, from, get_block); |
2700 | } | 2700 | } |
2701 | 2701 | ||
2702 | /* Find the buffer that contains "offset" */ | 2702 | /* Find the buffer that contains "offset" */ |
2703 | pos = blocksize; | 2703 | pos = blocksize; |
2704 | while (offset >= pos) { | 2704 | while (offset >= pos) { |
2705 | iblock++; | 2705 | iblock++; |
2706 | pos += blocksize; | 2706 | pos += blocksize; |
2707 | } | 2707 | } |
2708 | 2708 | ||
2709 | map_bh.b_size = blocksize; | 2709 | map_bh.b_size = blocksize; |
2710 | map_bh.b_state = 0; | 2710 | map_bh.b_state = 0; |
2711 | err = get_block(inode, iblock, &map_bh, 0); | 2711 | err = get_block(inode, iblock, &map_bh, 0); |
2712 | if (err) | 2712 | if (err) |
2713 | goto unlock; | 2713 | goto unlock; |
2714 | /* unmapped? It's a hole - nothing to do */ | 2714 | /* unmapped? It's a hole - nothing to do */ |
2715 | if (!buffer_mapped(&map_bh)) | 2715 | if (!buffer_mapped(&map_bh)) |
2716 | goto unlock; | 2716 | goto unlock; |
2717 | 2717 | ||
2718 | /* Ok, it's mapped. Make sure it's up-to-date */ | 2718 | /* Ok, it's mapped. Make sure it's up-to-date */ |
2719 | if (!PageUptodate(page)) { | 2719 | if (!PageUptodate(page)) { |
2720 | err = mapping->a_ops->readpage(NULL, page); | 2720 | err = mapping->a_ops->readpage(NULL, page); |
2721 | if (err) { | 2721 | if (err) { |
2722 | page_cache_release(page); | 2722 | page_cache_release(page); |
2723 | goto out; | 2723 | goto out; |
2724 | } | 2724 | } |
2725 | lock_page(page); | 2725 | lock_page(page); |
2726 | if (!PageUptodate(page)) { | 2726 | if (!PageUptodate(page)) { |
2727 | err = -EIO; | 2727 | err = -EIO; |
2728 | goto unlock; | 2728 | goto unlock; |
2729 | } | 2729 | } |
2730 | if (page_has_buffers(page)) | 2730 | if (page_has_buffers(page)) |
2731 | goto has_buffers; | 2731 | goto has_buffers; |
2732 | } | 2732 | } |
2733 | zero_user(page, offset, length); | 2733 | zero_user(page, offset, length); |
2734 | set_page_dirty(page); | 2734 | set_page_dirty(page); |
2735 | err = 0; | 2735 | err = 0; |
2736 | 2736 | ||
2737 | unlock: | 2737 | unlock: |
2738 | unlock_page(page); | 2738 | unlock_page(page); |
2739 | page_cache_release(page); | 2739 | page_cache_release(page); |
2740 | out: | 2740 | out: |
2741 | return err; | 2741 | return err; |
2742 | } | 2742 | } |
2743 | EXPORT_SYMBOL(nobh_truncate_page); | 2743 | EXPORT_SYMBOL(nobh_truncate_page); |
2744 | 2744 | ||
2745 | int block_truncate_page(struct address_space *mapping, | 2745 | int block_truncate_page(struct address_space *mapping, |
2746 | loff_t from, get_block_t *get_block) | 2746 | loff_t from, get_block_t *get_block) |
2747 | { | 2747 | { |
2748 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 2748 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
2749 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 2749 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
2750 | unsigned blocksize; | 2750 | unsigned blocksize; |
2751 | sector_t iblock; | 2751 | sector_t iblock; |
2752 | unsigned length, pos; | 2752 | unsigned length, pos; |
2753 | struct inode *inode = mapping->host; | 2753 | struct inode *inode = mapping->host; |
2754 | struct page *page; | 2754 | struct page *page; |
2755 | struct buffer_head *bh; | 2755 | struct buffer_head *bh; |
2756 | int err; | 2756 | int err; |
2757 | 2757 | ||
2758 | blocksize = 1 << inode->i_blkbits; | 2758 | blocksize = 1 << inode->i_blkbits; |
2759 | length = offset & (blocksize - 1); | 2759 | length = offset & (blocksize - 1); |
2760 | 2760 | ||
2761 | /* Block boundary? Nothing to do */ | 2761 | /* Block boundary? Nothing to do */ |
2762 | if (!length) | 2762 | if (!length) |
2763 | return 0; | 2763 | return 0; |
2764 | 2764 | ||
2765 | length = blocksize - length; | 2765 | length = blocksize - length; |
2766 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2766 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2767 | 2767 | ||
2768 | page = grab_cache_page(mapping, index); | 2768 | page = grab_cache_page(mapping, index); |
2769 | err = -ENOMEM; | 2769 | err = -ENOMEM; |
2770 | if (!page) | 2770 | if (!page) |
2771 | goto out; | 2771 | goto out; |
2772 | 2772 | ||
2773 | if (!page_has_buffers(page)) | 2773 | if (!page_has_buffers(page)) |
2774 | create_empty_buffers(page, blocksize, 0); | 2774 | create_empty_buffers(page, blocksize, 0); |
2775 | 2775 | ||
2776 | /* Find the buffer that contains "offset" */ | 2776 | /* Find the buffer that contains "offset" */ |
2777 | bh = page_buffers(page); | 2777 | bh = page_buffers(page); |
2778 | pos = blocksize; | 2778 | pos = blocksize; |
2779 | while (offset >= pos) { | 2779 | while (offset >= pos) { |
2780 | bh = bh->b_this_page; | 2780 | bh = bh->b_this_page; |
2781 | iblock++; | 2781 | iblock++; |
2782 | pos += blocksize; | 2782 | pos += blocksize; |
2783 | } | 2783 | } |
2784 | 2784 | ||
2785 | err = 0; | 2785 | err = 0; |
2786 | if (!buffer_mapped(bh)) { | 2786 | if (!buffer_mapped(bh)) { |
2787 | WARN_ON(bh->b_size != blocksize); | 2787 | WARN_ON(bh->b_size != blocksize); |
2788 | err = get_block(inode, iblock, bh, 0); | 2788 | err = get_block(inode, iblock, bh, 0); |
2789 | if (err) | 2789 | if (err) |
2790 | goto unlock; | 2790 | goto unlock; |
2791 | /* unmapped? It's a hole - nothing to do */ | 2791 | /* unmapped? It's a hole - nothing to do */ |
2792 | if (!buffer_mapped(bh)) | 2792 | if (!buffer_mapped(bh)) |
2793 | goto unlock; | 2793 | goto unlock; |
2794 | } | 2794 | } |
2795 | 2795 | ||
2796 | /* Ok, it's mapped. Make sure it's up-to-date */ | 2796 | /* Ok, it's mapped. Make sure it's up-to-date */ |
2797 | if (PageUptodate(page)) | 2797 | if (PageUptodate(page)) |
2798 | set_buffer_uptodate(bh); | 2798 | set_buffer_uptodate(bh); |
2799 | 2799 | ||
2800 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { | 2800 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { |
2801 | err = -EIO; | 2801 | err = -EIO; |
2802 | ll_rw_block(READ, 1, &bh); | 2802 | ll_rw_block(READ, 1, &bh); |
2803 | wait_on_buffer(bh); | 2803 | wait_on_buffer(bh); |
2804 | /* Uhhuh. Read error. Complain and punt. */ | 2804 | /* Uhhuh. Read error. Complain and punt. */ |
2805 | if (!buffer_uptodate(bh)) | 2805 | if (!buffer_uptodate(bh)) |
2806 | goto unlock; | 2806 | goto unlock; |
2807 | } | 2807 | } |
2808 | 2808 | ||
2809 | zero_user(page, offset, length); | 2809 | zero_user(page, offset, length); |
2810 | mark_buffer_dirty(bh); | 2810 | mark_buffer_dirty(bh); |
2811 | err = 0; | 2811 | err = 0; |
2812 | 2812 | ||
2813 | unlock: | 2813 | unlock: |
2814 | unlock_page(page); | 2814 | unlock_page(page); |
2815 | page_cache_release(page); | 2815 | page_cache_release(page); |
2816 | out: | 2816 | out: |
2817 | return err; | 2817 | return err; |
2818 | } | 2818 | } |
2819 | EXPORT_SYMBOL(block_truncate_page); | 2819 | EXPORT_SYMBOL(block_truncate_page); |
2820 | 2820 | ||
2821 | /* | 2821 | /* |
2822 | * The generic ->writepage function for buffer-backed address_spaces | 2822 | * The generic ->writepage function for buffer-backed address_spaces |
2823 | * this form passes in the end_io handler used to finish the IO. | 2823 | * this form passes in the end_io handler used to finish the IO. |
2824 | */ | 2824 | */ |
2825 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, | 2825 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, |
2826 | struct writeback_control *wbc, bh_end_io_t *handler) | 2826 | struct writeback_control *wbc, bh_end_io_t *handler) |
2827 | { | 2827 | { |
2828 | struct inode * const inode = page->mapping->host; | 2828 | struct inode * const inode = page->mapping->host; |
2829 | loff_t i_size = i_size_read(inode); | 2829 | loff_t i_size = i_size_read(inode); |
2830 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; | 2830 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; |
2831 | unsigned offset; | 2831 | unsigned offset; |
2832 | 2832 | ||
2833 | /* Is the page fully inside i_size? */ | 2833 | /* Is the page fully inside i_size? */ |
2834 | if (page->index < end_index) | 2834 | if (page->index < end_index) |
2835 | return __block_write_full_page(inode, page, get_block, wbc, | 2835 | return __block_write_full_page(inode, page, get_block, wbc, |
2836 | handler); | 2836 | handler); |
2837 | 2837 | ||
2838 | /* Is the page fully outside i_size? (truncate in progress) */ | 2838 | /* Is the page fully outside i_size? (truncate in progress) */ |
2839 | offset = i_size & (PAGE_CACHE_SIZE-1); | 2839 | offset = i_size & (PAGE_CACHE_SIZE-1); |
2840 | if (page->index >= end_index+1 || !offset) { | 2840 | if (page->index >= end_index+1 || !offset) { |
2841 | /* | 2841 | /* |
2842 | * The page may have dirty, unmapped buffers. For example, | 2842 | * The page may have dirty, unmapped buffers. For example, |
2843 | * they may have been added in ext3_writepage(). Make them | 2843 | * they may have been added in ext3_writepage(). Make them |
2844 | * freeable here, so the page does not leak. | 2844 | * freeable here, so the page does not leak. |
2845 | */ | 2845 | */ |
2846 | do_invalidatepage(page, 0); | 2846 | do_invalidatepage(page, 0); |
2847 | unlock_page(page); | 2847 | unlock_page(page); |
2848 | return 0; /* don't care */ | 2848 | return 0; /* don't care */ |
2849 | } | 2849 | } |
2850 | 2850 | ||
2851 | /* | 2851 | /* |
2852 | * The page straddles i_size. It must be zeroed out on each and every | 2852 | * The page straddles i_size. It must be zeroed out on each and every |
2853 | * writepage invocation because it may be mmapped. "A file is mapped | 2853 | * writepage invocation because it may be mmapped. "A file is mapped |
2854 | * in multiples of the page size. For a file that is not a multiple of | 2854 | * in multiples of the page size. For a file that is not a multiple of |
2855 | * the page size, the remaining memory is zeroed when mapped, and | 2855 | * the page size, the remaining memory is zeroed when mapped, and |
2856 | * writes to that region are not written out to the file." | 2856 | * writes to that region are not written out to the file." |
2857 | */ | 2857 | */ |
2858 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 2858 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
2859 | return __block_write_full_page(inode, page, get_block, wbc, handler); | 2859 | return __block_write_full_page(inode, page, get_block, wbc, handler); |
2860 | } | 2860 | } |
2861 | EXPORT_SYMBOL(block_write_full_page_endio); | 2861 | EXPORT_SYMBOL(block_write_full_page_endio); |
2862 | 2862 | ||
2863 | /* | 2863 | /* |
2864 | * The generic ->writepage function for buffer-backed address_spaces | 2864 | * The generic ->writepage function for buffer-backed address_spaces |
2865 | */ | 2865 | */ |
2866 | int block_write_full_page(struct page *page, get_block_t *get_block, | 2866 | int block_write_full_page(struct page *page, get_block_t *get_block, |
2867 | struct writeback_control *wbc) | 2867 | struct writeback_control *wbc) |
2868 | { | 2868 | { |
2869 | return block_write_full_page_endio(page, get_block, wbc, | 2869 | return block_write_full_page_endio(page, get_block, wbc, |
2870 | end_buffer_async_write); | 2870 | end_buffer_async_write); |
2871 | } | 2871 | } |
2872 | EXPORT_SYMBOL(block_write_full_page); | 2872 | EXPORT_SYMBOL(block_write_full_page); |
2873 | 2873 | ||
2874 | sector_t generic_block_bmap(struct address_space *mapping, sector_t block, | 2874 | sector_t generic_block_bmap(struct address_space *mapping, sector_t block, |
2875 | get_block_t *get_block) | 2875 | get_block_t *get_block) |
2876 | { | 2876 | { |
2877 | struct buffer_head tmp; | 2877 | struct buffer_head tmp; |
2878 | struct inode *inode = mapping->host; | 2878 | struct inode *inode = mapping->host; |
2879 | tmp.b_state = 0; | 2879 | tmp.b_state = 0; |
2880 | tmp.b_blocknr = 0; | 2880 | tmp.b_blocknr = 0; |
2881 | tmp.b_size = 1 << inode->i_blkbits; | 2881 | tmp.b_size = 1 << inode->i_blkbits; |
2882 | get_block(inode, block, &tmp, 0); | 2882 | get_block(inode, block, &tmp, 0); |
2883 | return tmp.b_blocknr; | 2883 | return tmp.b_blocknr; |
2884 | } | 2884 | } |
2885 | EXPORT_SYMBOL(generic_block_bmap); | 2885 | EXPORT_SYMBOL(generic_block_bmap); |
2886 | 2886 | ||
2887 | static void end_bio_bh_io_sync(struct bio *bio, int err) | 2887 | static void end_bio_bh_io_sync(struct bio *bio, int err) |
2888 | { | 2888 | { |
2889 | struct buffer_head *bh = bio->bi_private; | 2889 | struct buffer_head *bh = bio->bi_private; |
2890 | 2890 | ||
2891 | if (err == -EOPNOTSUPP) { | 2891 | if (err == -EOPNOTSUPP) { |
2892 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | 2892 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); |
2893 | set_bit(BH_Eopnotsupp, &bh->b_state); | 2893 | set_bit(BH_Eopnotsupp, &bh->b_state); |
2894 | } | 2894 | } |
2895 | 2895 | ||
2896 | if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) | 2896 | if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) |
2897 | set_bit(BH_Quiet, &bh->b_state); | 2897 | set_bit(BH_Quiet, &bh->b_state); |
2898 | 2898 | ||
2899 | bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); | 2899 | bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); |
2900 | bio_put(bio); | 2900 | bio_put(bio); |
2901 | } | 2901 | } |
2902 | 2902 | ||
2903 | int submit_bh(int rw, struct buffer_head * bh) | 2903 | int submit_bh(int rw, struct buffer_head * bh) |
2904 | { | 2904 | { |
2905 | struct bio *bio; | 2905 | struct bio *bio; |
2906 | int ret = 0; | 2906 | int ret = 0; |
2907 | 2907 | ||
2908 | BUG_ON(!buffer_locked(bh)); | 2908 | BUG_ON(!buffer_locked(bh)); |
2909 | BUG_ON(!buffer_mapped(bh)); | 2909 | BUG_ON(!buffer_mapped(bh)); |
2910 | BUG_ON(!bh->b_end_io); | 2910 | BUG_ON(!bh->b_end_io); |
2911 | BUG_ON(buffer_delay(bh)); | 2911 | BUG_ON(buffer_delay(bh)); |
2912 | BUG_ON(buffer_unwritten(bh)); | 2912 | BUG_ON(buffer_unwritten(bh)); |
2913 | 2913 | ||
2914 | /* | 2914 | /* |
2915 | * Mask in barrier bit for a write (could be either a WRITE or a | ||
2916 | * WRITE_SYNC | ||
2917 | */ | ||
2918 | if (buffer_ordered(bh) && (rw & WRITE)) | ||
2919 | rw |= WRITE_BARRIER; | ||
2920 | |||
2921 | /* | ||
2922 | * Only clear out a write error when rewriting | 2915 | * Only clear out a write error when rewriting |
2923 | */ | 2916 | */ |
2924 | if (test_set_buffer_req(bh) && (rw & WRITE)) | 2917 | if (test_set_buffer_req(bh) && (rw & WRITE)) |
2925 | clear_buffer_write_io_error(bh); | 2918 | clear_buffer_write_io_error(bh); |
2926 | 2919 | ||
2927 | /* | 2920 | /* |
2928 | * from here on down, it's all bio -- do the initial mapping, | 2921 | * from here on down, it's all bio -- do the initial mapping, |
2929 | * submit_bio -> generic_make_request may further map this bio around | 2922 | * submit_bio -> generic_make_request may further map this bio around |
2930 | */ | 2923 | */ |
2931 | bio = bio_alloc(GFP_NOIO, 1); | 2924 | bio = bio_alloc(GFP_NOIO, 1); |
2932 | 2925 | ||
2933 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 2926 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
2934 | bio->bi_bdev = bh->b_bdev; | 2927 | bio->bi_bdev = bh->b_bdev; |
2935 | bio->bi_io_vec[0].bv_page = bh->b_page; | 2928 | bio->bi_io_vec[0].bv_page = bh->b_page; |
2936 | bio->bi_io_vec[0].bv_len = bh->b_size; | 2929 | bio->bi_io_vec[0].bv_len = bh->b_size; |
2937 | bio->bi_io_vec[0].bv_offset = bh_offset(bh); | 2930 | bio->bi_io_vec[0].bv_offset = bh_offset(bh); |
2938 | 2931 | ||
2939 | bio->bi_vcnt = 1; | 2932 | bio->bi_vcnt = 1; |
2940 | bio->bi_idx = 0; | 2933 | bio->bi_idx = 0; |
2941 | bio->bi_size = bh->b_size; | 2934 | bio->bi_size = bh->b_size; |
2942 | 2935 | ||
2943 | bio->bi_end_io = end_bio_bh_io_sync; | 2936 | bio->bi_end_io = end_bio_bh_io_sync; |
2944 | bio->bi_private = bh; | 2937 | bio->bi_private = bh; |
2945 | 2938 | ||
2946 | bio_get(bio); | 2939 | bio_get(bio); |
2947 | submit_bio(rw, bio); | 2940 | submit_bio(rw, bio); |
2948 | 2941 | ||
2949 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 2942 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
2950 | ret = -EOPNOTSUPP; | 2943 | ret = -EOPNOTSUPP; |
2951 | 2944 | ||
2952 | bio_put(bio); | 2945 | bio_put(bio); |
2953 | return ret; | 2946 | return ret; |
2954 | } | 2947 | } |
2955 | EXPORT_SYMBOL(submit_bh); | 2948 | EXPORT_SYMBOL(submit_bh); |
2956 | 2949 | ||
2957 | /** | 2950 | /** |
2958 | * ll_rw_block: low-level access to block devices (DEPRECATED) | 2951 | * ll_rw_block: low-level access to block devices (DEPRECATED) |
2959 | * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) | 2952 | * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) |
2960 | * @nr: number of &struct buffer_heads in the array | 2953 | * @nr: number of &struct buffer_heads in the array |
2961 | * @bhs: array of pointers to &struct buffer_head | 2954 | * @bhs: array of pointers to &struct buffer_head |
2962 | * | 2955 | * |
2963 | * ll_rw_block() takes an array of pointers to &struct buffer_heads, and | 2956 | * ll_rw_block() takes an array of pointers to &struct buffer_heads, and |
2964 | * requests an I/O operation on them, either a %READ or a %WRITE. The third | 2957 | * requests an I/O operation on them, either a %READ or a %WRITE. The third |
2965 | * %SWRITE is like %WRITE only we make sure that the *current* data in buffers | 2958 | * %SWRITE is like %WRITE only we make sure that the *current* data in buffers |
2966 | * are sent to disk. The fourth %READA option is described in the documentation | 2959 | * are sent to disk. The fourth %READA option is described in the documentation |
2967 | * for generic_make_request() which ll_rw_block() calls. | 2960 | * for generic_make_request() which ll_rw_block() calls. |
2968 | * | 2961 | * |
2969 | * This function drops any buffer that it cannot get a lock on (with the | 2962 | * This function drops any buffer that it cannot get a lock on (with the |
2970 | * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be | 2963 | * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be |
2971 | * clean when doing a write request, and any buffer that appears to be | 2964 | * clean when doing a write request, and any buffer that appears to be |
2972 | * up-to-date when doing read request. Further it marks as clean buffers that | 2965 | * up-to-date when doing read request. Further it marks as clean buffers that |
2973 | * are processed for writing (the buffer cache won't assume that they are | 2966 | * are processed for writing (the buffer cache won't assume that they are |
2974 | * actually clean until the buffer gets unlocked). | 2967 | * actually clean until the buffer gets unlocked). |
2975 | * | 2968 | * |
2976 | * ll_rw_block sets b_end_io to simple completion handler that marks | 2969 | * ll_rw_block sets b_end_io to simple completion handler that marks |
2977 | * the buffer up-to-date (if approriate), unlocks the buffer and wakes | 2970 | * the buffer up-to-date (if approriate), unlocks the buffer and wakes |
2978 | * any waiters. | 2971 | * any waiters. |
2979 | * | 2972 | * |
2980 | * All of the buffers must be for the same device, and must also be a | 2973 | * All of the buffers must be for the same device, and must also be a |
2981 | * multiple of the current approved size for the device. | 2974 | * multiple of the current approved size for the device. |
2982 | */ | 2975 | */ |
2983 | void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) | 2976 | void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) |
2984 | { | 2977 | { |
2985 | int i; | 2978 | int i; |
2986 | 2979 | ||
2987 | for (i = 0; i < nr; i++) { | 2980 | for (i = 0; i < nr; i++) { |
2988 | struct buffer_head *bh = bhs[i]; | 2981 | struct buffer_head *bh = bhs[i]; |
2989 | 2982 | ||
2990 | if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) | 2983 | if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) |
2991 | lock_buffer(bh); | 2984 | lock_buffer(bh); |
2992 | else if (!trylock_buffer(bh)) | 2985 | else if (!trylock_buffer(bh)) |
2993 | continue; | 2986 | continue; |
2994 | 2987 | ||
2995 | if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC || | 2988 | if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC || |
2996 | rw == SWRITE_SYNC_PLUG) { | 2989 | rw == SWRITE_SYNC_PLUG) { |
2997 | if (test_clear_buffer_dirty(bh)) { | 2990 | if (test_clear_buffer_dirty(bh)) { |
2998 | bh->b_end_io = end_buffer_write_sync; | 2991 | bh->b_end_io = end_buffer_write_sync; |
2999 | get_bh(bh); | 2992 | get_bh(bh); |
3000 | if (rw == SWRITE_SYNC) | 2993 | if (rw == SWRITE_SYNC) |
3001 | submit_bh(WRITE_SYNC, bh); | 2994 | submit_bh(WRITE_SYNC, bh); |
3002 | else | 2995 | else |
3003 | submit_bh(WRITE, bh); | 2996 | submit_bh(WRITE, bh); |
3004 | continue; | 2997 | continue; |
3005 | } | 2998 | } |
3006 | } else { | 2999 | } else { |
3007 | if (!buffer_uptodate(bh)) { | 3000 | if (!buffer_uptodate(bh)) { |
3008 | bh->b_end_io = end_buffer_read_sync; | 3001 | bh->b_end_io = end_buffer_read_sync; |
3009 | get_bh(bh); | 3002 | get_bh(bh); |
3010 | submit_bh(rw, bh); | 3003 | submit_bh(rw, bh); |
3011 | continue; | 3004 | continue; |
3012 | } | 3005 | } |
3013 | } | 3006 | } |
3014 | unlock_buffer(bh); | 3007 | unlock_buffer(bh); |
3015 | } | 3008 | } |
3016 | } | 3009 | } |
3017 | EXPORT_SYMBOL(ll_rw_block); | 3010 | EXPORT_SYMBOL(ll_rw_block); |
3018 | 3011 | ||
3019 | /* | 3012 | /* |
3020 | * For a data-integrity writeout, we need to wait upon any in-progress I/O | 3013 | * For a data-integrity writeout, we need to wait upon any in-progress I/O |
3021 | * and then start new I/O and then wait upon it. The caller must have a ref on | 3014 | * and then start new I/O and then wait upon it. The caller must have a ref on |
3022 | * the buffer_head. | 3015 | * the buffer_head. |
3023 | */ | 3016 | */ |
3024 | int sync_dirty_buffer(struct buffer_head *bh) | 3017 | int __sync_dirty_buffer(struct buffer_head *bh, int rw) |
3025 | { | 3018 | { |
3026 | int ret = 0; | 3019 | int ret = 0; |
3027 | 3020 | ||
3028 | WARN_ON(atomic_read(&bh->b_count) < 1); | 3021 | WARN_ON(atomic_read(&bh->b_count) < 1); |
3029 | lock_buffer(bh); | 3022 | lock_buffer(bh); |
3030 | if (test_clear_buffer_dirty(bh)) { | 3023 | if (test_clear_buffer_dirty(bh)) { |
3031 | get_bh(bh); | 3024 | get_bh(bh); |
3032 | bh->b_end_io = end_buffer_write_sync; | 3025 | bh->b_end_io = end_buffer_write_sync; |
3033 | ret = submit_bh(WRITE_SYNC, bh); | 3026 | ret = submit_bh(rw, bh); |
3034 | wait_on_buffer(bh); | 3027 | wait_on_buffer(bh); |
3035 | if (buffer_eopnotsupp(bh)) { | 3028 | if (buffer_eopnotsupp(bh)) { |
3036 | clear_buffer_eopnotsupp(bh); | 3029 | clear_buffer_eopnotsupp(bh); |
3037 | ret = -EOPNOTSUPP; | 3030 | ret = -EOPNOTSUPP; |
3038 | } | 3031 | } |
3039 | if (!ret && !buffer_uptodate(bh)) | 3032 | if (!ret && !buffer_uptodate(bh)) |
3040 | ret = -EIO; | 3033 | ret = -EIO; |
3041 | } else { | 3034 | } else { |
3042 | unlock_buffer(bh); | 3035 | unlock_buffer(bh); |
3043 | } | 3036 | } |
3044 | return ret; | 3037 | return ret; |
3038 | } | ||
3039 | EXPORT_SYMBOL(__sync_dirty_buffer); | ||
3040 | |||
3041 | int sync_dirty_buffer(struct buffer_head *bh) | ||
3042 | { | ||
3043 | return __sync_dirty_buffer(bh, WRITE_SYNC); | ||
3045 | } | 3044 | } |
3046 | EXPORT_SYMBOL(sync_dirty_buffer); | 3045 | EXPORT_SYMBOL(sync_dirty_buffer); |
3047 | 3046 | ||
3048 | /* | 3047 | /* |
3049 | * try_to_free_buffers() checks if all the buffers on this particular page | 3048 | * try_to_free_buffers() checks if all the buffers on this particular page |
3050 | * are unused, and releases them if so. | 3049 | * are unused, and releases them if so. |
3051 | * | 3050 | * |
3052 | * Exclusion against try_to_free_buffers may be obtained by either | 3051 | * Exclusion against try_to_free_buffers may be obtained by either |
3053 | * locking the page or by holding its mapping's private_lock. | 3052 | * locking the page or by holding its mapping's private_lock. |
3054 | * | 3053 | * |
3055 | * If the page is dirty but all the buffers are clean then we need to | 3054 | * If the page is dirty but all the buffers are clean then we need to |
3056 | * be sure to mark the page clean as well. This is because the page | 3055 | * be sure to mark the page clean as well. This is because the page |
3057 | * may be against a block device, and a later reattachment of buffers | 3056 | * may be against a block device, and a later reattachment of buffers |
3058 | * to a dirty page will set *all* buffers dirty. Which would corrupt | 3057 | * to a dirty page will set *all* buffers dirty. Which would corrupt |
3059 | * filesystem data on the same device. | 3058 | * filesystem data on the same device. |
3060 | * | 3059 | * |
3061 | * The same applies to regular filesystem pages: if all the buffers are | 3060 | * The same applies to regular filesystem pages: if all the buffers are |
3062 | * clean then we set the page clean and proceed. To do that, we require | 3061 | * clean then we set the page clean and proceed. To do that, we require |
3063 | * total exclusion from __set_page_dirty_buffers(). That is obtained with | 3062 | * total exclusion from __set_page_dirty_buffers(). That is obtained with |
3064 | * private_lock. | 3063 | * private_lock. |
3065 | * | 3064 | * |
3066 | * try_to_free_buffers() is non-blocking. | 3065 | * try_to_free_buffers() is non-blocking. |
3067 | */ | 3066 | */ |
3068 | static inline int buffer_busy(struct buffer_head *bh) | 3067 | static inline int buffer_busy(struct buffer_head *bh) |
3069 | { | 3068 | { |
3070 | return atomic_read(&bh->b_count) | | 3069 | return atomic_read(&bh->b_count) | |
3071 | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); | 3070 | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); |
3072 | } | 3071 | } |
3073 | 3072 | ||
3074 | static int | 3073 | static int |
3075 | drop_buffers(struct page *page, struct buffer_head **buffers_to_free) | 3074 | drop_buffers(struct page *page, struct buffer_head **buffers_to_free) |
3076 | { | 3075 | { |
3077 | struct buffer_head *head = page_buffers(page); | 3076 | struct buffer_head *head = page_buffers(page); |
3078 | struct buffer_head *bh; | 3077 | struct buffer_head *bh; |
3079 | 3078 | ||
3080 | bh = head; | 3079 | bh = head; |
3081 | do { | 3080 | do { |
3082 | if (buffer_write_io_error(bh) && page->mapping) | 3081 | if (buffer_write_io_error(bh) && page->mapping) |
3083 | set_bit(AS_EIO, &page->mapping->flags); | 3082 | set_bit(AS_EIO, &page->mapping->flags); |
3084 | if (buffer_busy(bh)) | 3083 | if (buffer_busy(bh)) |
3085 | goto failed; | 3084 | goto failed; |
3086 | bh = bh->b_this_page; | 3085 | bh = bh->b_this_page; |
3087 | } while (bh != head); | 3086 | } while (bh != head); |
3088 | 3087 | ||
3089 | do { | 3088 | do { |
3090 | struct buffer_head *next = bh->b_this_page; | 3089 | struct buffer_head *next = bh->b_this_page; |
3091 | 3090 | ||
3092 | if (bh->b_assoc_map) | 3091 | if (bh->b_assoc_map) |
3093 | __remove_assoc_queue(bh); | 3092 | __remove_assoc_queue(bh); |
3094 | bh = next; | 3093 | bh = next; |
3095 | } while (bh != head); | 3094 | } while (bh != head); |
3096 | *buffers_to_free = head; | 3095 | *buffers_to_free = head; |
3097 | __clear_page_buffers(page); | 3096 | __clear_page_buffers(page); |
3098 | return 1; | 3097 | return 1; |
3099 | failed: | 3098 | failed: |
3100 | return 0; | 3099 | return 0; |
3101 | } | 3100 | } |
3102 | 3101 | ||
3103 | int try_to_free_buffers(struct page *page) | 3102 | int try_to_free_buffers(struct page *page) |
3104 | { | 3103 | { |
3105 | struct address_space * const mapping = page->mapping; | 3104 | struct address_space * const mapping = page->mapping; |
3106 | struct buffer_head *buffers_to_free = NULL; | 3105 | struct buffer_head *buffers_to_free = NULL; |
3107 | int ret = 0; | 3106 | int ret = 0; |
3108 | 3107 | ||
3109 | BUG_ON(!PageLocked(page)); | 3108 | BUG_ON(!PageLocked(page)); |
3110 | if (PageWriteback(page)) | 3109 | if (PageWriteback(page)) |
3111 | return 0; | 3110 | return 0; |
3112 | 3111 | ||
3113 | if (mapping == NULL) { /* can this still happen? */ | 3112 | if (mapping == NULL) { /* can this still happen? */ |
3114 | ret = drop_buffers(page, &buffers_to_free); | 3113 | ret = drop_buffers(page, &buffers_to_free); |
3115 | goto out; | 3114 | goto out; |
3116 | } | 3115 | } |
3117 | 3116 | ||
3118 | spin_lock(&mapping->private_lock); | 3117 | spin_lock(&mapping->private_lock); |
3119 | ret = drop_buffers(page, &buffers_to_free); | 3118 | ret = drop_buffers(page, &buffers_to_free); |
3120 | 3119 | ||
3121 | /* | 3120 | /* |
3122 | * If the filesystem writes its buffers by hand (eg ext3) | 3121 | * If the filesystem writes its buffers by hand (eg ext3) |
3123 | * then we can have clean buffers against a dirty page. We | 3122 | * then we can have clean buffers against a dirty page. We |
3124 | * clean the page here; otherwise the VM will never notice | 3123 | * clean the page here; otherwise the VM will never notice |
3125 | * that the filesystem did any IO at all. | 3124 | * that the filesystem did any IO at all. |
3126 | * | 3125 | * |
3127 | * Also, during truncate, discard_buffer will have marked all | 3126 | * Also, during truncate, discard_buffer will have marked all |
3128 | * the page's buffers clean. We discover that here and clean | 3127 | * the page's buffers clean. We discover that here and clean |
3129 | * the page also. | 3128 | * the page also. |
3130 | * | 3129 | * |
3131 | * private_lock must be held over this entire operation in order | 3130 | * private_lock must be held over this entire operation in order |
3132 | * to synchronise against __set_page_dirty_buffers and prevent the | 3131 | * to synchronise against __set_page_dirty_buffers and prevent the |
3133 | * dirty bit from being lost. | 3132 | * dirty bit from being lost. |
3134 | */ | 3133 | */ |
3135 | if (ret) | 3134 | if (ret) |
3136 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 3135 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
3137 | spin_unlock(&mapping->private_lock); | 3136 | spin_unlock(&mapping->private_lock); |
3138 | out: | 3137 | out: |
3139 | if (buffers_to_free) { | 3138 | if (buffers_to_free) { |
3140 | struct buffer_head *bh = buffers_to_free; | 3139 | struct buffer_head *bh = buffers_to_free; |
3141 | 3140 | ||
3142 | do { | 3141 | do { |
3143 | struct buffer_head *next = bh->b_this_page; | 3142 | struct buffer_head *next = bh->b_this_page; |
3144 | free_buffer_head(bh); | 3143 | free_buffer_head(bh); |
3145 | bh = next; | 3144 | bh = next; |
3146 | } while (bh != buffers_to_free); | 3145 | } while (bh != buffers_to_free); |
3147 | } | 3146 | } |
3148 | return ret; | 3147 | return ret; |
3149 | } | 3148 | } |
3150 | EXPORT_SYMBOL(try_to_free_buffers); | 3149 | EXPORT_SYMBOL(try_to_free_buffers); |
3151 | 3150 | ||
3152 | void block_sync_page(struct page *page) | 3151 | void block_sync_page(struct page *page) |
3153 | { | 3152 | { |
3154 | struct address_space *mapping; | 3153 | struct address_space *mapping; |
3155 | 3154 | ||
3156 | smp_mb(); | 3155 | smp_mb(); |
3157 | mapping = page_mapping(page); | 3156 | mapping = page_mapping(page); |
3158 | if (mapping) | 3157 | if (mapping) |
3159 | blk_run_backing_dev(mapping->backing_dev_info, page); | 3158 | blk_run_backing_dev(mapping->backing_dev_info, page); |
3160 | } | 3159 | } |
3161 | EXPORT_SYMBOL(block_sync_page); | 3160 | EXPORT_SYMBOL(block_sync_page); |
3162 | 3161 | ||
3163 | /* | 3162 | /* |
3164 | * There are no bdflush tunables left. But distributions are | 3163 | * There are no bdflush tunables left. But distributions are |
3165 | * still running obsolete flush daemons, so we terminate them here. | 3164 | * still running obsolete flush daemons, so we terminate them here. |
3166 | * | 3165 | * |
3167 | * Use of bdflush() is deprecated and will be removed in a future kernel. | 3166 | * Use of bdflush() is deprecated and will be removed in a future kernel. |
3168 | * The `flush-X' kernel threads fully replace bdflush daemons and this call. | 3167 | * The `flush-X' kernel threads fully replace bdflush daemons and this call. |
3169 | */ | 3168 | */ |
3170 | SYSCALL_DEFINE2(bdflush, int, func, long, data) | 3169 | SYSCALL_DEFINE2(bdflush, int, func, long, data) |
3171 | { | 3170 | { |
3172 | static int msg_count; | 3171 | static int msg_count; |
3173 | 3172 | ||
3174 | if (!capable(CAP_SYS_ADMIN)) | 3173 | if (!capable(CAP_SYS_ADMIN)) |
3175 | return -EPERM; | 3174 | return -EPERM; |
3176 | 3175 | ||
3177 | if (msg_count < 5) { | 3176 | if (msg_count < 5) { |
3178 | msg_count++; | 3177 | msg_count++; |
3179 | printk(KERN_INFO | 3178 | printk(KERN_INFO |
3180 | "warning: process `%s' used the obsolete bdflush" | 3179 | "warning: process `%s' used the obsolete bdflush" |
3181 | " system call\n", current->comm); | 3180 | " system call\n", current->comm); |
3182 | printk(KERN_INFO "Fix your initscripts?\n"); | 3181 | printk(KERN_INFO "Fix your initscripts?\n"); |
3183 | } | 3182 | } |
3184 | 3183 | ||
3185 | if (func == 1) | 3184 | if (func == 1) |
3186 | do_exit(0); | 3185 | do_exit(0); |
3187 | return 0; | 3186 | return 0; |
3188 | } | 3187 | } |
3189 | 3188 | ||
3190 | /* | 3189 | /* |
3191 | * Buffer-head allocation | 3190 | * Buffer-head allocation |
3192 | */ | 3191 | */ |
3193 | static struct kmem_cache *bh_cachep; | 3192 | static struct kmem_cache *bh_cachep; |
3194 | 3193 | ||
3195 | /* | 3194 | /* |
3196 | * Once the number of bh's in the machine exceeds this level, we start | 3195 | * Once the number of bh's in the machine exceeds this level, we start |
3197 | * stripping them in writeback. | 3196 | * stripping them in writeback. |
3198 | */ | 3197 | */ |
3199 | static int max_buffer_heads; | 3198 | static int max_buffer_heads; |
3200 | 3199 | ||
3201 | int buffer_heads_over_limit; | 3200 | int buffer_heads_over_limit; |
3202 | 3201 | ||
3203 | struct bh_accounting { | 3202 | struct bh_accounting { |
3204 | int nr; /* Number of live bh's */ | 3203 | int nr; /* Number of live bh's */ |
3205 | int ratelimit; /* Limit cacheline bouncing */ | 3204 | int ratelimit; /* Limit cacheline bouncing */ |
3206 | }; | 3205 | }; |
3207 | 3206 | ||
3208 | static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; | 3207 | static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; |
3209 | 3208 | ||
3210 | static void recalc_bh_state(void) | 3209 | static void recalc_bh_state(void) |
3211 | { | 3210 | { |
3212 | int i; | 3211 | int i; |
3213 | int tot = 0; | 3212 | int tot = 0; |
3214 | 3213 | ||
3215 | if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) | 3214 | if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) |
3216 | return; | 3215 | return; |
3217 | __get_cpu_var(bh_accounting).ratelimit = 0; | 3216 | __get_cpu_var(bh_accounting).ratelimit = 0; |
3218 | for_each_online_cpu(i) | 3217 | for_each_online_cpu(i) |
3219 | tot += per_cpu(bh_accounting, i).nr; | 3218 | tot += per_cpu(bh_accounting, i).nr; |
3220 | buffer_heads_over_limit = (tot > max_buffer_heads); | 3219 | buffer_heads_over_limit = (tot > max_buffer_heads); |
3221 | } | 3220 | } |
3222 | 3221 | ||
3223 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) | 3222 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) |
3224 | { | 3223 | { |
3225 | struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); | 3224 | struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); |
3226 | if (ret) { | 3225 | if (ret) { |
3227 | INIT_LIST_HEAD(&ret->b_assoc_buffers); | 3226 | INIT_LIST_HEAD(&ret->b_assoc_buffers); |
3228 | get_cpu_var(bh_accounting).nr++; | 3227 | get_cpu_var(bh_accounting).nr++; |
3229 | recalc_bh_state(); | 3228 | recalc_bh_state(); |
3230 | put_cpu_var(bh_accounting); | 3229 | put_cpu_var(bh_accounting); |
3231 | } | 3230 | } |
3232 | return ret; | 3231 | return ret; |
3233 | } | 3232 | } |
3234 | EXPORT_SYMBOL(alloc_buffer_head); | 3233 | EXPORT_SYMBOL(alloc_buffer_head); |
3235 | 3234 | ||
3236 | void free_buffer_head(struct buffer_head *bh) | 3235 | void free_buffer_head(struct buffer_head *bh) |
3237 | { | 3236 | { |
3238 | BUG_ON(!list_empty(&bh->b_assoc_buffers)); | 3237 | BUG_ON(!list_empty(&bh->b_assoc_buffers)); |
3239 | kmem_cache_free(bh_cachep, bh); | 3238 | kmem_cache_free(bh_cachep, bh); |
3240 | get_cpu_var(bh_accounting).nr--; | 3239 | get_cpu_var(bh_accounting).nr--; |
3241 | recalc_bh_state(); | 3240 | recalc_bh_state(); |
3242 | put_cpu_var(bh_accounting); | 3241 | put_cpu_var(bh_accounting); |
3243 | } | 3242 | } |
3244 | EXPORT_SYMBOL(free_buffer_head); | 3243 | EXPORT_SYMBOL(free_buffer_head); |
3245 | 3244 | ||
3246 | static void buffer_exit_cpu(int cpu) | 3245 | static void buffer_exit_cpu(int cpu) |
3247 | { | 3246 | { |
3248 | int i; | 3247 | int i; |
3249 | struct bh_lru *b = &per_cpu(bh_lrus, cpu); | 3248 | struct bh_lru *b = &per_cpu(bh_lrus, cpu); |
3250 | 3249 | ||
3251 | for (i = 0; i < BH_LRU_SIZE; i++) { | 3250 | for (i = 0; i < BH_LRU_SIZE; i++) { |
3252 | brelse(b->bhs[i]); | 3251 | brelse(b->bhs[i]); |
3253 | b->bhs[i] = NULL; | 3252 | b->bhs[i] = NULL; |
3254 | } | 3253 | } |
3255 | get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; | 3254 | get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; |
3256 | per_cpu(bh_accounting, cpu).nr = 0; | 3255 | per_cpu(bh_accounting, cpu).nr = 0; |
3257 | put_cpu_var(bh_accounting); | 3256 | put_cpu_var(bh_accounting); |
3258 | } | 3257 | } |
3259 | 3258 | ||
3260 | static int buffer_cpu_notify(struct notifier_block *self, | 3259 | static int buffer_cpu_notify(struct notifier_block *self, |
3261 | unsigned long action, void *hcpu) | 3260 | unsigned long action, void *hcpu) |
3262 | { | 3261 | { |
3263 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) | 3262 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) |
3264 | buffer_exit_cpu((unsigned long)hcpu); | 3263 | buffer_exit_cpu((unsigned long)hcpu); |
3265 | return NOTIFY_OK; | 3264 | return NOTIFY_OK; |
3266 | } | 3265 | } |
3267 | 3266 | ||
3268 | /** | 3267 | /** |
3269 | * bh_uptodate_or_lock - Test whether the buffer is uptodate | 3268 | * bh_uptodate_or_lock - Test whether the buffer is uptodate |
3270 | * @bh: struct buffer_head | 3269 | * @bh: struct buffer_head |
3271 | * | 3270 | * |
3272 | * Return true if the buffer is up-to-date and false, | 3271 | * Return true if the buffer is up-to-date and false, |
3273 | * with the buffer locked, if not. | 3272 | * with the buffer locked, if not. |
3274 | */ | 3273 | */ |
3275 | int bh_uptodate_or_lock(struct buffer_head *bh) | 3274 | int bh_uptodate_or_lock(struct buffer_head *bh) |
3276 | { | 3275 | { |
3277 | if (!buffer_uptodate(bh)) { | 3276 | if (!buffer_uptodate(bh)) { |
3278 | lock_buffer(bh); | 3277 | lock_buffer(bh); |
3279 | if (!buffer_uptodate(bh)) | 3278 | if (!buffer_uptodate(bh)) |
3280 | return 0; | 3279 | return 0; |
3281 | unlock_buffer(bh); | 3280 | unlock_buffer(bh); |
3282 | } | 3281 | } |
3283 | return 1; | 3282 | return 1; |
3284 | } | 3283 | } |
3285 | EXPORT_SYMBOL(bh_uptodate_or_lock); | 3284 | EXPORT_SYMBOL(bh_uptodate_or_lock); |
3286 | 3285 | ||
3287 | /** | 3286 | /** |
3288 | * bh_submit_read - Submit a locked buffer for reading | 3287 | * bh_submit_read - Submit a locked buffer for reading |
3289 | * @bh: struct buffer_head | 3288 | * @bh: struct buffer_head |
3290 | * | 3289 | * |
3291 | * Returns zero on success and -EIO on error. | 3290 | * Returns zero on success and -EIO on error. |
3292 | */ | 3291 | */ |
3293 | int bh_submit_read(struct buffer_head *bh) | 3292 | int bh_submit_read(struct buffer_head *bh) |
3294 | { | 3293 | { |
3295 | BUG_ON(!buffer_locked(bh)); | 3294 | BUG_ON(!buffer_locked(bh)); |
3296 | 3295 | ||
3297 | if (buffer_uptodate(bh)) { | 3296 | if (buffer_uptodate(bh)) { |
3298 | unlock_buffer(bh); | 3297 | unlock_buffer(bh); |
3299 | return 0; | 3298 | return 0; |
3300 | } | 3299 | } |
3301 | 3300 | ||
3302 | get_bh(bh); | 3301 | get_bh(bh); |
3303 | bh->b_end_io = end_buffer_read_sync; | 3302 | bh->b_end_io = end_buffer_read_sync; |
3304 | submit_bh(READ, bh); | 3303 | submit_bh(READ, bh); |
3305 | wait_on_buffer(bh); | 3304 | wait_on_buffer(bh); |
3306 | if (buffer_uptodate(bh)) | 3305 | if (buffer_uptodate(bh)) |
3307 | return 0; | 3306 | return 0; |
3308 | return -EIO; | 3307 | return -EIO; |
3309 | } | 3308 | } |
3310 | EXPORT_SYMBOL(bh_submit_read); | 3309 | EXPORT_SYMBOL(bh_submit_read); |
3311 | 3310 | ||
3312 | void __init buffer_init(void) | 3311 | void __init buffer_init(void) |
3313 | { | 3312 | { |
3314 | int nrpages; | 3313 | int nrpages; |
3315 | 3314 | ||
3316 | bh_cachep = kmem_cache_create("buffer_head", | 3315 | bh_cachep = kmem_cache_create("buffer_head", |
3317 | sizeof(struct buffer_head), 0, | 3316 | sizeof(struct buffer_head), 0, |
3318 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| | 3317 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| |
3319 | SLAB_MEM_SPREAD), | 3318 | SLAB_MEM_SPREAD), |
3320 | NULL); | 3319 | NULL); |
3321 | 3320 | ||
3322 | /* | 3321 | /* |
3323 | * Limit the bh occupancy to 10% of ZONE_NORMAL | 3322 | * Limit the bh occupancy to 10% of ZONE_NORMAL |
fs/jbd/commit.c
1 | /* | 1 | /* |
2 | * linux/fs/jbd/commit.c | 2 | * linux/fs/jbd/commit.c |
3 | * | 3 | * |
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | 4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 |
5 | * | 5 | * |
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | 6 | * Copyright 1998 Red Hat corp --- All Rights Reserved |
7 | * | 7 | * |
8 | * This file is part of the Linux kernel and is made available under | 8 | * This file is part of the Linux kernel and is made available under |
9 | * the terms of the GNU General Public License, version 2, or at your | 9 | * the terms of the GNU General Public License, version 2, or at your |
10 | * option, any later version, incorporated herein by reference. | 10 | * option, any later version, incorporated herein by reference. |
11 | * | 11 | * |
12 | * Journal commit routines for the generic filesystem journaling code; | 12 | * Journal commit routines for the generic filesystem journaling code; |
13 | * part of the ext2fs journaling system. | 13 | * part of the ext2fs journaling system. |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/time.h> | 16 | #include <linux/time.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/jbd.h> | 18 | #include <linux/jbd.h> |
19 | #include <linux/errno.h> | 19 | #include <linux/errno.h> |
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/bio.h> | 22 | #include <linux/bio.h> |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * Default IO end handler for temporary BJ_IO buffer_heads. | 25 | * Default IO end handler for temporary BJ_IO buffer_heads. |
26 | */ | 26 | */ |
27 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | 27 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
28 | { | 28 | { |
29 | BUFFER_TRACE(bh, ""); | 29 | BUFFER_TRACE(bh, ""); |
30 | if (uptodate) | 30 | if (uptodate) |
31 | set_buffer_uptodate(bh); | 31 | set_buffer_uptodate(bh); |
32 | else | 32 | else |
33 | clear_buffer_uptodate(bh); | 33 | clear_buffer_uptodate(bh); |
34 | unlock_buffer(bh); | 34 | unlock_buffer(bh); |
35 | } | 35 | } |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * When an ext3-ordered file is truncated, it is possible that many pages are | 38 | * When an ext3-ordered file is truncated, it is possible that many pages are |
39 | * not successfully freed, because they are attached to a committing transaction. | 39 | * not successfully freed, because they are attached to a committing transaction. |
40 | * After the transaction commits, these pages are left on the LRU, with no | 40 | * After the transaction commits, these pages are left on the LRU, with no |
41 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | 41 | * ->mapping, and with attached buffers. These pages are trivially reclaimable |
42 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | 42 | * by the VM, but their apparent absence upsets the VM accounting, and it makes |
43 | * the numbers in /proc/meminfo look odd. | 43 | * the numbers in /proc/meminfo look odd. |
44 | * | 44 | * |
45 | * So here, we have a buffer which has just come off the forget list. Look to | 45 | * So here, we have a buffer which has just come off the forget list. Look to |
46 | * see if we can strip all buffers from the backing page. | 46 | * see if we can strip all buffers from the backing page. |
47 | * | 47 | * |
48 | * Called under journal->j_list_lock. The caller provided us with a ref | 48 | * Called under journal->j_list_lock. The caller provided us with a ref |
49 | * against the buffer, and we drop that here. | 49 | * against the buffer, and we drop that here. |
50 | */ | 50 | */ |
51 | static void release_buffer_page(struct buffer_head *bh) | 51 | static void release_buffer_page(struct buffer_head *bh) |
52 | { | 52 | { |
53 | struct page *page; | 53 | struct page *page; |
54 | 54 | ||
55 | if (buffer_dirty(bh)) | 55 | if (buffer_dirty(bh)) |
56 | goto nope; | 56 | goto nope; |
57 | if (atomic_read(&bh->b_count) != 1) | 57 | if (atomic_read(&bh->b_count) != 1) |
58 | goto nope; | 58 | goto nope; |
59 | page = bh->b_page; | 59 | page = bh->b_page; |
60 | if (!page) | 60 | if (!page) |
61 | goto nope; | 61 | goto nope; |
62 | if (page->mapping) | 62 | if (page->mapping) |
63 | goto nope; | 63 | goto nope; |
64 | 64 | ||
65 | /* OK, it's a truncated page */ | 65 | /* OK, it's a truncated page */ |
66 | if (!trylock_page(page)) | 66 | if (!trylock_page(page)) |
67 | goto nope; | 67 | goto nope; |
68 | 68 | ||
69 | page_cache_get(page); | 69 | page_cache_get(page); |
70 | __brelse(bh); | 70 | __brelse(bh); |
71 | try_to_free_buffers(page); | 71 | try_to_free_buffers(page); |
72 | unlock_page(page); | 72 | unlock_page(page); |
73 | page_cache_release(page); | 73 | page_cache_release(page); |
74 | return; | 74 | return; |
75 | 75 | ||
76 | nope: | 76 | nope: |
77 | __brelse(bh); | 77 | __brelse(bh); |
78 | } | 78 | } |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * Decrement reference counter for data buffer. If it has been marked | 81 | * Decrement reference counter for data buffer. If it has been marked |
82 | * 'BH_Freed', release it and the page to which it belongs if possible. | 82 | * 'BH_Freed', release it and the page to which it belongs if possible. |
83 | */ | 83 | */ |
84 | static void release_data_buffer(struct buffer_head *bh) | 84 | static void release_data_buffer(struct buffer_head *bh) |
85 | { | 85 | { |
86 | if (buffer_freed(bh)) { | 86 | if (buffer_freed(bh)) { |
87 | clear_buffer_freed(bh); | 87 | clear_buffer_freed(bh); |
88 | release_buffer_page(bh); | 88 | release_buffer_page(bh); |
89 | } else | 89 | } else |
90 | put_bh(bh); | 90 | put_bh(bh); |
91 | } | 91 | } |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | 94 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is |
95 | * held. For ranking reasons we must trylock. If we lose, schedule away and | 95 | * held. For ranking reasons we must trylock. If we lose, schedule away and |
96 | * return 0. j_list_lock is dropped in this case. | 96 | * return 0. j_list_lock is dropped in this case. |
97 | */ | 97 | */ |
98 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | 98 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) |
99 | { | 99 | { |
100 | if (!jbd_trylock_bh_state(bh)) { | 100 | if (!jbd_trylock_bh_state(bh)) { |
101 | spin_unlock(&journal->j_list_lock); | 101 | spin_unlock(&journal->j_list_lock); |
102 | schedule(); | 102 | schedule(); |
103 | return 0; | 103 | return 0; |
104 | } | 104 | } |
105 | return 1; | 105 | return 1; |
106 | } | 106 | } |
107 | 107 | ||
108 | /* Done it all: now write the commit record. We should have | 108 | /* Done it all: now write the commit record. We should have |
109 | * cleaned up our previous buffers by now, so if we are in abort | 109 | * cleaned up our previous buffers by now, so if we are in abort |
110 | * mode we can now just skip the rest of the journal write | 110 | * mode we can now just skip the rest of the journal write |
111 | * entirely. | 111 | * entirely. |
112 | * | 112 | * |
113 | * Returns 1 if the journal needs to be aborted or 0 on success | 113 | * Returns 1 if the journal needs to be aborted or 0 on success |
114 | */ | 114 | */ |
115 | static int journal_write_commit_record(journal_t *journal, | 115 | static int journal_write_commit_record(journal_t *journal, |
116 | transaction_t *commit_transaction) | 116 | transaction_t *commit_transaction) |
117 | { | 117 | { |
118 | struct journal_head *descriptor; | 118 | struct journal_head *descriptor; |
119 | struct buffer_head *bh; | 119 | struct buffer_head *bh; |
120 | journal_header_t *header; | 120 | journal_header_t *header; |
121 | int ret; | 121 | int ret; |
122 | int barrier_done = 0; | ||
123 | 122 | ||
124 | if (is_journal_aborted(journal)) | 123 | if (is_journal_aborted(journal)) |
125 | return 0; | 124 | return 0; |
126 | 125 | ||
127 | descriptor = journal_get_descriptor_buffer(journal); | 126 | descriptor = journal_get_descriptor_buffer(journal); |
128 | if (!descriptor) | 127 | if (!descriptor) |
129 | return 1; | 128 | return 1; |
130 | 129 | ||
131 | bh = jh2bh(descriptor); | 130 | bh = jh2bh(descriptor); |
132 | 131 | ||
133 | header = (journal_header_t *)(bh->b_data); | 132 | header = (journal_header_t *)(bh->b_data); |
134 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | 133 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); |
135 | header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); | 134 | header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); |
136 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 135 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
137 | 136 | ||
138 | JBUFFER_TRACE(descriptor, "write commit block"); | 137 | JBUFFER_TRACE(descriptor, "write commit block"); |
139 | set_buffer_dirty(bh); | 138 | set_buffer_dirty(bh); |
139 | |||
140 | if (journal->j_flags & JFS_BARRIER) { | 140 | if (journal->j_flags & JFS_BARRIER) { |
141 | set_buffer_ordered(bh); | 141 | ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER); |
142 | barrier_done = 1; | ||
143 | } | ||
144 | ret = sync_dirty_buffer(bh); | ||
145 | if (barrier_done) | ||
146 | clear_buffer_ordered(bh); | ||
147 | /* is it possible for another commit to fail at roughly | ||
148 | * the same time as this one? If so, we don't want to | ||
149 | * trust the barrier flag in the super, but instead want | ||
150 | * to remember if we sent a barrier request | ||
151 | */ | ||
152 | if (ret == -EOPNOTSUPP && barrier_done) { | ||
153 | char b[BDEVNAME_SIZE]; | ||
154 | 142 | ||
155 | printk(KERN_WARNING | 143 | /* |
156 | "JBD: barrier-based sync failed on %s - " | 144 | * Is it possible for another commit to fail at roughly |
157 | "disabling barriers\n", | 145 | * the same time as this one? If so, we don't want to |
158 | bdevname(journal->j_dev, b)); | 146 | * trust the barrier flag in the super, but instead want |
159 | spin_lock(&journal->j_state_lock); | 147 | * to remember if we sent a barrier request |
160 | journal->j_flags &= ~JFS_BARRIER; | 148 | */ |
161 | spin_unlock(&journal->j_state_lock); | 149 | if (ret == -EOPNOTSUPP) { |
150 | char b[BDEVNAME_SIZE]; | ||
162 | 151 | ||
163 | /* And try again, without the barrier */ | 152 | printk(KERN_WARNING |
164 | set_buffer_uptodate(bh); | 153 | "JBD: barrier-based sync failed on %s - " |
165 | set_buffer_dirty(bh); | 154 | "disabling barriers\n", |
155 | bdevname(journal->j_dev, b)); | ||
156 | spin_lock(&journal->j_state_lock); | ||
157 | journal->j_flags &= ~JFS_BARRIER; | ||
158 | spin_unlock(&journal->j_state_lock); | ||
159 | |||
160 | /* And try again, without the barrier */ | ||
161 | set_buffer_uptodate(bh); | ||
162 | set_buffer_dirty(bh); | ||
163 | ret = sync_dirty_buffer(bh); | ||
164 | } | ||
165 | } else { | ||
166 | ret = sync_dirty_buffer(bh); | 166 | ret = sync_dirty_buffer(bh); |
167 | } | 167 | } |
168 | |||
168 | put_bh(bh); /* One for getblk() */ | 169 | put_bh(bh); /* One for getblk() */ |
169 | journal_put_journal_head(descriptor); | 170 | journal_put_journal_head(descriptor); |
170 | 171 | ||
171 | return (ret == -EIO); | 172 | return (ret == -EIO); |
172 | } | 173 | } |
173 | 174 | ||
174 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, | 175 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, |
175 | int write_op) | 176 | int write_op) |
176 | { | 177 | { |
177 | int i; | 178 | int i; |
178 | 179 | ||
179 | for (i = 0; i < bufs; i++) { | 180 | for (i = 0; i < bufs; i++) { |
180 | wbuf[i]->b_end_io = end_buffer_write_sync; | 181 | wbuf[i]->b_end_io = end_buffer_write_sync; |
181 | /* We use-up our safety reference in submit_bh() */ | 182 | /* We use-up our safety reference in submit_bh() */ |
182 | submit_bh(write_op, wbuf[i]); | 183 | submit_bh(write_op, wbuf[i]); |
183 | } | 184 | } |
184 | } | 185 | } |
185 | 186 | ||
186 | /* | 187 | /* |
187 | * Submit all the data buffers to disk | 188 | * Submit all the data buffers to disk |
188 | */ | 189 | */ |
189 | static int journal_submit_data_buffers(journal_t *journal, | 190 | static int journal_submit_data_buffers(journal_t *journal, |
190 | transaction_t *commit_transaction, | 191 | transaction_t *commit_transaction, |
191 | int write_op) | 192 | int write_op) |
192 | { | 193 | { |
193 | struct journal_head *jh; | 194 | struct journal_head *jh; |
194 | struct buffer_head *bh; | 195 | struct buffer_head *bh; |
195 | int locked; | 196 | int locked; |
196 | int bufs = 0; | 197 | int bufs = 0; |
197 | struct buffer_head **wbuf = journal->j_wbuf; | 198 | struct buffer_head **wbuf = journal->j_wbuf; |
198 | int err = 0; | 199 | int err = 0; |
199 | 200 | ||
200 | /* | 201 | /* |
201 | * Whenever we unlock the journal and sleep, things can get added | 202 | * Whenever we unlock the journal and sleep, things can get added |
202 | * onto ->t_sync_datalist, so we have to keep looping back to | 203 | * onto ->t_sync_datalist, so we have to keep looping back to |
203 | * write_out_data until we *know* that the list is empty. | 204 | * write_out_data until we *know* that the list is empty. |
204 | * | 205 | * |
205 | * Cleanup any flushed data buffers from the data list. Even in | 206 | * Cleanup any flushed data buffers from the data list. Even in |
206 | * abort mode, we want to flush this out as soon as possible. | 207 | * abort mode, we want to flush this out as soon as possible. |
207 | */ | 208 | */ |
208 | write_out_data: | 209 | write_out_data: |
209 | cond_resched(); | 210 | cond_resched(); |
210 | spin_lock(&journal->j_list_lock); | 211 | spin_lock(&journal->j_list_lock); |
211 | 212 | ||
212 | while (commit_transaction->t_sync_datalist) { | 213 | while (commit_transaction->t_sync_datalist) { |
213 | jh = commit_transaction->t_sync_datalist; | 214 | jh = commit_transaction->t_sync_datalist; |
214 | bh = jh2bh(jh); | 215 | bh = jh2bh(jh); |
215 | locked = 0; | 216 | locked = 0; |
216 | 217 | ||
217 | /* Get reference just to make sure buffer does not disappear | 218 | /* Get reference just to make sure buffer does not disappear |
218 | * when we are forced to drop various locks */ | 219 | * when we are forced to drop various locks */ |
219 | get_bh(bh); | 220 | get_bh(bh); |
220 | /* If the buffer is dirty, we need to submit IO and hence | 221 | /* If the buffer is dirty, we need to submit IO and hence |
221 | * we need the buffer lock. We try to lock the buffer without | 222 | * we need the buffer lock. We try to lock the buffer without |
222 | * blocking. If we fail, we need to drop j_list_lock and do | 223 | * blocking. If we fail, we need to drop j_list_lock and do |
223 | * blocking lock_buffer(). | 224 | * blocking lock_buffer(). |
224 | */ | 225 | */ |
225 | if (buffer_dirty(bh)) { | 226 | if (buffer_dirty(bh)) { |
226 | if (!trylock_buffer(bh)) { | 227 | if (!trylock_buffer(bh)) { |
227 | BUFFER_TRACE(bh, "needs blocking lock"); | 228 | BUFFER_TRACE(bh, "needs blocking lock"); |
228 | spin_unlock(&journal->j_list_lock); | 229 | spin_unlock(&journal->j_list_lock); |
229 | /* Write out all data to prevent deadlocks */ | 230 | /* Write out all data to prevent deadlocks */ |
230 | journal_do_submit_data(wbuf, bufs, write_op); | 231 | journal_do_submit_data(wbuf, bufs, write_op); |
231 | bufs = 0; | 232 | bufs = 0; |
232 | lock_buffer(bh); | 233 | lock_buffer(bh); |
233 | spin_lock(&journal->j_list_lock); | 234 | spin_lock(&journal->j_list_lock); |
234 | } | 235 | } |
235 | locked = 1; | 236 | locked = 1; |
236 | } | 237 | } |
237 | /* We have to get bh_state lock. Again out of order, sigh. */ | 238 | /* We have to get bh_state lock. Again out of order, sigh. */ |
238 | if (!inverted_lock(journal, bh)) { | 239 | if (!inverted_lock(journal, bh)) { |
239 | jbd_lock_bh_state(bh); | 240 | jbd_lock_bh_state(bh); |
240 | spin_lock(&journal->j_list_lock); | 241 | spin_lock(&journal->j_list_lock); |
241 | } | 242 | } |
242 | /* Someone already cleaned up the buffer? */ | 243 | /* Someone already cleaned up the buffer? */ |
243 | if (!buffer_jbd(bh) || bh2jh(bh) != jh | 244 | if (!buffer_jbd(bh) || bh2jh(bh) != jh |
244 | || jh->b_transaction != commit_transaction | 245 | || jh->b_transaction != commit_transaction |
245 | || jh->b_jlist != BJ_SyncData) { | 246 | || jh->b_jlist != BJ_SyncData) { |
246 | jbd_unlock_bh_state(bh); | 247 | jbd_unlock_bh_state(bh); |
247 | if (locked) | 248 | if (locked) |
248 | unlock_buffer(bh); | 249 | unlock_buffer(bh); |
249 | BUFFER_TRACE(bh, "already cleaned up"); | 250 | BUFFER_TRACE(bh, "already cleaned up"); |
250 | release_data_buffer(bh); | 251 | release_data_buffer(bh); |
251 | continue; | 252 | continue; |
252 | } | 253 | } |
253 | if (locked && test_clear_buffer_dirty(bh)) { | 254 | if (locked && test_clear_buffer_dirty(bh)) { |
254 | BUFFER_TRACE(bh, "needs writeout, adding to array"); | 255 | BUFFER_TRACE(bh, "needs writeout, adding to array"); |
255 | wbuf[bufs++] = bh; | 256 | wbuf[bufs++] = bh; |
256 | __journal_file_buffer(jh, commit_transaction, | 257 | __journal_file_buffer(jh, commit_transaction, |
257 | BJ_Locked); | 258 | BJ_Locked); |
258 | jbd_unlock_bh_state(bh); | 259 | jbd_unlock_bh_state(bh); |
259 | if (bufs == journal->j_wbufsize) { | 260 | if (bufs == journal->j_wbufsize) { |
260 | spin_unlock(&journal->j_list_lock); | 261 | spin_unlock(&journal->j_list_lock); |
261 | journal_do_submit_data(wbuf, bufs, write_op); | 262 | journal_do_submit_data(wbuf, bufs, write_op); |
262 | bufs = 0; | 263 | bufs = 0; |
263 | goto write_out_data; | 264 | goto write_out_data; |
264 | } | 265 | } |
265 | } else if (!locked && buffer_locked(bh)) { | 266 | } else if (!locked && buffer_locked(bh)) { |
266 | __journal_file_buffer(jh, commit_transaction, | 267 | __journal_file_buffer(jh, commit_transaction, |
267 | BJ_Locked); | 268 | BJ_Locked); |
268 | jbd_unlock_bh_state(bh); | 269 | jbd_unlock_bh_state(bh); |
269 | put_bh(bh); | 270 | put_bh(bh); |
270 | } else { | 271 | } else { |
271 | BUFFER_TRACE(bh, "writeout complete: unfile"); | 272 | BUFFER_TRACE(bh, "writeout complete: unfile"); |
272 | if (unlikely(!buffer_uptodate(bh))) | 273 | if (unlikely(!buffer_uptodate(bh))) |
273 | err = -EIO; | 274 | err = -EIO; |
274 | __journal_unfile_buffer(jh); | 275 | __journal_unfile_buffer(jh); |
275 | jbd_unlock_bh_state(bh); | 276 | jbd_unlock_bh_state(bh); |
276 | if (locked) | 277 | if (locked) |
277 | unlock_buffer(bh); | 278 | unlock_buffer(bh); |
278 | journal_remove_journal_head(bh); | 279 | journal_remove_journal_head(bh); |
279 | /* One for our safety reference, other for | 280 | /* One for our safety reference, other for |
280 | * journal_remove_journal_head() */ | 281 | * journal_remove_journal_head() */ |
281 | put_bh(bh); | 282 | put_bh(bh); |
282 | release_data_buffer(bh); | 283 | release_data_buffer(bh); |
283 | } | 284 | } |
284 | 285 | ||
285 | if (need_resched() || spin_needbreak(&journal->j_list_lock)) { | 286 | if (need_resched() || spin_needbreak(&journal->j_list_lock)) { |
286 | spin_unlock(&journal->j_list_lock); | 287 | spin_unlock(&journal->j_list_lock); |
287 | goto write_out_data; | 288 | goto write_out_data; |
288 | } | 289 | } |
289 | } | 290 | } |
290 | spin_unlock(&journal->j_list_lock); | 291 | spin_unlock(&journal->j_list_lock); |
291 | journal_do_submit_data(wbuf, bufs, write_op); | 292 | journal_do_submit_data(wbuf, bufs, write_op); |
292 | 293 | ||
293 | return err; | 294 | return err; |
294 | } | 295 | } |
295 | 296 | ||
296 | /* | 297 | /* |
297 | * journal_commit_transaction | 298 | * journal_commit_transaction |
298 | * | 299 | * |
299 | * The primary function for committing a transaction to the log. This | 300 | * The primary function for committing a transaction to the log. This |
300 | * function is called by the journal thread to begin a complete commit. | 301 | * function is called by the journal thread to begin a complete commit. |
301 | */ | 302 | */ |
302 | void journal_commit_transaction(journal_t *journal) | 303 | void journal_commit_transaction(journal_t *journal) |
303 | { | 304 | { |
304 | transaction_t *commit_transaction; | 305 | transaction_t *commit_transaction; |
305 | struct journal_head *jh, *new_jh, *descriptor; | 306 | struct journal_head *jh, *new_jh, *descriptor; |
306 | struct buffer_head **wbuf = journal->j_wbuf; | 307 | struct buffer_head **wbuf = journal->j_wbuf; |
307 | int bufs; | 308 | int bufs; |
308 | int flags; | 309 | int flags; |
309 | int err; | 310 | int err; |
310 | unsigned int blocknr; | 311 | unsigned int blocknr; |
311 | ktime_t start_time; | 312 | ktime_t start_time; |
312 | u64 commit_time; | 313 | u64 commit_time; |
313 | char *tagp = NULL; | 314 | char *tagp = NULL; |
314 | journal_header_t *header; | 315 | journal_header_t *header; |
315 | journal_block_tag_t *tag = NULL; | 316 | journal_block_tag_t *tag = NULL; |
316 | int space_left = 0; | 317 | int space_left = 0; |
317 | int first_tag = 0; | 318 | int first_tag = 0; |
318 | int tag_flag; | 319 | int tag_flag; |
319 | int i; | 320 | int i; |
320 | int write_op = WRITE; | 321 | int write_op = WRITE; |
321 | 322 | ||
322 | /* | 323 | /* |
323 | * First job: lock down the current transaction and wait for | 324 | * First job: lock down the current transaction and wait for |
324 | * all outstanding updates to complete. | 325 | * all outstanding updates to complete. |
325 | */ | 326 | */ |
326 | 327 | ||
327 | #ifdef COMMIT_STATS | 328 | #ifdef COMMIT_STATS |
328 | spin_lock(&journal->j_list_lock); | 329 | spin_lock(&journal->j_list_lock); |
329 | summarise_journal_usage(journal); | 330 | summarise_journal_usage(journal); |
330 | spin_unlock(&journal->j_list_lock); | 331 | spin_unlock(&journal->j_list_lock); |
331 | #endif | 332 | #endif |
332 | 333 | ||
333 | /* Do we need to erase the effects of a prior journal_flush? */ | 334 | /* Do we need to erase the effects of a prior journal_flush? */ |
334 | if (journal->j_flags & JFS_FLUSHED) { | 335 | if (journal->j_flags & JFS_FLUSHED) { |
335 | jbd_debug(3, "super block updated\n"); | 336 | jbd_debug(3, "super block updated\n"); |
336 | journal_update_superblock(journal, 1); | 337 | journal_update_superblock(journal, 1); |
337 | } else { | 338 | } else { |
338 | jbd_debug(3, "superblock not updated\n"); | 339 | jbd_debug(3, "superblock not updated\n"); |
339 | } | 340 | } |
340 | 341 | ||
341 | J_ASSERT(journal->j_running_transaction != NULL); | 342 | J_ASSERT(journal->j_running_transaction != NULL); |
342 | J_ASSERT(journal->j_committing_transaction == NULL); | 343 | J_ASSERT(journal->j_committing_transaction == NULL); |
343 | 344 | ||
344 | commit_transaction = journal->j_running_transaction; | 345 | commit_transaction = journal->j_running_transaction; |
345 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | 346 | J_ASSERT(commit_transaction->t_state == T_RUNNING); |
346 | 347 | ||
347 | jbd_debug(1, "JBD: starting commit of transaction %d\n", | 348 | jbd_debug(1, "JBD: starting commit of transaction %d\n", |
348 | commit_transaction->t_tid); | 349 | commit_transaction->t_tid); |
349 | 350 | ||
350 | spin_lock(&journal->j_state_lock); | 351 | spin_lock(&journal->j_state_lock); |
351 | commit_transaction->t_state = T_LOCKED; | 352 | commit_transaction->t_state = T_LOCKED; |
352 | 353 | ||
353 | /* | 354 | /* |
354 | * Use plugged writes here, since we want to submit several before | 355 | * Use plugged writes here, since we want to submit several before |
355 | * we unplug the device. We don't do explicit unplugging in here, | 356 | * we unplug the device. We don't do explicit unplugging in here, |
356 | * instead we rely on sync_buffer() doing the unplug for us. | 357 | * instead we rely on sync_buffer() doing the unplug for us. |
357 | */ | 358 | */ |
358 | if (commit_transaction->t_synchronous_commit) | 359 | if (commit_transaction->t_synchronous_commit) |
359 | write_op = WRITE_SYNC_PLUG; | 360 | write_op = WRITE_SYNC_PLUG; |
360 | spin_lock(&commit_transaction->t_handle_lock); | 361 | spin_lock(&commit_transaction->t_handle_lock); |
361 | while (commit_transaction->t_updates) { | 362 | while (commit_transaction->t_updates) { |
362 | DEFINE_WAIT(wait); | 363 | DEFINE_WAIT(wait); |
363 | 364 | ||
364 | prepare_to_wait(&journal->j_wait_updates, &wait, | 365 | prepare_to_wait(&journal->j_wait_updates, &wait, |
365 | TASK_UNINTERRUPTIBLE); | 366 | TASK_UNINTERRUPTIBLE); |
366 | if (commit_transaction->t_updates) { | 367 | if (commit_transaction->t_updates) { |
367 | spin_unlock(&commit_transaction->t_handle_lock); | 368 | spin_unlock(&commit_transaction->t_handle_lock); |
368 | spin_unlock(&journal->j_state_lock); | 369 | spin_unlock(&journal->j_state_lock); |
369 | schedule(); | 370 | schedule(); |
370 | spin_lock(&journal->j_state_lock); | 371 | spin_lock(&journal->j_state_lock); |
371 | spin_lock(&commit_transaction->t_handle_lock); | 372 | spin_lock(&commit_transaction->t_handle_lock); |
372 | } | 373 | } |
373 | finish_wait(&journal->j_wait_updates, &wait); | 374 | finish_wait(&journal->j_wait_updates, &wait); |
374 | } | 375 | } |
375 | spin_unlock(&commit_transaction->t_handle_lock); | 376 | spin_unlock(&commit_transaction->t_handle_lock); |
376 | 377 | ||
377 | J_ASSERT (commit_transaction->t_outstanding_credits <= | 378 | J_ASSERT (commit_transaction->t_outstanding_credits <= |
378 | journal->j_max_transaction_buffers); | 379 | journal->j_max_transaction_buffers); |
379 | 380 | ||
380 | /* | 381 | /* |
381 | * First thing we are allowed to do is to discard any remaining | 382 | * First thing we are allowed to do is to discard any remaining |
382 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume | 383 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume |
383 | * that there are no such buffers: if a large filesystem | 384 | * that there are no such buffers: if a large filesystem |
384 | * operation like a truncate needs to split itself over multiple | 385 | * operation like a truncate needs to split itself over multiple |
385 | * transactions, then it may try to do a journal_restart() while | 386 | * transactions, then it may try to do a journal_restart() while |
386 | * there are still BJ_Reserved buffers outstanding. These must | 387 | * there are still BJ_Reserved buffers outstanding. These must |
387 | * be released cleanly from the current transaction. | 388 | * be released cleanly from the current transaction. |
388 | * | 389 | * |
389 | * In this case, the filesystem must still reserve write access | 390 | * In this case, the filesystem must still reserve write access |
390 | * again before modifying the buffer in the new transaction, but | 391 | * again before modifying the buffer in the new transaction, but |
391 | * we do not require it to remember exactly which old buffers it | 392 | * we do not require it to remember exactly which old buffers it |
392 | * has reserved. This is consistent with the existing behaviour | 393 | * has reserved. This is consistent with the existing behaviour |
393 | * that multiple journal_get_write_access() calls to the same | 394 | * that multiple journal_get_write_access() calls to the same |
394 | * buffer are perfectly permissable. | 395 | * buffer are perfectly permissable. |
395 | */ | 396 | */ |
396 | while (commit_transaction->t_reserved_list) { | 397 | while (commit_transaction->t_reserved_list) { |
397 | jh = commit_transaction->t_reserved_list; | 398 | jh = commit_transaction->t_reserved_list; |
398 | JBUFFER_TRACE(jh, "reserved, unused: refile"); | 399 | JBUFFER_TRACE(jh, "reserved, unused: refile"); |
399 | /* | 400 | /* |
400 | * A journal_get_undo_access()+journal_release_buffer() may | 401 | * A journal_get_undo_access()+journal_release_buffer() may |
401 | * leave undo-committed data. | 402 | * leave undo-committed data. |
402 | */ | 403 | */ |
403 | if (jh->b_committed_data) { | 404 | if (jh->b_committed_data) { |
404 | struct buffer_head *bh = jh2bh(jh); | 405 | struct buffer_head *bh = jh2bh(jh); |
405 | 406 | ||
406 | jbd_lock_bh_state(bh); | 407 | jbd_lock_bh_state(bh); |
407 | jbd_free(jh->b_committed_data, bh->b_size); | 408 | jbd_free(jh->b_committed_data, bh->b_size); |
408 | jh->b_committed_data = NULL; | 409 | jh->b_committed_data = NULL; |
409 | jbd_unlock_bh_state(bh); | 410 | jbd_unlock_bh_state(bh); |
410 | } | 411 | } |
411 | journal_refile_buffer(journal, jh); | 412 | journal_refile_buffer(journal, jh); |
412 | } | 413 | } |
413 | 414 | ||
414 | /* | 415 | /* |
415 | * Now try to drop any written-back buffers from the journal's | 416 | * Now try to drop any written-back buffers from the journal's |
416 | * checkpoint lists. We do this *before* commit because it potentially | 417 | * checkpoint lists. We do this *before* commit because it potentially |
417 | * frees some memory | 418 | * frees some memory |
418 | */ | 419 | */ |
419 | spin_lock(&journal->j_list_lock); | 420 | spin_lock(&journal->j_list_lock); |
420 | __journal_clean_checkpoint_list(journal); | 421 | __journal_clean_checkpoint_list(journal); |
421 | spin_unlock(&journal->j_list_lock); | 422 | spin_unlock(&journal->j_list_lock); |
422 | 423 | ||
423 | jbd_debug (3, "JBD: commit phase 1\n"); | 424 | jbd_debug (3, "JBD: commit phase 1\n"); |
424 | 425 | ||
425 | /* | 426 | /* |
426 | * Switch to a new revoke table. | 427 | * Switch to a new revoke table. |
427 | */ | 428 | */ |
428 | journal_switch_revoke_table(journal); | 429 | journal_switch_revoke_table(journal); |
429 | 430 | ||
430 | commit_transaction->t_state = T_FLUSH; | 431 | commit_transaction->t_state = T_FLUSH; |
431 | journal->j_committing_transaction = commit_transaction; | 432 | journal->j_committing_transaction = commit_transaction; |
432 | journal->j_running_transaction = NULL; | 433 | journal->j_running_transaction = NULL; |
433 | start_time = ktime_get(); | 434 | start_time = ktime_get(); |
434 | commit_transaction->t_log_start = journal->j_head; | 435 | commit_transaction->t_log_start = journal->j_head; |
435 | wake_up(&journal->j_wait_transaction_locked); | 436 | wake_up(&journal->j_wait_transaction_locked); |
436 | spin_unlock(&journal->j_state_lock); | 437 | spin_unlock(&journal->j_state_lock); |
437 | 438 | ||
438 | jbd_debug (3, "JBD: commit phase 2\n"); | 439 | jbd_debug (3, "JBD: commit phase 2\n"); |
439 | 440 | ||
440 | /* | 441 | /* |
441 | * Now start flushing things to disk, in the order they appear | 442 | * Now start flushing things to disk, in the order they appear |
442 | * on the transaction lists. Data blocks go first. | 443 | * on the transaction lists. Data blocks go first. |
443 | */ | 444 | */ |
444 | err = journal_submit_data_buffers(journal, commit_transaction, | 445 | err = journal_submit_data_buffers(journal, commit_transaction, |
445 | write_op); | 446 | write_op); |
446 | 447 | ||
447 | /* | 448 | /* |
448 | * Wait for all previously submitted IO to complete. | 449 | * Wait for all previously submitted IO to complete. |
449 | */ | 450 | */ |
450 | spin_lock(&journal->j_list_lock); | 451 | spin_lock(&journal->j_list_lock); |
451 | while (commit_transaction->t_locked_list) { | 452 | while (commit_transaction->t_locked_list) { |
452 | struct buffer_head *bh; | 453 | struct buffer_head *bh; |
453 | 454 | ||
454 | jh = commit_transaction->t_locked_list->b_tprev; | 455 | jh = commit_transaction->t_locked_list->b_tprev; |
455 | bh = jh2bh(jh); | 456 | bh = jh2bh(jh); |
456 | get_bh(bh); | 457 | get_bh(bh); |
457 | if (buffer_locked(bh)) { | 458 | if (buffer_locked(bh)) { |
458 | spin_unlock(&journal->j_list_lock); | 459 | spin_unlock(&journal->j_list_lock); |
459 | wait_on_buffer(bh); | 460 | wait_on_buffer(bh); |
460 | spin_lock(&journal->j_list_lock); | 461 | spin_lock(&journal->j_list_lock); |
461 | } | 462 | } |
462 | if (unlikely(!buffer_uptodate(bh))) { | 463 | if (unlikely(!buffer_uptodate(bh))) { |
463 | if (!trylock_page(bh->b_page)) { | 464 | if (!trylock_page(bh->b_page)) { |
464 | spin_unlock(&journal->j_list_lock); | 465 | spin_unlock(&journal->j_list_lock); |
465 | lock_page(bh->b_page); | 466 | lock_page(bh->b_page); |
466 | spin_lock(&journal->j_list_lock); | 467 | spin_lock(&journal->j_list_lock); |
467 | } | 468 | } |
468 | if (bh->b_page->mapping) | 469 | if (bh->b_page->mapping) |
469 | set_bit(AS_EIO, &bh->b_page->mapping->flags); | 470 | set_bit(AS_EIO, &bh->b_page->mapping->flags); |
470 | 471 | ||
471 | unlock_page(bh->b_page); | 472 | unlock_page(bh->b_page); |
472 | SetPageError(bh->b_page); | 473 | SetPageError(bh->b_page); |
473 | err = -EIO; | 474 | err = -EIO; |
474 | } | 475 | } |
475 | if (!inverted_lock(journal, bh)) { | 476 | if (!inverted_lock(journal, bh)) { |
476 | put_bh(bh); | 477 | put_bh(bh); |
477 | spin_lock(&journal->j_list_lock); | 478 | spin_lock(&journal->j_list_lock); |
478 | continue; | 479 | continue; |
479 | } | 480 | } |
480 | if (buffer_jbd(bh) && bh2jh(bh) == jh && | 481 | if (buffer_jbd(bh) && bh2jh(bh) == jh && |
481 | jh->b_transaction == commit_transaction && | 482 | jh->b_transaction == commit_transaction && |
482 | jh->b_jlist == BJ_Locked) { | 483 | jh->b_jlist == BJ_Locked) { |
483 | __journal_unfile_buffer(jh); | 484 | __journal_unfile_buffer(jh); |
484 | jbd_unlock_bh_state(bh); | 485 | jbd_unlock_bh_state(bh); |
485 | journal_remove_journal_head(bh); | 486 | journal_remove_journal_head(bh); |
486 | put_bh(bh); | 487 | put_bh(bh); |
487 | } else { | 488 | } else { |
488 | jbd_unlock_bh_state(bh); | 489 | jbd_unlock_bh_state(bh); |
489 | } | 490 | } |
490 | release_data_buffer(bh); | 491 | release_data_buffer(bh); |
491 | cond_resched_lock(&journal->j_list_lock); | 492 | cond_resched_lock(&journal->j_list_lock); |
492 | } | 493 | } |
493 | spin_unlock(&journal->j_list_lock); | 494 | spin_unlock(&journal->j_list_lock); |
494 | 495 | ||
495 | if (err) { | 496 | if (err) { |
496 | char b[BDEVNAME_SIZE]; | 497 | char b[BDEVNAME_SIZE]; |
497 | 498 | ||
498 | printk(KERN_WARNING | 499 | printk(KERN_WARNING |
499 | "JBD: Detected IO errors while flushing file data " | 500 | "JBD: Detected IO errors while flushing file data " |
500 | "on %s\n", bdevname(journal->j_fs_dev, b)); | 501 | "on %s\n", bdevname(journal->j_fs_dev, b)); |
501 | if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) | 502 | if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) |
502 | journal_abort(journal, err); | 503 | journal_abort(journal, err); |
503 | err = 0; | 504 | err = 0; |
504 | } | 505 | } |
505 | 506 | ||
506 | journal_write_revoke_records(journal, commit_transaction, write_op); | 507 | journal_write_revoke_records(journal, commit_transaction, write_op); |
507 | 508 | ||
508 | /* | 509 | /* |
509 | * If we found any dirty or locked buffers, then we should have | 510 | * If we found any dirty or locked buffers, then we should have |
510 | * looped back up to the write_out_data label. If there weren't | 511 | * looped back up to the write_out_data label. If there weren't |
511 | * any then journal_clean_data_list should have wiped the list | 512 | * any then journal_clean_data_list should have wiped the list |
512 | * clean by now, so check that it is in fact empty. | 513 | * clean by now, so check that it is in fact empty. |
513 | */ | 514 | */ |
514 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | 515 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); |
515 | 516 | ||
516 | jbd_debug (3, "JBD: commit phase 3\n"); | 517 | jbd_debug (3, "JBD: commit phase 3\n"); |
517 | 518 | ||
518 | /* | 519 | /* |
519 | * Way to go: we have now written out all of the data for a | 520 | * Way to go: we have now written out all of the data for a |
520 | * transaction! Now comes the tricky part: we need to write out | 521 | * transaction! Now comes the tricky part: we need to write out |
521 | * metadata. Loop over the transaction's entire buffer list: | 522 | * metadata. Loop over the transaction's entire buffer list: |
522 | */ | 523 | */ |
523 | spin_lock(&journal->j_state_lock); | 524 | spin_lock(&journal->j_state_lock); |
524 | commit_transaction->t_state = T_COMMIT; | 525 | commit_transaction->t_state = T_COMMIT; |
525 | spin_unlock(&journal->j_state_lock); | 526 | spin_unlock(&journal->j_state_lock); |
526 | 527 | ||
527 | J_ASSERT(commit_transaction->t_nr_buffers <= | 528 | J_ASSERT(commit_transaction->t_nr_buffers <= |
528 | commit_transaction->t_outstanding_credits); | 529 | commit_transaction->t_outstanding_credits); |
529 | 530 | ||
530 | descriptor = NULL; | 531 | descriptor = NULL; |
531 | bufs = 0; | 532 | bufs = 0; |
532 | while (commit_transaction->t_buffers) { | 533 | while (commit_transaction->t_buffers) { |
533 | 534 | ||
534 | /* Find the next buffer to be journaled... */ | 535 | /* Find the next buffer to be journaled... */ |
535 | 536 | ||
536 | jh = commit_transaction->t_buffers; | 537 | jh = commit_transaction->t_buffers; |
537 | 538 | ||
538 | /* If we're in abort mode, we just un-journal the buffer and | 539 | /* If we're in abort mode, we just un-journal the buffer and |
539 | release it. */ | 540 | release it. */ |
540 | 541 | ||
541 | if (is_journal_aborted(journal)) { | 542 | if (is_journal_aborted(journal)) { |
542 | clear_buffer_jbddirty(jh2bh(jh)); | 543 | clear_buffer_jbddirty(jh2bh(jh)); |
543 | JBUFFER_TRACE(jh, "journal is aborting: refile"); | 544 | JBUFFER_TRACE(jh, "journal is aborting: refile"); |
544 | journal_refile_buffer(journal, jh); | 545 | journal_refile_buffer(journal, jh); |
545 | /* If that was the last one, we need to clean up | 546 | /* If that was the last one, we need to clean up |
546 | * any descriptor buffers which may have been | 547 | * any descriptor buffers which may have been |
547 | * already allocated, even if we are now | 548 | * already allocated, even if we are now |
548 | * aborting. */ | 549 | * aborting. */ |
549 | if (!commit_transaction->t_buffers) | 550 | if (!commit_transaction->t_buffers) |
550 | goto start_journal_io; | 551 | goto start_journal_io; |
551 | continue; | 552 | continue; |
552 | } | 553 | } |
553 | 554 | ||
554 | /* Make sure we have a descriptor block in which to | 555 | /* Make sure we have a descriptor block in which to |
555 | record the metadata buffer. */ | 556 | record the metadata buffer. */ |
556 | 557 | ||
557 | if (!descriptor) { | 558 | if (!descriptor) { |
558 | struct buffer_head *bh; | 559 | struct buffer_head *bh; |
559 | 560 | ||
560 | J_ASSERT (bufs == 0); | 561 | J_ASSERT (bufs == 0); |
561 | 562 | ||
562 | jbd_debug(4, "JBD: get descriptor\n"); | 563 | jbd_debug(4, "JBD: get descriptor\n"); |
563 | 564 | ||
564 | descriptor = journal_get_descriptor_buffer(journal); | 565 | descriptor = journal_get_descriptor_buffer(journal); |
565 | if (!descriptor) { | 566 | if (!descriptor) { |
566 | journal_abort(journal, -EIO); | 567 | journal_abort(journal, -EIO); |
567 | continue; | 568 | continue; |
568 | } | 569 | } |
569 | 570 | ||
570 | bh = jh2bh(descriptor); | 571 | bh = jh2bh(descriptor); |
571 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", | 572 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", |
572 | (unsigned long long)bh->b_blocknr, bh->b_data); | 573 | (unsigned long long)bh->b_blocknr, bh->b_data); |
573 | header = (journal_header_t *)&bh->b_data[0]; | 574 | header = (journal_header_t *)&bh->b_data[0]; |
574 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | 575 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); |
575 | header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); | 576 | header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); |
576 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 577 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
577 | 578 | ||
578 | tagp = &bh->b_data[sizeof(journal_header_t)]; | 579 | tagp = &bh->b_data[sizeof(journal_header_t)]; |
579 | space_left = bh->b_size - sizeof(journal_header_t); | 580 | space_left = bh->b_size - sizeof(journal_header_t); |
580 | first_tag = 1; | 581 | first_tag = 1; |
581 | set_buffer_jwrite(bh); | 582 | set_buffer_jwrite(bh); |
582 | set_buffer_dirty(bh); | 583 | set_buffer_dirty(bh); |
583 | wbuf[bufs++] = bh; | 584 | wbuf[bufs++] = bh; |
584 | 585 | ||
585 | /* Record it so that we can wait for IO | 586 | /* Record it so that we can wait for IO |
586 | completion later */ | 587 | completion later */ |
587 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | 588 | BUFFER_TRACE(bh, "ph3: file as descriptor"); |
588 | journal_file_buffer(descriptor, commit_transaction, | 589 | journal_file_buffer(descriptor, commit_transaction, |
589 | BJ_LogCtl); | 590 | BJ_LogCtl); |
590 | } | 591 | } |
591 | 592 | ||
592 | /* Where is the buffer to be written? */ | 593 | /* Where is the buffer to be written? */ |
593 | 594 | ||
594 | err = journal_next_log_block(journal, &blocknr); | 595 | err = journal_next_log_block(journal, &blocknr); |
595 | /* If the block mapping failed, just abandon the buffer | 596 | /* If the block mapping failed, just abandon the buffer |
596 | and repeat this loop: we'll fall into the | 597 | and repeat this loop: we'll fall into the |
597 | refile-on-abort condition above. */ | 598 | refile-on-abort condition above. */ |
598 | if (err) { | 599 | if (err) { |
599 | journal_abort(journal, err); | 600 | journal_abort(journal, err); |
600 | continue; | 601 | continue; |
601 | } | 602 | } |
602 | 603 | ||
603 | /* | 604 | /* |
604 | * start_this_handle() uses t_outstanding_credits to determine | 605 | * start_this_handle() uses t_outstanding_credits to determine |
605 | * the free space in the log, but this counter is changed | 606 | * the free space in the log, but this counter is changed |
606 | * by journal_next_log_block() also. | 607 | * by journal_next_log_block() also. |
607 | */ | 608 | */ |
608 | commit_transaction->t_outstanding_credits--; | 609 | commit_transaction->t_outstanding_credits--; |
609 | 610 | ||
610 | /* Bump b_count to prevent truncate from stumbling over | 611 | /* Bump b_count to prevent truncate from stumbling over |
611 | the shadowed buffer! @@@ This can go if we ever get | 612 | the shadowed buffer! @@@ This can go if we ever get |
612 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | 613 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ |
613 | atomic_inc(&jh2bh(jh)->b_count); | 614 | atomic_inc(&jh2bh(jh)->b_count); |
614 | 615 | ||
615 | /* Make a temporary IO buffer with which to write it out | 616 | /* Make a temporary IO buffer with which to write it out |
616 | (this will requeue both the metadata buffer and the | 617 | (this will requeue both the metadata buffer and the |
617 | temporary IO buffer). new_bh goes on BJ_IO*/ | 618 | temporary IO buffer). new_bh goes on BJ_IO*/ |
618 | 619 | ||
619 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | 620 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); |
620 | /* | 621 | /* |
621 | * akpm: journal_write_metadata_buffer() sets | 622 | * akpm: journal_write_metadata_buffer() sets |
622 | * new_bh->b_transaction to commit_transaction. | 623 | * new_bh->b_transaction to commit_transaction. |
623 | * We need to clean this up before we release new_bh | 624 | * We need to clean this up before we release new_bh |
624 | * (which is of type BJ_IO) | 625 | * (which is of type BJ_IO) |
625 | */ | 626 | */ |
626 | JBUFFER_TRACE(jh, "ph3: write metadata"); | 627 | JBUFFER_TRACE(jh, "ph3: write metadata"); |
627 | flags = journal_write_metadata_buffer(commit_transaction, | 628 | flags = journal_write_metadata_buffer(commit_transaction, |
628 | jh, &new_jh, blocknr); | 629 | jh, &new_jh, blocknr); |
629 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | 630 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); |
630 | wbuf[bufs++] = jh2bh(new_jh); | 631 | wbuf[bufs++] = jh2bh(new_jh); |
631 | 632 | ||
632 | /* Record the new block's tag in the current descriptor | 633 | /* Record the new block's tag in the current descriptor |
633 | buffer */ | 634 | buffer */ |
634 | 635 | ||
635 | tag_flag = 0; | 636 | tag_flag = 0; |
636 | if (flags & 1) | 637 | if (flags & 1) |
637 | tag_flag |= JFS_FLAG_ESCAPE; | 638 | tag_flag |= JFS_FLAG_ESCAPE; |
638 | if (!first_tag) | 639 | if (!first_tag) |
639 | tag_flag |= JFS_FLAG_SAME_UUID; | 640 | tag_flag |= JFS_FLAG_SAME_UUID; |
640 | 641 | ||
641 | tag = (journal_block_tag_t *) tagp; | 642 | tag = (journal_block_tag_t *) tagp; |
642 | tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); | 643 | tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); |
643 | tag->t_flags = cpu_to_be32(tag_flag); | 644 | tag->t_flags = cpu_to_be32(tag_flag); |
644 | tagp += sizeof(journal_block_tag_t); | 645 | tagp += sizeof(journal_block_tag_t); |
645 | space_left -= sizeof(journal_block_tag_t); | 646 | space_left -= sizeof(journal_block_tag_t); |
646 | 647 | ||
647 | if (first_tag) { | 648 | if (first_tag) { |
648 | memcpy (tagp, journal->j_uuid, 16); | 649 | memcpy (tagp, journal->j_uuid, 16); |
649 | tagp += 16; | 650 | tagp += 16; |
650 | space_left -= 16; | 651 | space_left -= 16; |
651 | first_tag = 0; | 652 | first_tag = 0; |
652 | } | 653 | } |
653 | 654 | ||
654 | /* If there's no more to do, or if the descriptor is full, | 655 | /* If there's no more to do, or if the descriptor is full, |
655 | let the IO rip! */ | 656 | let the IO rip! */ |
656 | 657 | ||
657 | if (bufs == journal->j_wbufsize || | 658 | if (bufs == journal->j_wbufsize || |
658 | commit_transaction->t_buffers == NULL || | 659 | commit_transaction->t_buffers == NULL || |
659 | space_left < sizeof(journal_block_tag_t) + 16) { | 660 | space_left < sizeof(journal_block_tag_t) + 16) { |
660 | 661 | ||
661 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); | 662 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); |
662 | 663 | ||
663 | /* Write an end-of-descriptor marker before | 664 | /* Write an end-of-descriptor marker before |
664 | submitting the IOs. "tag" still points to | 665 | submitting the IOs. "tag" still points to |
665 | the last tag we set up. */ | 666 | the last tag we set up. */ |
666 | 667 | ||
667 | tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); | 668 | tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); |
668 | 669 | ||
669 | start_journal_io: | 670 | start_journal_io: |
670 | for (i = 0; i < bufs; i++) { | 671 | for (i = 0; i < bufs; i++) { |
671 | struct buffer_head *bh = wbuf[i]; | 672 | struct buffer_head *bh = wbuf[i]; |
672 | lock_buffer(bh); | 673 | lock_buffer(bh); |
673 | clear_buffer_dirty(bh); | 674 | clear_buffer_dirty(bh); |
674 | set_buffer_uptodate(bh); | 675 | set_buffer_uptodate(bh); |
675 | bh->b_end_io = journal_end_buffer_io_sync; | 676 | bh->b_end_io = journal_end_buffer_io_sync; |
676 | submit_bh(write_op, bh); | 677 | submit_bh(write_op, bh); |
677 | } | 678 | } |
678 | cond_resched(); | 679 | cond_resched(); |
679 | 680 | ||
680 | /* Force a new descriptor to be generated next | 681 | /* Force a new descriptor to be generated next |
681 | time round the loop. */ | 682 | time round the loop. */ |
682 | descriptor = NULL; | 683 | descriptor = NULL; |
683 | bufs = 0; | 684 | bufs = 0; |
684 | } | 685 | } |
685 | } | 686 | } |
686 | 687 | ||
687 | /* Lo and behold: we have just managed to send a transaction to | 688 | /* Lo and behold: we have just managed to send a transaction to |
688 | the log. Before we can commit it, wait for the IO so far to | 689 | the log. Before we can commit it, wait for the IO so far to |
689 | complete. Control buffers being written are on the | 690 | complete. Control buffers being written are on the |
690 | transaction's t_log_list queue, and metadata buffers are on | 691 | transaction's t_log_list queue, and metadata buffers are on |
691 | the t_iobuf_list queue. | 692 | the t_iobuf_list queue. |
692 | 693 | ||
693 | Wait for the buffers in reverse order. That way we are | 694 | Wait for the buffers in reverse order. That way we are |
694 | less likely to be woken up until all IOs have completed, and | 695 | less likely to be woken up until all IOs have completed, and |
695 | so we incur less scheduling load. | 696 | so we incur less scheduling load. |
696 | */ | 697 | */ |
697 | 698 | ||
698 | jbd_debug(3, "JBD: commit phase 4\n"); | 699 | jbd_debug(3, "JBD: commit phase 4\n"); |
699 | 700 | ||
700 | /* | 701 | /* |
701 | * akpm: these are BJ_IO, and j_list_lock is not needed. | 702 | * akpm: these are BJ_IO, and j_list_lock is not needed. |
702 | * See __journal_try_to_free_buffer. | 703 | * See __journal_try_to_free_buffer. |
703 | */ | 704 | */ |
704 | wait_for_iobuf: | 705 | wait_for_iobuf: |
705 | while (commit_transaction->t_iobuf_list != NULL) { | 706 | while (commit_transaction->t_iobuf_list != NULL) { |
706 | struct buffer_head *bh; | 707 | struct buffer_head *bh; |
707 | 708 | ||
708 | jh = commit_transaction->t_iobuf_list->b_tprev; | 709 | jh = commit_transaction->t_iobuf_list->b_tprev; |
709 | bh = jh2bh(jh); | 710 | bh = jh2bh(jh); |
710 | if (buffer_locked(bh)) { | 711 | if (buffer_locked(bh)) { |
711 | wait_on_buffer(bh); | 712 | wait_on_buffer(bh); |
712 | goto wait_for_iobuf; | 713 | goto wait_for_iobuf; |
713 | } | 714 | } |
714 | if (cond_resched()) | 715 | if (cond_resched()) |
715 | goto wait_for_iobuf; | 716 | goto wait_for_iobuf; |
716 | 717 | ||
717 | if (unlikely(!buffer_uptodate(bh))) | 718 | if (unlikely(!buffer_uptodate(bh))) |
718 | err = -EIO; | 719 | err = -EIO; |
719 | 720 | ||
720 | clear_buffer_jwrite(bh); | 721 | clear_buffer_jwrite(bh); |
721 | 722 | ||
722 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | 723 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); |
723 | journal_unfile_buffer(journal, jh); | 724 | journal_unfile_buffer(journal, jh); |
724 | 725 | ||
725 | /* | 726 | /* |
726 | * ->t_iobuf_list should contain only dummy buffer_heads | 727 | * ->t_iobuf_list should contain only dummy buffer_heads |
727 | * which were created by journal_write_metadata_buffer(). | 728 | * which were created by journal_write_metadata_buffer(). |
728 | */ | 729 | */ |
729 | BUFFER_TRACE(bh, "dumping temporary bh"); | 730 | BUFFER_TRACE(bh, "dumping temporary bh"); |
730 | journal_put_journal_head(jh); | 731 | journal_put_journal_head(jh); |
731 | __brelse(bh); | 732 | __brelse(bh); |
732 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | 733 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); |
733 | free_buffer_head(bh); | 734 | free_buffer_head(bh); |
734 | 735 | ||
735 | /* We also have to unlock and free the corresponding | 736 | /* We also have to unlock and free the corresponding |
736 | shadowed buffer */ | 737 | shadowed buffer */ |
737 | jh = commit_transaction->t_shadow_list->b_tprev; | 738 | jh = commit_transaction->t_shadow_list->b_tprev; |
738 | bh = jh2bh(jh); | 739 | bh = jh2bh(jh); |
739 | clear_bit(BH_JWrite, &bh->b_state); | 740 | clear_bit(BH_JWrite, &bh->b_state); |
740 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | 741 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); |
741 | 742 | ||
742 | /* The metadata is now released for reuse, but we need | 743 | /* The metadata is now released for reuse, but we need |
743 | to remember it against this transaction so that when | 744 | to remember it against this transaction so that when |
744 | we finally commit, we can do any checkpointing | 745 | we finally commit, we can do any checkpointing |
745 | required. */ | 746 | required. */ |
746 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | 747 | JBUFFER_TRACE(jh, "file as BJ_Forget"); |
747 | journal_file_buffer(jh, commit_transaction, BJ_Forget); | 748 | journal_file_buffer(jh, commit_transaction, BJ_Forget); |
748 | /* Wake up any transactions which were waiting for this | 749 | /* Wake up any transactions which were waiting for this |
749 | IO to complete */ | 750 | IO to complete */ |
750 | wake_up_bit(&bh->b_state, BH_Unshadow); | 751 | wake_up_bit(&bh->b_state, BH_Unshadow); |
751 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | 752 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); |
752 | __brelse(bh); | 753 | __brelse(bh); |
753 | } | 754 | } |
754 | 755 | ||
755 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | 756 | J_ASSERT (commit_transaction->t_shadow_list == NULL); |
756 | 757 | ||
757 | jbd_debug(3, "JBD: commit phase 5\n"); | 758 | jbd_debug(3, "JBD: commit phase 5\n"); |
758 | 759 | ||
759 | /* Here we wait for the revoke record and descriptor record buffers */ | 760 | /* Here we wait for the revoke record and descriptor record buffers */ |
760 | wait_for_ctlbuf: | 761 | wait_for_ctlbuf: |
761 | while (commit_transaction->t_log_list != NULL) { | 762 | while (commit_transaction->t_log_list != NULL) { |
762 | struct buffer_head *bh; | 763 | struct buffer_head *bh; |
763 | 764 | ||
764 | jh = commit_transaction->t_log_list->b_tprev; | 765 | jh = commit_transaction->t_log_list->b_tprev; |
765 | bh = jh2bh(jh); | 766 | bh = jh2bh(jh); |
766 | if (buffer_locked(bh)) { | 767 | if (buffer_locked(bh)) { |
767 | wait_on_buffer(bh); | 768 | wait_on_buffer(bh); |
768 | goto wait_for_ctlbuf; | 769 | goto wait_for_ctlbuf; |
769 | } | 770 | } |
770 | if (cond_resched()) | 771 | if (cond_resched()) |
771 | goto wait_for_ctlbuf; | 772 | goto wait_for_ctlbuf; |
772 | 773 | ||
773 | if (unlikely(!buffer_uptodate(bh))) | 774 | if (unlikely(!buffer_uptodate(bh))) |
774 | err = -EIO; | 775 | err = -EIO; |
775 | 776 | ||
776 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | 777 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); |
777 | clear_buffer_jwrite(bh); | 778 | clear_buffer_jwrite(bh); |
778 | journal_unfile_buffer(journal, jh); | 779 | journal_unfile_buffer(journal, jh); |
779 | journal_put_journal_head(jh); | 780 | journal_put_journal_head(jh); |
780 | __brelse(bh); /* One for getblk */ | 781 | __brelse(bh); /* One for getblk */ |
781 | /* AKPM: bforget here */ | 782 | /* AKPM: bforget here */ |
782 | } | 783 | } |
783 | 784 | ||
784 | if (err) | 785 | if (err) |
785 | journal_abort(journal, err); | 786 | journal_abort(journal, err); |
786 | 787 | ||
787 | jbd_debug(3, "JBD: commit phase 6\n"); | 788 | jbd_debug(3, "JBD: commit phase 6\n"); |
788 | 789 | ||
789 | /* All metadata is written, now write commit record and do cleanup */ | 790 | /* All metadata is written, now write commit record and do cleanup */ |
790 | spin_lock(&journal->j_state_lock); | 791 | spin_lock(&journal->j_state_lock); |
791 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | 792 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
792 | commit_transaction->t_state = T_COMMIT_RECORD; | 793 | commit_transaction->t_state = T_COMMIT_RECORD; |
793 | spin_unlock(&journal->j_state_lock); | 794 | spin_unlock(&journal->j_state_lock); |
794 | 795 | ||
795 | if (journal_write_commit_record(journal, commit_transaction)) | 796 | if (journal_write_commit_record(journal, commit_transaction)) |
796 | err = -EIO; | 797 | err = -EIO; |
797 | 798 | ||
798 | if (err) | 799 | if (err) |
799 | journal_abort(journal, err); | 800 | journal_abort(journal, err); |
800 | 801 | ||
801 | /* End of a transaction! Finally, we can do checkpoint | 802 | /* End of a transaction! Finally, we can do checkpoint |
802 | processing: any buffers committed as a result of this | 803 | processing: any buffers committed as a result of this |
803 | transaction can be removed from any checkpoint list it was on | 804 | transaction can be removed from any checkpoint list it was on |
804 | before. */ | 805 | before. */ |
805 | 806 | ||
806 | jbd_debug(3, "JBD: commit phase 7\n"); | 807 | jbd_debug(3, "JBD: commit phase 7\n"); |
807 | 808 | ||
808 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | 809 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); |
809 | J_ASSERT(commit_transaction->t_buffers == NULL); | 810 | J_ASSERT(commit_transaction->t_buffers == NULL); |
810 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | 811 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); |
811 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | 812 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); |
812 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | 813 | J_ASSERT(commit_transaction->t_shadow_list == NULL); |
813 | J_ASSERT(commit_transaction->t_log_list == NULL); | 814 | J_ASSERT(commit_transaction->t_log_list == NULL); |
814 | 815 | ||
815 | restart_loop: | 816 | restart_loop: |
816 | /* | 817 | /* |
817 | * As there are other places (journal_unmap_buffer()) adding buffers | 818 | * As there are other places (journal_unmap_buffer()) adding buffers |
818 | * to this list we have to be careful and hold the j_list_lock. | 819 | * to this list we have to be careful and hold the j_list_lock. |
819 | */ | 820 | */ |
820 | spin_lock(&journal->j_list_lock); | 821 | spin_lock(&journal->j_list_lock); |
821 | while (commit_transaction->t_forget) { | 822 | while (commit_transaction->t_forget) { |
822 | transaction_t *cp_transaction; | 823 | transaction_t *cp_transaction; |
823 | struct buffer_head *bh; | 824 | struct buffer_head *bh; |
824 | 825 | ||
825 | jh = commit_transaction->t_forget; | 826 | jh = commit_transaction->t_forget; |
826 | spin_unlock(&journal->j_list_lock); | 827 | spin_unlock(&journal->j_list_lock); |
827 | bh = jh2bh(jh); | 828 | bh = jh2bh(jh); |
828 | jbd_lock_bh_state(bh); | 829 | jbd_lock_bh_state(bh); |
829 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || | 830 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || |
830 | jh->b_transaction == journal->j_running_transaction); | 831 | jh->b_transaction == journal->j_running_transaction); |
831 | 832 | ||
832 | /* | 833 | /* |
833 | * If there is undo-protected committed data against | 834 | * If there is undo-protected committed data against |
834 | * this buffer, then we can remove it now. If it is a | 835 | * this buffer, then we can remove it now. If it is a |
835 | * buffer needing such protection, the old frozen_data | 836 | * buffer needing such protection, the old frozen_data |
836 | * field now points to a committed version of the | 837 | * field now points to a committed version of the |
837 | * buffer, so rotate that field to the new committed | 838 | * buffer, so rotate that field to the new committed |
838 | * data. | 839 | * data. |
839 | * | 840 | * |
840 | * Otherwise, we can just throw away the frozen data now. | 841 | * Otherwise, we can just throw away the frozen data now. |
841 | */ | 842 | */ |
842 | if (jh->b_committed_data) { | 843 | if (jh->b_committed_data) { |
843 | jbd_free(jh->b_committed_data, bh->b_size); | 844 | jbd_free(jh->b_committed_data, bh->b_size); |
844 | jh->b_committed_data = NULL; | 845 | jh->b_committed_data = NULL; |
845 | if (jh->b_frozen_data) { | 846 | if (jh->b_frozen_data) { |
846 | jh->b_committed_data = jh->b_frozen_data; | 847 | jh->b_committed_data = jh->b_frozen_data; |
847 | jh->b_frozen_data = NULL; | 848 | jh->b_frozen_data = NULL; |
848 | } | 849 | } |
849 | } else if (jh->b_frozen_data) { | 850 | } else if (jh->b_frozen_data) { |
850 | jbd_free(jh->b_frozen_data, bh->b_size); | 851 | jbd_free(jh->b_frozen_data, bh->b_size); |
851 | jh->b_frozen_data = NULL; | 852 | jh->b_frozen_data = NULL; |
852 | } | 853 | } |
853 | 854 | ||
854 | spin_lock(&journal->j_list_lock); | 855 | spin_lock(&journal->j_list_lock); |
855 | cp_transaction = jh->b_cp_transaction; | 856 | cp_transaction = jh->b_cp_transaction; |
856 | if (cp_transaction) { | 857 | if (cp_transaction) { |
857 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | 858 | JBUFFER_TRACE(jh, "remove from old cp transaction"); |
858 | __journal_remove_checkpoint(jh); | 859 | __journal_remove_checkpoint(jh); |
859 | } | 860 | } |
860 | 861 | ||
861 | /* Only re-checkpoint the buffer_head if it is marked | 862 | /* Only re-checkpoint the buffer_head if it is marked |
862 | * dirty. If the buffer was added to the BJ_Forget list | 863 | * dirty. If the buffer was added to the BJ_Forget list |
863 | * by journal_forget, it may no longer be dirty and | 864 | * by journal_forget, it may no longer be dirty and |
864 | * there's no point in keeping a checkpoint record for | 865 | * there's no point in keeping a checkpoint record for |
865 | * it. */ | 866 | * it. */ |
866 | 867 | ||
867 | /* A buffer which has been freed while still being | 868 | /* A buffer which has been freed while still being |
868 | * journaled by a previous transaction may end up still | 869 | * journaled by a previous transaction may end up still |
869 | * being dirty here, but we want to avoid writing back | 870 | * being dirty here, but we want to avoid writing back |
870 | * that buffer in the future after the "add to orphan" | 871 | * that buffer in the future after the "add to orphan" |
871 | * operation been committed, That's not only a performance | 872 | * operation been committed, That's not only a performance |
872 | * gain, it also stops aliasing problems if the buffer is | 873 | * gain, it also stops aliasing problems if the buffer is |
873 | * left behind for writeback and gets reallocated for another | 874 | * left behind for writeback and gets reallocated for another |
874 | * use in a different page. */ | 875 | * use in a different page. */ |
875 | if (buffer_freed(bh) && !jh->b_next_transaction) { | 876 | if (buffer_freed(bh) && !jh->b_next_transaction) { |
876 | clear_buffer_freed(bh); | 877 | clear_buffer_freed(bh); |
877 | clear_buffer_jbddirty(bh); | 878 | clear_buffer_jbddirty(bh); |
878 | } | 879 | } |
879 | 880 | ||
880 | if (buffer_jbddirty(bh)) { | 881 | if (buffer_jbddirty(bh)) { |
881 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); | 882 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); |
882 | __journal_insert_checkpoint(jh, commit_transaction); | 883 | __journal_insert_checkpoint(jh, commit_transaction); |
883 | if (is_journal_aborted(journal)) | 884 | if (is_journal_aborted(journal)) |
884 | clear_buffer_jbddirty(bh); | 885 | clear_buffer_jbddirty(bh); |
885 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); | 886 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); |
886 | __journal_refile_buffer(jh); | 887 | __journal_refile_buffer(jh); |
887 | jbd_unlock_bh_state(bh); | 888 | jbd_unlock_bh_state(bh); |
888 | } else { | 889 | } else { |
889 | J_ASSERT_BH(bh, !buffer_dirty(bh)); | 890 | J_ASSERT_BH(bh, !buffer_dirty(bh)); |
890 | /* The buffer on BJ_Forget list and not jbddirty means | 891 | /* The buffer on BJ_Forget list and not jbddirty means |
891 | * it has been freed by this transaction and hence it | 892 | * it has been freed by this transaction and hence it |
892 | * could not have been reallocated until this | 893 | * could not have been reallocated until this |
893 | * transaction has committed. *BUT* it could be | 894 | * transaction has committed. *BUT* it could be |
894 | * reallocated once we have written all the data to | 895 | * reallocated once we have written all the data to |
895 | * disk and before we process the buffer on BJ_Forget | 896 | * disk and before we process the buffer on BJ_Forget |
896 | * list. */ | 897 | * list. */ |
897 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); | 898 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); |
898 | __journal_refile_buffer(jh); | 899 | __journal_refile_buffer(jh); |
899 | if (!jh->b_transaction) { | 900 | if (!jh->b_transaction) { |
900 | jbd_unlock_bh_state(bh); | 901 | jbd_unlock_bh_state(bh); |
901 | /* needs a brelse */ | 902 | /* needs a brelse */ |
902 | journal_remove_journal_head(bh); | 903 | journal_remove_journal_head(bh); |
903 | release_buffer_page(bh); | 904 | release_buffer_page(bh); |
904 | } else | 905 | } else |
905 | jbd_unlock_bh_state(bh); | 906 | jbd_unlock_bh_state(bh); |
906 | } | 907 | } |
907 | cond_resched_lock(&journal->j_list_lock); | 908 | cond_resched_lock(&journal->j_list_lock); |
908 | } | 909 | } |
909 | spin_unlock(&journal->j_list_lock); | 910 | spin_unlock(&journal->j_list_lock); |
910 | /* | 911 | /* |
911 | * This is a bit sleazy. We use j_list_lock to protect transition | 912 | * This is a bit sleazy. We use j_list_lock to protect transition |
912 | * of a transaction into T_FINISHED state and calling | 913 | * of a transaction into T_FINISHED state and calling |
913 | * __journal_drop_transaction(). Otherwise we could race with | 914 | * __journal_drop_transaction(). Otherwise we could race with |
914 | * other checkpointing code processing the transaction... | 915 | * other checkpointing code processing the transaction... |
915 | */ | 916 | */ |
916 | spin_lock(&journal->j_state_lock); | 917 | spin_lock(&journal->j_state_lock); |
917 | spin_lock(&journal->j_list_lock); | 918 | spin_lock(&journal->j_list_lock); |
918 | /* | 919 | /* |
919 | * Now recheck if some buffers did not get attached to the transaction | 920 | * Now recheck if some buffers did not get attached to the transaction |
920 | * while the lock was dropped... | 921 | * while the lock was dropped... |
921 | */ | 922 | */ |
922 | if (commit_transaction->t_forget) { | 923 | if (commit_transaction->t_forget) { |
923 | spin_unlock(&journal->j_list_lock); | 924 | spin_unlock(&journal->j_list_lock); |
924 | spin_unlock(&journal->j_state_lock); | 925 | spin_unlock(&journal->j_state_lock); |
925 | goto restart_loop; | 926 | goto restart_loop; |
926 | } | 927 | } |
927 | 928 | ||
928 | /* Done with this transaction! */ | 929 | /* Done with this transaction! */ |
929 | 930 | ||
930 | jbd_debug(3, "JBD: commit phase 8\n"); | 931 | jbd_debug(3, "JBD: commit phase 8\n"); |
931 | 932 | ||
932 | J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); | 933 | J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); |
933 | 934 | ||
934 | commit_transaction->t_state = T_FINISHED; | 935 | commit_transaction->t_state = T_FINISHED; |
935 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 936 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
936 | journal->j_commit_sequence = commit_transaction->t_tid; | 937 | journal->j_commit_sequence = commit_transaction->t_tid; |
937 | journal->j_committing_transaction = NULL; | 938 | journal->j_committing_transaction = NULL; |
938 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | 939 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); |
939 | 940 | ||
940 | /* | 941 | /* |
941 | * weight the commit time higher than the average time so we don't | 942 | * weight the commit time higher than the average time so we don't |
942 | * react too strongly to vast changes in commit time | 943 | * react too strongly to vast changes in commit time |
943 | */ | 944 | */ |
944 | if (likely(journal->j_average_commit_time)) | 945 | if (likely(journal->j_average_commit_time)) |
945 | journal->j_average_commit_time = (commit_time*3 + | 946 | journal->j_average_commit_time = (commit_time*3 + |
946 | journal->j_average_commit_time) / 4; | 947 | journal->j_average_commit_time) / 4; |
947 | else | 948 | else |
948 | journal->j_average_commit_time = commit_time; | 949 | journal->j_average_commit_time = commit_time; |
949 | 950 | ||
950 | spin_unlock(&journal->j_state_lock); | 951 | spin_unlock(&journal->j_state_lock); |
951 | 952 | ||
952 | if (commit_transaction->t_checkpoint_list == NULL && | 953 | if (commit_transaction->t_checkpoint_list == NULL && |
953 | commit_transaction->t_checkpoint_io_list == NULL) { | 954 | commit_transaction->t_checkpoint_io_list == NULL) { |
954 | __journal_drop_transaction(journal, commit_transaction); | 955 | __journal_drop_transaction(journal, commit_transaction); |
955 | } else { | 956 | } else { |
956 | if (journal->j_checkpoint_transactions == NULL) { | 957 | if (journal->j_checkpoint_transactions == NULL) { |
957 | journal->j_checkpoint_transactions = commit_transaction; | 958 | journal->j_checkpoint_transactions = commit_transaction; |
958 | commit_transaction->t_cpnext = commit_transaction; | 959 | commit_transaction->t_cpnext = commit_transaction; |
959 | commit_transaction->t_cpprev = commit_transaction; | 960 | commit_transaction->t_cpprev = commit_transaction; |
960 | } else { | 961 | } else { |
961 | commit_transaction->t_cpnext = | 962 | commit_transaction->t_cpnext = |
962 | journal->j_checkpoint_transactions; | 963 | journal->j_checkpoint_transactions; |
963 | commit_transaction->t_cpprev = | 964 | commit_transaction->t_cpprev = |
964 | commit_transaction->t_cpnext->t_cpprev; | 965 | commit_transaction->t_cpnext->t_cpprev; |
965 | commit_transaction->t_cpnext->t_cpprev = | 966 | commit_transaction->t_cpnext->t_cpprev = |
fs/jbd2/commit.c
1 | /* | 1 | /* |
2 | * linux/fs/jbd2/commit.c | 2 | * linux/fs/jbd2/commit.c |
3 | * | 3 | * |
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | 4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 |
5 | * | 5 | * |
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | 6 | * Copyright 1998 Red Hat corp --- All Rights Reserved |
7 | * | 7 | * |
8 | * This file is part of the Linux kernel and is made available under | 8 | * This file is part of the Linux kernel and is made available under |
9 | * the terms of the GNU General Public License, version 2, or at your | 9 | * the terms of the GNU General Public License, version 2, or at your |
10 | * option, any later version, incorporated herein by reference. | 10 | * option, any later version, incorporated herein by reference. |
11 | * | 11 | * |
12 | * Journal commit routines for the generic filesystem journaling code; | 12 | * Journal commit routines for the generic filesystem journaling code; |
13 | * part of the ext2fs journaling system. | 13 | * part of the ext2fs journaling system. |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/time.h> | 16 | #include <linux/time.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/jbd2.h> | 18 | #include <linux/jbd2.h> |
19 | #include <linux/errno.h> | 19 | #include <linux/errno.h> |
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/crc32.h> | 24 | #include <linux/crc32.h> |
25 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
26 | #include <linux/backing-dev.h> | 26 | #include <linux/backing-dev.h> |
27 | #include <linux/bio.h> | 27 | #include <linux/bio.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <trace/events/jbd2.h> | 29 | #include <trace/events/jbd2.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Default IO end handler for temporary BJ_IO buffer_heads. | 32 | * Default IO end handler for temporary BJ_IO buffer_heads. |
33 | */ | 33 | */ |
34 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | 34 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
35 | { | 35 | { |
36 | BUFFER_TRACE(bh, ""); | 36 | BUFFER_TRACE(bh, ""); |
37 | if (uptodate) | 37 | if (uptodate) |
38 | set_buffer_uptodate(bh); | 38 | set_buffer_uptodate(bh); |
39 | else | 39 | else |
40 | clear_buffer_uptodate(bh); | 40 | clear_buffer_uptodate(bh); |
41 | unlock_buffer(bh); | 41 | unlock_buffer(bh); |
42 | } | 42 | } |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * When an ext4 file is truncated, it is possible that some pages are not | 45 | * When an ext4 file is truncated, it is possible that some pages are not |
46 | * successfully freed, because they are attached to a committing transaction. | 46 | * successfully freed, because they are attached to a committing transaction. |
47 | * After the transaction commits, these pages are left on the LRU, with no | 47 | * After the transaction commits, these pages are left on the LRU, with no |
48 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | 48 | * ->mapping, and with attached buffers. These pages are trivially reclaimable |
49 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | 49 | * by the VM, but their apparent absence upsets the VM accounting, and it makes |
50 | * the numbers in /proc/meminfo look odd. | 50 | * the numbers in /proc/meminfo look odd. |
51 | * | 51 | * |
52 | * So here, we have a buffer which has just come off the forget list. Look to | 52 | * So here, we have a buffer which has just come off the forget list. Look to |
53 | * see if we can strip all buffers from the backing page. | 53 | * see if we can strip all buffers from the backing page. |
54 | * | 54 | * |
55 | * Called under lock_journal(), and possibly under journal_datalist_lock. The | 55 | * Called under lock_journal(), and possibly under journal_datalist_lock. The |
56 | * caller provided us with a ref against the buffer, and we drop that here. | 56 | * caller provided us with a ref against the buffer, and we drop that here. |
57 | */ | 57 | */ |
58 | static void release_buffer_page(struct buffer_head *bh) | 58 | static void release_buffer_page(struct buffer_head *bh) |
59 | { | 59 | { |
60 | struct page *page; | 60 | struct page *page; |
61 | 61 | ||
62 | if (buffer_dirty(bh)) | 62 | if (buffer_dirty(bh)) |
63 | goto nope; | 63 | goto nope; |
64 | if (atomic_read(&bh->b_count) != 1) | 64 | if (atomic_read(&bh->b_count) != 1) |
65 | goto nope; | 65 | goto nope; |
66 | page = bh->b_page; | 66 | page = bh->b_page; |
67 | if (!page) | 67 | if (!page) |
68 | goto nope; | 68 | goto nope; |
69 | if (page->mapping) | 69 | if (page->mapping) |
70 | goto nope; | 70 | goto nope; |
71 | 71 | ||
72 | /* OK, it's a truncated page */ | 72 | /* OK, it's a truncated page */ |
73 | if (!trylock_page(page)) | 73 | if (!trylock_page(page)) |
74 | goto nope; | 74 | goto nope; |
75 | 75 | ||
76 | page_cache_get(page); | 76 | page_cache_get(page); |
77 | __brelse(bh); | 77 | __brelse(bh); |
78 | try_to_free_buffers(page); | 78 | try_to_free_buffers(page); |
79 | unlock_page(page); | 79 | unlock_page(page); |
80 | page_cache_release(page); | 80 | page_cache_release(page); |
81 | return; | 81 | return; |
82 | 82 | ||
83 | nope: | 83 | nope: |
84 | __brelse(bh); | 84 | __brelse(bh); |
85 | } | 85 | } |
86 | 86 | ||
87 | /* | 87 | /* |
88 | * Done it all: now submit the commit record. We should have | 88 | * Done it all: now submit the commit record. We should have |
89 | * cleaned up our previous buffers by now, so if we are in abort | 89 | * cleaned up our previous buffers by now, so if we are in abort |
90 | * mode we can now just skip the rest of the journal write | 90 | * mode we can now just skip the rest of the journal write |
91 | * entirely. | 91 | * entirely. |
92 | * | 92 | * |
93 | * Returns 1 if the journal needs to be aborted or 0 on success | 93 | * Returns 1 if the journal needs to be aborted or 0 on success |
94 | */ | 94 | */ |
95 | static int journal_submit_commit_record(journal_t *journal, | 95 | static int journal_submit_commit_record(journal_t *journal, |
96 | transaction_t *commit_transaction, | 96 | transaction_t *commit_transaction, |
97 | struct buffer_head **cbh, | 97 | struct buffer_head **cbh, |
98 | __u32 crc32_sum) | 98 | __u32 crc32_sum) |
99 | { | 99 | { |
100 | struct journal_head *descriptor; | 100 | struct journal_head *descriptor; |
101 | struct commit_header *tmp; | 101 | struct commit_header *tmp; |
102 | struct buffer_head *bh; | 102 | struct buffer_head *bh; |
103 | int ret; | 103 | int ret; |
104 | int barrier_done = 0; | ||
105 | struct timespec now = current_kernel_time(); | 104 | struct timespec now = current_kernel_time(); |
106 | 105 | ||
107 | if (is_journal_aborted(journal)) | 106 | if (is_journal_aborted(journal)) |
108 | return 0; | 107 | return 0; |
109 | 108 | ||
110 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 109 | descriptor = jbd2_journal_get_descriptor_buffer(journal); |
111 | if (!descriptor) | 110 | if (!descriptor) |
112 | return 1; | 111 | return 1; |
113 | 112 | ||
114 | bh = jh2bh(descriptor); | 113 | bh = jh2bh(descriptor); |
115 | 114 | ||
116 | tmp = (struct commit_header *)bh->b_data; | 115 | tmp = (struct commit_header *)bh->b_data; |
117 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 116 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
118 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 117 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
119 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 118 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
120 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); | 119 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); |
121 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); | 120 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); |
122 | 121 | ||
123 | if (JBD2_HAS_COMPAT_FEATURE(journal, | 122 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
124 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | 123 | JBD2_FEATURE_COMPAT_CHECKSUM)) { |
125 | tmp->h_chksum_type = JBD2_CRC32_CHKSUM; | 124 | tmp->h_chksum_type = JBD2_CRC32_CHKSUM; |
126 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; | 125 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; |
127 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); | 126 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); |
128 | } | 127 | } |
129 | 128 | ||
130 | JBUFFER_TRACE(descriptor, "submit commit block"); | 129 | JBUFFER_TRACE(descriptor, "submit commit block"); |
131 | lock_buffer(bh); | 130 | lock_buffer(bh); |
132 | clear_buffer_dirty(bh); | 131 | clear_buffer_dirty(bh); |
133 | set_buffer_uptodate(bh); | 132 | set_buffer_uptodate(bh); |
134 | bh->b_end_io = journal_end_buffer_io_sync; | 133 | bh->b_end_io = journal_end_buffer_io_sync; |
135 | 134 | ||
136 | if (journal->j_flags & JBD2_BARRIER && | 135 | if (journal->j_flags & JBD2_BARRIER && |
137 | !JBD2_HAS_INCOMPAT_FEATURE(journal, | 136 | !JBD2_HAS_INCOMPAT_FEATURE(journal, |
138 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 137 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
139 | set_buffer_ordered(bh); | 138 | ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh); |
140 | barrier_done = 1; | 139 | if (ret == -EOPNOTSUPP) { |
141 | } | 140 | printk(KERN_WARNING |
142 | ret = submit_bh(WRITE_SYNC_PLUG, bh); | 141 | "JBD2: Disabling barriers on %s, " |
143 | if (barrier_done) | 142 | "not supported by device\n", journal->j_devname); |
144 | clear_buffer_ordered(bh); | 143 | write_lock(&journal->j_state_lock); |
144 | journal->j_flags &= ~JBD2_BARRIER; | ||
145 | write_unlock(&journal->j_state_lock); | ||
145 | 146 | ||
146 | /* is it possible for another commit to fail at roughly | 147 | /* And try again, without the barrier */ |
147 | * the same time as this one? If so, we don't want to | 148 | lock_buffer(bh); |
148 | * trust the barrier flag in the super, but instead want | 149 | set_buffer_uptodate(bh); |
149 | * to remember if we sent a barrier request | 150 | clear_buffer_dirty(bh); |
150 | */ | 151 | ret = submit_bh(WRITE_SYNC_PLUG, bh); |
151 | if (ret == -EOPNOTSUPP && barrier_done) { | 152 | } |
152 | printk(KERN_WARNING | 153 | } else { |
153 | "JBD2: Disabling barriers on %s, " | ||
154 | "not supported by device\n", journal->j_devname); | ||
155 | write_lock(&journal->j_state_lock); | ||
156 | journal->j_flags &= ~JBD2_BARRIER; | ||
157 | write_unlock(&journal->j_state_lock); | ||
158 | |||
159 | /* And try again, without the barrier */ | ||
160 | lock_buffer(bh); | ||
161 | set_buffer_uptodate(bh); | ||
162 | clear_buffer_dirty(bh); | ||
163 | ret = submit_bh(WRITE_SYNC_PLUG, bh); | 154 | ret = submit_bh(WRITE_SYNC_PLUG, bh); |
164 | } | 155 | } |
165 | *cbh = bh; | 156 | *cbh = bh; |
166 | return ret; | 157 | return ret; |
167 | } | 158 | } |
168 | 159 | ||
169 | /* | 160 | /* |
170 | * This function along with journal_submit_commit_record | 161 | * This function along with journal_submit_commit_record |
171 | * allows to write the commit record asynchronously. | 162 | * allows to write the commit record asynchronously. |
172 | */ | 163 | */ |
173 | static int journal_wait_on_commit_record(journal_t *journal, | 164 | static int journal_wait_on_commit_record(journal_t *journal, |
174 | struct buffer_head *bh) | 165 | struct buffer_head *bh) |
175 | { | 166 | { |
176 | int ret = 0; | 167 | int ret = 0; |
177 | 168 | ||
178 | retry: | 169 | retry: |
179 | clear_buffer_dirty(bh); | 170 | clear_buffer_dirty(bh); |
180 | wait_on_buffer(bh); | 171 | wait_on_buffer(bh); |
181 | if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { | 172 | if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { |
182 | printk(KERN_WARNING | 173 | printk(KERN_WARNING |
183 | "JBD2: %s: disabling barries on %s - not supported " | 174 | "JBD2: %s: disabling barries on %s - not supported " |
184 | "by device\n", __func__, journal->j_devname); | 175 | "by device\n", __func__, journal->j_devname); |
185 | write_lock(&journal->j_state_lock); | 176 | write_lock(&journal->j_state_lock); |
186 | journal->j_flags &= ~JBD2_BARRIER; | 177 | journal->j_flags &= ~JBD2_BARRIER; |
187 | write_unlock(&journal->j_state_lock); | 178 | write_unlock(&journal->j_state_lock); |
188 | 179 | ||
189 | lock_buffer(bh); | 180 | lock_buffer(bh); |
190 | clear_buffer_dirty(bh); | 181 | clear_buffer_dirty(bh); |
191 | set_buffer_uptodate(bh); | 182 | set_buffer_uptodate(bh); |
192 | bh->b_end_io = journal_end_buffer_io_sync; | 183 | bh->b_end_io = journal_end_buffer_io_sync; |
193 | 184 | ||
194 | ret = submit_bh(WRITE_SYNC_PLUG, bh); | 185 | ret = submit_bh(WRITE_SYNC_PLUG, bh); |
195 | if (ret) { | 186 | if (ret) { |
196 | unlock_buffer(bh); | 187 | unlock_buffer(bh); |
197 | return ret; | 188 | return ret; |
198 | } | 189 | } |
199 | goto retry; | 190 | goto retry; |
200 | } | 191 | } |
201 | 192 | ||
202 | if (unlikely(!buffer_uptodate(bh))) | 193 | if (unlikely(!buffer_uptodate(bh))) |
203 | ret = -EIO; | 194 | ret = -EIO; |
204 | put_bh(bh); /* One for getblk() */ | 195 | put_bh(bh); /* One for getblk() */ |
205 | jbd2_journal_put_journal_head(bh2jh(bh)); | 196 | jbd2_journal_put_journal_head(bh2jh(bh)); |
206 | 197 | ||
207 | return ret; | 198 | return ret; |
208 | } | 199 | } |
209 | 200 | ||
210 | /* | 201 | /* |
211 | * write the filemap data using writepage() address_space_operations. | 202 | * write the filemap data using writepage() address_space_operations. |
212 | * We don't do block allocation here even for delalloc. We don't | 203 | * We don't do block allocation here even for delalloc. We don't |
213 | * use writepages() because with dealyed allocation we may be doing | 204 | * use writepages() because with dealyed allocation we may be doing |
214 | * block allocation in writepages(). | 205 | * block allocation in writepages(). |
215 | */ | 206 | */ |
216 | static int journal_submit_inode_data_buffers(struct address_space *mapping) | 207 | static int journal_submit_inode_data_buffers(struct address_space *mapping) |
217 | { | 208 | { |
218 | int ret; | 209 | int ret; |
219 | struct writeback_control wbc = { | 210 | struct writeback_control wbc = { |
220 | .sync_mode = WB_SYNC_ALL, | 211 | .sync_mode = WB_SYNC_ALL, |
221 | .nr_to_write = mapping->nrpages * 2, | 212 | .nr_to_write = mapping->nrpages * 2, |
222 | .range_start = 0, | 213 | .range_start = 0, |
223 | .range_end = i_size_read(mapping->host), | 214 | .range_end = i_size_read(mapping->host), |
224 | }; | 215 | }; |
225 | 216 | ||
226 | ret = generic_writepages(mapping, &wbc); | 217 | ret = generic_writepages(mapping, &wbc); |
227 | return ret; | 218 | return ret; |
228 | } | 219 | } |
229 | 220 | ||
230 | /* | 221 | /* |
231 | * Submit all the data buffers of inode associated with the transaction to | 222 | * Submit all the data buffers of inode associated with the transaction to |
232 | * disk. | 223 | * disk. |
233 | * | 224 | * |
234 | * We are in a committing transaction. Therefore no new inode can be added to | 225 | * We are in a committing transaction. Therefore no new inode can be added to |
235 | * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently | 226 | * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently |
236 | * operate on from being released while we write out pages. | 227 | * operate on from being released while we write out pages. |
237 | */ | 228 | */ |
238 | static int journal_submit_data_buffers(journal_t *journal, | 229 | static int journal_submit_data_buffers(journal_t *journal, |
239 | transaction_t *commit_transaction) | 230 | transaction_t *commit_transaction) |
240 | { | 231 | { |
241 | struct jbd2_inode *jinode; | 232 | struct jbd2_inode *jinode; |
242 | int err, ret = 0; | 233 | int err, ret = 0; |
243 | struct address_space *mapping; | 234 | struct address_space *mapping; |
244 | 235 | ||
245 | spin_lock(&journal->j_list_lock); | 236 | spin_lock(&journal->j_list_lock); |
246 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | 237 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
247 | mapping = jinode->i_vfs_inode->i_mapping; | 238 | mapping = jinode->i_vfs_inode->i_mapping; |
248 | jinode->i_flags |= JI_COMMIT_RUNNING; | 239 | jinode->i_flags |= JI_COMMIT_RUNNING; |
249 | spin_unlock(&journal->j_list_lock); | 240 | spin_unlock(&journal->j_list_lock); |
250 | /* | 241 | /* |
251 | * submit the inode data buffers. We use writepage | 242 | * submit the inode data buffers. We use writepage |
252 | * instead of writepages. Because writepages can do | 243 | * instead of writepages. Because writepages can do |
253 | * block allocation with delalloc. We need to write | 244 | * block allocation with delalloc. We need to write |
254 | * only allocated blocks here. | 245 | * only allocated blocks here. |
255 | */ | 246 | */ |
256 | trace_jbd2_submit_inode_data(jinode->i_vfs_inode); | 247 | trace_jbd2_submit_inode_data(jinode->i_vfs_inode); |
257 | err = journal_submit_inode_data_buffers(mapping); | 248 | err = journal_submit_inode_data_buffers(mapping); |
258 | if (!ret) | 249 | if (!ret) |
259 | ret = err; | 250 | ret = err; |
260 | spin_lock(&journal->j_list_lock); | 251 | spin_lock(&journal->j_list_lock); |
261 | J_ASSERT(jinode->i_transaction == commit_transaction); | 252 | J_ASSERT(jinode->i_transaction == commit_transaction); |
262 | commit_transaction->t_flushed_data_blocks = 1; | 253 | commit_transaction->t_flushed_data_blocks = 1; |
263 | jinode->i_flags &= ~JI_COMMIT_RUNNING; | 254 | jinode->i_flags &= ~JI_COMMIT_RUNNING; |
264 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | 255 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); |
265 | } | 256 | } |
266 | spin_unlock(&journal->j_list_lock); | 257 | spin_unlock(&journal->j_list_lock); |
267 | return ret; | 258 | return ret; |
268 | } | 259 | } |
269 | 260 | ||
270 | /* | 261 | /* |
271 | * Wait for data submitted for writeout, refile inodes to proper | 262 | * Wait for data submitted for writeout, refile inodes to proper |
272 | * transaction if needed. | 263 | * transaction if needed. |
273 | * | 264 | * |
274 | */ | 265 | */ |
275 | static int journal_finish_inode_data_buffers(journal_t *journal, | 266 | static int journal_finish_inode_data_buffers(journal_t *journal, |
276 | transaction_t *commit_transaction) | 267 | transaction_t *commit_transaction) |
277 | { | 268 | { |
278 | struct jbd2_inode *jinode, *next_i; | 269 | struct jbd2_inode *jinode, *next_i; |
279 | int err, ret = 0; | 270 | int err, ret = 0; |
280 | 271 | ||
281 | /* For locking, see the comment in journal_submit_data_buffers() */ | 272 | /* For locking, see the comment in journal_submit_data_buffers() */ |
282 | spin_lock(&journal->j_list_lock); | 273 | spin_lock(&journal->j_list_lock); |
283 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | 274 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
284 | jinode->i_flags |= JI_COMMIT_RUNNING; | 275 | jinode->i_flags |= JI_COMMIT_RUNNING; |
285 | spin_unlock(&journal->j_list_lock); | 276 | spin_unlock(&journal->j_list_lock); |
286 | err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); | 277 | err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); |
287 | if (err) { | 278 | if (err) { |
288 | /* | 279 | /* |
289 | * Because AS_EIO is cleared by | 280 | * Because AS_EIO is cleared by |
290 | * filemap_fdatawait_range(), set it again so | 281 | * filemap_fdatawait_range(), set it again so |
291 | * that user process can get -EIO from fsync(). | 282 | * that user process can get -EIO from fsync(). |
292 | */ | 283 | */ |
293 | set_bit(AS_EIO, | 284 | set_bit(AS_EIO, |
294 | &jinode->i_vfs_inode->i_mapping->flags); | 285 | &jinode->i_vfs_inode->i_mapping->flags); |
295 | 286 | ||
296 | if (!ret) | 287 | if (!ret) |
297 | ret = err; | 288 | ret = err; |
298 | } | 289 | } |
299 | spin_lock(&journal->j_list_lock); | 290 | spin_lock(&journal->j_list_lock); |
300 | jinode->i_flags &= ~JI_COMMIT_RUNNING; | 291 | jinode->i_flags &= ~JI_COMMIT_RUNNING; |
301 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | 292 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); |
302 | } | 293 | } |
303 | 294 | ||
304 | /* Now refile inode to proper lists */ | 295 | /* Now refile inode to proper lists */ |
305 | list_for_each_entry_safe(jinode, next_i, | 296 | list_for_each_entry_safe(jinode, next_i, |
306 | &commit_transaction->t_inode_list, i_list) { | 297 | &commit_transaction->t_inode_list, i_list) { |
307 | list_del(&jinode->i_list); | 298 | list_del(&jinode->i_list); |
308 | if (jinode->i_next_transaction) { | 299 | if (jinode->i_next_transaction) { |
309 | jinode->i_transaction = jinode->i_next_transaction; | 300 | jinode->i_transaction = jinode->i_next_transaction; |
310 | jinode->i_next_transaction = NULL; | 301 | jinode->i_next_transaction = NULL; |
311 | list_add(&jinode->i_list, | 302 | list_add(&jinode->i_list, |
312 | &jinode->i_transaction->t_inode_list); | 303 | &jinode->i_transaction->t_inode_list); |
313 | } else { | 304 | } else { |
314 | jinode->i_transaction = NULL; | 305 | jinode->i_transaction = NULL; |
315 | } | 306 | } |
316 | } | 307 | } |
317 | spin_unlock(&journal->j_list_lock); | 308 | spin_unlock(&journal->j_list_lock); |
318 | 309 | ||
319 | return ret; | 310 | return ret; |
320 | } | 311 | } |
321 | 312 | ||
322 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) | 313 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) |
323 | { | 314 | { |
324 | struct page *page = bh->b_page; | 315 | struct page *page = bh->b_page; |
325 | char *addr; | 316 | char *addr; |
326 | __u32 checksum; | 317 | __u32 checksum; |
327 | 318 | ||
328 | addr = kmap_atomic(page, KM_USER0); | 319 | addr = kmap_atomic(page, KM_USER0); |
329 | checksum = crc32_be(crc32_sum, | 320 | checksum = crc32_be(crc32_sum, |
330 | (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); | 321 | (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); |
331 | kunmap_atomic(addr, KM_USER0); | 322 | kunmap_atomic(addr, KM_USER0); |
332 | 323 | ||
333 | return checksum; | 324 | return checksum; |
334 | } | 325 | } |
335 | 326 | ||
336 | static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | 327 | static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, |
337 | unsigned long long block) | 328 | unsigned long long block) |
338 | { | 329 | { |
339 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); | 330 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); |
340 | if (tag_bytes > JBD2_TAG_SIZE32) | 331 | if (tag_bytes > JBD2_TAG_SIZE32) |
341 | tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); | 332 | tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); |
342 | } | 333 | } |
343 | 334 | ||
344 | /* | 335 | /* |
345 | * jbd2_journal_commit_transaction | 336 | * jbd2_journal_commit_transaction |
346 | * | 337 | * |
347 | * The primary function for committing a transaction to the log. This | 338 | * The primary function for committing a transaction to the log. This |
348 | * function is called by the journal thread to begin a complete commit. | 339 | * function is called by the journal thread to begin a complete commit. |
349 | */ | 340 | */ |
350 | void jbd2_journal_commit_transaction(journal_t *journal) | 341 | void jbd2_journal_commit_transaction(journal_t *journal) |
351 | { | 342 | { |
352 | struct transaction_stats_s stats; | 343 | struct transaction_stats_s stats; |
353 | transaction_t *commit_transaction; | 344 | transaction_t *commit_transaction; |
354 | struct journal_head *jh, *new_jh, *descriptor; | 345 | struct journal_head *jh, *new_jh, *descriptor; |
355 | struct buffer_head **wbuf = journal->j_wbuf; | 346 | struct buffer_head **wbuf = journal->j_wbuf; |
356 | int bufs; | 347 | int bufs; |
357 | int flags; | 348 | int flags; |
358 | int err; | 349 | int err; |
359 | unsigned long long blocknr; | 350 | unsigned long long blocknr; |
360 | ktime_t start_time; | 351 | ktime_t start_time; |
361 | u64 commit_time; | 352 | u64 commit_time; |
362 | char *tagp = NULL; | 353 | char *tagp = NULL; |
363 | journal_header_t *header; | 354 | journal_header_t *header; |
364 | journal_block_tag_t *tag = NULL; | 355 | journal_block_tag_t *tag = NULL; |
365 | int space_left = 0; | 356 | int space_left = 0; |
366 | int first_tag = 0; | 357 | int first_tag = 0; |
367 | int tag_flag; | 358 | int tag_flag; |
368 | int i, to_free = 0; | 359 | int i, to_free = 0; |
369 | int tag_bytes = journal_tag_bytes(journal); | 360 | int tag_bytes = journal_tag_bytes(journal); |
370 | struct buffer_head *cbh = NULL; /* For transactional checksums */ | 361 | struct buffer_head *cbh = NULL; /* For transactional checksums */ |
371 | __u32 crc32_sum = ~0; | 362 | __u32 crc32_sum = ~0; |
372 | int write_op = WRITE; | 363 | int write_op = WRITE; |
373 | 364 | ||
374 | /* | 365 | /* |
375 | * First job: lock down the current transaction and wait for | 366 | * First job: lock down the current transaction and wait for |
376 | * all outstanding updates to complete. | 367 | * all outstanding updates to complete. |
377 | */ | 368 | */ |
378 | 369 | ||
379 | #ifdef COMMIT_STATS | 370 | #ifdef COMMIT_STATS |
380 | spin_lock(&journal->j_list_lock); | 371 | spin_lock(&journal->j_list_lock); |
381 | summarise_journal_usage(journal); | 372 | summarise_journal_usage(journal); |
382 | spin_unlock(&journal->j_list_lock); | 373 | spin_unlock(&journal->j_list_lock); |
383 | #endif | 374 | #endif |
384 | 375 | ||
385 | /* Do we need to erase the effects of a prior jbd2_journal_flush? */ | 376 | /* Do we need to erase the effects of a prior jbd2_journal_flush? */ |
386 | if (journal->j_flags & JBD2_FLUSHED) { | 377 | if (journal->j_flags & JBD2_FLUSHED) { |
387 | jbd_debug(3, "super block updated\n"); | 378 | jbd_debug(3, "super block updated\n"); |
388 | jbd2_journal_update_superblock(journal, 1); | 379 | jbd2_journal_update_superblock(journal, 1); |
389 | } else { | 380 | } else { |
390 | jbd_debug(3, "superblock not updated\n"); | 381 | jbd_debug(3, "superblock not updated\n"); |
391 | } | 382 | } |
392 | 383 | ||
393 | J_ASSERT(journal->j_running_transaction != NULL); | 384 | J_ASSERT(journal->j_running_transaction != NULL); |
394 | J_ASSERT(journal->j_committing_transaction == NULL); | 385 | J_ASSERT(journal->j_committing_transaction == NULL); |
395 | 386 | ||
396 | commit_transaction = journal->j_running_transaction; | 387 | commit_transaction = journal->j_running_transaction; |
397 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | 388 | J_ASSERT(commit_transaction->t_state == T_RUNNING); |
398 | 389 | ||
399 | trace_jbd2_start_commit(journal, commit_transaction); | 390 | trace_jbd2_start_commit(journal, commit_transaction); |
400 | jbd_debug(1, "JBD: starting commit of transaction %d\n", | 391 | jbd_debug(1, "JBD: starting commit of transaction %d\n", |
401 | commit_transaction->t_tid); | 392 | commit_transaction->t_tid); |
402 | 393 | ||
403 | write_lock(&journal->j_state_lock); | 394 | write_lock(&journal->j_state_lock); |
404 | commit_transaction->t_state = T_LOCKED; | 395 | commit_transaction->t_state = T_LOCKED; |
405 | 396 | ||
406 | /* | 397 | /* |
407 | * Use plugged writes here, since we want to submit several before | 398 | * Use plugged writes here, since we want to submit several before |
408 | * we unplug the device. We don't do explicit unplugging in here, | 399 | * we unplug the device. We don't do explicit unplugging in here, |
409 | * instead we rely on sync_buffer() doing the unplug for us. | 400 | * instead we rely on sync_buffer() doing the unplug for us. |
410 | */ | 401 | */ |
411 | if (commit_transaction->t_synchronous_commit) | 402 | if (commit_transaction->t_synchronous_commit) |
412 | write_op = WRITE_SYNC_PLUG; | 403 | write_op = WRITE_SYNC_PLUG; |
413 | trace_jbd2_commit_locking(journal, commit_transaction); | 404 | trace_jbd2_commit_locking(journal, commit_transaction); |
414 | stats.run.rs_wait = commit_transaction->t_max_wait; | 405 | stats.run.rs_wait = commit_transaction->t_max_wait; |
415 | stats.run.rs_locked = jiffies; | 406 | stats.run.rs_locked = jiffies; |
416 | stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, | 407 | stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, |
417 | stats.run.rs_locked); | 408 | stats.run.rs_locked); |
418 | 409 | ||
419 | spin_lock(&commit_transaction->t_handle_lock); | 410 | spin_lock(&commit_transaction->t_handle_lock); |
420 | while (atomic_read(&commit_transaction->t_updates)) { | 411 | while (atomic_read(&commit_transaction->t_updates)) { |
421 | DEFINE_WAIT(wait); | 412 | DEFINE_WAIT(wait); |
422 | 413 | ||
423 | prepare_to_wait(&journal->j_wait_updates, &wait, | 414 | prepare_to_wait(&journal->j_wait_updates, &wait, |
424 | TASK_UNINTERRUPTIBLE); | 415 | TASK_UNINTERRUPTIBLE); |
425 | if (atomic_read(&commit_transaction->t_updates)) { | 416 | if (atomic_read(&commit_transaction->t_updates)) { |
426 | spin_unlock(&commit_transaction->t_handle_lock); | 417 | spin_unlock(&commit_transaction->t_handle_lock); |
427 | write_unlock(&journal->j_state_lock); | 418 | write_unlock(&journal->j_state_lock); |
428 | schedule(); | 419 | schedule(); |
429 | write_lock(&journal->j_state_lock); | 420 | write_lock(&journal->j_state_lock); |
430 | spin_lock(&commit_transaction->t_handle_lock); | 421 | spin_lock(&commit_transaction->t_handle_lock); |
431 | } | 422 | } |
432 | finish_wait(&journal->j_wait_updates, &wait); | 423 | finish_wait(&journal->j_wait_updates, &wait); |
433 | } | 424 | } |
434 | spin_unlock(&commit_transaction->t_handle_lock); | 425 | spin_unlock(&commit_transaction->t_handle_lock); |
435 | 426 | ||
436 | J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= | 427 | J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= |
437 | journal->j_max_transaction_buffers); | 428 | journal->j_max_transaction_buffers); |
438 | 429 | ||
439 | /* | 430 | /* |
440 | * First thing we are allowed to do is to discard any remaining | 431 | * First thing we are allowed to do is to discard any remaining |
441 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume | 432 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume |
442 | * that there are no such buffers: if a large filesystem | 433 | * that there are no such buffers: if a large filesystem |
443 | * operation like a truncate needs to split itself over multiple | 434 | * operation like a truncate needs to split itself over multiple |
444 | * transactions, then it may try to do a jbd2_journal_restart() while | 435 | * transactions, then it may try to do a jbd2_journal_restart() while |
445 | * there are still BJ_Reserved buffers outstanding. These must | 436 | * there are still BJ_Reserved buffers outstanding. These must |
446 | * be released cleanly from the current transaction. | 437 | * be released cleanly from the current transaction. |
447 | * | 438 | * |
448 | * In this case, the filesystem must still reserve write access | 439 | * In this case, the filesystem must still reserve write access |
449 | * again before modifying the buffer in the new transaction, but | 440 | * again before modifying the buffer in the new transaction, but |
450 | * we do not require it to remember exactly which old buffers it | 441 | * we do not require it to remember exactly which old buffers it |
451 | * has reserved. This is consistent with the existing behaviour | 442 | * has reserved. This is consistent with the existing behaviour |
452 | * that multiple jbd2_journal_get_write_access() calls to the same | 443 | * that multiple jbd2_journal_get_write_access() calls to the same |
453 | * buffer are perfectly permissable. | 444 | * buffer are perfectly permissable. |
454 | */ | 445 | */ |
455 | while (commit_transaction->t_reserved_list) { | 446 | while (commit_transaction->t_reserved_list) { |
456 | jh = commit_transaction->t_reserved_list; | 447 | jh = commit_transaction->t_reserved_list; |
457 | JBUFFER_TRACE(jh, "reserved, unused: refile"); | 448 | JBUFFER_TRACE(jh, "reserved, unused: refile"); |
458 | /* | 449 | /* |
459 | * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may | 450 | * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may |
460 | * leave undo-committed data. | 451 | * leave undo-committed data. |
461 | */ | 452 | */ |
462 | if (jh->b_committed_data) { | 453 | if (jh->b_committed_data) { |
463 | struct buffer_head *bh = jh2bh(jh); | 454 | struct buffer_head *bh = jh2bh(jh); |
464 | 455 | ||
465 | jbd_lock_bh_state(bh); | 456 | jbd_lock_bh_state(bh); |
466 | jbd2_free(jh->b_committed_data, bh->b_size); | 457 | jbd2_free(jh->b_committed_data, bh->b_size); |
467 | jh->b_committed_data = NULL; | 458 | jh->b_committed_data = NULL; |
468 | jbd_unlock_bh_state(bh); | 459 | jbd_unlock_bh_state(bh); |
469 | } | 460 | } |
470 | jbd2_journal_refile_buffer(journal, jh); | 461 | jbd2_journal_refile_buffer(journal, jh); |
471 | } | 462 | } |
472 | 463 | ||
473 | /* | 464 | /* |
474 | * Now try to drop any written-back buffers from the journal's | 465 | * Now try to drop any written-back buffers from the journal's |
475 | * checkpoint lists. We do this *before* commit because it potentially | 466 | * checkpoint lists. We do this *before* commit because it potentially |
476 | * frees some memory | 467 | * frees some memory |
477 | */ | 468 | */ |
478 | spin_lock(&journal->j_list_lock); | 469 | spin_lock(&journal->j_list_lock); |
479 | __jbd2_journal_clean_checkpoint_list(journal); | 470 | __jbd2_journal_clean_checkpoint_list(journal); |
480 | spin_unlock(&journal->j_list_lock); | 471 | spin_unlock(&journal->j_list_lock); |
481 | 472 | ||
482 | jbd_debug (3, "JBD: commit phase 1\n"); | 473 | jbd_debug (3, "JBD: commit phase 1\n"); |
483 | 474 | ||
484 | /* | 475 | /* |
485 | * Switch to a new revoke table. | 476 | * Switch to a new revoke table. |
486 | */ | 477 | */ |
487 | jbd2_journal_switch_revoke_table(journal); | 478 | jbd2_journal_switch_revoke_table(journal); |
488 | 479 | ||
489 | trace_jbd2_commit_flushing(journal, commit_transaction); | 480 | trace_jbd2_commit_flushing(journal, commit_transaction); |
490 | stats.run.rs_flushing = jiffies; | 481 | stats.run.rs_flushing = jiffies; |
491 | stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, | 482 | stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, |
492 | stats.run.rs_flushing); | 483 | stats.run.rs_flushing); |
493 | 484 | ||
494 | commit_transaction->t_state = T_FLUSH; | 485 | commit_transaction->t_state = T_FLUSH; |
495 | journal->j_committing_transaction = commit_transaction; | 486 | journal->j_committing_transaction = commit_transaction; |
496 | journal->j_running_transaction = NULL; | 487 | journal->j_running_transaction = NULL; |
497 | start_time = ktime_get(); | 488 | start_time = ktime_get(); |
498 | commit_transaction->t_log_start = journal->j_head; | 489 | commit_transaction->t_log_start = journal->j_head; |
499 | wake_up(&journal->j_wait_transaction_locked); | 490 | wake_up(&journal->j_wait_transaction_locked); |
500 | write_unlock(&journal->j_state_lock); | 491 | write_unlock(&journal->j_state_lock); |
501 | 492 | ||
502 | jbd_debug (3, "JBD: commit phase 2\n"); | 493 | jbd_debug (3, "JBD: commit phase 2\n"); |
503 | 494 | ||
504 | /* | 495 | /* |
505 | * Now start flushing things to disk, in the order they appear | 496 | * Now start flushing things to disk, in the order they appear |
506 | * on the transaction lists. Data blocks go first. | 497 | * on the transaction lists. Data blocks go first. |
507 | */ | 498 | */ |
508 | err = journal_submit_data_buffers(journal, commit_transaction); | 499 | err = journal_submit_data_buffers(journal, commit_transaction); |
509 | if (err) | 500 | if (err) |
510 | jbd2_journal_abort(journal, err); | 501 | jbd2_journal_abort(journal, err); |
511 | 502 | ||
512 | jbd2_journal_write_revoke_records(journal, commit_transaction, | 503 | jbd2_journal_write_revoke_records(journal, commit_transaction, |
513 | write_op); | 504 | write_op); |
514 | 505 | ||
515 | jbd_debug(3, "JBD: commit phase 2\n"); | 506 | jbd_debug(3, "JBD: commit phase 2\n"); |
516 | 507 | ||
517 | /* | 508 | /* |
518 | * Way to go: we have now written out all of the data for a | 509 | * Way to go: we have now written out all of the data for a |
519 | * transaction! Now comes the tricky part: we need to write out | 510 | * transaction! Now comes the tricky part: we need to write out |
520 | * metadata. Loop over the transaction's entire buffer list: | 511 | * metadata. Loop over the transaction's entire buffer list: |
521 | */ | 512 | */ |
522 | write_lock(&journal->j_state_lock); | 513 | write_lock(&journal->j_state_lock); |
523 | commit_transaction->t_state = T_COMMIT; | 514 | commit_transaction->t_state = T_COMMIT; |
524 | write_unlock(&journal->j_state_lock); | 515 | write_unlock(&journal->j_state_lock); |
525 | 516 | ||
526 | trace_jbd2_commit_logging(journal, commit_transaction); | 517 | trace_jbd2_commit_logging(journal, commit_transaction); |
527 | stats.run.rs_logging = jiffies; | 518 | stats.run.rs_logging = jiffies; |
528 | stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, | 519 | stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, |
529 | stats.run.rs_logging); | 520 | stats.run.rs_logging); |
530 | stats.run.rs_blocks = | 521 | stats.run.rs_blocks = |
531 | atomic_read(&commit_transaction->t_outstanding_credits); | 522 | atomic_read(&commit_transaction->t_outstanding_credits); |
532 | stats.run.rs_blocks_logged = 0; | 523 | stats.run.rs_blocks_logged = 0; |
533 | 524 | ||
534 | J_ASSERT(commit_transaction->t_nr_buffers <= | 525 | J_ASSERT(commit_transaction->t_nr_buffers <= |
535 | atomic_read(&commit_transaction->t_outstanding_credits)); | 526 | atomic_read(&commit_transaction->t_outstanding_credits)); |
536 | 527 | ||
537 | err = 0; | 528 | err = 0; |
538 | descriptor = NULL; | 529 | descriptor = NULL; |
539 | bufs = 0; | 530 | bufs = 0; |
540 | while (commit_transaction->t_buffers) { | 531 | while (commit_transaction->t_buffers) { |
541 | 532 | ||
542 | /* Find the next buffer to be journaled... */ | 533 | /* Find the next buffer to be journaled... */ |
543 | 534 | ||
544 | jh = commit_transaction->t_buffers; | 535 | jh = commit_transaction->t_buffers; |
545 | 536 | ||
546 | /* If we're in abort mode, we just un-journal the buffer and | 537 | /* If we're in abort mode, we just un-journal the buffer and |
547 | release it. */ | 538 | release it. */ |
548 | 539 | ||
549 | if (is_journal_aborted(journal)) { | 540 | if (is_journal_aborted(journal)) { |
550 | clear_buffer_jbddirty(jh2bh(jh)); | 541 | clear_buffer_jbddirty(jh2bh(jh)); |
551 | JBUFFER_TRACE(jh, "journal is aborting: refile"); | 542 | JBUFFER_TRACE(jh, "journal is aborting: refile"); |
552 | jbd2_buffer_abort_trigger(jh, | 543 | jbd2_buffer_abort_trigger(jh, |
553 | jh->b_frozen_data ? | 544 | jh->b_frozen_data ? |
554 | jh->b_frozen_triggers : | 545 | jh->b_frozen_triggers : |
555 | jh->b_triggers); | 546 | jh->b_triggers); |
556 | jbd2_journal_refile_buffer(journal, jh); | 547 | jbd2_journal_refile_buffer(journal, jh); |
557 | /* If that was the last one, we need to clean up | 548 | /* If that was the last one, we need to clean up |
558 | * any descriptor buffers which may have been | 549 | * any descriptor buffers which may have been |
559 | * already allocated, even if we are now | 550 | * already allocated, even if we are now |
560 | * aborting. */ | 551 | * aborting. */ |
561 | if (!commit_transaction->t_buffers) | 552 | if (!commit_transaction->t_buffers) |
562 | goto start_journal_io; | 553 | goto start_journal_io; |
563 | continue; | 554 | continue; |
564 | } | 555 | } |
565 | 556 | ||
566 | /* Make sure we have a descriptor block in which to | 557 | /* Make sure we have a descriptor block in which to |
567 | record the metadata buffer. */ | 558 | record the metadata buffer. */ |
568 | 559 | ||
569 | if (!descriptor) { | 560 | if (!descriptor) { |
570 | struct buffer_head *bh; | 561 | struct buffer_head *bh; |
571 | 562 | ||
572 | J_ASSERT (bufs == 0); | 563 | J_ASSERT (bufs == 0); |
573 | 564 | ||
574 | jbd_debug(4, "JBD: get descriptor\n"); | 565 | jbd_debug(4, "JBD: get descriptor\n"); |
575 | 566 | ||
576 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 567 | descriptor = jbd2_journal_get_descriptor_buffer(journal); |
577 | if (!descriptor) { | 568 | if (!descriptor) { |
578 | jbd2_journal_abort(journal, -EIO); | 569 | jbd2_journal_abort(journal, -EIO); |
579 | continue; | 570 | continue; |
580 | } | 571 | } |
581 | 572 | ||
582 | bh = jh2bh(descriptor); | 573 | bh = jh2bh(descriptor); |
583 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", | 574 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", |
584 | (unsigned long long)bh->b_blocknr, bh->b_data); | 575 | (unsigned long long)bh->b_blocknr, bh->b_data); |
585 | header = (journal_header_t *)&bh->b_data[0]; | 576 | header = (journal_header_t *)&bh->b_data[0]; |
586 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 577 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
587 | header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); | 578 | header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); |
588 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 579 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
589 | 580 | ||
590 | tagp = &bh->b_data[sizeof(journal_header_t)]; | 581 | tagp = &bh->b_data[sizeof(journal_header_t)]; |
591 | space_left = bh->b_size - sizeof(journal_header_t); | 582 | space_left = bh->b_size - sizeof(journal_header_t); |
592 | first_tag = 1; | 583 | first_tag = 1; |
593 | set_buffer_jwrite(bh); | 584 | set_buffer_jwrite(bh); |
594 | set_buffer_dirty(bh); | 585 | set_buffer_dirty(bh); |
595 | wbuf[bufs++] = bh; | 586 | wbuf[bufs++] = bh; |
596 | 587 | ||
597 | /* Record it so that we can wait for IO | 588 | /* Record it so that we can wait for IO |
598 | completion later */ | 589 | completion later */ |
599 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | 590 | BUFFER_TRACE(bh, "ph3: file as descriptor"); |
600 | jbd2_journal_file_buffer(descriptor, commit_transaction, | 591 | jbd2_journal_file_buffer(descriptor, commit_transaction, |
601 | BJ_LogCtl); | 592 | BJ_LogCtl); |
602 | } | 593 | } |
603 | 594 | ||
604 | /* Where is the buffer to be written? */ | 595 | /* Where is the buffer to be written? */ |
605 | 596 | ||
606 | err = jbd2_journal_next_log_block(journal, &blocknr); | 597 | err = jbd2_journal_next_log_block(journal, &blocknr); |
607 | /* If the block mapping failed, just abandon the buffer | 598 | /* If the block mapping failed, just abandon the buffer |
608 | and repeat this loop: we'll fall into the | 599 | and repeat this loop: we'll fall into the |
609 | refile-on-abort condition above. */ | 600 | refile-on-abort condition above. */ |
610 | if (err) { | 601 | if (err) { |
611 | jbd2_journal_abort(journal, err); | 602 | jbd2_journal_abort(journal, err); |
612 | continue; | 603 | continue; |
613 | } | 604 | } |
614 | 605 | ||
615 | /* | 606 | /* |
616 | * start_this_handle() uses t_outstanding_credits to determine | 607 | * start_this_handle() uses t_outstanding_credits to determine |
617 | * the free space in the log, but this counter is changed | 608 | * the free space in the log, but this counter is changed |
618 | * by jbd2_journal_next_log_block() also. | 609 | * by jbd2_journal_next_log_block() also. |
619 | */ | 610 | */ |
620 | atomic_dec(&commit_transaction->t_outstanding_credits); | 611 | atomic_dec(&commit_transaction->t_outstanding_credits); |
621 | 612 | ||
622 | /* Bump b_count to prevent truncate from stumbling over | 613 | /* Bump b_count to prevent truncate from stumbling over |
623 | the shadowed buffer! @@@ This can go if we ever get | 614 | the shadowed buffer! @@@ This can go if we ever get |
624 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | 615 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ |
625 | atomic_inc(&jh2bh(jh)->b_count); | 616 | atomic_inc(&jh2bh(jh)->b_count); |
626 | 617 | ||
627 | /* Make a temporary IO buffer with which to write it out | 618 | /* Make a temporary IO buffer with which to write it out |
628 | (this will requeue both the metadata buffer and the | 619 | (this will requeue both the metadata buffer and the |
629 | temporary IO buffer). new_bh goes on BJ_IO*/ | 620 | temporary IO buffer). new_bh goes on BJ_IO*/ |
630 | 621 | ||
631 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | 622 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); |
632 | /* | 623 | /* |
633 | * akpm: jbd2_journal_write_metadata_buffer() sets | 624 | * akpm: jbd2_journal_write_metadata_buffer() sets |
634 | * new_bh->b_transaction to commit_transaction. | 625 | * new_bh->b_transaction to commit_transaction. |
635 | * We need to clean this up before we release new_bh | 626 | * We need to clean this up before we release new_bh |
636 | * (which is of type BJ_IO) | 627 | * (which is of type BJ_IO) |
637 | */ | 628 | */ |
638 | JBUFFER_TRACE(jh, "ph3: write metadata"); | 629 | JBUFFER_TRACE(jh, "ph3: write metadata"); |
639 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, | 630 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, |
640 | jh, &new_jh, blocknr); | 631 | jh, &new_jh, blocknr); |
641 | if (flags < 0) { | 632 | if (flags < 0) { |
642 | jbd2_journal_abort(journal, flags); | 633 | jbd2_journal_abort(journal, flags); |
643 | continue; | 634 | continue; |
644 | } | 635 | } |
645 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | 636 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); |
646 | wbuf[bufs++] = jh2bh(new_jh); | 637 | wbuf[bufs++] = jh2bh(new_jh); |
647 | 638 | ||
648 | /* Record the new block's tag in the current descriptor | 639 | /* Record the new block's tag in the current descriptor |
649 | buffer */ | 640 | buffer */ |
650 | 641 | ||
651 | tag_flag = 0; | 642 | tag_flag = 0; |
652 | if (flags & 1) | 643 | if (flags & 1) |
653 | tag_flag |= JBD2_FLAG_ESCAPE; | 644 | tag_flag |= JBD2_FLAG_ESCAPE; |
654 | if (!first_tag) | 645 | if (!first_tag) |
655 | tag_flag |= JBD2_FLAG_SAME_UUID; | 646 | tag_flag |= JBD2_FLAG_SAME_UUID; |
656 | 647 | ||
657 | tag = (journal_block_tag_t *) tagp; | 648 | tag = (journal_block_tag_t *) tagp; |
658 | write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); | 649 | write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); |
659 | tag->t_flags = cpu_to_be32(tag_flag); | 650 | tag->t_flags = cpu_to_be32(tag_flag); |
660 | tagp += tag_bytes; | 651 | tagp += tag_bytes; |
661 | space_left -= tag_bytes; | 652 | space_left -= tag_bytes; |
662 | 653 | ||
663 | if (first_tag) { | 654 | if (first_tag) { |
664 | memcpy (tagp, journal->j_uuid, 16); | 655 | memcpy (tagp, journal->j_uuid, 16); |
665 | tagp += 16; | 656 | tagp += 16; |
666 | space_left -= 16; | 657 | space_left -= 16; |
667 | first_tag = 0; | 658 | first_tag = 0; |
668 | } | 659 | } |
669 | 660 | ||
670 | /* If there's no more to do, or if the descriptor is full, | 661 | /* If there's no more to do, or if the descriptor is full, |
671 | let the IO rip! */ | 662 | let the IO rip! */ |
672 | 663 | ||
673 | if (bufs == journal->j_wbufsize || | 664 | if (bufs == journal->j_wbufsize || |
674 | commit_transaction->t_buffers == NULL || | 665 | commit_transaction->t_buffers == NULL || |
675 | space_left < tag_bytes + 16) { | 666 | space_left < tag_bytes + 16) { |
676 | 667 | ||
677 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); | 668 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); |
678 | 669 | ||
679 | /* Write an end-of-descriptor marker before | 670 | /* Write an end-of-descriptor marker before |
680 | submitting the IOs. "tag" still points to | 671 | submitting the IOs. "tag" still points to |
681 | the last tag we set up. */ | 672 | the last tag we set up. */ |
682 | 673 | ||
683 | tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); | 674 | tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); |
684 | 675 | ||
685 | start_journal_io: | 676 | start_journal_io: |
686 | for (i = 0; i < bufs; i++) { | 677 | for (i = 0; i < bufs; i++) { |
687 | struct buffer_head *bh = wbuf[i]; | 678 | struct buffer_head *bh = wbuf[i]; |
688 | /* | 679 | /* |
689 | * Compute checksum. | 680 | * Compute checksum. |
690 | */ | 681 | */ |
691 | if (JBD2_HAS_COMPAT_FEATURE(journal, | 682 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
692 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | 683 | JBD2_FEATURE_COMPAT_CHECKSUM)) { |
693 | crc32_sum = | 684 | crc32_sum = |
694 | jbd2_checksum_data(crc32_sum, bh); | 685 | jbd2_checksum_data(crc32_sum, bh); |
695 | } | 686 | } |
696 | 687 | ||
697 | lock_buffer(bh); | 688 | lock_buffer(bh); |
698 | clear_buffer_dirty(bh); | 689 | clear_buffer_dirty(bh); |
699 | set_buffer_uptodate(bh); | 690 | set_buffer_uptodate(bh); |
700 | bh->b_end_io = journal_end_buffer_io_sync; | 691 | bh->b_end_io = journal_end_buffer_io_sync; |
701 | submit_bh(write_op, bh); | 692 | submit_bh(write_op, bh); |
702 | } | 693 | } |
703 | cond_resched(); | 694 | cond_resched(); |
704 | stats.run.rs_blocks_logged += bufs; | 695 | stats.run.rs_blocks_logged += bufs; |
705 | 696 | ||
706 | /* Force a new descriptor to be generated next | 697 | /* Force a new descriptor to be generated next |
707 | time round the loop. */ | 698 | time round the loop. */ |
708 | descriptor = NULL; | 699 | descriptor = NULL; |
709 | bufs = 0; | 700 | bufs = 0; |
710 | } | 701 | } |
711 | } | 702 | } |
712 | 703 | ||
713 | /* | 704 | /* |
714 | * If the journal is not located on the file system device, | 705 | * If the journal is not located on the file system device, |
715 | * then we must flush the file system device before we issue | 706 | * then we must flush the file system device before we issue |
716 | * the commit record | 707 | * the commit record |
717 | */ | 708 | */ |
718 | if (commit_transaction->t_flushed_data_blocks && | 709 | if (commit_transaction->t_flushed_data_blocks && |
719 | (journal->j_fs_dev != journal->j_dev) && | 710 | (journal->j_fs_dev != journal->j_dev) && |
720 | (journal->j_flags & JBD2_BARRIER)) | 711 | (journal->j_flags & JBD2_BARRIER)) |
721 | blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, | 712 | blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, |
722 | BLKDEV_IFL_WAIT); | 713 | BLKDEV_IFL_WAIT); |
723 | 714 | ||
724 | /* Done it all: now write the commit record asynchronously. */ | 715 | /* Done it all: now write the commit record asynchronously. */ |
725 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, | 716 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, |
726 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 717 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
727 | err = journal_submit_commit_record(journal, commit_transaction, | 718 | err = journal_submit_commit_record(journal, commit_transaction, |
728 | &cbh, crc32_sum); | 719 | &cbh, crc32_sum); |
729 | if (err) | 720 | if (err) |
730 | __jbd2_journal_abort_hard(journal); | 721 | __jbd2_journal_abort_hard(journal); |
731 | if (journal->j_flags & JBD2_BARRIER) | 722 | if (journal->j_flags & JBD2_BARRIER) |
732 | blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, | 723 | blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, |
733 | BLKDEV_IFL_WAIT); | 724 | BLKDEV_IFL_WAIT); |
734 | } | 725 | } |
735 | 726 | ||
736 | err = journal_finish_inode_data_buffers(journal, commit_transaction); | 727 | err = journal_finish_inode_data_buffers(journal, commit_transaction); |
737 | if (err) { | 728 | if (err) { |
738 | printk(KERN_WARNING | 729 | printk(KERN_WARNING |
739 | "JBD2: Detected IO errors while flushing file data " | 730 | "JBD2: Detected IO errors while flushing file data " |
740 | "on %s\n", journal->j_devname); | 731 | "on %s\n", journal->j_devname); |
741 | if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) | 732 | if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) |
742 | jbd2_journal_abort(journal, err); | 733 | jbd2_journal_abort(journal, err); |
743 | err = 0; | 734 | err = 0; |
744 | } | 735 | } |
745 | 736 | ||
746 | /* Lo and behold: we have just managed to send a transaction to | 737 | /* Lo and behold: we have just managed to send a transaction to |
747 | the log. Before we can commit it, wait for the IO so far to | 738 | the log. Before we can commit it, wait for the IO so far to |
748 | complete. Control buffers being written are on the | 739 | complete. Control buffers being written are on the |
749 | transaction's t_log_list queue, and metadata buffers are on | 740 | transaction's t_log_list queue, and metadata buffers are on |
750 | the t_iobuf_list queue. | 741 | the t_iobuf_list queue. |
751 | 742 | ||
752 | Wait for the buffers in reverse order. That way we are | 743 | Wait for the buffers in reverse order. That way we are |
753 | less likely to be woken up until all IOs have completed, and | 744 | less likely to be woken up until all IOs have completed, and |
754 | so we incur less scheduling load. | 745 | so we incur less scheduling load. |
755 | */ | 746 | */ |
756 | 747 | ||
757 | jbd_debug(3, "JBD: commit phase 3\n"); | 748 | jbd_debug(3, "JBD: commit phase 3\n"); |
758 | 749 | ||
759 | /* | 750 | /* |
760 | * akpm: these are BJ_IO, and j_list_lock is not needed. | 751 | * akpm: these are BJ_IO, and j_list_lock is not needed. |
761 | * See __journal_try_to_free_buffer. | 752 | * See __journal_try_to_free_buffer. |
762 | */ | 753 | */ |
763 | wait_for_iobuf: | 754 | wait_for_iobuf: |
764 | while (commit_transaction->t_iobuf_list != NULL) { | 755 | while (commit_transaction->t_iobuf_list != NULL) { |
765 | struct buffer_head *bh; | 756 | struct buffer_head *bh; |
766 | 757 | ||
767 | jh = commit_transaction->t_iobuf_list->b_tprev; | 758 | jh = commit_transaction->t_iobuf_list->b_tprev; |
768 | bh = jh2bh(jh); | 759 | bh = jh2bh(jh); |
769 | if (buffer_locked(bh)) { | 760 | if (buffer_locked(bh)) { |
770 | wait_on_buffer(bh); | 761 | wait_on_buffer(bh); |
771 | goto wait_for_iobuf; | 762 | goto wait_for_iobuf; |
772 | } | 763 | } |
773 | if (cond_resched()) | 764 | if (cond_resched()) |
774 | goto wait_for_iobuf; | 765 | goto wait_for_iobuf; |
775 | 766 | ||
776 | if (unlikely(!buffer_uptodate(bh))) | 767 | if (unlikely(!buffer_uptodate(bh))) |
777 | err = -EIO; | 768 | err = -EIO; |
778 | 769 | ||
779 | clear_buffer_jwrite(bh); | 770 | clear_buffer_jwrite(bh); |
780 | 771 | ||
781 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | 772 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); |
782 | jbd2_journal_unfile_buffer(journal, jh); | 773 | jbd2_journal_unfile_buffer(journal, jh); |
783 | 774 | ||
784 | /* | 775 | /* |
785 | * ->t_iobuf_list should contain only dummy buffer_heads | 776 | * ->t_iobuf_list should contain only dummy buffer_heads |
786 | * which were created by jbd2_journal_write_metadata_buffer(). | 777 | * which were created by jbd2_journal_write_metadata_buffer(). |
787 | */ | 778 | */ |
788 | BUFFER_TRACE(bh, "dumping temporary bh"); | 779 | BUFFER_TRACE(bh, "dumping temporary bh"); |
789 | jbd2_journal_put_journal_head(jh); | 780 | jbd2_journal_put_journal_head(jh); |
790 | __brelse(bh); | 781 | __brelse(bh); |
791 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | 782 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); |
792 | free_buffer_head(bh); | 783 | free_buffer_head(bh); |
793 | 784 | ||
794 | /* We also have to unlock and free the corresponding | 785 | /* We also have to unlock and free the corresponding |
795 | shadowed buffer */ | 786 | shadowed buffer */ |
796 | jh = commit_transaction->t_shadow_list->b_tprev; | 787 | jh = commit_transaction->t_shadow_list->b_tprev; |
797 | bh = jh2bh(jh); | 788 | bh = jh2bh(jh); |
798 | clear_bit(BH_JWrite, &bh->b_state); | 789 | clear_bit(BH_JWrite, &bh->b_state); |
799 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | 790 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); |
800 | 791 | ||
801 | /* The metadata is now released for reuse, but we need | 792 | /* The metadata is now released for reuse, but we need |
802 | to remember it against this transaction so that when | 793 | to remember it against this transaction so that when |
803 | we finally commit, we can do any checkpointing | 794 | we finally commit, we can do any checkpointing |
804 | required. */ | 795 | required. */ |
805 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | 796 | JBUFFER_TRACE(jh, "file as BJ_Forget"); |
806 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); | 797 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); |
807 | /* Wake up any transactions which were waiting for this | 798 | /* Wake up any transactions which were waiting for this |
808 | IO to complete */ | 799 | IO to complete */ |
809 | wake_up_bit(&bh->b_state, BH_Unshadow); | 800 | wake_up_bit(&bh->b_state, BH_Unshadow); |
810 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | 801 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); |
811 | __brelse(bh); | 802 | __brelse(bh); |
812 | } | 803 | } |
813 | 804 | ||
814 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | 805 | J_ASSERT (commit_transaction->t_shadow_list == NULL); |
815 | 806 | ||
816 | jbd_debug(3, "JBD: commit phase 4\n"); | 807 | jbd_debug(3, "JBD: commit phase 4\n"); |
817 | 808 | ||
818 | /* Here we wait for the revoke record and descriptor record buffers */ | 809 | /* Here we wait for the revoke record and descriptor record buffers */ |
819 | wait_for_ctlbuf: | 810 | wait_for_ctlbuf: |
820 | while (commit_transaction->t_log_list != NULL) { | 811 | while (commit_transaction->t_log_list != NULL) { |
821 | struct buffer_head *bh; | 812 | struct buffer_head *bh; |
822 | 813 | ||
823 | jh = commit_transaction->t_log_list->b_tprev; | 814 | jh = commit_transaction->t_log_list->b_tprev; |
824 | bh = jh2bh(jh); | 815 | bh = jh2bh(jh); |
825 | if (buffer_locked(bh)) { | 816 | if (buffer_locked(bh)) { |
826 | wait_on_buffer(bh); | 817 | wait_on_buffer(bh); |
827 | goto wait_for_ctlbuf; | 818 | goto wait_for_ctlbuf; |
828 | } | 819 | } |
829 | if (cond_resched()) | 820 | if (cond_resched()) |
830 | goto wait_for_ctlbuf; | 821 | goto wait_for_ctlbuf; |
831 | 822 | ||
832 | if (unlikely(!buffer_uptodate(bh))) | 823 | if (unlikely(!buffer_uptodate(bh))) |
833 | err = -EIO; | 824 | err = -EIO; |
834 | 825 | ||
835 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | 826 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); |
836 | clear_buffer_jwrite(bh); | 827 | clear_buffer_jwrite(bh); |
837 | jbd2_journal_unfile_buffer(journal, jh); | 828 | jbd2_journal_unfile_buffer(journal, jh); |
838 | jbd2_journal_put_journal_head(jh); | 829 | jbd2_journal_put_journal_head(jh); |
839 | __brelse(bh); /* One for getblk */ | 830 | __brelse(bh); /* One for getblk */ |
840 | /* AKPM: bforget here */ | 831 | /* AKPM: bforget here */ |
841 | } | 832 | } |
842 | 833 | ||
843 | if (err) | 834 | if (err) |
844 | jbd2_journal_abort(journal, err); | 835 | jbd2_journal_abort(journal, err); |
845 | 836 | ||
846 | jbd_debug(3, "JBD: commit phase 5\n"); | 837 | jbd_debug(3, "JBD: commit phase 5\n"); |
847 | 838 | ||
848 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | 839 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
849 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 840 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
850 | err = journal_submit_commit_record(journal, commit_transaction, | 841 | err = journal_submit_commit_record(journal, commit_transaction, |
851 | &cbh, crc32_sum); | 842 | &cbh, crc32_sum); |
852 | if (err) | 843 | if (err) |
853 | __jbd2_journal_abort_hard(journal); | 844 | __jbd2_journal_abort_hard(journal); |
854 | } | 845 | } |
855 | if (!err && !is_journal_aborted(journal)) | 846 | if (!err && !is_journal_aborted(journal)) |
856 | err = journal_wait_on_commit_record(journal, cbh); | 847 | err = journal_wait_on_commit_record(journal, cbh); |
857 | 848 | ||
858 | if (err) | 849 | if (err) |
859 | jbd2_journal_abort(journal, err); | 850 | jbd2_journal_abort(journal, err); |
860 | 851 | ||
861 | /* End of a transaction! Finally, we can do checkpoint | 852 | /* End of a transaction! Finally, we can do checkpoint |
862 | processing: any buffers committed as a result of this | 853 | processing: any buffers committed as a result of this |
863 | transaction can be removed from any checkpoint list it was on | 854 | transaction can be removed from any checkpoint list it was on |
864 | before. */ | 855 | before. */ |
865 | 856 | ||
866 | jbd_debug(3, "JBD: commit phase 6\n"); | 857 | jbd_debug(3, "JBD: commit phase 6\n"); |
867 | 858 | ||
868 | J_ASSERT(list_empty(&commit_transaction->t_inode_list)); | 859 | J_ASSERT(list_empty(&commit_transaction->t_inode_list)); |
869 | J_ASSERT(commit_transaction->t_buffers == NULL); | 860 | J_ASSERT(commit_transaction->t_buffers == NULL); |
870 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | 861 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); |
871 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | 862 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); |
872 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | 863 | J_ASSERT(commit_transaction->t_shadow_list == NULL); |
873 | J_ASSERT(commit_transaction->t_log_list == NULL); | 864 | J_ASSERT(commit_transaction->t_log_list == NULL); |
874 | 865 | ||
875 | restart_loop: | 866 | restart_loop: |
876 | /* | 867 | /* |
877 | * As there are other places (journal_unmap_buffer()) adding buffers | 868 | * As there are other places (journal_unmap_buffer()) adding buffers |
878 | * to this list we have to be careful and hold the j_list_lock. | 869 | * to this list we have to be careful and hold the j_list_lock. |
879 | */ | 870 | */ |
880 | spin_lock(&journal->j_list_lock); | 871 | spin_lock(&journal->j_list_lock); |
881 | while (commit_transaction->t_forget) { | 872 | while (commit_transaction->t_forget) { |
882 | transaction_t *cp_transaction; | 873 | transaction_t *cp_transaction; |
883 | struct buffer_head *bh; | 874 | struct buffer_head *bh; |
884 | 875 | ||
885 | jh = commit_transaction->t_forget; | 876 | jh = commit_transaction->t_forget; |
886 | spin_unlock(&journal->j_list_lock); | 877 | spin_unlock(&journal->j_list_lock); |
887 | bh = jh2bh(jh); | 878 | bh = jh2bh(jh); |
888 | jbd_lock_bh_state(bh); | 879 | jbd_lock_bh_state(bh); |
889 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); | 880 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); |
890 | 881 | ||
891 | /* | 882 | /* |
892 | * If there is undo-protected committed data against | 883 | * If there is undo-protected committed data against |
893 | * this buffer, then we can remove it now. If it is a | 884 | * this buffer, then we can remove it now. If it is a |
894 | * buffer needing such protection, the old frozen_data | 885 | * buffer needing such protection, the old frozen_data |
895 | * field now points to a committed version of the | 886 | * field now points to a committed version of the |
896 | * buffer, so rotate that field to the new committed | 887 | * buffer, so rotate that field to the new committed |
897 | * data. | 888 | * data. |
898 | * | 889 | * |
899 | * Otherwise, we can just throw away the frozen data now. | 890 | * Otherwise, we can just throw away the frozen data now. |
900 | * | 891 | * |
901 | * We also know that the frozen data has already fired | 892 | * We also know that the frozen data has already fired |
902 | * its triggers if they exist, so we can clear that too. | 893 | * its triggers if they exist, so we can clear that too. |
903 | */ | 894 | */ |
904 | if (jh->b_committed_data) { | 895 | if (jh->b_committed_data) { |
905 | jbd2_free(jh->b_committed_data, bh->b_size); | 896 | jbd2_free(jh->b_committed_data, bh->b_size); |
906 | jh->b_committed_data = NULL; | 897 | jh->b_committed_data = NULL; |
907 | if (jh->b_frozen_data) { | 898 | if (jh->b_frozen_data) { |
908 | jh->b_committed_data = jh->b_frozen_data; | 899 | jh->b_committed_data = jh->b_frozen_data; |
909 | jh->b_frozen_data = NULL; | 900 | jh->b_frozen_data = NULL; |
910 | jh->b_frozen_triggers = NULL; | 901 | jh->b_frozen_triggers = NULL; |
911 | } | 902 | } |
912 | } else if (jh->b_frozen_data) { | 903 | } else if (jh->b_frozen_data) { |
913 | jbd2_free(jh->b_frozen_data, bh->b_size); | 904 | jbd2_free(jh->b_frozen_data, bh->b_size); |
914 | jh->b_frozen_data = NULL; | 905 | jh->b_frozen_data = NULL; |
915 | jh->b_frozen_triggers = NULL; | 906 | jh->b_frozen_triggers = NULL; |
916 | } | 907 | } |
917 | 908 | ||
918 | spin_lock(&journal->j_list_lock); | 909 | spin_lock(&journal->j_list_lock); |
919 | cp_transaction = jh->b_cp_transaction; | 910 | cp_transaction = jh->b_cp_transaction; |
920 | if (cp_transaction) { | 911 | if (cp_transaction) { |
921 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | 912 | JBUFFER_TRACE(jh, "remove from old cp transaction"); |
922 | cp_transaction->t_chp_stats.cs_dropped++; | 913 | cp_transaction->t_chp_stats.cs_dropped++; |
923 | __jbd2_journal_remove_checkpoint(jh); | 914 | __jbd2_journal_remove_checkpoint(jh); |
924 | } | 915 | } |
925 | 916 | ||
926 | /* Only re-checkpoint the buffer_head if it is marked | 917 | /* Only re-checkpoint the buffer_head if it is marked |
927 | * dirty. If the buffer was added to the BJ_Forget list | 918 | * dirty. If the buffer was added to the BJ_Forget list |
928 | * by jbd2_journal_forget, it may no longer be dirty and | 919 | * by jbd2_journal_forget, it may no longer be dirty and |
929 | * there's no point in keeping a checkpoint record for | 920 | * there's no point in keeping a checkpoint record for |
930 | * it. */ | 921 | * it. */ |
931 | 922 | ||
932 | /* A buffer which has been freed while still being | 923 | /* A buffer which has been freed while still being |
933 | * journaled by a previous transaction may end up still | 924 | * journaled by a previous transaction may end up still |
934 | * being dirty here, but we want to avoid writing back | 925 | * being dirty here, but we want to avoid writing back |
935 | * that buffer in the future after the "add to orphan" | 926 | * that buffer in the future after the "add to orphan" |
936 | * operation been committed, That's not only a performance | 927 | * operation been committed, That's not only a performance |
937 | * gain, it also stops aliasing problems if the buffer is | 928 | * gain, it also stops aliasing problems if the buffer is |
938 | * left behind for writeback and gets reallocated for another | 929 | * left behind for writeback and gets reallocated for another |
939 | * use in a different page. */ | 930 | * use in a different page. */ |
940 | if (buffer_freed(bh) && !jh->b_next_transaction) { | 931 | if (buffer_freed(bh) && !jh->b_next_transaction) { |
941 | clear_buffer_freed(bh); | 932 | clear_buffer_freed(bh); |
942 | clear_buffer_jbddirty(bh); | 933 | clear_buffer_jbddirty(bh); |
943 | } | 934 | } |
944 | 935 | ||
945 | if (buffer_jbddirty(bh)) { | 936 | if (buffer_jbddirty(bh)) { |
946 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); | 937 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); |
947 | __jbd2_journal_insert_checkpoint(jh, commit_transaction); | 938 | __jbd2_journal_insert_checkpoint(jh, commit_transaction); |
948 | if (is_journal_aborted(journal)) | 939 | if (is_journal_aborted(journal)) |
949 | clear_buffer_jbddirty(bh); | 940 | clear_buffer_jbddirty(bh); |
950 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); | 941 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); |
951 | __jbd2_journal_refile_buffer(jh); | 942 | __jbd2_journal_refile_buffer(jh); |
952 | jbd_unlock_bh_state(bh); | 943 | jbd_unlock_bh_state(bh); |
953 | } else { | 944 | } else { |
954 | J_ASSERT_BH(bh, !buffer_dirty(bh)); | 945 | J_ASSERT_BH(bh, !buffer_dirty(bh)); |
955 | /* The buffer on BJ_Forget list and not jbddirty means | 946 | /* The buffer on BJ_Forget list and not jbddirty means |
956 | * it has been freed by this transaction and hence it | 947 | * it has been freed by this transaction and hence it |
957 | * could not have been reallocated until this | 948 | * could not have been reallocated until this |
958 | * transaction has committed. *BUT* it could be | 949 | * transaction has committed. *BUT* it could be |
959 | * reallocated once we have written all the data to | 950 | * reallocated once we have written all the data to |
960 | * disk and before we process the buffer on BJ_Forget | 951 | * disk and before we process the buffer on BJ_Forget |
961 | * list. */ | 952 | * list. */ |
962 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); | 953 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); |
963 | __jbd2_journal_refile_buffer(jh); | 954 | __jbd2_journal_refile_buffer(jh); |
964 | if (!jh->b_transaction) { | 955 | if (!jh->b_transaction) { |
965 | jbd_unlock_bh_state(bh); | 956 | jbd_unlock_bh_state(bh); |
966 | /* needs a brelse */ | 957 | /* needs a brelse */ |
967 | jbd2_journal_remove_journal_head(bh); | 958 | jbd2_journal_remove_journal_head(bh); |
968 | release_buffer_page(bh); | 959 | release_buffer_page(bh); |
969 | } else | 960 | } else |
970 | jbd_unlock_bh_state(bh); | 961 | jbd_unlock_bh_state(bh); |
971 | } | 962 | } |
972 | cond_resched_lock(&journal->j_list_lock); | 963 | cond_resched_lock(&journal->j_list_lock); |
973 | } | 964 | } |
974 | spin_unlock(&journal->j_list_lock); | 965 | spin_unlock(&journal->j_list_lock); |
975 | /* | 966 | /* |
976 | * This is a bit sleazy. We use j_list_lock to protect transition | 967 | * This is a bit sleazy. We use j_list_lock to protect transition |
977 | * of a transaction into T_FINISHED state and calling | 968 | * of a transaction into T_FINISHED state and calling |
978 | * __jbd2_journal_drop_transaction(). Otherwise we could race with | 969 | * __jbd2_journal_drop_transaction(). Otherwise we could race with |
979 | * other checkpointing code processing the transaction... | 970 | * other checkpointing code processing the transaction... |
980 | */ | 971 | */ |
981 | write_lock(&journal->j_state_lock); | 972 | write_lock(&journal->j_state_lock); |
982 | spin_lock(&journal->j_list_lock); | 973 | spin_lock(&journal->j_list_lock); |
983 | /* | 974 | /* |
984 | * Now recheck if some buffers did not get attached to the transaction | 975 | * Now recheck if some buffers did not get attached to the transaction |
985 | * while the lock was dropped... | 976 | * while the lock was dropped... |
986 | */ | 977 | */ |
987 | if (commit_transaction->t_forget) { | 978 | if (commit_transaction->t_forget) { |
988 | spin_unlock(&journal->j_list_lock); | 979 | spin_unlock(&journal->j_list_lock); |
989 | write_unlock(&journal->j_state_lock); | 980 | write_unlock(&journal->j_state_lock); |
990 | goto restart_loop; | 981 | goto restart_loop; |
991 | } | 982 | } |
992 | 983 | ||
993 | /* Done with this transaction! */ | 984 | /* Done with this transaction! */ |
994 | 985 | ||
995 | jbd_debug(3, "JBD: commit phase 7\n"); | 986 | jbd_debug(3, "JBD: commit phase 7\n"); |
996 | 987 | ||
997 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | 988 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
998 | 989 | ||
999 | commit_transaction->t_start = jiffies; | 990 | commit_transaction->t_start = jiffies; |
1000 | stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, | 991 | stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, |
1001 | commit_transaction->t_start); | 992 | commit_transaction->t_start); |
1002 | 993 | ||
1003 | /* | 994 | /* |
1004 | * File the transaction statistics | 995 | * File the transaction statistics |
1005 | */ | 996 | */ |
1006 | stats.ts_tid = commit_transaction->t_tid; | 997 | stats.ts_tid = commit_transaction->t_tid; |
1007 | stats.run.rs_handle_count = | 998 | stats.run.rs_handle_count = |
1008 | atomic_read(&commit_transaction->t_handle_count); | 999 | atomic_read(&commit_transaction->t_handle_count); |
1009 | trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, | 1000 | trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, |
1010 | commit_transaction->t_tid, &stats.run); | 1001 | commit_transaction->t_tid, &stats.run); |
1011 | 1002 | ||
1012 | /* | 1003 | /* |
1013 | * Calculate overall stats | 1004 | * Calculate overall stats |
1014 | */ | 1005 | */ |
1015 | spin_lock(&journal->j_history_lock); | 1006 | spin_lock(&journal->j_history_lock); |
1016 | journal->j_stats.ts_tid++; | 1007 | journal->j_stats.ts_tid++; |
1017 | journal->j_stats.run.rs_wait += stats.run.rs_wait; | 1008 | journal->j_stats.run.rs_wait += stats.run.rs_wait; |
1018 | journal->j_stats.run.rs_running += stats.run.rs_running; | 1009 | journal->j_stats.run.rs_running += stats.run.rs_running; |
1019 | journal->j_stats.run.rs_locked += stats.run.rs_locked; | 1010 | journal->j_stats.run.rs_locked += stats.run.rs_locked; |
1020 | journal->j_stats.run.rs_flushing += stats.run.rs_flushing; | 1011 | journal->j_stats.run.rs_flushing += stats.run.rs_flushing; |
1021 | journal->j_stats.run.rs_logging += stats.run.rs_logging; | 1012 | journal->j_stats.run.rs_logging += stats.run.rs_logging; |
1022 | journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; | 1013 | journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; |
1023 | journal->j_stats.run.rs_blocks += stats.run.rs_blocks; | 1014 | journal->j_stats.run.rs_blocks += stats.run.rs_blocks; |
1024 | journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; | 1015 | journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; |
1025 | spin_unlock(&journal->j_history_lock); | 1016 | spin_unlock(&journal->j_history_lock); |
1026 | 1017 | ||
1027 | commit_transaction->t_state = T_FINISHED; | 1018 | commit_transaction->t_state = T_FINISHED; |
1028 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 1019 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
1029 | journal->j_commit_sequence = commit_transaction->t_tid; | 1020 | journal->j_commit_sequence = commit_transaction->t_tid; |
1030 | journal->j_committing_transaction = NULL; | 1021 | journal->j_committing_transaction = NULL; |
1031 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | 1022 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); |
1032 | 1023 | ||
1033 | /* | 1024 | /* |
1034 | * weight the commit time higher than the average time so we don't | 1025 | * weight the commit time higher than the average time so we don't |
1035 | * react too strongly to vast changes in the commit time | 1026 | * react too strongly to vast changes in the commit time |
1036 | */ | 1027 | */ |
1037 | if (likely(journal->j_average_commit_time)) | 1028 | if (likely(journal->j_average_commit_time)) |
1038 | journal->j_average_commit_time = (commit_time + | 1029 | journal->j_average_commit_time = (commit_time + |
1039 | journal->j_average_commit_time*3) / 4; | 1030 | journal->j_average_commit_time*3) / 4; |
1040 | else | 1031 | else |
1041 | journal->j_average_commit_time = commit_time; | 1032 | journal->j_average_commit_time = commit_time; |
1042 | write_unlock(&journal->j_state_lock); | 1033 | write_unlock(&journal->j_state_lock); |
1043 | 1034 | ||
1044 | if (commit_transaction->t_checkpoint_list == NULL && | 1035 | if (commit_transaction->t_checkpoint_list == NULL && |
1045 | commit_transaction->t_checkpoint_io_list == NULL) { | 1036 | commit_transaction->t_checkpoint_io_list == NULL) { |
1046 | __jbd2_journal_drop_transaction(journal, commit_transaction); | 1037 | __jbd2_journal_drop_transaction(journal, commit_transaction); |
1047 | to_free = 1; | 1038 | to_free = 1; |
1048 | } else { | 1039 | } else { |
1049 | if (journal->j_checkpoint_transactions == NULL) { | 1040 | if (journal->j_checkpoint_transactions == NULL) { |
1050 | journal->j_checkpoint_transactions = commit_transaction; | 1041 | journal->j_checkpoint_transactions = commit_transaction; |
1051 | commit_transaction->t_cpnext = commit_transaction; | 1042 | commit_transaction->t_cpnext = commit_transaction; |
1052 | commit_transaction->t_cpprev = commit_transaction; | 1043 | commit_transaction->t_cpprev = commit_transaction; |
1053 | } else { | 1044 | } else { |
1054 | commit_transaction->t_cpnext = | 1045 | commit_transaction->t_cpnext = |
1055 | journal->j_checkpoint_transactions; | 1046 | journal->j_checkpoint_transactions; |
1056 | commit_transaction->t_cpprev = | 1047 | commit_transaction->t_cpprev = |
1057 | commit_transaction->t_cpnext->t_cpprev; | 1048 | commit_transaction->t_cpnext->t_cpprev; |
1058 | commit_transaction->t_cpnext->t_cpprev = | 1049 | commit_transaction->t_cpnext->t_cpprev = |
1059 | commit_transaction; | 1050 | commit_transaction; |
1060 | commit_transaction->t_cpprev->t_cpnext = | 1051 | commit_transaction->t_cpprev->t_cpnext = |
1061 | commit_transaction; | 1052 | commit_transaction; |
1062 | } | 1053 | } |
1063 | } | 1054 | } |
1064 | spin_unlock(&journal->j_list_lock); | 1055 | spin_unlock(&journal->j_list_lock); |
1065 | 1056 | ||
1066 | if (journal->j_commit_callback) | 1057 | if (journal->j_commit_callback) |
1067 | journal->j_commit_callback(journal, commit_transaction); | 1058 | journal->j_commit_callback(journal, commit_transaction); |
1068 | 1059 | ||
1069 | trace_jbd2_end_commit(journal, commit_transaction); | 1060 | trace_jbd2_end_commit(journal, commit_transaction); |
1070 | jbd_debug(1, "JBD: commit %d complete, head %d\n", | 1061 | jbd_debug(1, "JBD: commit %d complete, head %d\n", |
1071 | journal->j_commit_sequence, journal->j_tail_sequence); | 1062 | journal->j_commit_sequence, journal->j_tail_sequence); |
1072 | if (to_free) | 1063 | if (to_free) |
1073 | kfree(commit_transaction); | 1064 | kfree(commit_transaction); |
1074 | 1065 | ||
1075 | wake_up(&journal->j_wait_done_commit); | 1066 | wake_up(&journal->j_wait_done_commit); |
fs/nilfs2/super.c
1 | /* | 1 | /* |
2 | * super.c - NILFS module and super block management. | 2 | * super.c - NILFS module and super block management. |
3 | * | 3 | * |
4 | * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. | 4 | * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
8 | * the Free Software Foundation; either version 2 of the License, or | 8 | * the Free Software Foundation; either version 2 of the License, or |
9 | * (at your option) any later version. | 9 | * (at your option) any later version. |
10 | * | 10 | * |
11 | * This program is distributed in the hope that it will be useful, | 11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
15 | * | 15 | * |
16 | * You should have received a copy of the GNU General Public License | 16 | * You should have received a copy of the GNU General Public License |
17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
19 | * | 19 | * |
20 | * Written by Ryusuke Konishi <ryusuke@osrg.net> | 20 | * Written by Ryusuke Konishi <ryusuke@osrg.net> |
21 | */ | 21 | */ |
22 | /* | 22 | /* |
23 | * linux/fs/ext2/super.c | 23 | * linux/fs/ext2/super.c |
24 | * | 24 | * |
25 | * Copyright (C) 1992, 1993, 1994, 1995 | 25 | * Copyright (C) 1992, 1993, 1994, 1995 |
26 | * Remy Card (card@masi.ibp.fr) | 26 | * Remy Card (card@masi.ibp.fr) |
27 | * Laboratoire MASI - Institut Blaise Pascal | 27 | * Laboratoire MASI - Institut Blaise Pascal |
28 | * Universite Pierre et Marie Curie (Paris VI) | 28 | * Universite Pierre et Marie Curie (Paris VI) |
29 | * | 29 | * |
30 | * from | 30 | * from |
31 | * | 31 | * |
32 | * linux/fs/minix/inode.c | 32 | * linux/fs/minix/inode.c |
33 | * | 33 | * |
34 | * Copyright (C) 1991, 1992 Linus Torvalds | 34 | * Copyright (C) 1991, 1992 Linus Torvalds |
35 | * | 35 | * |
36 | * Big-endian to little-endian byte-swapping/bitmaps by | 36 | * Big-endian to little-endian byte-swapping/bitmaps by |
37 | * David S. Miller (davem@caip.rutgers.edu), 1995 | 37 | * David S. Miller (davem@caip.rutgers.edu), 1995 |
38 | */ | 38 | */ |
39 | 39 | ||
40 | #include <linux/module.h> | 40 | #include <linux/module.h> |
41 | #include <linux/string.h> | 41 | #include <linux/string.h> |
42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
43 | #include <linux/init.h> | 43 | #include <linux/init.h> |
44 | #include <linux/blkdev.h> | 44 | #include <linux/blkdev.h> |
45 | #include <linux/parser.h> | 45 | #include <linux/parser.h> |
46 | #include <linux/random.h> | 46 | #include <linux/random.h> |
47 | #include <linux/crc32.h> | 47 | #include <linux/crc32.h> |
48 | #include <linux/smp_lock.h> | 48 | #include <linux/smp_lock.h> |
49 | #include <linux/vfs.h> | 49 | #include <linux/vfs.h> |
50 | #include <linux/writeback.h> | 50 | #include <linux/writeback.h> |
51 | #include <linux/kobject.h> | 51 | #include <linux/kobject.h> |
52 | #include <linux/exportfs.h> | 52 | #include <linux/exportfs.h> |
53 | #include <linux/seq_file.h> | 53 | #include <linux/seq_file.h> |
54 | #include <linux/mount.h> | 54 | #include <linux/mount.h> |
55 | #include "nilfs.h" | 55 | #include "nilfs.h" |
56 | #include "mdt.h" | 56 | #include "mdt.h" |
57 | #include "alloc.h" | 57 | #include "alloc.h" |
58 | #include "btree.h" | 58 | #include "btree.h" |
59 | #include "btnode.h" | 59 | #include "btnode.h" |
60 | #include "page.h" | 60 | #include "page.h" |
61 | #include "cpfile.h" | 61 | #include "cpfile.h" |
62 | #include "ifile.h" | 62 | #include "ifile.h" |
63 | #include "dat.h" | 63 | #include "dat.h" |
64 | #include "segment.h" | 64 | #include "segment.h" |
65 | #include "segbuf.h" | 65 | #include "segbuf.h" |
66 | 66 | ||
67 | MODULE_AUTHOR("NTT Corp."); | 67 | MODULE_AUTHOR("NTT Corp."); |
68 | MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " | 68 | MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " |
69 | "(NILFS)"); | 69 | "(NILFS)"); |
70 | MODULE_LICENSE("GPL"); | 70 | MODULE_LICENSE("GPL"); |
71 | 71 | ||
72 | struct kmem_cache *nilfs_inode_cachep; | 72 | struct kmem_cache *nilfs_inode_cachep; |
73 | struct kmem_cache *nilfs_transaction_cachep; | 73 | struct kmem_cache *nilfs_transaction_cachep; |
74 | struct kmem_cache *nilfs_segbuf_cachep; | 74 | struct kmem_cache *nilfs_segbuf_cachep; |
75 | struct kmem_cache *nilfs_btree_path_cache; | 75 | struct kmem_cache *nilfs_btree_path_cache; |
76 | 76 | ||
77 | static int nilfs_remount(struct super_block *sb, int *flags, char *data); | 77 | static int nilfs_remount(struct super_block *sb, int *flags, char *data); |
78 | 78 | ||
79 | static void nilfs_set_error(struct nilfs_sb_info *sbi) | 79 | static void nilfs_set_error(struct nilfs_sb_info *sbi) |
80 | { | 80 | { |
81 | struct the_nilfs *nilfs = sbi->s_nilfs; | 81 | struct the_nilfs *nilfs = sbi->s_nilfs; |
82 | struct nilfs_super_block **sbp; | 82 | struct nilfs_super_block **sbp; |
83 | 83 | ||
84 | down_write(&nilfs->ns_sem); | 84 | down_write(&nilfs->ns_sem); |
85 | if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { | 85 | if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { |
86 | nilfs->ns_mount_state |= NILFS_ERROR_FS; | 86 | nilfs->ns_mount_state |= NILFS_ERROR_FS; |
87 | sbp = nilfs_prepare_super(sbi, 0); | 87 | sbp = nilfs_prepare_super(sbi, 0); |
88 | if (likely(sbp)) { | 88 | if (likely(sbp)) { |
89 | sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); | 89 | sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); |
90 | if (sbp[1]) | 90 | if (sbp[1]) |
91 | sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); | 91 | sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); |
92 | nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); | 92 | nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); |
93 | } | 93 | } |
94 | } | 94 | } |
95 | up_write(&nilfs->ns_sem); | 95 | up_write(&nilfs->ns_sem); |
96 | } | 96 | } |
97 | 97 | ||
98 | /** | 98 | /** |
99 | * nilfs_error() - report failure condition on a filesystem | 99 | * nilfs_error() - report failure condition on a filesystem |
100 | * | 100 | * |
101 | * nilfs_error() sets an ERROR_FS flag on the superblock as well as | 101 | * nilfs_error() sets an ERROR_FS flag on the superblock as well as |
102 | * reporting an error message. It should be called when NILFS detects | 102 | * reporting an error message. It should be called when NILFS detects |
103 | * incoherences or defects of meta data on disk. As for sustainable | 103 | * incoherences or defects of meta data on disk. As for sustainable |
104 | * errors such as a single-shot I/O error, nilfs_warning() or the printk() | 104 | * errors such as a single-shot I/O error, nilfs_warning() or the printk() |
105 | * function should be used instead. | 105 | * function should be used instead. |
106 | * | 106 | * |
107 | * The segment constructor must not call this function because it can | 107 | * The segment constructor must not call this function because it can |
108 | * kill itself. | 108 | * kill itself. |
109 | */ | 109 | */ |
110 | void nilfs_error(struct super_block *sb, const char *function, | 110 | void nilfs_error(struct super_block *sb, const char *function, |
111 | const char *fmt, ...) | 111 | const char *fmt, ...) |
112 | { | 112 | { |
113 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 113 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
114 | va_list args; | 114 | va_list args; |
115 | 115 | ||
116 | va_start(args, fmt); | 116 | va_start(args, fmt); |
117 | printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); | 117 | printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); |
118 | vprintk(fmt, args); | 118 | vprintk(fmt, args); |
119 | printk("\n"); | 119 | printk("\n"); |
120 | va_end(args); | 120 | va_end(args); |
121 | 121 | ||
122 | if (!(sb->s_flags & MS_RDONLY)) { | 122 | if (!(sb->s_flags & MS_RDONLY)) { |
123 | nilfs_set_error(sbi); | 123 | nilfs_set_error(sbi); |
124 | 124 | ||
125 | if (nilfs_test_opt(sbi, ERRORS_RO)) { | 125 | if (nilfs_test_opt(sbi, ERRORS_RO)) { |
126 | printk(KERN_CRIT "Remounting filesystem read-only\n"); | 126 | printk(KERN_CRIT "Remounting filesystem read-only\n"); |
127 | sb->s_flags |= MS_RDONLY; | 127 | sb->s_flags |= MS_RDONLY; |
128 | } | 128 | } |
129 | } | 129 | } |
130 | 130 | ||
131 | if (nilfs_test_opt(sbi, ERRORS_PANIC)) | 131 | if (nilfs_test_opt(sbi, ERRORS_PANIC)) |
132 | panic("NILFS (device %s): panic forced after error\n", | 132 | panic("NILFS (device %s): panic forced after error\n", |
133 | sb->s_id); | 133 | sb->s_id); |
134 | } | 134 | } |
135 | 135 | ||
136 | void nilfs_warning(struct super_block *sb, const char *function, | 136 | void nilfs_warning(struct super_block *sb, const char *function, |
137 | const char *fmt, ...) | 137 | const char *fmt, ...) |
138 | { | 138 | { |
139 | va_list args; | 139 | va_list args; |
140 | 140 | ||
141 | va_start(args, fmt); | 141 | va_start(args, fmt); |
142 | printk(KERN_WARNING "NILFS warning (device %s): %s: ", | 142 | printk(KERN_WARNING "NILFS warning (device %s): %s: ", |
143 | sb->s_id, function); | 143 | sb->s_id, function); |
144 | vprintk(fmt, args); | 144 | vprintk(fmt, args); |
145 | printk("\n"); | 145 | printk("\n"); |
146 | va_end(args); | 146 | va_end(args); |
147 | } | 147 | } |
148 | 148 | ||
149 | 149 | ||
150 | struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) | 150 | struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) |
151 | { | 151 | { |
152 | struct nilfs_inode_info *ii; | 152 | struct nilfs_inode_info *ii; |
153 | 153 | ||
154 | ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); | 154 | ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); |
155 | if (!ii) | 155 | if (!ii) |
156 | return NULL; | 156 | return NULL; |
157 | ii->i_bh = NULL; | 157 | ii->i_bh = NULL; |
158 | ii->i_state = 0; | 158 | ii->i_state = 0; |
159 | ii->vfs_inode.i_version = 1; | 159 | ii->vfs_inode.i_version = 1; |
160 | nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi); | 160 | nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi); |
161 | return &ii->vfs_inode; | 161 | return &ii->vfs_inode; |
162 | } | 162 | } |
163 | 163 | ||
164 | struct inode *nilfs_alloc_inode(struct super_block *sb) | 164 | struct inode *nilfs_alloc_inode(struct super_block *sb) |
165 | { | 165 | { |
166 | return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs); | 166 | return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs); |
167 | } | 167 | } |
168 | 168 | ||
169 | void nilfs_destroy_inode(struct inode *inode) | 169 | void nilfs_destroy_inode(struct inode *inode) |
170 | { | 170 | { |
171 | kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); | 171 | kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); |
172 | } | 172 | } |
173 | 173 | ||
174 | static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) | 174 | static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) |
175 | { | 175 | { |
176 | struct the_nilfs *nilfs = sbi->s_nilfs; | 176 | struct the_nilfs *nilfs = sbi->s_nilfs; |
177 | int err; | 177 | int err; |
178 | int barrier_done = 0; | ||
179 | 178 | ||
180 | if (nilfs_test_opt(sbi, BARRIER)) { | ||
181 | set_buffer_ordered(nilfs->ns_sbh[0]); | ||
182 | barrier_done = 1; | ||
183 | } | ||
184 | retry: | 179 | retry: |
185 | set_buffer_dirty(nilfs->ns_sbh[0]); | 180 | set_buffer_dirty(nilfs->ns_sbh[0]); |
186 | err = sync_dirty_buffer(nilfs->ns_sbh[0]); | 181 | |
187 | if (err == -EOPNOTSUPP && barrier_done) { | 182 | if (nilfs_test_opt(sbi, BARRIER)) { |
188 | nilfs_warning(sbi->s_super, __func__, | 183 | err = __sync_dirty_buffer(nilfs->ns_sbh[0], |
189 | "barrier-based sync failed. " | 184 | WRITE_SYNC | WRITE_BARRIER); |
190 | "disabling barriers\n"); | 185 | if (err == -EOPNOTSUPP) { |
191 | nilfs_clear_opt(sbi, BARRIER); | 186 | nilfs_warning(sbi->s_super, __func__, |
192 | barrier_done = 0; | 187 | "barrier-based sync failed. " |
193 | clear_buffer_ordered(nilfs->ns_sbh[0]); | 188 | "disabling barriers\n"); |
194 | goto retry; | 189 | nilfs_clear_opt(sbi, BARRIER); |
190 | goto retry; | ||
191 | } | ||
192 | } else { | ||
193 | err = sync_dirty_buffer(nilfs->ns_sbh[0]); | ||
195 | } | 194 | } |
195 | |||
196 | if (unlikely(err)) { | 196 | if (unlikely(err)) { |
197 | printk(KERN_ERR | 197 | printk(KERN_ERR |
198 | "NILFS: unable to write superblock (err=%d)\n", err); | 198 | "NILFS: unable to write superblock (err=%d)\n", err); |
199 | if (err == -EIO && nilfs->ns_sbh[1]) { | 199 | if (err == -EIO && nilfs->ns_sbh[1]) { |
200 | /* | 200 | /* |
201 | * sbp[0] points to newer log than sbp[1], | 201 | * sbp[0] points to newer log than sbp[1], |
202 | * so copy sbp[0] to sbp[1] to take over sbp[0]. | 202 | * so copy sbp[0] to sbp[1] to take over sbp[0]. |
203 | */ | 203 | */ |
204 | memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0], | 204 | memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0], |
205 | nilfs->ns_sbsize); | 205 | nilfs->ns_sbsize); |
206 | nilfs_fall_back_super_block(nilfs); | 206 | nilfs_fall_back_super_block(nilfs); |
207 | goto retry; | 207 | goto retry; |
208 | } | 208 | } |
209 | } else { | 209 | } else { |
210 | struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; | 210 | struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; |
211 | 211 | ||
212 | nilfs->ns_sbwcount++; | 212 | nilfs->ns_sbwcount++; |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * The latest segment becomes trailable from the position | 215 | * The latest segment becomes trailable from the position |
216 | * written in superblock. | 216 | * written in superblock. |
217 | */ | 217 | */ |
218 | clear_nilfs_discontinued(nilfs); | 218 | clear_nilfs_discontinued(nilfs); |
219 | 219 | ||
220 | /* update GC protection for recent segments */ | 220 | /* update GC protection for recent segments */ |
221 | if (nilfs->ns_sbh[1]) { | 221 | if (nilfs->ns_sbh[1]) { |
222 | if (flag == NILFS_SB_COMMIT_ALL) { | 222 | if (flag == NILFS_SB_COMMIT_ALL) { |
223 | set_buffer_dirty(nilfs->ns_sbh[1]); | 223 | set_buffer_dirty(nilfs->ns_sbh[1]); |
224 | if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0) | 224 | if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0) |
225 | goto out; | 225 | goto out; |
226 | } | 226 | } |
227 | if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) < | 227 | if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) < |
228 | le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno)) | 228 | le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno)) |
229 | sbp = nilfs->ns_sbp[1]; | 229 | sbp = nilfs->ns_sbp[1]; |
230 | } | 230 | } |
231 | 231 | ||
232 | spin_lock(&nilfs->ns_last_segment_lock); | 232 | spin_lock(&nilfs->ns_last_segment_lock); |
233 | nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); | 233 | nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); |
234 | spin_unlock(&nilfs->ns_last_segment_lock); | 234 | spin_unlock(&nilfs->ns_last_segment_lock); |
235 | } | 235 | } |
236 | out: | 236 | out: |
237 | return err; | 237 | return err; |
238 | } | 238 | } |
239 | 239 | ||
240 | void nilfs_set_log_cursor(struct nilfs_super_block *sbp, | 240 | void nilfs_set_log_cursor(struct nilfs_super_block *sbp, |
241 | struct the_nilfs *nilfs) | 241 | struct the_nilfs *nilfs) |
242 | { | 242 | { |
243 | sector_t nfreeblocks; | 243 | sector_t nfreeblocks; |
244 | 244 | ||
245 | /* nilfs->ns_sem must be locked by the caller. */ | 245 | /* nilfs->ns_sem must be locked by the caller. */ |
246 | nilfs_count_free_blocks(nilfs, &nfreeblocks); | 246 | nilfs_count_free_blocks(nilfs, &nfreeblocks); |
247 | sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks); | 247 | sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks); |
248 | 248 | ||
249 | spin_lock(&nilfs->ns_last_segment_lock); | 249 | spin_lock(&nilfs->ns_last_segment_lock); |
250 | sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); | 250 | sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); |
251 | sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); | 251 | sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); |
252 | sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); | 252 | sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); |
253 | spin_unlock(&nilfs->ns_last_segment_lock); | 253 | spin_unlock(&nilfs->ns_last_segment_lock); |
254 | } | 254 | } |
255 | 255 | ||
256 | struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi, | 256 | struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi, |
257 | int flip) | 257 | int flip) |
258 | { | 258 | { |
259 | struct the_nilfs *nilfs = sbi->s_nilfs; | 259 | struct the_nilfs *nilfs = sbi->s_nilfs; |
260 | struct nilfs_super_block **sbp = nilfs->ns_sbp; | 260 | struct nilfs_super_block **sbp = nilfs->ns_sbp; |
261 | 261 | ||
262 | /* nilfs->ns_sem must be locked by the caller. */ | 262 | /* nilfs->ns_sem must be locked by the caller. */ |
263 | if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { | 263 | if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { |
264 | if (sbp[1] && | 264 | if (sbp[1] && |
265 | sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { | 265 | sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { |
266 | memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); | 266 | memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); |
267 | } else { | 267 | } else { |
268 | printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", | 268 | printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", |
269 | sbi->s_super->s_id); | 269 | sbi->s_super->s_id); |
270 | return NULL; | 270 | return NULL; |
271 | } | 271 | } |
272 | } else if (sbp[1] && | 272 | } else if (sbp[1] && |
273 | sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { | 273 | sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { |
274 | memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); | 274 | memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); |
275 | } | 275 | } |
276 | 276 | ||
277 | if (flip && sbp[1]) | 277 | if (flip && sbp[1]) |
278 | nilfs_swap_super_block(nilfs); | 278 | nilfs_swap_super_block(nilfs); |
279 | 279 | ||
280 | return sbp; | 280 | return sbp; |
281 | } | 281 | } |
282 | 282 | ||
283 | int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag) | 283 | int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag) |
284 | { | 284 | { |
285 | struct the_nilfs *nilfs = sbi->s_nilfs; | 285 | struct the_nilfs *nilfs = sbi->s_nilfs; |
286 | struct nilfs_super_block **sbp = nilfs->ns_sbp; | 286 | struct nilfs_super_block **sbp = nilfs->ns_sbp; |
287 | time_t t; | 287 | time_t t; |
288 | 288 | ||
289 | /* nilfs->ns_sem must be locked by the caller. */ | 289 | /* nilfs->ns_sem must be locked by the caller. */ |
290 | t = get_seconds(); | 290 | t = get_seconds(); |
291 | nilfs->ns_sbwtime = t; | 291 | nilfs->ns_sbwtime = t; |
292 | sbp[0]->s_wtime = cpu_to_le64(t); | 292 | sbp[0]->s_wtime = cpu_to_le64(t); |
293 | sbp[0]->s_sum = 0; | 293 | sbp[0]->s_sum = 0; |
294 | sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, | 294 | sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, |
295 | (unsigned char *)sbp[0], | 295 | (unsigned char *)sbp[0], |
296 | nilfs->ns_sbsize)); | 296 | nilfs->ns_sbsize)); |
297 | if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) { | 297 | if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) { |
298 | sbp[1]->s_wtime = sbp[0]->s_wtime; | 298 | sbp[1]->s_wtime = sbp[0]->s_wtime; |
299 | sbp[1]->s_sum = 0; | 299 | sbp[1]->s_sum = 0; |
300 | sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, | 300 | sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, |
301 | (unsigned char *)sbp[1], | 301 | (unsigned char *)sbp[1], |
302 | nilfs->ns_sbsize)); | 302 | nilfs->ns_sbsize)); |
303 | } | 303 | } |
304 | clear_nilfs_sb_dirty(nilfs); | 304 | clear_nilfs_sb_dirty(nilfs); |
305 | return nilfs_sync_super(sbi, flag); | 305 | return nilfs_sync_super(sbi, flag); |
306 | } | 306 | } |
307 | 307 | ||
308 | /** | 308 | /** |
309 | * nilfs_cleanup_super() - write filesystem state for cleanup | 309 | * nilfs_cleanup_super() - write filesystem state for cleanup |
310 | * @sbi: nilfs_sb_info to be unmounted or degraded to read-only | 310 | * @sbi: nilfs_sb_info to be unmounted or degraded to read-only |
311 | * | 311 | * |
312 | * This function restores state flags in the on-disk super block. | 312 | * This function restores state flags in the on-disk super block. |
313 | * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the | 313 | * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the |
314 | * filesystem was not clean previously. | 314 | * filesystem was not clean previously. |
315 | */ | 315 | */ |
316 | int nilfs_cleanup_super(struct nilfs_sb_info *sbi) | 316 | int nilfs_cleanup_super(struct nilfs_sb_info *sbi) |
317 | { | 317 | { |
318 | struct nilfs_super_block **sbp; | 318 | struct nilfs_super_block **sbp; |
319 | int flag = NILFS_SB_COMMIT; | 319 | int flag = NILFS_SB_COMMIT; |
320 | int ret = -EIO; | 320 | int ret = -EIO; |
321 | 321 | ||
322 | sbp = nilfs_prepare_super(sbi, 0); | 322 | sbp = nilfs_prepare_super(sbi, 0); |
323 | if (sbp) { | 323 | if (sbp) { |
324 | sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state); | 324 | sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state); |
325 | nilfs_set_log_cursor(sbp[0], sbi->s_nilfs); | 325 | nilfs_set_log_cursor(sbp[0], sbi->s_nilfs); |
326 | if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { | 326 | if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { |
327 | /* | 327 | /* |
328 | * make the "clean" flag also to the opposite | 328 | * make the "clean" flag also to the opposite |
329 | * super block if both super blocks point to | 329 | * super block if both super blocks point to |
330 | * the same checkpoint. | 330 | * the same checkpoint. |
331 | */ | 331 | */ |
332 | sbp[1]->s_state = sbp[0]->s_state; | 332 | sbp[1]->s_state = sbp[0]->s_state; |
333 | flag = NILFS_SB_COMMIT_ALL; | 333 | flag = NILFS_SB_COMMIT_ALL; |
334 | } | 334 | } |
335 | ret = nilfs_commit_super(sbi, flag); | 335 | ret = nilfs_commit_super(sbi, flag); |
336 | } | 336 | } |
337 | return ret; | 337 | return ret; |
338 | } | 338 | } |
339 | 339 | ||
340 | static void nilfs_put_super(struct super_block *sb) | 340 | static void nilfs_put_super(struct super_block *sb) |
341 | { | 341 | { |
342 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 342 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
343 | struct the_nilfs *nilfs = sbi->s_nilfs; | 343 | struct the_nilfs *nilfs = sbi->s_nilfs; |
344 | 344 | ||
345 | lock_kernel(); | 345 | lock_kernel(); |
346 | 346 | ||
347 | nilfs_detach_segment_constructor(sbi); | 347 | nilfs_detach_segment_constructor(sbi); |
348 | 348 | ||
349 | if (!(sb->s_flags & MS_RDONLY)) { | 349 | if (!(sb->s_flags & MS_RDONLY)) { |
350 | down_write(&nilfs->ns_sem); | 350 | down_write(&nilfs->ns_sem); |
351 | nilfs_cleanup_super(sbi); | 351 | nilfs_cleanup_super(sbi); |
352 | up_write(&nilfs->ns_sem); | 352 | up_write(&nilfs->ns_sem); |
353 | } | 353 | } |
354 | down_write(&nilfs->ns_super_sem); | 354 | down_write(&nilfs->ns_super_sem); |
355 | if (nilfs->ns_current == sbi) | 355 | if (nilfs->ns_current == sbi) |
356 | nilfs->ns_current = NULL; | 356 | nilfs->ns_current = NULL; |
357 | up_write(&nilfs->ns_super_sem); | 357 | up_write(&nilfs->ns_super_sem); |
358 | 358 | ||
359 | nilfs_detach_checkpoint(sbi); | 359 | nilfs_detach_checkpoint(sbi); |
360 | put_nilfs(sbi->s_nilfs); | 360 | put_nilfs(sbi->s_nilfs); |
361 | sbi->s_super = NULL; | 361 | sbi->s_super = NULL; |
362 | sb->s_fs_info = NULL; | 362 | sb->s_fs_info = NULL; |
363 | nilfs_put_sbinfo(sbi); | 363 | nilfs_put_sbinfo(sbi); |
364 | 364 | ||
365 | unlock_kernel(); | 365 | unlock_kernel(); |
366 | } | 366 | } |
367 | 367 | ||
368 | static int nilfs_sync_fs(struct super_block *sb, int wait) | 368 | static int nilfs_sync_fs(struct super_block *sb, int wait) |
369 | { | 369 | { |
370 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 370 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
371 | struct the_nilfs *nilfs = sbi->s_nilfs; | 371 | struct the_nilfs *nilfs = sbi->s_nilfs; |
372 | struct nilfs_super_block **sbp; | 372 | struct nilfs_super_block **sbp; |
373 | int err = 0; | 373 | int err = 0; |
374 | 374 | ||
375 | /* This function is called when super block should be written back */ | 375 | /* This function is called when super block should be written back */ |
376 | if (wait) | 376 | if (wait) |
377 | err = nilfs_construct_segment(sb); | 377 | err = nilfs_construct_segment(sb); |
378 | 378 | ||
379 | down_write(&nilfs->ns_sem); | 379 | down_write(&nilfs->ns_sem); |
380 | if (nilfs_sb_dirty(nilfs)) { | 380 | if (nilfs_sb_dirty(nilfs)) { |
381 | sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs)); | 381 | sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs)); |
382 | if (likely(sbp)) { | 382 | if (likely(sbp)) { |
383 | nilfs_set_log_cursor(sbp[0], nilfs); | 383 | nilfs_set_log_cursor(sbp[0], nilfs); |
384 | nilfs_commit_super(sbi, NILFS_SB_COMMIT); | 384 | nilfs_commit_super(sbi, NILFS_SB_COMMIT); |
385 | } | 385 | } |
386 | } | 386 | } |
387 | up_write(&nilfs->ns_sem); | 387 | up_write(&nilfs->ns_sem); |
388 | 388 | ||
389 | return err; | 389 | return err; |
390 | } | 390 | } |
391 | 391 | ||
392 | int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) | 392 | int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) |
393 | { | 393 | { |
394 | struct the_nilfs *nilfs = sbi->s_nilfs; | 394 | struct the_nilfs *nilfs = sbi->s_nilfs; |
395 | struct nilfs_checkpoint *raw_cp; | 395 | struct nilfs_checkpoint *raw_cp; |
396 | struct buffer_head *bh_cp; | 396 | struct buffer_head *bh_cp; |
397 | int err; | 397 | int err; |
398 | 398 | ||
399 | down_write(&nilfs->ns_super_sem); | 399 | down_write(&nilfs->ns_super_sem); |
400 | list_add(&sbi->s_list, &nilfs->ns_supers); | 400 | list_add(&sbi->s_list, &nilfs->ns_supers); |
401 | up_write(&nilfs->ns_super_sem); | 401 | up_write(&nilfs->ns_super_sem); |
402 | 402 | ||
403 | sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); | 403 | sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); |
404 | if (!sbi->s_ifile) | 404 | if (!sbi->s_ifile) |
405 | return -ENOMEM; | 405 | return -ENOMEM; |
406 | 406 | ||
407 | down_read(&nilfs->ns_segctor_sem); | 407 | down_read(&nilfs->ns_segctor_sem); |
408 | err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, | 408 | err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, |
409 | &bh_cp); | 409 | &bh_cp); |
410 | up_read(&nilfs->ns_segctor_sem); | 410 | up_read(&nilfs->ns_segctor_sem); |
411 | if (unlikely(err)) { | 411 | if (unlikely(err)) { |
412 | if (err == -ENOENT || err == -EINVAL) { | 412 | if (err == -ENOENT || err == -EINVAL) { |
413 | printk(KERN_ERR | 413 | printk(KERN_ERR |
414 | "NILFS: Invalid checkpoint " | 414 | "NILFS: Invalid checkpoint " |
415 | "(checkpoint number=%llu)\n", | 415 | "(checkpoint number=%llu)\n", |
416 | (unsigned long long)cno); | 416 | (unsigned long long)cno); |
417 | err = -EINVAL; | 417 | err = -EINVAL; |
418 | } | 418 | } |
419 | goto failed; | 419 | goto failed; |
420 | } | 420 | } |
421 | err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); | 421 | err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); |
422 | if (unlikely(err)) | 422 | if (unlikely(err)) |
423 | goto failed_bh; | 423 | goto failed_bh; |
424 | atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); | 424 | atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); |
425 | atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); | 425 | atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); |
426 | 426 | ||
427 | nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); | 427 | nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); |
428 | return 0; | 428 | return 0; |
429 | 429 | ||
430 | failed_bh: | 430 | failed_bh: |
431 | nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); | 431 | nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); |
432 | failed: | 432 | failed: |
433 | nilfs_mdt_destroy(sbi->s_ifile); | 433 | nilfs_mdt_destroy(sbi->s_ifile); |
434 | sbi->s_ifile = NULL; | 434 | sbi->s_ifile = NULL; |
435 | 435 | ||
436 | down_write(&nilfs->ns_super_sem); | 436 | down_write(&nilfs->ns_super_sem); |
437 | list_del_init(&sbi->s_list); | 437 | list_del_init(&sbi->s_list); |
438 | up_write(&nilfs->ns_super_sem); | 438 | up_write(&nilfs->ns_super_sem); |
439 | 439 | ||
440 | return err; | 440 | return err; |
441 | } | 441 | } |
442 | 442 | ||
443 | void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) | 443 | void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) |
444 | { | 444 | { |
445 | struct the_nilfs *nilfs = sbi->s_nilfs; | 445 | struct the_nilfs *nilfs = sbi->s_nilfs; |
446 | 446 | ||
447 | nilfs_mdt_destroy(sbi->s_ifile); | 447 | nilfs_mdt_destroy(sbi->s_ifile); |
448 | sbi->s_ifile = NULL; | 448 | sbi->s_ifile = NULL; |
449 | down_write(&nilfs->ns_super_sem); | 449 | down_write(&nilfs->ns_super_sem); |
450 | list_del_init(&sbi->s_list); | 450 | list_del_init(&sbi->s_list); |
451 | up_write(&nilfs->ns_super_sem); | 451 | up_write(&nilfs->ns_super_sem); |
452 | } | 452 | } |
453 | 453 | ||
454 | static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) | 454 | static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
455 | { | 455 | { |
456 | struct super_block *sb = dentry->d_sb; | 456 | struct super_block *sb = dentry->d_sb; |
457 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 457 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
458 | struct the_nilfs *nilfs = sbi->s_nilfs; | 458 | struct the_nilfs *nilfs = sbi->s_nilfs; |
459 | u64 id = huge_encode_dev(sb->s_bdev->bd_dev); | 459 | u64 id = huge_encode_dev(sb->s_bdev->bd_dev); |
460 | unsigned long long blocks; | 460 | unsigned long long blocks; |
461 | unsigned long overhead; | 461 | unsigned long overhead; |
462 | unsigned long nrsvblocks; | 462 | unsigned long nrsvblocks; |
463 | sector_t nfreeblocks; | 463 | sector_t nfreeblocks; |
464 | int err; | 464 | int err; |
465 | 465 | ||
466 | /* | 466 | /* |
467 | * Compute all of the segment blocks | 467 | * Compute all of the segment blocks |
468 | * | 468 | * |
469 | * The blocks before first segment and after last segment | 469 | * The blocks before first segment and after last segment |
470 | * are excluded. | 470 | * are excluded. |
471 | */ | 471 | */ |
472 | blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments | 472 | blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments |
473 | - nilfs->ns_first_data_block; | 473 | - nilfs->ns_first_data_block; |
474 | nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; | 474 | nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; |
475 | 475 | ||
476 | /* | 476 | /* |
477 | * Compute the overhead | 477 | * Compute the overhead |
478 | * | 478 | * |
479 | * When distributing meta data blocks outside segment structure, | 479 | * When distributing meta data blocks outside segment structure, |
480 | * We must count them as the overhead. | 480 | * We must count them as the overhead. |
481 | */ | 481 | */ |
482 | overhead = 0; | 482 | overhead = 0; |
483 | 483 | ||
484 | err = nilfs_count_free_blocks(nilfs, &nfreeblocks); | 484 | err = nilfs_count_free_blocks(nilfs, &nfreeblocks); |
485 | if (unlikely(err)) | 485 | if (unlikely(err)) |
486 | return err; | 486 | return err; |
487 | 487 | ||
488 | buf->f_type = NILFS_SUPER_MAGIC; | 488 | buf->f_type = NILFS_SUPER_MAGIC; |
489 | buf->f_bsize = sb->s_blocksize; | 489 | buf->f_bsize = sb->s_blocksize; |
490 | buf->f_blocks = blocks - overhead; | 490 | buf->f_blocks = blocks - overhead; |
491 | buf->f_bfree = nfreeblocks; | 491 | buf->f_bfree = nfreeblocks; |
492 | buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? | 492 | buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? |
493 | (buf->f_bfree - nrsvblocks) : 0; | 493 | (buf->f_bfree - nrsvblocks) : 0; |
494 | buf->f_files = atomic_read(&sbi->s_inodes_count); | 494 | buf->f_files = atomic_read(&sbi->s_inodes_count); |
495 | buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ | 495 | buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ |
496 | buf->f_namelen = NILFS_NAME_LEN; | 496 | buf->f_namelen = NILFS_NAME_LEN; |
497 | buf->f_fsid.val[0] = (u32)id; | 497 | buf->f_fsid.val[0] = (u32)id; |
498 | buf->f_fsid.val[1] = (u32)(id >> 32); | 498 | buf->f_fsid.val[1] = (u32)(id >> 32); |
499 | 499 | ||
500 | return 0; | 500 | return 0; |
501 | } | 501 | } |
502 | 502 | ||
503 | static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) | 503 | static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) |
504 | { | 504 | { |
505 | struct super_block *sb = vfs->mnt_sb; | 505 | struct super_block *sb = vfs->mnt_sb; |
506 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 506 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
507 | 507 | ||
508 | if (!nilfs_test_opt(sbi, BARRIER)) | 508 | if (!nilfs_test_opt(sbi, BARRIER)) |
509 | seq_puts(seq, ",nobarrier"); | 509 | seq_puts(seq, ",nobarrier"); |
510 | if (nilfs_test_opt(sbi, SNAPSHOT)) | 510 | if (nilfs_test_opt(sbi, SNAPSHOT)) |
511 | seq_printf(seq, ",cp=%llu", | 511 | seq_printf(seq, ",cp=%llu", |
512 | (unsigned long long int)sbi->s_snapshot_cno); | 512 | (unsigned long long int)sbi->s_snapshot_cno); |
513 | if (nilfs_test_opt(sbi, ERRORS_PANIC)) | 513 | if (nilfs_test_opt(sbi, ERRORS_PANIC)) |
514 | seq_puts(seq, ",errors=panic"); | 514 | seq_puts(seq, ",errors=panic"); |
515 | if (nilfs_test_opt(sbi, ERRORS_CONT)) | 515 | if (nilfs_test_opt(sbi, ERRORS_CONT)) |
516 | seq_puts(seq, ",errors=continue"); | 516 | seq_puts(seq, ",errors=continue"); |
517 | if (nilfs_test_opt(sbi, STRICT_ORDER)) | 517 | if (nilfs_test_opt(sbi, STRICT_ORDER)) |
518 | seq_puts(seq, ",order=strict"); | 518 | seq_puts(seq, ",order=strict"); |
519 | if (nilfs_test_opt(sbi, NORECOVERY)) | 519 | if (nilfs_test_opt(sbi, NORECOVERY)) |
520 | seq_puts(seq, ",norecovery"); | 520 | seq_puts(seq, ",norecovery"); |
521 | if (nilfs_test_opt(sbi, DISCARD)) | 521 | if (nilfs_test_opt(sbi, DISCARD)) |
522 | seq_puts(seq, ",discard"); | 522 | seq_puts(seq, ",discard"); |
523 | 523 | ||
524 | return 0; | 524 | return 0; |
525 | } | 525 | } |
526 | 526 | ||
527 | static const struct super_operations nilfs_sops = { | 527 | static const struct super_operations nilfs_sops = { |
528 | .alloc_inode = nilfs_alloc_inode, | 528 | .alloc_inode = nilfs_alloc_inode, |
529 | .destroy_inode = nilfs_destroy_inode, | 529 | .destroy_inode = nilfs_destroy_inode, |
530 | .dirty_inode = nilfs_dirty_inode, | 530 | .dirty_inode = nilfs_dirty_inode, |
531 | /* .write_inode = nilfs_write_inode, */ | 531 | /* .write_inode = nilfs_write_inode, */ |
532 | /* .put_inode = nilfs_put_inode, */ | 532 | /* .put_inode = nilfs_put_inode, */ |
533 | /* .drop_inode = nilfs_drop_inode, */ | 533 | /* .drop_inode = nilfs_drop_inode, */ |
534 | .evict_inode = nilfs_evict_inode, | 534 | .evict_inode = nilfs_evict_inode, |
535 | .put_super = nilfs_put_super, | 535 | .put_super = nilfs_put_super, |
536 | /* .write_super = nilfs_write_super, */ | 536 | /* .write_super = nilfs_write_super, */ |
537 | .sync_fs = nilfs_sync_fs, | 537 | .sync_fs = nilfs_sync_fs, |
538 | /* .write_super_lockfs */ | 538 | /* .write_super_lockfs */ |
539 | /* .unlockfs */ | 539 | /* .unlockfs */ |
540 | .statfs = nilfs_statfs, | 540 | .statfs = nilfs_statfs, |
541 | .remount_fs = nilfs_remount, | 541 | .remount_fs = nilfs_remount, |
542 | /* .umount_begin */ | 542 | /* .umount_begin */ |
543 | .show_options = nilfs_show_options | 543 | .show_options = nilfs_show_options |
544 | }; | 544 | }; |
545 | 545 | ||
546 | static struct inode * | 546 | static struct inode * |
547 | nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) | 547 | nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) |
548 | { | 548 | { |
549 | struct inode *inode; | 549 | struct inode *inode; |
550 | 550 | ||
551 | if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO && | 551 | if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO && |
552 | ino != NILFS_SKETCH_INO) | 552 | ino != NILFS_SKETCH_INO) |
553 | return ERR_PTR(-ESTALE); | 553 | return ERR_PTR(-ESTALE); |
554 | 554 | ||
555 | inode = nilfs_iget(sb, ino); | 555 | inode = nilfs_iget(sb, ino); |
556 | if (IS_ERR(inode)) | 556 | if (IS_ERR(inode)) |
557 | return ERR_CAST(inode); | 557 | return ERR_CAST(inode); |
558 | if (generation && inode->i_generation != generation) { | 558 | if (generation && inode->i_generation != generation) { |
559 | iput(inode); | 559 | iput(inode); |
560 | return ERR_PTR(-ESTALE); | 560 | return ERR_PTR(-ESTALE); |
561 | } | 561 | } |
562 | 562 | ||
563 | return inode; | 563 | return inode; |
564 | } | 564 | } |
565 | 565 | ||
566 | static struct dentry * | 566 | static struct dentry * |
567 | nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, | 567 | nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, |
568 | int fh_type) | 568 | int fh_type) |
569 | { | 569 | { |
570 | return generic_fh_to_dentry(sb, fid, fh_len, fh_type, | 570 | return generic_fh_to_dentry(sb, fid, fh_len, fh_type, |
571 | nilfs_nfs_get_inode); | 571 | nilfs_nfs_get_inode); |
572 | } | 572 | } |
573 | 573 | ||
574 | static struct dentry * | 574 | static struct dentry * |
575 | nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, | 575 | nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, |
576 | int fh_type) | 576 | int fh_type) |
577 | { | 577 | { |
578 | return generic_fh_to_parent(sb, fid, fh_len, fh_type, | 578 | return generic_fh_to_parent(sb, fid, fh_len, fh_type, |
579 | nilfs_nfs_get_inode); | 579 | nilfs_nfs_get_inode); |
580 | } | 580 | } |
581 | 581 | ||
582 | static const struct export_operations nilfs_export_ops = { | 582 | static const struct export_operations nilfs_export_ops = { |
583 | .fh_to_dentry = nilfs_fh_to_dentry, | 583 | .fh_to_dentry = nilfs_fh_to_dentry, |
584 | .fh_to_parent = nilfs_fh_to_parent, | 584 | .fh_to_parent = nilfs_fh_to_parent, |
585 | .get_parent = nilfs_get_parent, | 585 | .get_parent = nilfs_get_parent, |
586 | }; | 586 | }; |
587 | 587 | ||
588 | enum { | 588 | enum { |
589 | Opt_err_cont, Opt_err_panic, Opt_err_ro, | 589 | Opt_err_cont, Opt_err_panic, Opt_err_ro, |
590 | Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, | 590 | Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, |
591 | Opt_discard, Opt_nodiscard, Opt_err, | 591 | Opt_discard, Opt_nodiscard, Opt_err, |
592 | }; | 592 | }; |
593 | 593 | ||
594 | static match_table_t tokens = { | 594 | static match_table_t tokens = { |
595 | {Opt_err_cont, "errors=continue"}, | 595 | {Opt_err_cont, "errors=continue"}, |
596 | {Opt_err_panic, "errors=panic"}, | 596 | {Opt_err_panic, "errors=panic"}, |
597 | {Opt_err_ro, "errors=remount-ro"}, | 597 | {Opt_err_ro, "errors=remount-ro"}, |
598 | {Opt_barrier, "barrier"}, | 598 | {Opt_barrier, "barrier"}, |
599 | {Opt_nobarrier, "nobarrier"}, | 599 | {Opt_nobarrier, "nobarrier"}, |
600 | {Opt_snapshot, "cp=%u"}, | 600 | {Opt_snapshot, "cp=%u"}, |
601 | {Opt_order, "order=%s"}, | 601 | {Opt_order, "order=%s"}, |
602 | {Opt_norecovery, "norecovery"}, | 602 | {Opt_norecovery, "norecovery"}, |
603 | {Opt_discard, "discard"}, | 603 | {Opt_discard, "discard"}, |
604 | {Opt_nodiscard, "nodiscard"}, | 604 | {Opt_nodiscard, "nodiscard"}, |
605 | {Opt_err, NULL} | 605 | {Opt_err, NULL} |
606 | }; | 606 | }; |
607 | 607 | ||
608 | static int parse_options(char *options, struct super_block *sb, int is_remount) | 608 | static int parse_options(char *options, struct super_block *sb, int is_remount) |
609 | { | 609 | { |
610 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 610 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
611 | char *p; | 611 | char *p; |
612 | substring_t args[MAX_OPT_ARGS]; | 612 | substring_t args[MAX_OPT_ARGS]; |
613 | int option; | 613 | int option; |
614 | 614 | ||
615 | if (!options) | 615 | if (!options) |
616 | return 1; | 616 | return 1; |
617 | 617 | ||
618 | while ((p = strsep(&options, ",")) != NULL) { | 618 | while ((p = strsep(&options, ",")) != NULL) { |
619 | int token; | 619 | int token; |
620 | if (!*p) | 620 | if (!*p) |
621 | continue; | 621 | continue; |
622 | 622 | ||
623 | token = match_token(p, tokens, args); | 623 | token = match_token(p, tokens, args); |
624 | switch (token) { | 624 | switch (token) { |
625 | case Opt_barrier: | 625 | case Opt_barrier: |
626 | nilfs_set_opt(sbi, BARRIER); | 626 | nilfs_set_opt(sbi, BARRIER); |
627 | break; | 627 | break; |
628 | case Opt_nobarrier: | 628 | case Opt_nobarrier: |
629 | nilfs_clear_opt(sbi, BARRIER); | 629 | nilfs_clear_opt(sbi, BARRIER); |
630 | break; | 630 | break; |
631 | case Opt_order: | 631 | case Opt_order: |
632 | if (strcmp(args[0].from, "relaxed") == 0) | 632 | if (strcmp(args[0].from, "relaxed") == 0) |
633 | /* Ordered data semantics */ | 633 | /* Ordered data semantics */ |
634 | nilfs_clear_opt(sbi, STRICT_ORDER); | 634 | nilfs_clear_opt(sbi, STRICT_ORDER); |
635 | else if (strcmp(args[0].from, "strict") == 0) | 635 | else if (strcmp(args[0].from, "strict") == 0) |
636 | /* Strict in-order semantics */ | 636 | /* Strict in-order semantics */ |
637 | nilfs_set_opt(sbi, STRICT_ORDER); | 637 | nilfs_set_opt(sbi, STRICT_ORDER); |
638 | else | 638 | else |
639 | return 0; | 639 | return 0; |
640 | break; | 640 | break; |
641 | case Opt_err_panic: | 641 | case Opt_err_panic: |
642 | nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC); | 642 | nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC); |
643 | break; | 643 | break; |
644 | case Opt_err_ro: | 644 | case Opt_err_ro: |
645 | nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO); | 645 | nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO); |
646 | break; | 646 | break; |
647 | case Opt_err_cont: | 647 | case Opt_err_cont: |
648 | nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); | 648 | nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); |
649 | break; | 649 | break; |
650 | case Opt_snapshot: | 650 | case Opt_snapshot: |
651 | if (match_int(&args[0], &option) || option <= 0) | 651 | if (match_int(&args[0], &option) || option <= 0) |
652 | return 0; | 652 | return 0; |
653 | if (is_remount) { | 653 | if (is_remount) { |
654 | if (!nilfs_test_opt(sbi, SNAPSHOT)) { | 654 | if (!nilfs_test_opt(sbi, SNAPSHOT)) { |
655 | printk(KERN_ERR | 655 | printk(KERN_ERR |
656 | "NILFS: cannot change regular " | 656 | "NILFS: cannot change regular " |
657 | "mount to snapshot.\n"); | 657 | "mount to snapshot.\n"); |
658 | return 0; | 658 | return 0; |
659 | } else if (option != sbi->s_snapshot_cno) { | 659 | } else if (option != sbi->s_snapshot_cno) { |
660 | printk(KERN_ERR | 660 | printk(KERN_ERR |
661 | "NILFS: cannot remount to a " | 661 | "NILFS: cannot remount to a " |
662 | "different snapshot.\n"); | 662 | "different snapshot.\n"); |
663 | return 0; | 663 | return 0; |
664 | } | 664 | } |
665 | break; | 665 | break; |
666 | } | 666 | } |
667 | if (!(sb->s_flags & MS_RDONLY)) { | 667 | if (!(sb->s_flags & MS_RDONLY)) { |
668 | printk(KERN_ERR "NILFS: cannot mount snapshot " | 668 | printk(KERN_ERR "NILFS: cannot mount snapshot " |
669 | "read/write. A read-only option is " | 669 | "read/write. A read-only option is " |
670 | "required.\n"); | 670 | "required.\n"); |
671 | return 0; | 671 | return 0; |
672 | } | 672 | } |
673 | sbi->s_snapshot_cno = option; | 673 | sbi->s_snapshot_cno = option; |
674 | nilfs_set_opt(sbi, SNAPSHOT); | 674 | nilfs_set_opt(sbi, SNAPSHOT); |
675 | break; | 675 | break; |
676 | case Opt_norecovery: | 676 | case Opt_norecovery: |
677 | nilfs_set_opt(sbi, NORECOVERY); | 677 | nilfs_set_opt(sbi, NORECOVERY); |
678 | break; | 678 | break; |
679 | case Opt_discard: | 679 | case Opt_discard: |
680 | nilfs_set_opt(sbi, DISCARD); | 680 | nilfs_set_opt(sbi, DISCARD); |
681 | break; | 681 | break; |
682 | case Opt_nodiscard: | 682 | case Opt_nodiscard: |
683 | nilfs_clear_opt(sbi, DISCARD); | 683 | nilfs_clear_opt(sbi, DISCARD); |
684 | break; | 684 | break; |
685 | default: | 685 | default: |
686 | printk(KERN_ERR | 686 | printk(KERN_ERR |
687 | "NILFS: Unrecognized mount option \"%s\"\n", p); | 687 | "NILFS: Unrecognized mount option \"%s\"\n", p); |
688 | return 0; | 688 | return 0; |
689 | } | 689 | } |
690 | } | 690 | } |
691 | return 1; | 691 | return 1; |
692 | } | 692 | } |
693 | 693 | ||
694 | static inline void | 694 | static inline void |
695 | nilfs_set_default_options(struct nilfs_sb_info *sbi, | 695 | nilfs_set_default_options(struct nilfs_sb_info *sbi, |
696 | struct nilfs_super_block *sbp) | 696 | struct nilfs_super_block *sbp) |
697 | { | 697 | { |
698 | sbi->s_mount_opt = | 698 | sbi->s_mount_opt = |
699 | NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; | 699 | NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; |
700 | } | 700 | } |
701 | 701 | ||
702 | static int nilfs_setup_super(struct nilfs_sb_info *sbi) | 702 | static int nilfs_setup_super(struct nilfs_sb_info *sbi) |
703 | { | 703 | { |
704 | struct the_nilfs *nilfs = sbi->s_nilfs; | 704 | struct the_nilfs *nilfs = sbi->s_nilfs; |
705 | struct nilfs_super_block **sbp; | 705 | struct nilfs_super_block **sbp; |
706 | int max_mnt_count; | 706 | int max_mnt_count; |
707 | int mnt_count; | 707 | int mnt_count; |
708 | 708 | ||
709 | /* nilfs->ns_sem must be locked by the caller. */ | 709 | /* nilfs->ns_sem must be locked by the caller. */ |
710 | sbp = nilfs_prepare_super(sbi, 0); | 710 | sbp = nilfs_prepare_super(sbi, 0); |
711 | if (!sbp) | 711 | if (!sbp) |
712 | return -EIO; | 712 | return -EIO; |
713 | 713 | ||
714 | max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); | 714 | max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); |
715 | mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); | 715 | mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); |
716 | 716 | ||
717 | if (nilfs->ns_mount_state & NILFS_ERROR_FS) { | 717 | if (nilfs->ns_mount_state & NILFS_ERROR_FS) { |
718 | printk(KERN_WARNING | 718 | printk(KERN_WARNING |
719 | "NILFS warning: mounting fs with errors\n"); | 719 | "NILFS warning: mounting fs with errors\n"); |
720 | #if 0 | 720 | #if 0 |
721 | } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { | 721 | } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { |
722 | printk(KERN_WARNING | 722 | printk(KERN_WARNING |
723 | "NILFS warning: maximal mount count reached\n"); | 723 | "NILFS warning: maximal mount count reached\n"); |
724 | #endif | 724 | #endif |
725 | } | 725 | } |
726 | if (!max_mnt_count) | 726 | if (!max_mnt_count) |
727 | sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); | 727 | sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); |
728 | 728 | ||
729 | sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); | 729 | sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); |
730 | sbp[0]->s_state = | 730 | sbp[0]->s_state = |
731 | cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); | 731 | cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); |
732 | sbp[0]->s_mtime = cpu_to_le64(get_seconds()); | 732 | sbp[0]->s_mtime = cpu_to_le64(get_seconds()); |
733 | /* synchronize sbp[1] with sbp[0] */ | 733 | /* synchronize sbp[1] with sbp[0] */ |
734 | memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); | 734 | memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); |
735 | return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); | 735 | return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); |
736 | } | 736 | } |
737 | 737 | ||
738 | struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, | 738 | struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, |
739 | u64 pos, int blocksize, | 739 | u64 pos, int blocksize, |
740 | struct buffer_head **pbh) | 740 | struct buffer_head **pbh) |
741 | { | 741 | { |
742 | unsigned long long sb_index = pos; | 742 | unsigned long long sb_index = pos; |
743 | unsigned long offset; | 743 | unsigned long offset; |
744 | 744 | ||
745 | offset = do_div(sb_index, blocksize); | 745 | offset = do_div(sb_index, blocksize); |
746 | *pbh = sb_bread(sb, sb_index); | 746 | *pbh = sb_bread(sb, sb_index); |
747 | if (!*pbh) | 747 | if (!*pbh) |
748 | return NULL; | 748 | return NULL; |
749 | return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); | 749 | return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); |
750 | } | 750 | } |
751 | 751 | ||
752 | int nilfs_store_magic_and_option(struct super_block *sb, | 752 | int nilfs_store_magic_and_option(struct super_block *sb, |
753 | struct nilfs_super_block *sbp, | 753 | struct nilfs_super_block *sbp, |
754 | char *data) | 754 | char *data) |
755 | { | 755 | { |
756 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 756 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
757 | 757 | ||
758 | sb->s_magic = le16_to_cpu(sbp->s_magic); | 758 | sb->s_magic = le16_to_cpu(sbp->s_magic); |
759 | 759 | ||
760 | /* FS independent flags */ | 760 | /* FS independent flags */ |
761 | #ifdef NILFS_ATIME_DISABLE | 761 | #ifdef NILFS_ATIME_DISABLE |
762 | sb->s_flags |= MS_NOATIME; | 762 | sb->s_flags |= MS_NOATIME; |
763 | #endif | 763 | #endif |
764 | 764 | ||
765 | nilfs_set_default_options(sbi, sbp); | 765 | nilfs_set_default_options(sbi, sbp); |
766 | 766 | ||
767 | sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid); | 767 | sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid); |
768 | sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid); | 768 | sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid); |
769 | sbi->s_interval = le32_to_cpu(sbp->s_c_interval); | 769 | sbi->s_interval = le32_to_cpu(sbp->s_c_interval); |
770 | sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); | 770 | sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); |
771 | 771 | ||
772 | return !parse_options(data, sb, 0) ? -EINVAL : 0 ; | 772 | return !parse_options(data, sb, 0) ? -EINVAL : 0 ; |
773 | } | 773 | } |
774 | 774 | ||
775 | int nilfs_check_feature_compatibility(struct super_block *sb, | 775 | int nilfs_check_feature_compatibility(struct super_block *sb, |
776 | struct nilfs_super_block *sbp) | 776 | struct nilfs_super_block *sbp) |
777 | { | 777 | { |
778 | __u64 features; | 778 | __u64 features; |
779 | 779 | ||
780 | features = le64_to_cpu(sbp->s_feature_incompat) & | 780 | features = le64_to_cpu(sbp->s_feature_incompat) & |
781 | ~NILFS_FEATURE_INCOMPAT_SUPP; | 781 | ~NILFS_FEATURE_INCOMPAT_SUPP; |
782 | if (features) { | 782 | if (features) { |
783 | printk(KERN_ERR "NILFS: couldn't mount because of unsupported " | 783 | printk(KERN_ERR "NILFS: couldn't mount because of unsupported " |
784 | "optional features (%llx)\n", | 784 | "optional features (%llx)\n", |
785 | (unsigned long long)features); | 785 | (unsigned long long)features); |
786 | return -EINVAL; | 786 | return -EINVAL; |
787 | } | 787 | } |
788 | features = le64_to_cpu(sbp->s_feature_compat_ro) & | 788 | features = le64_to_cpu(sbp->s_feature_compat_ro) & |
789 | ~NILFS_FEATURE_COMPAT_RO_SUPP; | 789 | ~NILFS_FEATURE_COMPAT_RO_SUPP; |
790 | if (!(sb->s_flags & MS_RDONLY) && features) { | 790 | if (!(sb->s_flags & MS_RDONLY) && features) { |
791 | printk(KERN_ERR "NILFS: couldn't mount RDWR because of " | 791 | printk(KERN_ERR "NILFS: couldn't mount RDWR because of " |
792 | "unsupported optional features (%llx)\n", | 792 | "unsupported optional features (%llx)\n", |
793 | (unsigned long long)features); | 793 | (unsigned long long)features); |
794 | return -EINVAL; | 794 | return -EINVAL; |
795 | } | 795 | } |
796 | return 0; | 796 | return 0; |
797 | } | 797 | } |
798 | 798 | ||
799 | /** | 799 | /** |
800 | * nilfs_fill_super() - initialize a super block instance | 800 | * nilfs_fill_super() - initialize a super block instance |
801 | * @sb: super_block | 801 | * @sb: super_block |
802 | * @data: mount options | 802 | * @data: mount options |
803 | * @silent: silent mode flag | 803 | * @silent: silent mode flag |
804 | * @nilfs: the_nilfs struct | 804 | * @nilfs: the_nilfs struct |
805 | * | 805 | * |
806 | * This function is called exclusively by nilfs->ns_mount_mutex. | 806 | * This function is called exclusively by nilfs->ns_mount_mutex. |
807 | * So, the recovery process is protected from other simultaneous mounts. | 807 | * So, the recovery process is protected from other simultaneous mounts. |
808 | */ | 808 | */ |
809 | static int | 809 | static int |
810 | nilfs_fill_super(struct super_block *sb, void *data, int silent, | 810 | nilfs_fill_super(struct super_block *sb, void *data, int silent, |
811 | struct the_nilfs *nilfs) | 811 | struct the_nilfs *nilfs) |
812 | { | 812 | { |
813 | struct nilfs_sb_info *sbi; | 813 | struct nilfs_sb_info *sbi; |
814 | struct inode *root; | 814 | struct inode *root; |
815 | __u64 cno; | 815 | __u64 cno; |
816 | int err; | 816 | int err; |
817 | 817 | ||
818 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); | 818 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); |
819 | if (!sbi) | 819 | if (!sbi) |
820 | return -ENOMEM; | 820 | return -ENOMEM; |
821 | 821 | ||
822 | sb->s_fs_info = sbi; | 822 | sb->s_fs_info = sbi; |
823 | 823 | ||
824 | get_nilfs(nilfs); | 824 | get_nilfs(nilfs); |
825 | sbi->s_nilfs = nilfs; | 825 | sbi->s_nilfs = nilfs; |
826 | sbi->s_super = sb; | 826 | sbi->s_super = sb; |
827 | atomic_set(&sbi->s_count, 1); | 827 | atomic_set(&sbi->s_count, 1); |
828 | 828 | ||
829 | err = init_nilfs(nilfs, sbi, (char *)data); | 829 | err = init_nilfs(nilfs, sbi, (char *)data); |
830 | if (err) | 830 | if (err) |
831 | goto failed_sbi; | 831 | goto failed_sbi; |
832 | 832 | ||
833 | spin_lock_init(&sbi->s_inode_lock); | 833 | spin_lock_init(&sbi->s_inode_lock); |
834 | INIT_LIST_HEAD(&sbi->s_dirty_files); | 834 | INIT_LIST_HEAD(&sbi->s_dirty_files); |
835 | INIT_LIST_HEAD(&sbi->s_list); | 835 | INIT_LIST_HEAD(&sbi->s_list); |
836 | 836 | ||
837 | /* | 837 | /* |
838 | * Following initialization is overlapped because | 838 | * Following initialization is overlapped because |
839 | * nilfs_sb_info structure has been cleared at the beginning. | 839 | * nilfs_sb_info structure has been cleared at the beginning. |
840 | * But we reserve them to keep our interest and make ready | 840 | * But we reserve them to keep our interest and make ready |
841 | * for the future change. | 841 | * for the future change. |
842 | */ | 842 | */ |
843 | get_random_bytes(&sbi->s_next_generation, | 843 | get_random_bytes(&sbi->s_next_generation, |
844 | sizeof(sbi->s_next_generation)); | 844 | sizeof(sbi->s_next_generation)); |
845 | spin_lock_init(&sbi->s_next_gen_lock); | 845 | spin_lock_init(&sbi->s_next_gen_lock); |
846 | 846 | ||
847 | sb->s_op = &nilfs_sops; | 847 | sb->s_op = &nilfs_sops; |
848 | sb->s_export_op = &nilfs_export_ops; | 848 | sb->s_export_op = &nilfs_export_ops; |
849 | sb->s_root = NULL; | 849 | sb->s_root = NULL; |
850 | sb->s_time_gran = 1; | 850 | sb->s_time_gran = 1; |
851 | sb->s_bdi = nilfs->ns_bdi; | 851 | sb->s_bdi = nilfs->ns_bdi; |
852 | 852 | ||
853 | err = load_nilfs(nilfs, sbi); | 853 | err = load_nilfs(nilfs, sbi); |
854 | if (err) | 854 | if (err) |
855 | goto failed_sbi; | 855 | goto failed_sbi; |
856 | 856 | ||
857 | cno = nilfs_last_cno(nilfs); | 857 | cno = nilfs_last_cno(nilfs); |
858 | 858 | ||
859 | if (sb->s_flags & MS_RDONLY) { | 859 | if (sb->s_flags & MS_RDONLY) { |
860 | if (nilfs_test_opt(sbi, SNAPSHOT)) { | 860 | if (nilfs_test_opt(sbi, SNAPSHOT)) { |
861 | down_read(&nilfs->ns_segctor_sem); | 861 | down_read(&nilfs->ns_segctor_sem); |
862 | err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, | 862 | err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, |
863 | sbi->s_snapshot_cno); | 863 | sbi->s_snapshot_cno); |
864 | up_read(&nilfs->ns_segctor_sem); | 864 | up_read(&nilfs->ns_segctor_sem); |
865 | if (err < 0) { | 865 | if (err < 0) { |
866 | if (err == -ENOENT) | 866 | if (err == -ENOENT) |
867 | err = -EINVAL; | 867 | err = -EINVAL; |
868 | goto failed_sbi; | 868 | goto failed_sbi; |
869 | } | 869 | } |
870 | if (!err) { | 870 | if (!err) { |
871 | printk(KERN_ERR | 871 | printk(KERN_ERR |
872 | "NILFS: The specified checkpoint is " | 872 | "NILFS: The specified checkpoint is " |
873 | "not a snapshot " | 873 | "not a snapshot " |
874 | "(checkpoint number=%llu).\n", | 874 | "(checkpoint number=%llu).\n", |
875 | (unsigned long long)sbi->s_snapshot_cno); | 875 | (unsigned long long)sbi->s_snapshot_cno); |
876 | err = -EINVAL; | 876 | err = -EINVAL; |
877 | goto failed_sbi; | 877 | goto failed_sbi; |
878 | } | 878 | } |
879 | cno = sbi->s_snapshot_cno; | 879 | cno = sbi->s_snapshot_cno; |
880 | } | 880 | } |
881 | } | 881 | } |
882 | 882 | ||
883 | err = nilfs_attach_checkpoint(sbi, cno); | 883 | err = nilfs_attach_checkpoint(sbi, cno); |
884 | if (err) { | 884 | if (err) { |
885 | printk(KERN_ERR "NILFS: error loading a checkpoint" | 885 | printk(KERN_ERR "NILFS: error loading a checkpoint" |
886 | " (checkpoint number=%llu).\n", (unsigned long long)cno); | 886 | " (checkpoint number=%llu).\n", (unsigned long long)cno); |
887 | goto failed_sbi; | 887 | goto failed_sbi; |
888 | } | 888 | } |
889 | 889 | ||
890 | if (!(sb->s_flags & MS_RDONLY)) { | 890 | if (!(sb->s_flags & MS_RDONLY)) { |
891 | err = nilfs_attach_segment_constructor(sbi); | 891 | err = nilfs_attach_segment_constructor(sbi); |
892 | if (err) | 892 | if (err) |
893 | goto failed_checkpoint; | 893 | goto failed_checkpoint; |
894 | } | 894 | } |
895 | 895 | ||
896 | root = nilfs_iget(sb, NILFS_ROOT_INO); | 896 | root = nilfs_iget(sb, NILFS_ROOT_INO); |
897 | if (IS_ERR(root)) { | 897 | if (IS_ERR(root)) { |
898 | printk(KERN_ERR "NILFS: get root inode failed\n"); | 898 | printk(KERN_ERR "NILFS: get root inode failed\n"); |
899 | err = PTR_ERR(root); | 899 | err = PTR_ERR(root); |
900 | goto failed_segctor; | 900 | goto failed_segctor; |
901 | } | 901 | } |
902 | if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { | 902 | if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { |
903 | iput(root); | 903 | iput(root); |
904 | printk(KERN_ERR "NILFS: corrupt root inode.\n"); | 904 | printk(KERN_ERR "NILFS: corrupt root inode.\n"); |
905 | err = -EINVAL; | 905 | err = -EINVAL; |
906 | goto failed_segctor; | 906 | goto failed_segctor; |
907 | } | 907 | } |
908 | sb->s_root = d_alloc_root(root); | 908 | sb->s_root = d_alloc_root(root); |
909 | if (!sb->s_root) { | 909 | if (!sb->s_root) { |
910 | iput(root); | 910 | iput(root); |
911 | printk(KERN_ERR "NILFS: get root dentry failed\n"); | 911 | printk(KERN_ERR "NILFS: get root dentry failed\n"); |
912 | err = -ENOMEM; | 912 | err = -ENOMEM; |
913 | goto failed_segctor; | 913 | goto failed_segctor; |
914 | } | 914 | } |
915 | 915 | ||
916 | if (!(sb->s_flags & MS_RDONLY)) { | 916 | if (!(sb->s_flags & MS_RDONLY)) { |
917 | down_write(&nilfs->ns_sem); | 917 | down_write(&nilfs->ns_sem); |
918 | nilfs_setup_super(sbi); | 918 | nilfs_setup_super(sbi); |
919 | up_write(&nilfs->ns_sem); | 919 | up_write(&nilfs->ns_sem); |
920 | } | 920 | } |
921 | 921 | ||
922 | down_write(&nilfs->ns_super_sem); | 922 | down_write(&nilfs->ns_super_sem); |
923 | if (!nilfs_test_opt(sbi, SNAPSHOT)) | 923 | if (!nilfs_test_opt(sbi, SNAPSHOT)) |
924 | nilfs->ns_current = sbi; | 924 | nilfs->ns_current = sbi; |
925 | up_write(&nilfs->ns_super_sem); | 925 | up_write(&nilfs->ns_super_sem); |
926 | 926 | ||
927 | return 0; | 927 | return 0; |
928 | 928 | ||
929 | failed_segctor: | 929 | failed_segctor: |
930 | nilfs_detach_segment_constructor(sbi); | 930 | nilfs_detach_segment_constructor(sbi); |
931 | 931 | ||
932 | failed_checkpoint: | 932 | failed_checkpoint: |
933 | nilfs_detach_checkpoint(sbi); | 933 | nilfs_detach_checkpoint(sbi); |
934 | 934 | ||
935 | failed_sbi: | 935 | failed_sbi: |
936 | put_nilfs(nilfs); | 936 | put_nilfs(nilfs); |
937 | sb->s_fs_info = NULL; | 937 | sb->s_fs_info = NULL; |
938 | nilfs_put_sbinfo(sbi); | 938 | nilfs_put_sbinfo(sbi); |
939 | return err; | 939 | return err; |
940 | } | 940 | } |
941 | 941 | ||
942 | static int nilfs_remount(struct super_block *sb, int *flags, char *data) | 942 | static int nilfs_remount(struct super_block *sb, int *flags, char *data) |
943 | { | 943 | { |
944 | struct nilfs_sb_info *sbi = NILFS_SB(sb); | 944 | struct nilfs_sb_info *sbi = NILFS_SB(sb); |
945 | struct the_nilfs *nilfs = sbi->s_nilfs; | 945 | struct the_nilfs *nilfs = sbi->s_nilfs; |
946 | unsigned long old_sb_flags; | 946 | unsigned long old_sb_flags; |
947 | struct nilfs_mount_options old_opts; | 947 | struct nilfs_mount_options old_opts; |
948 | int was_snapshot, err; | 948 | int was_snapshot, err; |
949 | 949 | ||
950 | lock_kernel(); | 950 | lock_kernel(); |
951 | 951 | ||
952 | down_write(&nilfs->ns_super_sem); | 952 | down_write(&nilfs->ns_super_sem); |
953 | old_sb_flags = sb->s_flags; | 953 | old_sb_flags = sb->s_flags; |
954 | old_opts.mount_opt = sbi->s_mount_opt; | 954 | old_opts.mount_opt = sbi->s_mount_opt; |
955 | old_opts.snapshot_cno = sbi->s_snapshot_cno; | 955 | old_opts.snapshot_cno = sbi->s_snapshot_cno; |
956 | was_snapshot = nilfs_test_opt(sbi, SNAPSHOT); | 956 | was_snapshot = nilfs_test_opt(sbi, SNAPSHOT); |
957 | 957 | ||
958 | if (!parse_options(data, sb, 1)) { | 958 | if (!parse_options(data, sb, 1)) { |
959 | err = -EINVAL; | 959 | err = -EINVAL; |
960 | goto restore_opts; | 960 | goto restore_opts; |
961 | } | 961 | } |
962 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL); | 962 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL); |
963 | 963 | ||
964 | err = -EINVAL; | 964 | err = -EINVAL; |
965 | if (was_snapshot && !(*flags & MS_RDONLY)) { | 965 | if (was_snapshot && !(*flags & MS_RDONLY)) { |
966 | printk(KERN_ERR "NILFS (device %s): cannot remount snapshot " | 966 | printk(KERN_ERR "NILFS (device %s): cannot remount snapshot " |
967 | "read/write.\n", sb->s_id); | 967 | "read/write.\n", sb->s_id); |
968 | goto restore_opts; | 968 | goto restore_opts; |
969 | } | 969 | } |
970 | 970 | ||
971 | if (!nilfs_valid_fs(nilfs)) { | 971 | if (!nilfs_valid_fs(nilfs)) { |
972 | printk(KERN_WARNING "NILFS (device %s): couldn't " | 972 | printk(KERN_WARNING "NILFS (device %s): couldn't " |
973 | "remount because the filesystem is in an " | 973 | "remount because the filesystem is in an " |
974 | "incomplete recovery state.\n", sb->s_id); | 974 | "incomplete recovery state.\n", sb->s_id); |
975 | goto restore_opts; | 975 | goto restore_opts; |
976 | } | 976 | } |
977 | 977 | ||
978 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) | 978 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) |
979 | goto out; | 979 | goto out; |
980 | if (*flags & MS_RDONLY) { | 980 | if (*flags & MS_RDONLY) { |
981 | /* Shutting down the segment constructor */ | 981 | /* Shutting down the segment constructor */ |
982 | nilfs_detach_segment_constructor(sbi); | 982 | nilfs_detach_segment_constructor(sbi); |
983 | sb->s_flags |= MS_RDONLY; | 983 | sb->s_flags |= MS_RDONLY; |
984 | 984 | ||
985 | /* | 985 | /* |
986 | * Remounting a valid RW partition RDONLY, so set | 986 | * Remounting a valid RW partition RDONLY, so set |
987 | * the RDONLY flag and then mark the partition as valid again. | 987 | * the RDONLY flag and then mark the partition as valid again. |
988 | */ | 988 | */ |
989 | down_write(&nilfs->ns_sem); | 989 | down_write(&nilfs->ns_sem); |
990 | nilfs_cleanup_super(sbi); | 990 | nilfs_cleanup_super(sbi); |
991 | up_write(&nilfs->ns_sem); | 991 | up_write(&nilfs->ns_sem); |
992 | } else { | 992 | } else { |
993 | __u64 features; | 993 | __u64 features; |
994 | 994 | ||
995 | /* | 995 | /* |
996 | * Mounting a RDONLY partition read-write, so reread and | 996 | * Mounting a RDONLY partition read-write, so reread and |
997 | * store the current valid flag. (It may have been changed | 997 | * store the current valid flag. (It may have been changed |
998 | * by fsck since we originally mounted the partition.) | 998 | * by fsck since we originally mounted the partition.) |
999 | */ | 999 | */ |
1000 | down_read(&nilfs->ns_sem); | 1000 | down_read(&nilfs->ns_sem); |
1001 | features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & | 1001 | features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & |
1002 | ~NILFS_FEATURE_COMPAT_RO_SUPP; | 1002 | ~NILFS_FEATURE_COMPAT_RO_SUPP; |
1003 | up_read(&nilfs->ns_sem); | 1003 | up_read(&nilfs->ns_sem); |
1004 | if (features) { | 1004 | if (features) { |
1005 | printk(KERN_WARNING "NILFS (device %s): couldn't " | 1005 | printk(KERN_WARNING "NILFS (device %s): couldn't " |
1006 | "remount RDWR because of unsupported optional " | 1006 | "remount RDWR because of unsupported optional " |
1007 | "features (%llx)\n", | 1007 | "features (%llx)\n", |
1008 | sb->s_id, (unsigned long long)features); | 1008 | sb->s_id, (unsigned long long)features); |
1009 | err = -EROFS; | 1009 | err = -EROFS; |
1010 | goto restore_opts; | 1010 | goto restore_opts; |
1011 | } | 1011 | } |
1012 | 1012 | ||
1013 | sb->s_flags &= ~MS_RDONLY; | 1013 | sb->s_flags &= ~MS_RDONLY; |
1014 | 1014 | ||
1015 | err = nilfs_attach_segment_constructor(sbi); | 1015 | err = nilfs_attach_segment_constructor(sbi); |
1016 | if (err) | 1016 | if (err) |
1017 | goto restore_opts; | 1017 | goto restore_opts; |
1018 | 1018 | ||
1019 | down_write(&nilfs->ns_sem); | 1019 | down_write(&nilfs->ns_sem); |
1020 | nilfs_setup_super(sbi); | 1020 | nilfs_setup_super(sbi); |
1021 | up_write(&nilfs->ns_sem); | 1021 | up_write(&nilfs->ns_sem); |
1022 | } | 1022 | } |
1023 | out: | 1023 | out: |
1024 | up_write(&nilfs->ns_super_sem); | 1024 | up_write(&nilfs->ns_super_sem); |
1025 | unlock_kernel(); | 1025 | unlock_kernel(); |
1026 | return 0; | 1026 | return 0; |
1027 | 1027 | ||
1028 | restore_opts: | 1028 | restore_opts: |
1029 | sb->s_flags = old_sb_flags; | 1029 | sb->s_flags = old_sb_flags; |
1030 | sbi->s_mount_opt = old_opts.mount_opt; | 1030 | sbi->s_mount_opt = old_opts.mount_opt; |
1031 | sbi->s_snapshot_cno = old_opts.snapshot_cno; | 1031 | sbi->s_snapshot_cno = old_opts.snapshot_cno; |
1032 | up_write(&nilfs->ns_super_sem); | 1032 | up_write(&nilfs->ns_super_sem); |
1033 | unlock_kernel(); | 1033 | unlock_kernel(); |
1034 | return err; | 1034 | return err; |
1035 | } | 1035 | } |
1036 | 1036 | ||
1037 | struct nilfs_super_data { | 1037 | struct nilfs_super_data { |
1038 | struct block_device *bdev; | 1038 | struct block_device *bdev; |
1039 | struct nilfs_sb_info *sbi; | 1039 | struct nilfs_sb_info *sbi; |
1040 | __u64 cno; | 1040 | __u64 cno; |
1041 | int flags; | 1041 | int flags; |
1042 | }; | 1042 | }; |
1043 | 1043 | ||
1044 | /** | 1044 | /** |
1045 | * nilfs_identify - pre-read mount options needed to identify mount instance | 1045 | * nilfs_identify - pre-read mount options needed to identify mount instance |
1046 | * @data: mount options | 1046 | * @data: mount options |
1047 | * @sd: nilfs_super_data | 1047 | * @sd: nilfs_super_data |
1048 | */ | 1048 | */ |
1049 | static int nilfs_identify(char *data, struct nilfs_super_data *sd) | 1049 | static int nilfs_identify(char *data, struct nilfs_super_data *sd) |
1050 | { | 1050 | { |
1051 | char *p, *options = data; | 1051 | char *p, *options = data; |
1052 | substring_t args[MAX_OPT_ARGS]; | 1052 | substring_t args[MAX_OPT_ARGS]; |
1053 | int option, token; | 1053 | int option, token; |
1054 | int ret = 0; | 1054 | int ret = 0; |
1055 | 1055 | ||
1056 | do { | 1056 | do { |
1057 | p = strsep(&options, ","); | 1057 | p = strsep(&options, ","); |
1058 | if (p != NULL && *p) { | 1058 | if (p != NULL && *p) { |
1059 | token = match_token(p, tokens, args); | 1059 | token = match_token(p, tokens, args); |
1060 | if (token == Opt_snapshot) { | 1060 | if (token == Opt_snapshot) { |
1061 | if (!(sd->flags & MS_RDONLY)) | 1061 | if (!(sd->flags & MS_RDONLY)) |
1062 | ret++; | 1062 | ret++; |
1063 | else { | 1063 | else { |
1064 | ret = match_int(&args[0], &option); | 1064 | ret = match_int(&args[0], &option); |
1065 | if (!ret) { | 1065 | if (!ret) { |
1066 | if (option > 0) | 1066 | if (option > 0) |
1067 | sd->cno = option; | 1067 | sd->cno = option; |
1068 | else | 1068 | else |
1069 | ret++; | 1069 | ret++; |
1070 | } | 1070 | } |
1071 | } | 1071 | } |
1072 | } | 1072 | } |
1073 | if (ret) | 1073 | if (ret) |
1074 | printk(KERN_ERR | 1074 | printk(KERN_ERR |
1075 | "NILFS: invalid mount option: %s\n", p); | 1075 | "NILFS: invalid mount option: %s\n", p); |
1076 | } | 1076 | } |
1077 | if (!options) | 1077 | if (!options) |
1078 | break; | 1078 | break; |
1079 | BUG_ON(options == data); | 1079 | BUG_ON(options == data); |
1080 | *(options - 1) = ','; | 1080 | *(options - 1) = ','; |
1081 | } while (!ret); | 1081 | } while (!ret); |
1082 | return ret; | 1082 | return ret; |
1083 | } | 1083 | } |
1084 | 1084 | ||
1085 | static int nilfs_set_bdev_super(struct super_block *s, void *data) | 1085 | static int nilfs_set_bdev_super(struct super_block *s, void *data) |
1086 | { | 1086 | { |
1087 | struct nilfs_super_data *sd = data; | 1087 | struct nilfs_super_data *sd = data; |
1088 | 1088 | ||
1089 | s->s_bdev = sd->bdev; | 1089 | s->s_bdev = sd->bdev; |
1090 | s->s_dev = s->s_bdev->bd_dev; | 1090 | s->s_dev = s->s_bdev->bd_dev; |
1091 | return 0; | 1091 | return 0; |
1092 | } | 1092 | } |
1093 | 1093 | ||
1094 | static int nilfs_test_bdev_super(struct super_block *s, void *data) | 1094 | static int nilfs_test_bdev_super(struct super_block *s, void *data) |
1095 | { | 1095 | { |
1096 | struct nilfs_super_data *sd = data; | 1096 | struct nilfs_super_data *sd = data; |
1097 | 1097 | ||
1098 | return sd->sbi && s->s_fs_info == (void *)sd->sbi; | 1098 | return sd->sbi && s->s_fs_info == (void *)sd->sbi; |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | static int | 1101 | static int |
1102 | nilfs_get_sb(struct file_system_type *fs_type, int flags, | 1102 | nilfs_get_sb(struct file_system_type *fs_type, int flags, |
1103 | const char *dev_name, void *data, struct vfsmount *mnt) | 1103 | const char *dev_name, void *data, struct vfsmount *mnt) |
1104 | { | 1104 | { |
1105 | struct nilfs_super_data sd; | 1105 | struct nilfs_super_data sd; |
1106 | struct super_block *s; | 1106 | struct super_block *s; |
1107 | fmode_t mode = FMODE_READ; | 1107 | fmode_t mode = FMODE_READ; |
1108 | struct the_nilfs *nilfs; | 1108 | struct the_nilfs *nilfs; |
1109 | int err, need_to_close = 1; | 1109 | int err, need_to_close = 1; |
1110 | 1110 | ||
1111 | if (!(flags & MS_RDONLY)) | 1111 | if (!(flags & MS_RDONLY)) |
1112 | mode |= FMODE_WRITE; | 1112 | mode |= FMODE_WRITE; |
1113 | 1113 | ||
1114 | sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); | 1114 | sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); |
1115 | if (IS_ERR(sd.bdev)) | 1115 | if (IS_ERR(sd.bdev)) |
1116 | return PTR_ERR(sd.bdev); | 1116 | return PTR_ERR(sd.bdev); |
1117 | 1117 | ||
1118 | /* | 1118 | /* |
1119 | * To get mount instance using sget() vfs-routine, NILFS needs | 1119 | * To get mount instance using sget() vfs-routine, NILFS needs |
1120 | * much more information than normal filesystems to identify mount | 1120 | * much more information than normal filesystems to identify mount |
1121 | * instance. For snapshot mounts, not only a mount type (ro-mount | 1121 | * instance. For snapshot mounts, not only a mount type (ro-mount |
1122 | * or rw-mount) but also a checkpoint number is required. | 1122 | * or rw-mount) but also a checkpoint number is required. |
1123 | */ | 1123 | */ |
1124 | sd.cno = 0; | 1124 | sd.cno = 0; |
1125 | sd.flags = flags; | 1125 | sd.flags = flags; |
1126 | if (nilfs_identify((char *)data, &sd)) { | 1126 | if (nilfs_identify((char *)data, &sd)) { |
1127 | err = -EINVAL; | 1127 | err = -EINVAL; |
1128 | goto failed; | 1128 | goto failed; |
1129 | } | 1129 | } |
1130 | 1130 | ||
1131 | nilfs = find_or_create_nilfs(sd.bdev); | 1131 | nilfs = find_or_create_nilfs(sd.bdev); |
1132 | if (!nilfs) { | 1132 | if (!nilfs) { |
1133 | err = -ENOMEM; | 1133 | err = -ENOMEM; |
1134 | goto failed; | 1134 | goto failed; |
1135 | } | 1135 | } |
1136 | 1136 | ||
1137 | mutex_lock(&nilfs->ns_mount_mutex); | 1137 | mutex_lock(&nilfs->ns_mount_mutex); |
1138 | 1138 | ||
1139 | if (!sd.cno) { | 1139 | if (!sd.cno) { |
1140 | /* | 1140 | /* |
1141 | * Check if an exclusive mount exists or not. | 1141 | * Check if an exclusive mount exists or not. |
1142 | * Snapshot mounts coexist with a current mount | 1142 | * Snapshot mounts coexist with a current mount |
1143 | * (i.e. rw-mount or ro-mount), whereas rw-mount and | 1143 | * (i.e. rw-mount or ro-mount), whereas rw-mount and |
1144 | * ro-mount are mutually exclusive. | 1144 | * ro-mount are mutually exclusive. |
1145 | */ | 1145 | */ |
1146 | down_read(&nilfs->ns_super_sem); | 1146 | down_read(&nilfs->ns_super_sem); |
1147 | if (nilfs->ns_current && | 1147 | if (nilfs->ns_current && |
1148 | ((nilfs->ns_current->s_super->s_flags ^ flags) | 1148 | ((nilfs->ns_current->s_super->s_flags ^ flags) |
1149 | & MS_RDONLY)) { | 1149 | & MS_RDONLY)) { |
1150 | up_read(&nilfs->ns_super_sem); | 1150 | up_read(&nilfs->ns_super_sem); |
1151 | err = -EBUSY; | 1151 | err = -EBUSY; |
1152 | goto failed_unlock; | 1152 | goto failed_unlock; |
1153 | } | 1153 | } |
1154 | up_read(&nilfs->ns_super_sem); | 1154 | up_read(&nilfs->ns_super_sem); |
1155 | } | 1155 | } |
1156 | 1156 | ||
1157 | /* | 1157 | /* |
1158 | * Find existing nilfs_sb_info struct | 1158 | * Find existing nilfs_sb_info struct |
1159 | */ | 1159 | */ |
1160 | sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); | 1160 | sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); |
1161 | 1161 | ||
1162 | /* | 1162 | /* |
1163 | * Get super block instance holding the nilfs_sb_info struct. | 1163 | * Get super block instance holding the nilfs_sb_info struct. |
1164 | * A new instance is allocated if no existing mount is present or | 1164 | * A new instance is allocated if no existing mount is present or |
1165 | * existing instance has been unmounted. | 1165 | * existing instance has been unmounted. |
1166 | */ | 1166 | */ |
1167 | s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); | 1167 | s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); |
1168 | if (sd.sbi) | 1168 | if (sd.sbi) |
1169 | nilfs_put_sbinfo(sd.sbi); | 1169 | nilfs_put_sbinfo(sd.sbi); |
1170 | 1170 | ||
1171 | if (IS_ERR(s)) { | 1171 | if (IS_ERR(s)) { |
1172 | err = PTR_ERR(s); | 1172 | err = PTR_ERR(s); |
1173 | goto failed_unlock; | 1173 | goto failed_unlock; |
1174 | } | 1174 | } |
1175 | 1175 | ||
1176 | if (!s->s_root) { | 1176 | if (!s->s_root) { |
1177 | char b[BDEVNAME_SIZE]; | 1177 | char b[BDEVNAME_SIZE]; |
1178 | 1178 | ||
1179 | /* New superblock instance created */ | 1179 | /* New superblock instance created */ |
1180 | s->s_flags = flags; | 1180 | s->s_flags = flags; |
1181 | s->s_mode = mode; | 1181 | s->s_mode = mode; |
1182 | strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); | 1182 | strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); |
1183 | sb_set_blocksize(s, block_size(sd.bdev)); | 1183 | sb_set_blocksize(s, block_size(sd.bdev)); |
1184 | 1184 | ||
1185 | err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, | 1185 | err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, |
1186 | nilfs); | 1186 | nilfs); |
1187 | if (err) | 1187 | if (err) |
1188 | goto cancel_new; | 1188 | goto cancel_new; |
1189 | 1189 | ||
1190 | s->s_flags |= MS_ACTIVE; | 1190 | s->s_flags |= MS_ACTIVE; |
1191 | need_to_close = 0; | 1191 | need_to_close = 0; |
1192 | } | 1192 | } |
1193 | 1193 | ||
1194 | mutex_unlock(&nilfs->ns_mount_mutex); | 1194 | mutex_unlock(&nilfs->ns_mount_mutex); |
1195 | put_nilfs(nilfs); | 1195 | put_nilfs(nilfs); |
1196 | if (need_to_close) | 1196 | if (need_to_close) |
1197 | close_bdev_exclusive(sd.bdev, mode); | 1197 | close_bdev_exclusive(sd.bdev, mode); |
1198 | simple_set_mnt(mnt, s); | 1198 | simple_set_mnt(mnt, s); |
1199 | return 0; | 1199 | return 0; |
1200 | 1200 | ||
1201 | failed_unlock: | 1201 | failed_unlock: |
1202 | mutex_unlock(&nilfs->ns_mount_mutex); | 1202 | mutex_unlock(&nilfs->ns_mount_mutex); |
1203 | put_nilfs(nilfs); | 1203 | put_nilfs(nilfs); |
1204 | failed: | 1204 | failed: |
1205 | close_bdev_exclusive(sd.bdev, mode); | 1205 | close_bdev_exclusive(sd.bdev, mode); |
1206 | 1206 | ||
1207 | return err; | 1207 | return err; |
1208 | 1208 | ||
1209 | cancel_new: | 1209 | cancel_new: |
1210 | /* Abandoning the newly allocated superblock */ | 1210 | /* Abandoning the newly allocated superblock */ |
1211 | mutex_unlock(&nilfs->ns_mount_mutex); | 1211 | mutex_unlock(&nilfs->ns_mount_mutex); |
1212 | put_nilfs(nilfs); | 1212 | put_nilfs(nilfs); |
1213 | deactivate_locked_super(s); | 1213 | deactivate_locked_super(s); |
1214 | /* | 1214 | /* |
1215 | * deactivate_locked_super() invokes close_bdev_exclusive(). | 1215 | * deactivate_locked_super() invokes close_bdev_exclusive(). |
1216 | * We must finish all post-cleaning before this call; | 1216 | * We must finish all post-cleaning before this call; |
1217 | * put_nilfs() needs the block device. | 1217 | * put_nilfs() needs the block device. |
1218 | */ | 1218 | */ |
1219 | return err; | 1219 | return err; |
1220 | } | 1220 | } |
1221 | 1221 | ||
1222 | struct file_system_type nilfs_fs_type = { | 1222 | struct file_system_type nilfs_fs_type = { |
1223 | .owner = THIS_MODULE, | 1223 | .owner = THIS_MODULE, |
1224 | .name = "nilfs2", | 1224 | .name = "nilfs2", |
1225 | .get_sb = nilfs_get_sb, | 1225 | .get_sb = nilfs_get_sb, |
1226 | .kill_sb = kill_block_super, | 1226 | .kill_sb = kill_block_super, |
1227 | .fs_flags = FS_REQUIRES_DEV, | 1227 | .fs_flags = FS_REQUIRES_DEV, |
1228 | }; | 1228 | }; |
1229 | 1229 | ||
1230 | static void nilfs_inode_init_once(void *obj) | 1230 | static void nilfs_inode_init_once(void *obj) |
1231 | { | 1231 | { |
1232 | struct nilfs_inode_info *ii = obj; | 1232 | struct nilfs_inode_info *ii = obj; |
1233 | 1233 | ||
1234 | INIT_LIST_HEAD(&ii->i_dirty); | 1234 | INIT_LIST_HEAD(&ii->i_dirty); |
1235 | #ifdef CONFIG_NILFS_XATTR | 1235 | #ifdef CONFIG_NILFS_XATTR |
1236 | init_rwsem(&ii->xattr_sem); | 1236 | init_rwsem(&ii->xattr_sem); |
1237 | #endif | 1237 | #endif |
1238 | nilfs_btnode_cache_init_once(&ii->i_btnode_cache); | 1238 | nilfs_btnode_cache_init_once(&ii->i_btnode_cache); |
1239 | ii->i_bmap = &ii->i_bmap_data; | 1239 | ii->i_bmap = &ii->i_bmap_data; |
1240 | inode_init_once(&ii->vfs_inode); | 1240 | inode_init_once(&ii->vfs_inode); |
1241 | } | 1241 | } |
1242 | 1242 | ||
1243 | static void nilfs_segbuf_init_once(void *obj) | 1243 | static void nilfs_segbuf_init_once(void *obj) |
1244 | { | 1244 | { |
1245 | memset(obj, 0, sizeof(struct nilfs_segment_buffer)); | 1245 | memset(obj, 0, sizeof(struct nilfs_segment_buffer)); |
1246 | } | 1246 | } |
1247 | 1247 | ||
1248 | static void nilfs_destroy_cachep(void) | 1248 | static void nilfs_destroy_cachep(void) |
1249 | { | 1249 | { |
1250 | if (nilfs_inode_cachep) | 1250 | if (nilfs_inode_cachep) |
1251 | kmem_cache_destroy(nilfs_inode_cachep); | 1251 | kmem_cache_destroy(nilfs_inode_cachep); |
1252 | if (nilfs_transaction_cachep) | 1252 | if (nilfs_transaction_cachep) |
1253 | kmem_cache_destroy(nilfs_transaction_cachep); | 1253 | kmem_cache_destroy(nilfs_transaction_cachep); |
1254 | if (nilfs_segbuf_cachep) | 1254 | if (nilfs_segbuf_cachep) |
1255 | kmem_cache_destroy(nilfs_segbuf_cachep); | 1255 | kmem_cache_destroy(nilfs_segbuf_cachep); |
1256 | if (nilfs_btree_path_cache) | 1256 | if (nilfs_btree_path_cache) |
1257 | kmem_cache_destroy(nilfs_btree_path_cache); | 1257 | kmem_cache_destroy(nilfs_btree_path_cache); |
1258 | } | 1258 | } |
1259 | 1259 | ||
1260 | static int __init nilfs_init_cachep(void) | 1260 | static int __init nilfs_init_cachep(void) |
1261 | { | 1261 | { |
1262 | nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", | 1262 | nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", |
1263 | sizeof(struct nilfs_inode_info), 0, | 1263 | sizeof(struct nilfs_inode_info), 0, |
1264 | SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); | 1264 | SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); |
1265 | if (!nilfs_inode_cachep) | 1265 | if (!nilfs_inode_cachep) |
1266 | goto fail; | 1266 | goto fail; |
1267 | 1267 | ||
1268 | nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache", | 1268 | nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache", |
1269 | sizeof(struct nilfs_transaction_info), 0, | 1269 | sizeof(struct nilfs_transaction_info), 0, |
1270 | SLAB_RECLAIM_ACCOUNT, NULL); | 1270 | SLAB_RECLAIM_ACCOUNT, NULL); |
1271 | if (!nilfs_transaction_cachep) | 1271 | if (!nilfs_transaction_cachep) |
1272 | goto fail; | 1272 | goto fail; |
1273 | 1273 | ||
1274 | nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache", | 1274 | nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache", |
1275 | sizeof(struct nilfs_segment_buffer), 0, | 1275 | sizeof(struct nilfs_segment_buffer), 0, |
1276 | SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once); | 1276 | SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once); |
1277 | if (!nilfs_segbuf_cachep) | 1277 | if (!nilfs_segbuf_cachep) |
1278 | goto fail; | 1278 | goto fail; |
1279 | 1279 | ||
1280 | nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache", | 1280 | nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache", |
1281 | sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX, | 1281 | sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX, |
1282 | 0, 0, NULL); | 1282 | 0, 0, NULL); |
1283 | if (!nilfs_btree_path_cache) | 1283 | if (!nilfs_btree_path_cache) |
1284 | goto fail; | 1284 | goto fail; |
1285 | 1285 | ||
1286 | return 0; | 1286 | return 0; |
1287 | 1287 | ||
1288 | fail: | 1288 | fail: |
1289 | nilfs_destroy_cachep(); | 1289 | nilfs_destroy_cachep(); |
1290 | return -ENOMEM; | 1290 | return -ENOMEM; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | static int __init init_nilfs_fs(void) | 1293 | static int __init init_nilfs_fs(void) |
1294 | { | 1294 | { |
1295 | int err; | 1295 | int err; |
1296 | 1296 | ||
1297 | err = nilfs_init_cachep(); | 1297 | err = nilfs_init_cachep(); |
1298 | if (err) | 1298 | if (err) |
1299 | goto fail; | 1299 | goto fail; |
1300 | 1300 | ||
1301 | err = register_filesystem(&nilfs_fs_type); | 1301 | err = register_filesystem(&nilfs_fs_type); |
1302 | if (err) | 1302 | if (err) |
1303 | goto free_cachep; | 1303 | goto free_cachep; |
1304 | 1304 | ||
1305 | printk(KERN_INFO "NILFS version 2 loaded\n"); | 1305 | printk(KERN_INFO "NILFS version 2 loaded\n"); |
1306 | return 0; | 1306 | return 0; |
1307 | 1307 | ||
1308 | free_cachep: | 1308 | free_cachep: |
1309 | nilfs_destroy_cachep(); | 1309 | nilfs_destroy_cachep(); |
1310 | fail: | 1310 | fail: |
1311 | return err; | 1311 | return err; |
1312 | } | 1312 | } |
1313 | 1313 | ||
1314 | static void __exit exit_nilfs_fs(void) | 1314 | static void __exit exit_nilfs_fs(void) |
1315 | { | 1315 | { |
1316 | nilfs_destroy_cachep(); | 1316 | nilfs_destroy_cachep(); |
1317 | unregister_filesystem(&nilfs_fs_type); | 1317 | unregister_filesystem(&nilfs_fs_type); |
include/linux/buffer_head.h
1 | /* | 1 | /* |
2 | * include/linux/buffer_head.h | 2 | * include/linux/buffer_head.h |
3 | * | 3 | * |
4 | * Everything to do with buffer_heads. | 4 | * Everything to do with buffer_heads. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #ifndef _LINUX_BUFFER_HEAD_H | 7 | #ifndef _LINUX_BUFFER_HEAD_H |
8 | #define _LINUX_BUFFER_HEAD_H | 8 | #define _LINUX_BUFFER_HEAD_H |
9 | 9 | ||
10 | #include <linux/types.h> | 10 | #include <linux/types.h> |
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/linkage.h> | 12 | #include <linux/linkage.h> |
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/wait.h> | 14 | #include <linux/wait.h> |
15 | #include <asm/atomic.h> | 15 | #include <asm/atomic.h> |
16 | 16 | ||
17 | #ifdef CONFIG_BLOCK | 17 | #ifdef CONFIG_BLOCK |
18 | 18 | ||
19 | enum bh_state_bits { | 19 | enum bh_state_bits { |
20 | BH_Uptodate, /* Contains valid data */ | 20 | BH_Uptodate, /* Contains valid data */ |
21 | BH_Dirty, /* Is dirty */ | 21 | BH_Dirty, /* Is dirty */ |
22 | BH_Lock, /* Is locked */ | 22 | BH_Lock, /* Is locked */ |
23 | BH_Req, /* Has been submitted for I/O */ | 23 | BH_Req, /* Has been submitted for I/O */ |
24 | BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise | 24 | BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise |
25 | * IO completion of other buffers in the page | 25 | * IO completion of other buffers in the page |
26 | */ | 26 | */ |
27 | 27 | ||
28 | BH_Mapped, /* Has a disk mapping */ | 28 | BH_Mapped, /* Has a disk mapping */ |
29 | BH_New, /* Disk mapping was newly created by get_block */ | 29 | BH_New, /* Disk mapping was newly created by get_block */ |
30 | BH_Async_Read, /* Is under end_buffer_async_read I/O */ | 30 | BH_Async_Read, /* Is under end_buffer_async_read I/O */ |
31 | BH_Async_Write, /* Is under end_buffer_async_write I/O */ | 31 | BH_Async_Write, /* Is under end_buffer_async_write I/O */ |
32 | BH_Delay, /* Buffer is not yet allocated on disk */ | 32 | BH_Delay, /* Buffer is not yet allocated on disk */ |
33 | BH_Boundary, /* Block is followed by a discontiguity */ | 33 | BH_Boundary, /* Block is followed by a discontiguity */ |
34 | BH_Write_EIO, /* I/O error on write */ | 34 | BH_Write_EIO, /* I/O error on write */ |
35 | BH_Ordered, /* ordered write */ | ||
36 | BH_Eopnotsupp, /* operation not supported (barrier) */ | 35 | BH_Eopnotsupp, /* operation not supported (barrier) */ |
37 | BH_Unwritten, /* Buffer is allocated on disk but not written */ | 36 | BH_Unwritten, /* Buffer is allocated on disk but not written */ |
38 | BH_Quiet, /* Buffer Error Prinks to be quiet */ | 37 | BH_Quiet, /* Buffer Error Prinks to be quiet */ |
39 | 38 | ||
40 | BH_PrivateStart,/* not a state bit, but the first bit available | 39 | BH_PrivateStart,/* not a state bit, but the first bit available |
41 | * for private allocation by other entities | 40 | * for private allocation by other entities |
42 | */ | 41 | */ |
43 | }; | 42 | }; |
44 | 43 | ||
45 | #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) | 44 | #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) |
46 | 45 | ||
47 | struct page; | 46 | struct page; |
48 | struct buffer_head; | 47 | struct buffer_head; |
49 | struct address_space; | 48 | struct address_space; |
50 | typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); | 49 | typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); |
51 | 50 | ||
52 | /* | 51 | /* |
53 | * Historically, a buffer_head was used to map a single block | 52 | * Historically, a buffer_head was used to map a single block |
54 | * within a page, and of course as the unit of I/O through the | 53 | * within a page, and of course as the unit of I/O through the |
55 | * filesystem and block layers. Nowadays the basic I/O unit | 54 | * filesystem and block layers. Nowadays the basic I/O unit |
56 | * is the bio, and buffer_heads are used for extracting block | 55 | * is the bio, and buffer_heads are used for extracting block |
57 | * mappings (via a get_block_t call), for tracking state within | 56 | * mappings (via a get_block_t call), for tracking state within |
58 | * a page (via a page_mapping) and for wrapping bio submission | 57 | * a page (via a page_mapping) and for wrapping bio submission |
59 | * for backward compatibility reasons (e.g. submit_bh). | 58 | * for backward compatibility reasons (e.g. submit_bh). |
60 | */ | 59 | */ |
61 | struct buffer_head { | 60 | struct buffer_head { |
62 | unsigned long b_state; /* buffer state bitmap (see above) */ | 61 | unsigned long b_state; /* buffer state bitmap (see above) */ |
63 | struct buffer_head *b_this_page;/* circular list of page's buffers */ | 62 | struct buffer_head *b_this_page;/* circular list of page's buffers */ |
64 | struct page *b_page; /* the page this bh is mapped to */ | 63 | struct page *b_page; /* the page this bh is mapped to */ |
65 | 64 | ||
66 | sector_t b_blocknr; /* start block number */ | 65 | sector_t b_blocknr; /* start block number */ |
67 | size_t b_size; /* size of mapping */ | 66 | size_t b_size; /* size of mapping */ |
68 | char *b_data; /* pointer to data within the page */ | 67 | char *b_data; /* pointer to data within the page */ |
69 | 68 | ||
70 | struct block_device *b_bdev; | 69 | struct block_device *b_bdev; |
71 | bh_end_io_t *b_end_io; /* I/O completion */ | 70 | bh_end_io_t *b_end_io; /* I/O completion */ |
72 | void *b_private; /* reserved for b_end_io */ | 71 | void *b_private; /* reserved for b_end_io */ |
73 | struct list_head b_assoc_buffers; /* associated with another mapping */ | 72 | struct list_head b_assoc_buffers; /* associated with another mapping */ |
74 | struct address_space *b_assoc_map; /* mapping this buffer is | 73 | struct address_space *b_assoc_map; /* mapping this buffer is |
75 | associated with */ | 74 | associated with */ |
76 | atomic_t b_count; /* users using this buffer_head */ | 75 | atomic_t b_count; /* users using this buffer_head */ |
77 | }; | 76 | }; |
78 | 77 | ||
79 | /* | 78 | /* |
80 | * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() | 79 | * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() |
81 | * and buffer_foo() functions. | 80 | * and buffer_foo() functions. |
82 | */ | 81 | */ |
83 | #define BUFFER_FNS(bit, name) \ | 82 | #define BUFFER_FNS(bit, name) \ |
84 | static inline void set_buffer_##name(struct buffer_head *bh) \ | 83 | static inline void set_buffer_##name(struct buffer_head *bh) \ |
85 | { \ | 84 | { \ |
86 | set_bit(BH_##bit, &(bh)->b_state); \ | 85 | set_bit(BH_##bit, &(bh)->b_state); \ |
87 | } \ | 86 | } \ |
88 | static inline void clear_buffer_##name(struct buffer_head *bh) \ | 87 | static inline void clear_buffer_##name(struct buffer_head *bh) \ |
89 | { \ | 88 | { \ |
90 | clear_bit(BH_##bit, &(bh)->b_state); \ | 89 | clear_bit(BH_##bit, &(bh)->b_state); \ |
91 | } \ | 90 | } \ |
92 | static inline int buffer_##name(const struct buffer_head *bh) \ | 91 | static inline int buffer_##name(const struct buffer_head *bh) \ |
93 | { \ | 92 | { \ |
94 | return test_bit(BH_##bit, &(bh)->b_state); \ | 93 | return test_bit(BH_##bit, &(bh)->b_state); \ |
95 | } | 94 | } |
96 | 95 | ||
97 | /* | 96 | /* |
98 | * test_set_buffer_foo() and test_clear_buffer_foo() | 97 | * test_set_buffer_foo() and test_clear_buffer_foo() |
99 | */ | 98 | */ |
100 | #define TAS_BUFFER_FNS(bit, name) \ | 99 | #define TAS_BUFFER_FNS(bit, name) \ |
101 | static inline int test_set_buffer_##name(struct buffer_head *bh) \ | 100 | static inline int test_set_buffer_##name(struct buffer_head *bh) \ |
102 | { \ | 101 | { \ |
103 | return test_and_set_bit(BH_##bit, &(bh)->b_state); \ | 102 | return test_and_set_bit(BH_##bit, &(bh)->b_state); \ |
104 | } \ | 103 | } \ |
105 | static inline int test_clear_buffer_##name(struct buffer_head *bh) \ | 104 | static inline int test_clear_buffer_##name(struct buffer_head *bh) \ |
106 | { \ | 105 | { \ |
107 | return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ | 106 | return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ |
108 | } \ | 107 | } \ |
109 | 108 | ||
110 | /* | 109 | /* |
111 | * Emit the buffer bitops functions. Note that there are also functions | 110 | * Emit the buffer bitops functions. Note that there are also functions |
112 | * of the form "mark_buffer_foo()". These are higher-level functions which | 111 | * of the form "mark_buffer_foo()". These are higher-level functions which |
113 | * do something in addition to setting a b_state bit. | 112 | * do something in addition to setting a b_state bit. |
114 | */ | 113 | */ |
115 | BUFFER_FNS(Uptodate, uptodate) | 114 | BUFFER_FNS(Uptodate, uptodate) |
116 | BUFFER_FNS(Dirty, dirty) | 115 | BUFFER_FNS(Dirty, dirty) |
117 | TAS_BUFFER_FNS(Dirty, dirty) | 116 | TAS_BUFFER_FNS(Dirty, dirty) |
118 | BUFFER_FNS(Lock, locked) | 117 | BUFFER_FNS(Lock, locked) |
119 | BUFFER_FNS(Req, req) | 118 | BUFFER_FNS(Req, req) |
120 | TAS_BUFFER_FNS(Req, req) | 119 | TAS_BUFFER_FNS(Req, req) |
121 | BUFFER_FNS(Mapped, mapped) | 120 | BUFFER_FNS(Mapped, mapped) |
122 | BUFFER_FNS(New, new) | 121 | BUFFER_FNS(New, new) |
123 | BUFFER_FNS(Async_Read, async_read) | 122 | BUFFER_FNS(Async_Read, async_read) |
124 | BUFFER_FNS(Async_Write, async_write) | 123 | BUFFER_FNS(Async_Write, async_write) |
125 | BUFFER_FNS(Delay, delay) | 124 | BUFFER_FNS(Delay, delay) |
126 | BUFFER_FNS(Boundary, boundary) | 125 | BUFFER_FNS(Boundary, boundary) |
127 | BUFFER_FNS(Write_EIO, write_io_error) | 126 | BUFFER_FNS(Write_EIO, write_io_error) |
128 | BUFFER_FNS(Ordered, ordered) | ||
129 | BUFFER_FNS(Eopnotsupp, eopnotsupp) | 127 | BUFFER_FNS(Eopnotsupp, eopnotsupp) |
130 | BUFFER_FNS(Unwritten, unwritten) | 128 | BUFFER_FNS(Unwritten, unwritten) |
131 | 129 | ||
132 | #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) | 130 | #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) |
133 | #define touch_buffer(bh) mark_page_accessed(bh->b_page) | 131 | #define touch_buffer(bh) mark_page_accessed(bh->b_page) |
134 | 132 | ||
135 | /* If we *know* page->private refers to buffer_heads */ | 133 | /* If we *know* page->private refers to buffer_heads */ |
136 | #define page_buffers(page) \ | 134 | #define page_buffers(page) \ |
137 | ({ \ | 135 | ({ \ |
138 | BUG_ON(!PagePrivate(page)); \ | 136 | BUG_ON(!PagePrivate(page)); \ |
139 | ((struct buffer_head *)page_private(page)); \ | 137 | ((struct buffer_head *)page_private(page)); \ |
140 | }) | 138 | }) |
141 | #define page_has_buffers(page) PagePrivate(page) | 139 | #define page_has_buffers(page) PagePrivate(page) |
142 | 140 | ||
143 | /* | 141 | /* |
144 | * Declarations | 142 | * Declarations |
145 | */ | 143 | */ |
146 | 144 | ||
147 | void mark_buffer_dirty(struct buffer_head *bh); | 145 | void mark_buffer_dirty(struct buffer_head *bh); |
148 | void init_buffer(struct buffer_head *, bh_end_io_t *, void *); | 146 | void init_buffer(struct buffer_head *, bh_end_io_t *, void *); |
149 | void set_bh_page(struct buffer_head *bh, | 147 | void set_bh_page(struct buffer_head *bh, |
150 | struct page *page, unsigned long offset); | 148 | struct page *page, unsigned long offset); |
151 | int try_to_free_buffers(struct page *); | 149 | int try_to_free_buffers(struct page *); |
152 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | 150 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, |
153 | int retry); | 151 | int retry); |
154 | void create_empty_buffers(struct page *, unsigned long, | 152 | void create_empty_buffers(struct page *, unsigned long, |
155 | unsigned long b_state); | 153 | unsigned long b_state); |
156 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate); | 154 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate); |
157 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate); | 155 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate); |
158 | void end_buffer_async_write(struct buffer_head *bh, int uptodate); | 156 | void end_buffer_async_write(struct buffer_head *bh, int uptodate); |
159 | 157 | ||
160 | /* Things to do with buffers at mapping->private_list */ | 158 | /* Things to do with buffers at mapping->private_list */ |
161 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); | 159 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); |
162 | int inode_has_buffers(struct inode *); | 160 | int inode_has_buffers(struct inode *); |
163 | void invalidate_inode_buffers(struct inode *); | 161 | void invalidate_inode_buffers(struct inode *); |
164 | int remove_inode_buffers(struct inode *inode); | 162 | int remove_inode_buffers(struct inode *inode); |
165 | int sync_mapping_buffers(struct address_space *mapping); | 163 | int sync_mapping_buffers(struct address_space *mapping); |
166 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block); | 164 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block); |
167 | 165 | ||
168 | void mark_buffer_async_write(struct buffer_head *bh); | 166 | void mark_buffer_async_write(struct buffer_head *bh); |
169 | void __wait_on_buffer(struct buffer_head *); | 167 | void __wait_on_buffer(struct buffer_head *); |
170 | wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); | 168 | wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); |
171 | struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, | 169 | struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, |
172 | unsigned size); | 170 | unsigned size); |
173 | struct buffer_head *__getblk(struct block_device *bdev, sector_t block, | 171 | struct buffer_head *__getblk(struct block_device *bdev, sector_t block, |
174 | unsigned size); | 172 | unsigned size); |
175 | void __brelse(struct buffer_head *); | 173 | void __brelse(struct buffer_head *); |
176 | void __bforget(struct buffer_head *); | 174 | void __bforget(struct buffer_head *); |
177 | void __breadahead(struct block_device *, sector_t block, unsigned int size); | 175 | void __breadahead(struct block_device *, sector_t block, unsigned int size); |
178 | struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); | 176 | struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); |
179 | void invalidate_bh_lrus(void); | 177 | void invalidate_bh_lrus(void); |
180 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); | 178 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); |
181 | void free_buffer_head(struct buffer_head * bh); | 179 | void free_buffer_head(struct buffer_head * bh); |
182 | void unlock_buffer(struct buffer_head *bh); | 180 | void unlock_buffer(struct buffer_head *bh); |
183 | void __lock_buffer(struct buffer_head *bh); | 181 | void __lock_buffer(struct buffer_head *bh); |
184 | void ll_rw_block(int, int, struct buffer_head * bh[]); | 182 | void ll_rw_block(int, int, struct buffer_head * bh[]); |
185 | int sync_dirty_buffer(struct buffer_head *bh); | 183 | int sync_dirty_buffer(struct buffer_head *bh); |
184 | int __sync_dirty_buffer(struct buffer_head *bh, int rw); | ||
186 | int submit_bh(int, struct buffer_head *); | 185 | int submit_bh(int, struct buffer_head *); |
187 | void write_boundary_block(struct block_device *bdev, | 186 | void write_boundary_block(struct block_device *bdev, |
188 | sector_t bblock, unsigned blocksize); | 187 | sector_t bblock, unsigned blocksize); |
189 | int bh_uptodate_or_lock(struct buffer_head *bh); | 188 | int bh_uptodate_or_lock(struct buffer_head *bh); |
190 | int bh_submit_read(struct buffer_head *bh); | 189 | int bh_submit_read(struct buffer_head *bh); |
191 | 190 | ||
192 | extern int buffer_heads_over_limit; | 191 | extern int buffer_heads_over_limit; |
193 | 192 | ||
194 | /* | 193 | /* |
195 | * Generic address_space_operations implementations for buffer_head-backed | 194 | * Generic address_space_operations implementations for buffer_head-backed |
196 | * address_spaces. | 195 | * address_spaces. |
197 | */ | 196 | */ |
198 | void block_invalidatepage(struct page *page, unsigned long offset); | 197 | void block_invalidatepage(struct page *page, unsigned long offset); |
199 | int block_write_full_page(struct page *page, get_block_t *get_block, | 198 | int block_write_full_page(struct page *page, get_block_t *get_block, |
200 | struct writeback_control *wbc); | 199 | struct writeback_control *wbc); |
201 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, | 200 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, |
202 | struct writeback_control *wbc, bh_end_io_t *handler); | 201 | struct writeback_control *wbc, bh_end_io_t *handler); |
203 | int block_read_full_page(struct page*, get_block_t*); | 202 | int block_read_full_page(struct page*, get_block_t*); |
204 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | 203 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, |
205 | unsigned long from); | 204 | unsigned long from); |
206 | int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, | 205 | int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, |
207 | unsigned flags, struct page **pagep, get_block_t *get_block); | 206 | unsigned flags, struct page **pagep, get_block_t *get_block); |
208 | int __block_write_begin(struct page *page, loff_t pos, unsigned len, | 207 | int __block_write_begin(struct page *page, loff_t pos, unsigned len, |
209 | get_block_t *get_block); | 208 | get_block_t *get_block); |
210 | int block_write_end(struct file *, struct address_space *, | 209 | int block_write_end(struct file *, struct address_space *, |
211 | loff_t, unsigned, unsigned, | 210 | loff_t, unsigned, unsigned, |
212 | struct page *, void *); | 211 | struct page *, void *); |
213 | int generic_write_end(struct file *, struct address_space *, | 212 | int generic_write_end(struct file *, struct address_space *, |
214 | loff_t, unsigned, unsigned, | 213 | loff_t, unsigned, unsigned, |
215 | struct page *, void *); | 214 | struct page *, void *); |
216 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); | 215 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); |
217 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); | 216 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); |
218 | int cont_write_begin(struct file *, struct address_space *, loff_t, | 217 | int cont_write_begin(struct file *, struct address_space *, loff_t, |
219 | unsigned, unsigned, struct page **, void **, | 218 | unsigned, unsigned, struct page **, void **, |
220 | get_block_t *, loff_t *); | 219 | get_block_t *, loff_t *); |
221 | int generic_cont_expand_simple(struct inode *inode, loff_t size); | 220 | int generic_cont_expand_simple(struct inode *inode, loff_t size); |
222 | int block_commit_write(struct page *page, unsigned from, unsigned to); | 221 | int block_commit_write(struct page *page, unsigned from, unsigned to); |
223 | int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | 222 | int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, |
224 | get_block_t get_block); | 223 | get_block_t get_block); |
225 | void block_sync_page(struct page *); | 224 | void block_sync_page(struct page *); |
226 | sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); | 225 | sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); |
227 | int block_truncate_page(struct address_space *, loff_t, get_block_t *); | 226 | int block_truncate_page(struct address_space *, loff_t, get_block_t *); |
228 | int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned, | 227 | int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned, |
229 | struct page **, void **, get_block_t*); | 228 | struct page **, void **, get_block_t*); |
230 | int nobh_write_end(struct file *, struct address_space *, | 229 | int nobh_write_end(struct file *, struct address_space *, |
231 | loff_t, unsigned, unsigned, | 230 | loff_t, unsigned, unsigned, |
232 | struct page *, void *); | 231 | struct page *, void *); |
233 | int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); | 232 | int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); |
234 | int nobh_writepage(struct page *page, get_block_t *get_block, | 233 | int nobh_writepage(struct page *page, get_block_t *get_block, |
235 | struct writeback_control *wbc); | 234 | struct writeback_control *wbc); |
236 | 235 | ||
237 | void buffer_init(void); | 236 | void buffer_init(void); |
238 | 237 | ||
239 | /* | 238 | /* |
240 | * inline definitions | 239 | * inline definitions |
241 | */ | 240 | */ |
242 | 241 | ||
243 | static inline void attach_page_buffers(struct page *page, | 242 | static inline void attach_page_buffers(struct page *page, |
244 | struct buffer_head *head) | 243 | struct buffer_head *head) |
245 | { | 244 | { |
246 | page_cache_get(page); | 245 | page_cache_get(page); |
247 | SetPagePrivate(page); | 246 | SetPagePrivate(page); |
248 | set_page_private(page, (unsigned long)head); | 247 | set_page_private(page, (unsigned long)head); |
249 | } | 248 | } |
250 | 249 | ||
251 | static inline void get_bh(struct buffer_head *bh) | 250 | static inline void get_bh(struct buffer_head *bh) |
252 | { | 251 | { |
253 | atomic_inc(&bh->b_count); | 252 | atomic_inc(&bh->b_count); |
254 | } | 253 | } |
255 | 254 | ||
256 | static inline void put_bh(struct buffer_head *bh) | 255 | static inline void put_bh(struct buffer_head *bh) |
257 | { | 256 | { |
258 | smp_mb__before_atomic_dec(); | 257 | smp_mb__before_atomic_dec(); |
259 | atomic_dec(&bh->b_count); | 258 | atomic_dec(&bh->b_count); |
260 | } | 259 | } |
261 | 260 | ||
262 | static inline void brelse(struct buffer_head *bh) | 261 | static inline void brelse(struct buffer_head *bh) |
263 | { | 262 | { |
264 | if (bh) | 263 | if (bh) |
265 | __brelse(bh); | 264 | __brelse(bh); |
266 | } | 265 | } |
267 | 266 | ||
268 | static inline void bforget(struct buffer_head *bh) | 267 | static inline void bforget(struct buffer_head *bh) |
269 | { | 268 | { |
270 | if (bh) | 269 | if (bh) |
271 | __bforget(bh); | 270 | __bforget(bh); |
272 | } | 271 | } |
273 | 272 | ||
274 | static inline struct buffer_head * | 273 | static inline struct buffer_head * |
275 | sb_bread(struct super_block *sb, sector_t block) | 274 | sb_bread(struct super_block *sb, sector_t block) |
276 | { | 275 | { |
277 | return __bread(sb->s_bdev, block, sb->s_blocksize); | 276 | return __bread(sb->s_bdev, block, sb->s_blocksize); |
278 | } | 277 | } |
279 | 278 | ||
280 | static inline void | 279 | static inline void |
281 | sb_breadahead(struct super_block *sb, sector_t block) | 280 | sb_breadahead(struct super_block *sb, sector_t block) |
282 | { | 281 | { |
283 | __breadahead(sb->s_bdev, block, sb->s_blocksize); | 282 | __breadahead(sb->s_bdev, block, sb->s_blocksize); |
284 | } | 283 | } |
285 | 284 | ||
286 | static inline struct buffer_head * | 285 | static inline struct buffer_head * |
287 | sb_getblk(struct super_block *sb, sector_t block) | 286 | sb_getblk(struct super_block *sb, sector_t block) |
288 | { | 287 | { |
289 | return __getblk(sb->s_bdev, block, sb->s_blocksize); | 288 | return __getblk(sb->s_bdev, block, sb->s_blocksize); |
290 | } | 289 | } |
291 | 290 | ||
292 | static inline struct buffer_head * | 291 | static inline struct buffer_head * |
293 | sb_find_get_block(struct super_block *sb, sector_t block) | 292 | sb_find_get_block(struct super_block *sb, sector_t block) |
294 | { | 293 | { |
295 | return __find_get_block(sb->s_bdev, block, sb->s_blocksize); | 294 | return __find_get_block(sb->s_bdev, block, sb->s_blocksize); |
296 | } | 295 | } |
297 | 296 | ||
298 | static inline void | 297 | static inline void |
299 | map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) | 298 | map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) |
300 | { | 299 | { |
301 | set_buffer_mapped(bh); | 300 | set_buffer_mapped(bh); |
302 | bh->b_bdev = sb->s_bdev; | 301 | bh->b_bdev = sb->s_bdev; |
303 | bh->b_blocknr = block; | 302 | bh->b_blocknr = block; |
304 | bh->b_size = sb->s_blocksize; | 303 | bh->b_size = sb->s_blocksize; |
305 | } | 304 | } |
306 | 305 | ||
307 | static inline void wait_on_buffer(struct buffer_head *bh) | 306 | static inline void wait_on_buffer(struct buffer_head *bh) |
308 | { | 307 | { |
309 | might_sleep(); | 308 | might_sleep(); |
310 | if (buffer_locked(bh)) | 309 | if (buffer_locked(bh)) |
311 | __wait_on_buffer(bh); | 310 | __wait_on_buffer(bh); |
312 | } | 311 | } |
313 | 312 | ||
314 | static inline int trylock_buffer(struct buffer_head *bh) | 313 | static inline int trylock_buffer(struct buffer_head *bh) |
315 | { | 314 | { |
316 | return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); | 315 | return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); |
317 | } | 316 | } |
318 | 317 | ||
319 | static inline void lock_buffer(struct buffer_head *bh) | 318 | static inline void lock_buffer(struct buffer_head *bh) |
320 | { | 319 | { |
321 | might_sleep(); | 320 | might_sleep(); |
322 | if (!trylock_buffer(bh)) | 321 | if (!trylock_buffer(bh)) |
323 | __lock_buffer(bh); | 322 | __lock_buffer(bh); |
324 | } | 323 | } |
325 | 324 | ||
326 | extern int __set_page_dirty_buffers(struct page *page); | 325 | extern int __set_page_dirty_buffers(struct page *page); |
327 | 326 | ||
328 | #else /* CONFIG_BLOCK */ | 327 | #else /* CONFIG_BLOCK */ |
329 | 328 | ||
330 | static inline void buffer_init(void) {} | 329 | static inline void buffer_init(void) {} |
331 | static inline int try_to_free_buffers(struct page *page) { return 1; } | 330 | static inline int try_to_free_buffers(struct page *page) { return 1; } |
332 | static inline int inode_has_buffers(struct inode *inode) { return 0; } | 331 | static inline int inode_has_buffers(struct inode *inode) { return 0; } |
333 | static inline void invalidate_inode_buffers(struct inode *inode) {} | 332 | static inline void invalidate_inode_buffers(struct inode *inode) {} |
334 | static inline int remove_inode_buffers(struct inode *inode) { return 1; } | 333 | static inline int remove_inode_buffers(struct inode *inode) { return 1; } |
335 | static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } | 334 | static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } |
336 | 335 | ||
337 | #endif /* CONFIG_BLOCK */ | 336 | #endif /* CONFIG_BLOCK */ |
338 | #endif /* _LINUX_BUFFER_HEAD_H */ | 337 | #endif /* _LINUX_BUFFER_HEAD_H */ |