Commit 29a814d2ee0e43c2980f33f91c1311ec06c0aa35
Committed by
Theodore Ts'o
1 parent
87c89c232c
Exists in
master
and in
40 other branches
vfs: add hooks for ext4's delayed allocation support
Export mpage_bio_submit() and __mpage_writepage() for the benefit of ext4's delayed allocation support. Also change __block_write_full_page so that if buffers that have the BH_Delay flag set it will call get_block() to get the physical block allocated, just as in the !BH_Mapped case. Signed-off-by: Alex Tomas <alex@clusterfs.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Showing 3 changed files with 20 additions and 11 deletions Inline Diff
fs/buffer.c
1 | /* | 1 | /* |
2 | * linux/fs/buffer.c | 2 | * linux/fs/buffer.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 2002 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 2002 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 | 8 | * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 |
9 | * | 9 | * |
10 | * Removed a lot of unnecessary code and simplified things now that | 10 | * Removed a lot of unnecessary code and simplified things now that |
11 | * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 | 11 | * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 |
12 | * | 12 | * |
13 | * Speed up hash, lru, and free list operations. Use gfp() for allocating | 13 | * Speed up hash, lru, and free list operations. Use gfp() for allocating |
14 | * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM | 14 | * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM |
15 | * | 15 | * |
16 | * Added 32k buffer block sizes - these are required older ARM systems. - RMK | 16 | * Added 32k buffer block sizes - these are required older ARM systems. - RMK |
17 | * | 17 | * |
18 | * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> | 18 | * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/kernel.h> | 21 | #include <linux/kernel.h> |
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | #include <linux/fs.h> | 23 | #include <linux/fs.h> |
24 | #include <linux/mm.h> | 24 | #include <linux/mm.h> |
25 | #include <linux/percpu.h> | 25 | #include <linux/percpu.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/capability.h> | 27 | #include <linux/capability.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
30 | #include <linux/quotaops.h> | 30 | #include <linux/quotaops.h> |
31 | #include <linux/highmem.h> | 31 | #include <linux/highmem.h> |
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <linux/writeback.h> | 33 | #include <linux/writeback.h> |
34 | #include <linux/hash.h> | 34 | #include <linux/hash.h> |
35 | #include <linux/suspend.h> | 35 | #include <linux/suspend.h> |
36 | #include <linux/buffer_head.h> | 36 | #include <linux/buffer_head.h> |
37 | #include <linux/task_io_accounting_ops.h> | 37 | #include <linux/task_io_accounting_ops.h> |
38 | #include <linux/bio.h> | 38 | #include <linux/bio.h> |
39 | #include <linux/notifier.h> | 39 | #include <linux/notifier.h> |
40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
41 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
42 | #include <linux/mpage.h> | 42 | #include <linux/mpage.h> |
43 | #include <linux/bit_spinlock.h> | 43 | #include <linux/bit_spinlock.h> |
44 | 44 | ||
45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); | 45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); |
46 | 46 | ||
47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) | 47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) |
48 | 48 | ||
49 | inline void | 49 | inline void |
50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | 50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) |
51 | { | 51 | { |
52 | bh->b_end_io = handler; | 52 | bh->b_end_io = handler; |
53 | bh->b_private = private; | 53 | bh->b_private = private; |
54 | } | 54 | } |
55 | 55 | ||
56 | static int sync_buffer(void *word) | 56 | static int sync_buffer(void *word) |
57 | { | 57 | { |
58 | struct block_device *bd; | 58 | struct block_device *bd; |
59 | struct buffer_head *bh | 59 | struct buffer_head *bh |
60 | = container_of(word, struct buffer_head, b_state); | 60 | = container_of(word, struct buffer_head, b_state); |
61 | 61 | ||
62 | smp_mb(); | 62 | smp_mb(); |
63 | bd = bh->b_bdev; | 63 | bd = bh->b_bdev; |
64 | if (bd) | 64 | if (bd) |
65 | blk_run_address_space(bd->bd_inode->i_mapping); | 65 | blk_run_address_space(bd->bd_inode->i_mapping); |
66 | io_schedule(); | 66 | io_schedule(); |
67 | return 0; | 67 | return 0; |
68 | } | 68 | } |
69 | 69 | ||
70 | void __lock_buffer(struct buffer_head *bh) | 70 | void __lock_buffer(struct buffer_head *bh) |
71 | { | 71 | { |
72 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, | 72 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, |
73 | TASK_UNINTERRUPTIBLE); | 73 | TASK_UNINTERRUPTIBLE); |
74 | } | 74 | } |
75 | EXPORT_SYMBOL(__lock_buffer); | 75 | EXPORT_SYMBOL(__lock_buffer); |
76 | 76 | ||
77 | void unlock_buffer(struct buffer_head *bh) | 77 | void unlock_buffer(struct buffer_head *bh) |
78 | { | 78 | { |
79 | smp_mb__before_clear_bit(); | 79 | smp_mb__before_clear_bit(); |
80 | clear_buffer_locked(bh); | 80 | clear_buffer_locked(bh); |
81 | smp_mb__after_clear_bit(); | 81 | smp_mb__after_clear_bit(); |
82 | wake_up_bit(&bh->b_state, BH_Lock); | 82 | wake_up_bit(&bh->b_state, BH_Lock); |
83 | } | 83 | } |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * Block until a buffer comes unlocked. This doesn't stop it | 86 | * Block until a buffer comes unlocked. This doesn't stop it |
87 | * from becoming locked again - you have to lock it yourself | 87 | * from becoming locked again - you have to lock it yourself |
88 | * if you want to preserve its state. | 88 | * if you want to preserve its state. |
89 | */ | 89 | */ |
90 | void __wait_on_buffer(struct buffer_head * bh) | 90 | void __wait_on_buffer(struct buffer_head * bh) |
91 | { | 91 | { |
92 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); | 92 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); |
93 | } | 93 | } |
94 | 94 | ||
95 | static void | 95 | static void |
96 | __clear_page_buffers(struct page *page) | 96 | __clear_page_buffers(struct page *page) |
97 | { | 97 | { |
98 | ClearPagePrivate(page); | 98 | ClearPagePrivate(page); |
99 | set_page_private(page, 0); | 99 | set_page_private(page, 0); |
100 | page_cache_release(page); | 100 | page_cache_release(page); |
101 | } | 101 | } |
102 | 102 | ||
103 | static void buffer_io_error(struct buffer_head *bh) | 103 | static void buffer_io_error(struct buffer_head *bh) |
104 | { | 104 | { |
105 | char b[BDEVNAME_SIZE]; | 105 | char b[BDEVNAME_SIZE]; |
106 | 106 | ||
107 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", | 107 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", |
108 | bdevname(bh->b_bdev, b), | 108 | bdevname(bh->b_bdev, b), |
109 | (unsigned long long)bh->b_blocknr); | 109 | (unsigned long long)bh->b_blocknr); |
110 | } | 110 | } |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * End-of-IO handler helper function which does not touch the bh after | 113 | * End-of-IO handler helper function which does not touch the bh after |
114 | * unlocking it. | 114 | * unlocking it. |
115 | * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but | 115 | * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but |
116 | * a race there is benign: unlock_buffer() only use the bh's address for | 116 | * a race there is benign: unlock_buffer() only use the bh's address for |
117 | * hashing after unlocking the buffer, so it doesn't actually touch the bh | 117 | * hashing after unlocking the buffer, so it doesn't actually touch the bh |
118 | * itself. | 118 | * itself. |
119 | */ | 119 | */ |
120 | static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) | 120 | static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) |
121 | { | 121 | { |
122 | if (uptodate) { | 122 | if (uptodate) { |
123 | set_buffer_uptodate(bh); | 123 | set_buffer_uptodate(bh); |
124 | } else { | 124 | } else { |
125 | /* This happens, due to failed READA attempts. */ | 125 | /* This happens, due to failed READA attempts. */ |
126 | clear_buffer_uptodate(bh); | 126 | clear_buffer_uptodate(bh); |
127 | } | 127 | } |
128 | unlock_buffer(bh); | 128 | unlock_buffer(bh); |
129 | } | 129 | } |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * Default synchronous end-of-IO handler.. Just mark it up-to-date and | 132 | * Default synchronous end-of-IO handler.. Just mark it up-to-date and |
133 | * unlock the buffer. This is what ll_rw_block uses too. | 133 | * unlock the buffer. This is what ll_rw_block uses too. |
134 | */ | 134 | */ |
135 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate) | 135 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate) |
136 | { | 136 | { |
137 | __end_buffer_read_notouch(bh, uptodate); | 137 | __end_buffer_read_notouch(bh, uptodate); |
138 | put_bh(bh); | 138 | put_bh(bh); |
139 | } | 139 | } |
140 | 140 | ||
141 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate) | 141 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate) |
142 | { | 142 | { |
143 | char b[BDEVNAME_SIZE]; | 143 | char b[BDEVNAME_SIZE]; |
144 | 144 | ||
145 | if (uptodate) { | 145 | if (uptodate) { |
146 | set_buffer_uptodate(bh); | 146 | set_buffer_uptodate(bh); |
147 | } else { | 147 | } else { |
148 | if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { | 148 | if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { |
149 | buffer_io_error(bh); | 149 | buffer_io_error(bh); |
150 | printk(KERN_WARNING "lost page write due to " | 150 | printk(KERN_WARNING "lost page write due to " |
151 | "I/O error on %s\n", | 151 | "I/O error on %s\n", |
152 | bdevname(bh->b_bdev, b)); | 152 | bdevname(bh->b_bdev, b)); |
153 | } | 153 | } |
154 | set_buffer_write_io_error(bh); | 154 | set_buffer_write_io_error(bh); |
155 | clear_buffer_uptodate(bh); | 155 | clear_buffer_uptodate(bh); |
156 | } | 156 | } |
157 | unlock_buffer(bh); | 157 | unlock_buffer(bh); |
158 | put_bh(bh); | 158 | put_bh(bh); |
159 | } | 159 | } |
160 | 160 | ||
161 | /* | 161 | /* |
162 | * Write out and wait upon all the dirty data associated with a block | 162 | * Write out and wait upon all the dirty data associated with a block |
163 | * device via its mapping. Does not take the superblock lock. | 163 | * device via its mapping. Does not take the superblock lock. |
164 | */ | 164 | */ |
165 | int sync_blockdev(struct block_device *bdev) | 165 | int sync_blockdev(struct block_device *bdev) |
166 | { | 166 | { |
167 | int ret = 0; | 167 | int ret = 0; |
168 | 168 | ||
169 | if (bdev) | 169 | if (bdev) |
170 | ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); | 170 | ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); |
171 | return ret; | 171 | return ret; |
172 | } | 172 | } |
173 | EXPORT_SYMBOL(sync_blockdev); | 173 | EXPORT_SYMBOL(sync_blockdev); |
174 | 174 | ||
175 | /* | 175 | /* |
176 | * Write out and wait upon all dirty data associated with this | 176 | * Write out and wait upon all dirty data associated with this |
177 | * device. Filesystem data as well as the underlying block | 177 | * device. Filesystem data as well as the underlying block |
178 | * device. Takes the superblock lock. | 178 | * device. Takes the superblock lock. |
179 | */ | 179 | */ |
180 | int fsync_bdev(struct block_device *bdev) | 180 | int fsync_bdev(struct block_device *bdev) |
181 | { | 181 | { |
182 | struct super_block *sb = get_super(bdev); | 182 | struct super_block *sb = get_super(bdev); |
183 | if (sb) { | 183 | if (sb) { |
184 | int res = fsync_super(sb); | 184 | int res = fsync_super(sb); |
185 | drop_super(sb); | 185 | drop_super(sb); |
186 | return res; | 186 | return res; |
187 | } | 187 | } |
188 | return sync_blockdev(bdev); | 188 | return sync_blockdev(bdev); |
189 | } | 189 | } |
190 | 190 | ||
191 | /** | 191 | /** |
192 | * freeze_bdev -- lock a filesystem and force it into a consistent state | 192 | * freeze_bdev -- lock a filesystem and force it into a consistent state |
193 | * @bdev: blockdevice to lock | 193 | * @bdev: blockdevice to lock |
194 | * | 194 | * |
195 | * This takes the block device bd_mount_sem to make sure no new mounts | 195 | * This takes the block device bd_mount_sem to make sure no new mounts |
196 | * happen on bdev until thaw_bdev() is called. | 196 | * happen on bdev until thaw_bdev() is called. |
197 | * If a superblock is found on this device, we take the s_umount semaphore | 197 | * If a superblock is found on this device, we take the s_umount semaphore |
198 | * on it to make sure nobody unmounts until the snapshot creation is done. | 198 | * on it to make sure nobody unmounts until the snapshot creation is done. |
199 | */ | 199 | */ |
200 | struct super_block *freeze_bdev(struct block_device *bdev) | 200 | struct super_block *freeze_bdev(struct block_device *bdev) |
201 | { | 201 | { |
202 | struct super_block *sb; | 202 | struct super_block *sb; |
203 | 203 | ||
204 | down(&bdev->bd_mount_sem); | 204 | down(&bdev->bd_mount_sem); |
205 | sb = get_super(bdev); | 205 | sb = get_super(bdev); |
206 | if (sb && !(sb->s_flags & MS_RDONLY)) { | 206 | if (sb && !(sb->s_flags & MS_RDONLY)) { |
207 | sb->s_frozen = SB_FREEZE_WRITE; | 207 | sb->s_frozen = SB_FREEZE_WRITE; |
208 | smp_wmb(); | 208 | smp_wmb(); |
209 | 209 | ||
210 | __fsync_super(sb); | 210 | __fsync_super(sb); |
211 | 211 | ||
212 | sb->s_frozen = SB_FREEZE_TRANS; | 212 | sb->s_frozen = SB_FREEZE_TRANS; |
213 | smp_wmb(); | 213 | smp_wmb(); |
214 | 214 | ||
215 | sync_blockdev(sb->s_bdev); | 215 | sync_blockdev(sb->s_bdev); |
216 | 216 | ||
217 | if (sb->s_op->write_super_lockfs) | 217 | if (sb->s_op->write_super_lockfs) |
218 | sb->s_op->write_super_lockfs(sb); | 218 | sb->s_op->write_super_lockfs(sb); |
219 | } | 219 | } |
220 | 220 | ||
221 | sync_blockdev(bdev); | 221 | sync_blockdev(bdev); |
222 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ | 222 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ |
223 | } | 223 | } |
224 | EXPORT_SYMBOL(freeze_bdev); | 224 | EXPORT_SYMBOL(freeze_bdev); |
225 | 225 | ||
226 | /** | 226 | /** |
227 | * thaw_bdev -- unlock filesystem | 227 | * thaw_bdev -- unlock filesystem |
228 | * @bdev: blockdevice to unlock | 228 | * @bdev: blockdevice to unlock |
229 | * @sb: associated superblock | 229 | * @sb: associated superblock |
230 | * | 230 | * |
231 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). | 231 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). |
232 | */ | 232 | */ |
233 | void thaw_bdev(struct block_device *bdev, struct super_block *sb) | 233 | void thaw_bdev(struct block_device *bdev, struct super_block *sb) |
234 | { | 234 | { |
235 | if (sb) { | 235 | if (sb) { |
236 | BUG_ON(sb->s_bdev != bdev); | 236 | BUG_ON(sb->s_bdev != bdev); |
237 | 237 | ||
238 | if (sb->s_op->unlockfs) | 238 | if (sb->s_op->unlockfs) |
239 | sb->s_op->unlockfs(sb); | 239 | sb->s_op->unlockfs(sb); |
240 | sb->s_frozen = SB_UNFROZEN; | 240 | sb->s_frozen = SB_UNFROZEN; |
241 | smp_wmb(); | 241 | smp_wmb(); |
242 | wake_up(&sb->s_wait_unfrozen); | 242 | wake_up(&sb->s_wait_unfrozen); |
243 | drop_super(sb); | 243 | drop_super(sb); |
244 | } | 244 | } |
245 | 245 | ||
246 | up(&bdev->bd_mount_sem); | 246 | up(&bdev->bd_mount_sem); |
247 | } | 247 | } |
248 | EXPORT_SYMBOL(thaw_bdev); | 248 | EXPORT_SYMBOL(thaw_bdev); |
249 | 249 | ||
250 | /* | 250 | /* |
251 | * Various filesystems appear to want __find_get_block to be non-blocking. | 251 | * Various filesystems appear to want __find_get_block to be non-blocking. |
252 | * But it's the page lock which protects the buffers. To get around this, | 252 | * But it's the page lock which protects the buffers. To get around this, |
253 | * we get exclusion from try_to_free_buffers with the blockdev mapping's | 253 | * we get exclusion from try_to_free_buffers with the blockdev mapping's |
254 | * private_lock. | 254 | * private_lock. |
255 | * | 255 | * |
256 | * Hack idea: for the blockdev mapping, i_bufferlist_lock contention | 256 | * Hack idea: for the blockdev mapping, i_bufferlist_lock contention |
257 | * may be quite high. This code could TryLock the page, and if that | 257 | * may be quite high. This code could TryLock the page, and if that |
258 | * succeeds, there is no need to take private_lock. (But if | 258 | * succeeds, there is no need to take private_lock. (But if |
259 | * private_lock is contended then so is mapping->tree_lock). | 259 | * private_lock is contended then so is mapping->tree_lock). |
260 | */ | 260 | */ |
261 | static struct buffer_head * | 261 | static struct buffer_head * |
262 | __find_get_block_slow(struct block_device *bdev, sector_t block) | 262 | __find_get_block_slow(struct block_device *bdev, sector_t block) |
263 | { | 263 | { |
264 | struct inode *bd_inode = bdev->bd_inode; | 264 | struct inode *bd_inode = bdev->bd_inode; |
265 | struct address_space *bd_mapping = bd_inode->i_mapping; | 265 | struct address_space *bd_mapping = bd_inode->i_mapping; |
266 | struct buffer_head *ret = NULL; | 266 | struct buffer_head *ret = NULL; |
267 | pgoff_t index; | 267 | pgoff_t index; |
268 | struct buffer_head *bh; | 268 | struct buffer_head *bh; |
269 | struct buffer_head *head; | 269 | struct buffer_head *head; |
270 | struct page *page; | 270 | struct page *page; |
271 | int all_mapped = 1; | 271 | int all_mapped = 1; |
272 | 272 | ||
273 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); | 273 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); |
274 | page = find_get_page(bd_mapping, index); | 274 | page = find_get_page(bd_mapping, index); |
275 | if (!page) | 275 | if (!page) |
276 | goto out; | 276 | goto out; |
277 | 277 | ||
278 | spin_lock(&bd_mapping->private_lock); | 278 | spin_lock(&bd_mapping->private_lock); |
279 | if (!page_has_buffers(page)) | 279 | if (!page_has_buffers(page)) |
280 | goto out_unlock; | 280 | goto out_unlock; |
281 | head = page_buffers(page); | 281 | head = page_buffers(page); |
282 | bh = head; | 282 | bh = head; |
283 | do { | 283 | do { |
284 | if (bh->b_blocknr == block) { | 284 | if (bh->b_blocknr == block) { |
285 | ret = bh; | 285 | ret = bh; |
286 | get_bh(bh); | 286 | get_bh(bh); |
287 | goto out_unlock; | 287 | goto out_unlock; |
288 | } | 288 | } |
289 | if (!buffer_mapped(bh)) | 289 | if (!buffer_mapped(bh)) |
290 | all_mapped = 0; | 290 | all_mapped = 0; |
291 | bh = bh->b_this_page; | 291 | bh = bh->b_this_page; |
292 | } while (bh != head); | 292 | } while (bh != head); |
293 | 293 | ||
294 | /* we might be here because some of the buffers on this page are | 294 | /* we might be here because some of the buffers on this page are |
295 | * not mapped. This is due to various races between | 295 | * not mapped. This is due to various races between |
296 | * file io on the block device and getblk. It gets dealt with | 296 | * file io on the block device and getblk. It gets dealt with |
297 | * elsewhere, don't buffer_error if we had some unmapped buffers | 297 | * elsewhere, don't buffer_error if we had some unmapped buffers |
298 | */ | 298 | */ |
299 | if (all_mapped) { | 299 | if (all_mapped) { |
300 | printk("__find_get_block_slow() failed. " | 300 | printk("__find_get_block_slow() failed. " |
301 | "block=%llu, b_blocknr=%llu\n", | 301 | "block=%llu, b_blocknr=%llu\n", |
302 | (unsigned long long)block, | 302 | (unsigned long long)block, |
303 | (unsigned long long)bh->b_blocknr); | 303 | (unsigned long long)bh->b_blocknr); |
304 | printk("b_state=0x%08lx, b_size=%zu\n", | 304 | printk("b_state=0x%08lx, b_size=%zu\n", |
305 | bh->b_state, bh->b_size); | 305 | bh->b_state, bh->b_size); |
306 | printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); | 306 | printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); |
307 | } | 307 | } |
308 | out_unlock: | 308 | out_unlock: |
309 | spin_unlock(&bd_mapping->private_lock); | 309 | spin_unlock(&bd_mapping->private_lock); |
310 | page_cache_release(page); | 310 | page_cache_release(page); |
311 | out: | 311 | out: |
312 | return ret; | 312 | return ret; |
313 | } | 313 | } |
314 | 314 | ||
315 | /* If invalidate_buffers() will trash dirty buffers, it means some kind | 315 | /* If invalidate_buffers() will trash dirty buffers, it means some kind |
316 | of fs corruption is going on. Trashing dirty data always imply losing | 316 | of fs corruption is going on. Trashing dirty data always imply losing |
317 | information that was supposed to be just stored on the physical layer | 317 | information that was supposed to be just stored on the physical layer |
318 | by the user. | 318 | by the user. |
319 | 319 | ||
320 | Thus invalidate_buffers in general usage is not allwowed to trash | 320 | Thus invalidate_buffers in general usage is not allwowed to trash |
321 | dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to | 321 | dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to |
322 | be preserved. These buffers are simply skipped. | 322 | be preserved. These buffers are simply skipped. |
323 | 323 | ||
324 | We also skip buffers which are still in use. For example this can | 324 | We also skip buffers which are still in use. For example this can |
325 | happen if a userspace program is reading the block device. | 325 | happen if a userspace program is reading the block device. |
326 | 326 | ||
327 | NOTE: In the case where the user removed a removable-media-disk even if | 327 | NOTE: In the case where the user removed a removable-media-disk even if |
328 | there's still dirty data not synced on disk (due a bug in the device driver | 328 | there's still dirty data not synced on disk (due a bug in the device driver |
329 | or due an error of the user), by not destroying the dirty buffers we could | 329 | or due an error of the user), by not destroying the dirty buffers we could |
330 | generate corruption also on the next media inserted, thus a parameter is | 330 | generate corruption also on the next media inserted, thus a parameter is |
331 | necessary to handle this case in the most safe way possible (trying | 331 | necessary to handle this case in the most safe way possible (trying |
332 | to not corrupt also the new disk inserted with the data belonging to | 332 | to not corrupt also the new disk inserted with the data belonging to |
333 | the old now corrupted disk). Also for the ramdisk the natural thing | 333 | the old now corrupted disk). Also for the ramdisk the natural thing |
334 | to do in order to release the ramdisk memory is to destroy dirty buffers. | 334 | to do in order to release the ramdisk memory is to destroy dirty buffers. |
335 | 335 | ||
336 | These are two special cases. Normal usage imply the device driver | 336 | These are two special cases. Normal usage imply the device driver |
337 | to issue a sync on the device (without waiting I/O completion) and | 337 | to issue a sync on the device (without waiting I/O completion) and |
338 | then an invalidate_buffers call that doesn't trash dirty buffers. | 338 | then an invalidate_buffers call that doesn't trash dirty buffers. |
339 | 339 | ||
340 | For handling cache coherency with the blkdev pagecache the 'update' case | 340 | For handling cache coherency with the blkdev pagecache the 'update' case |
341 | is been introduced. It is needed to re-read from disk any pinned | 341 | is been introduced. It is needed to re-read from disk any pinned |
342 | buffer. NOTE: re-reading from disk is destructive so we can do it only | 342 | buffer. NOTE: re-reading from disk is destructive so we can do it only |
343 | when we assume nobody is changing the buffercache under our I/O and when | 343 | when we assume nobody is changing the buffercache under our I/O and when |
344 | we think the disk contains more recent information than the buffercache. | 344 | we think the disk contains more recent information than the buffercache. |
345 | The update == 1 pass marks the buffers we need to update, the update == 2 | 345 | The update == 1 pass marks the buffers we need to update, the update == 2 |
346 | pass does the actual I/O. */ | 346 | pass does the actual I/O. */ |
347 | void invalidate_bdev(struct block_device *bdev) | 347 | void invalidate_bdev(struct block_device *bdev) |
348 | { | 348 | { |
349 | struct address_space *mapping = bdev->bd_inode->i_mapping; | 349 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
350 | 350 | ||
351 | if (mapping->nrpages == 0) | 351 | if (mapping->nrpages == 0) |
352 | return; | 352 | return; |
353 | 353 | ||
354 | invalidate_bh_lrus(); | 354 | invalidate_bh_lrus(); |
355 | invalidate_mapping_pages(mapping, 0, -1); | 355 | invalidate_mapping_pages(mapping, 0, -1); |
356 | } | 356 | } |
357 | 357 | ||
358 | /* | 358 | /* |
359 | * Kick pdflush then try to free up some ZONE_NORMAL memory. | 359 | * Kick pdflush then try to free up some ZONE_NORMAL memory. |
360 | */ | 360 | */ |
361 | static void free_more_memory(void) | 361 | static void free_more_memory(void) |
362 | { | 362 | { |
363 | struct zone *zone; | 363 | struct zone *zone; |
364 | int nid; | 364 | int nid; |
365 | 365 | ||
366 | wakeup_pdflush(1024); | 366 | wakeup_pdflush(1024); |
367 | yield(); | 367 | yield(); |
368 | 368 | ||
369 | for_each_online_node(nid) { | 369 | for_each_online_node(nid) { |
370 | (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), | 370 | (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), |
371 | gfp_zone(GFP_NOFS), NULL, | 371 | gfp_zone(GFP_NOFS), NULL, |
372 | &zone); | 372 | &zone); |
373 | if (zone) | 373 | if (zone) |
374 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, | 374 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, |
375 | GFP_NOFS); | 375 | GFP_NOFS); |
376 | } | 376 | } |
377 | } | 377 | } |
378 | 378 | ||
379 | /* | 379 | /* |
380 | * I/O completion handler for block_read_full_page() - pages | 380 | * I/O completion handler for block_read_full_page() - pages |
381 | * which come unlocked at the end of I/O. | 381 | * which come unlocked at the end of I/O. |
382 | */ | 382 | */ |
383 | static void end_buffer_async_read(struct buffer_head *bh, int uptodate) | 383 | static void end_buffer_async_read(struct buffer_head *bh, int uptodate) |
384 | { | 384 | { |
385 | unsigned long flags; | 385 | unsigned long flags; |
386 | struct buffer_head *first; | 386 | struct buffer_head *first; |
387 | struct buffer_head *tmp; | 387 | struct buffer_head *tmp; |
388 | struct page *page; | 388 | struct page *page; |
389 | int page_uptodate = 1; | 389 | int page_uptodate = 1; |
390 | 390 | ||
391 | BUG_ON(!buffer_async_read(bh)); | 391 | BUG_ON(!buffer_async_read(bh)); |
392 | 392 | ||
393 | page = bh->b_page; | 393 | page = bh->b_page; |
394 | if (uptodate) { | 394 | if (uptodate) { |
395 | set_buffer_uptodate(bh); | 395 | set_buffer_uptodate(bh); |
396 | } else { | 396 | } else { |
397 | clear_buffer_uptodate(bh); | 397 | clear_buffer_uptodate(bh); |
398 | if (printk_ratelimit()) | 398 | if (printk_ratelimit()) |
399 | buffer_io_error(bh); | 399 | buffer_io_error(bh); |
400 | SetPageError(page); | 400 | SetPageError(page); |
401 | } | 401 | } |
402 | 402 | ||
403 | /* | 403 | /* |
404 | * Be _very_ careful from here on. Bad things can happen if | 404 | * Be _very_ careful from here on. Bad things can happen if |
405 | * two buffer heads end IO at almost the same time and both | 405 | * two buffer heads end IO at almost the same time and both |
406 | * decide that the page is now completely done. | 406 | * decide that the page is now completely done. |
407 | */ | 407 | */ |
408 | first = page_buffers(page); | 408 | first = page_buffers(page); |
409 | local_irq_save(flags); | 409 | local_irq_save(flags); |
410 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | 410 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); |
411 | clear_buffer_async_read(bh); | 411 | clear_buffer_async_read(bh); |
412 | unlock_buffer(bh); | 412 | unlock_buffer(bh); |
413 | tmp = bh; | 413 | tmp = bh; |
414 | do { | 414 | do { |
415 | if (!buffer_uptodate(tmp)) | 415 | if (!buffer_uptodate(tmp)) |
416 | page_uptodate = 0; | 416 | page_uptodate = 0; |
417 | if (buffer_async_read(tmp)) { | 417 | if (buffer_async_read(tmp)) { |
418 | BUG_ON(!buffer_locked(tmp)); | 418 | BUG_ON(!buffer_locked(tmp)); |
419 | goto still_busy; | 419 | goto still_busy; |
420 | } | 420 | } |
421 | tmp = tmp->b_this_page; | 421 | tmp = tmp->b_this_page; |
422 | } while (tmp != bh); | 422 | } while (tmp != bh); |
423 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 423 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
424 | local_irq_restore(flags); | 424 | local_irq_restore(flags); |
425 | 425 | ||
426 | /* | 426 | /* |
427 | * If none of the buffers had errors and they are all | 427 | * If none of the buffers had errors and they are all |
428 | * uptodate then we can set the page uptodate. | 428 | * uptodate then we can set the page uptodate. |
429 | */ | 429 | */ |
430 | if (page_uptodate && !PageError(page)) | 430 | if (page_uptodate && !PageError(page)) |
431 | SetPageUptodate(page); | 431 | SetPageUptodate(page); |
432 | unlock_page(page); | 432 | unlock_page(page); |
433 | return; | 433 | return; |
434 | 434 | ||
435 | still_busy: | 435 | still_busy: |
436 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 436 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
437 | local_irq_restore(flags); | 437 | local_irq_restore(flags); |
438 | return; | 438 | return; |
439 | } | 439 | } |
440 | 440 | ||
441 | /* | 441 | /* |
442 | * Completion handler for block_write_full_page() - pages which are unlocked | 442 | * Completion handler for block_write_full_page() - pages which are unlocked |
443 | * during I/O, and which have PageWriteback cleared upon I/O completion. | 443 | * during I/O, and which have PageWriteback cleared upon I/O completion. |
444 | */ | 444 | */ |
445 | static void end_buffer_async_write(struct buffer_head *bh, int uptodate) | 445 | static void end_buffer_async_write(struct buffer_head *bh, int uptodate) |
446 | { | 446 | { |
447 | char b[BDEVNAME_SIZE]; | 447 | char b[BDEVNAME_SIZE]; |
448 | unsigned long flags; | 448 | unsigned long flags; |
449 | struct buffer_head *first; | 449 | struct buffer_head *first; |
450 | struct buffer_head *tmp; | 450 | struct buffer_head *tmp; |
451 | struct page *page; | 451 | struct page *page; |
452 | 452 | ||
453 | BUG_ON(!buffer_async_write(bh)); | 453 | BUG_ON(!buffer_async_write(bh)); |
454 | 454 | ||
455 | page = bh->b_page; | 455 | page = bh->b_page; |
456 | if (uptodate) { | 456 | if (uptodate) { |
457 | set_buffer_uptodate(bh); | 457 | set_buffer_uptodate(bh); |
458 | } else { | 458 | } else { |
459 | if (printk_ratelimit()) { | 459 | if (printk_ratelimit()) { |
460 | buffer_io_error(bh); | 460 | buffer_io_error(bh); |
461 | printk(KERN_WARNING "lost page write due to " | 461 | printk(KERN_WARNING "lost page write due to " |
462 | "I/O error on %s\n", | 462 | "I/O error on %s\n", |
463 | bdevname(bh->b_bdev, b)); | 463 | bdevname(bh->b_bdev, b)); |
464 | } | 464 | } |
465 | set_bit(AS_EIO, &page->mapping->flags); | 465 | set_bit(AS_EIO, &page->mapping->flags); |
466 | set_buffer_write_io_error(bh); | 466 | set_buffer_write_io_error(bh); |
467 | clear_buffer_uptodate(bh); | 467 | clear_buffer_uptodate(bh); |
468 | SetPageError(page); | 468 | SetPageError(page); |
469 | } | 469 | } |
470 | 470 | ||
471 | first = page_buffers(page); | 471 | first = page_buffers(page); |
472 | local_irq_save(flags); | 472 | local_irq_save(flags); |
473 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | 473 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); |
474 | 474 | ||
475 | clear_buffer_async_write(bh); | 475 | clear_buffer_async_write(bh); |
476 | unlock_buffer(bh); | 476 | unlock_buffer(bh); |
477 | tmp = bh->b_this_page; | 477 | tmp = bh->b_this_page; |
478 | while (tmp != bh) { | 478 | while (tmp != bh) { |
479 | if (buffer_async_write(tmp)) { | 479 | if (buffer_async_write(tmp)) { |
480 | BUG_ON(!buffer_locked(tmp)); | 480 | BUG_ON(!buffer_locked(tmp)); |
481 | goto still_busy; | 481 | goto still_busy; |
482 | } | 482 | } |
483 | tmp = tmp->b_this_page; | 483 | tmp = tmp->b_this_page; |
484 | } | 484 | } |
485 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 485 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
486 | local_irq_restore(flags); | 486 | local_irq_restore(flags); |
487 | end_page_writeback(page); | 487 | end_page_writeback(page); |
488 | return; | 488 | return; |
489 | 489 | ||
490 | still_busy: | 490 | still_busy: |
491 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 491 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
492 | local_irq_restore(flags); | 492 | local_irq_restore(flags); |
493 | return; | 493 | return; |
494 | } | 494 | } |
495 | 495 | ||
496 | /* | 496 | /* |
497 | * If a page's buffers are under async readin (end_buffer_async_read | 497 | * If a page's buffers are under async readin (end_buffer_async_read |
498 | * completion) then there is a possibility that another thread of | 498 | * completion) then there is a possibility that another thread of |
499 | * control could lock one of the buffers after it has completed | 499 | * control could lock one of the buffers after it has completed |
500 | * but while some of the other buffers have not completed. This | 500 | * but while some of the other buffers have not completed. This |
501 | * locked buffer would confuse end_buffer_async_read() into not unlocking | 501 | * locked buffer would confuse end_buffer_async_read() into not unlocking |
502 | * the page. So the absence of BH_Async_Read tells end_buffer_async_read() | 502 | * the page. So the absence of BH_Async_Read tells end_buffer_async_read() |
503 | * that this buffer is not under async I/O. | 503 | * that this buffer is not under async I/O. |
504 | * | 504 | * |
505 | * The page comes unlocked when it has no locked buffer_async buffers | 505 | * The page comes unlocked when it has no locked buffer_async buffers |
506 | * left. | 506 | * left. |
507 | * | 507 | * |
508 | * PageLocked prevents anyone starting new async I/O reads any of | 508 | * PageLocked prevents anyone starting new async I/O reads any of |
509 | * the buffers. | 509 | * the buffers. |
510 | * | 510 | * |
511 | * PageWriteback is used to prevent simultaneous writeout of the same | 511 | * PageWriteback is used to prevent simultaneous writeout of the same |
512 | * page. | 512 | * page. |
513 | * | 513 | * |
514 | * PageLocked prevents anyone from starting writeback of a page which is | 514 | * PageLocked prevents anyone from starting writeback of a page which is |
515 | * under read I/O (PageWriteback is only ever set against a locked page). | 515 | * under read I/O (PageWriteback is only ever set against a locked page). |
516 | */ | 516 | */ |
517 | static void mark_buffer_async_read(struct buffer_head *bh) | 517 | static void mark_buffer_async_read(struct buffer_head *bh) |
518 | { | 518 | { |
519 | bh->b_end_io = end_buffer_async_read; | 519 | bh->b_end_io = end_buffer_async_read; |
520 | set_buffer_async_read(bh); | 520 | set_buffer_async_read(bh); |
521 | } | 521 | } |
522 | 522 | ||
523 | void mark_buffer_async_write(struct buffer_head *bh) | 523 | void mark_buffer_async_write(struct buffer_head *bh) |
524 | { | 524 | { |
525 | bh->b_end_io = end_buffer_async_write; | 525 | bh->b_end_io = end_buffer_async_write; |
526 | set_buffer_async_write(bh); | 526 | set_buffer_async_write(bh); |
527 | } | 527 | } |
528 | EXPORT_SYMBOL(mark_buffer_async_write); | 528 | EXPORT_SYMBOL(mark_buffer_async_write); |
529 | 529 | ||
530 | 530 | ||
531 | /* | 531 | /* |
532 | * fs/buffer.c contains helper functions for buffer-backed address space's | 532 | * fs/buffer.c contains helper functions for buffer-backed address space's |
533 | * fsync functions. A common requirement for buffer-based filesystems is | 533 | * fsync functions. A common requirement for buffer-based filesystems is |
534 | * that certain data from the backing blockdev needs to be written out for | 534 | * that certain data from the backing blockdev needs to be written out for |
535 | * a successful fsync(). For example, ext2 indirect blocks need to be | 535 | * a successful fsync(). For example, ext2 indirect blocks need to be |
536 | * written back and waited upon before fsync() returns. | 536 | * written back and waited upon before fsync() returns. |
537 | * | 537 | * |
538 | * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), | 538 | * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), |
539 | * inode_has_buffers() and invalidate_inode_buffers() are provided for the | 539 | * inode_has_buffers() and invalidate_inode_buffers() are provided for the |
540 | * management of a list of dependent buffers at ->i_mapping->private_list. | 540 | * management of a list of dependent buffers at ->i_mapping->private_list. |
541 | * | 541 | * |
542 | * Locking is a little subtle: try_to_free_buffers() will remove buffers | 542 | * Locking is a little subtle: try_to_free_buffers() will remove buffers |
543 | * from their controlling inode's queue when they are being freed. But | 543 | * from their controlling inode's queue when they are being freed. But |
544 | * try_to_free_buffers() will be operating against the *blockdev* mapping | 544 | * try_to_free_buffers() will be operating against the *blockdev* mapping |
545 | * at the time, not against the S_ISREG file which depends on those buffers. | 545 | * at the time, not against the S_ISREG file which depends on those buffers. |
546 | * So the locking for private_list is via the private_lock in the address_space | 546 | * So the locking for private_list is via the private_lock in the address_space |
547 | * which backs the buffers. Which is different from the address_space | 547 | * which backs the buffers. Which is different from the address_space |
548 | * against which the buffers are listed. So for a particular address_space, | 548 | * against which the buffers are listed. So for a particular address_space, |
549 | * mapping->private_lock does *not* protect mapping->private_list! In fact, | 549 | * mapping->private_lock does *not* protect mapping->private_list! In fact, |
550 | * mapping->private_list will always be protected by the backing blockdev's | 550 | * mapping->private_list will always be protected by the backing blockdev's |
551 | * ->private_lock. | 551 | * ->private_lock. |
552 | * | 552 | * |
553 | * Which introduces a requirement: all buffers on an address_space's | 553 | * Which introduces a requirement: all buffers on an address_space's |
554 | * ->private_list must be from the same address_space: the blockdev's. | 554 | * ->private_list must be from the same address_space: the blockdev's. |
555 | * | 555 | * |
556 | * address_spaces which do not place buffers at ->private_list via these | 556 | * address_spaces which do not place buffers at ->private_list via these |
557 | * utility functions are free to use private_lock and private_list for | 557 | * utility functions are free to use private_lock and private_list for |
558 | * whatever they want. The only requirement is that list_empty(private_list) | 558 | * whatever they want. The only requirement is that list_empty(private_list) |
559 | * be true at clear_inode() time. | 559 | * be true at clear_inode() time. |
560 | * | 560 | * |
561 | * FIXME: clear_inode should not call invalidate_inode_buffers(). The | 561 | * FIXME: clear_inode should not call invalidate_inode_buffers(). The |
562 | * filesystems should do that. invalidate_inode_buffers() should just go | 562 | * filesystems should do that. invalidate_inode_buffers() should just go |
563 | * BUG_ON(!list_empty). | 563 | * BUG_ON(!list_empty). |
564 | * | 564 | * |
565 | * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should | 565 | * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should |
566 | * take an address_space, not an inode. And it should be called | 566 | * take an address_space, not an inode. And it should be called |
567 | * mark_buffer_dirty_fsync() to clearly define why those buffers are being | 567 | * mark_buffer_dirty_fsync() to clearly define why those buffers are being |
568 | * queued up. | 568 | * queued up. |
569 | * | 569 | * |
570 | * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the | 570 | * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the |
571 | * list if it is already on a list. Because if the buffer is on a list, | 571 | * list if it is already on a list. Because if the buffer is on a list, |
572 | * it *must* already be on the right one. If not, the filesystem is being | 572 | * it *must* already be on the right one. If not, the filesystem is being |
573 | * silly. This will save a ton of locking. But first we have to ensure | 573 | * silly. This will save a ton of locking. But first we have to ensure |
574 | * that buffers are taken *off* the old inode's list when they are freed | 574 | * that buffers are taken *off* the old inode's list when they are freed |
575 | * (presumably in truncate). That requires careful auditing of all | 575 | * (presumably in truncate). That requires careful auditing of all |
576 | * filesystems (do it inside bforget()). It could also be done by bringing | 576 | * filesystems (do it inside bforget()). It could also be done by bringing |
577 | * b_inode back. | 577 | * b_inode back. |
578 | */ | 578 | */ |
579 | 579 | ||
580 | /* | 580 | /* |
581 | * The buffer's backing address_space's private_lock must be held | 581 | * The buffer's backing address_space's private_lock must be held |
582 | */ | 582 | */ |
583 | static inline void __remove_assoc_queue(struct buffer_head *bh) | 583 | static inline void __remove_assoc_queue(struct buffer_head *bh) |
584 | { | 584 | { |
585 | list_del_init(&bh->b_assoc_buffers); | 585 | list_del_init(&bh->b_assoc_buffers); |
586 | WARN_ON(!bh->b_assoc_map); | 586 | WARN_ON(!bh->b_assoc_map); |
587 | if (buffer_write_io_error(bh)) | 587 | if (buffer_write_io_error(bh)) |
588 | set_bit(AS_EIO, &bh->b_assoc_map->flags); | 588 | set_bit(AS_EIO, &bh->b_assoc_map->flags); |
589 | bh->b_assoc_map = NULL; | 589 | bh->b_assoc_map = NULL; |
590 | } | 590 | } |
591 | 591 | ||
592 | int inode_has_buffers(struct inode *inode) | 592 | int inode_has_buffers(struct inode *inode) |
593 | { | 593 | { |
594 | return !list_empty(&inode->i_data.private_list); | 594 | return !list_empty(&inode->i_data.private_list); |
595 | } | 595 | } |
596 | 596 | ||
597 | /* | 597 | /* |
598 | * osync is designed to support O_SYNC io. It waits synchronously for | 598 | * osync is designed to support O_SYNC io. It waits synchronously for |
599 | * all already-submitted IO to complete, but does not queue any new | 599 | * all already-submitted IO to complete, but does not queue any new |
600 | * writes to the disk. | 600 | * writes to the disk. |
601 | * | 601 | * |
602 | * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as | 602 | * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as |
603 | * you dirty the buffers, and then use osync_inode_buffers to wait for | 603 | * you dirty the buffers, and then use osync_inode_buffers to wait for |
604 | * completion. Any other dirty buffers which are not yet queued for | 604 | * completion. Any other dirty buffers which are not yet queued for |
605 | * write will not be flushed to disk by the osync. | 605 | * write will not be flushed to disk by the osync. |
606 | */ | 606 | */ |
607 | static int osync_buffers_list(spinlock_t *lock, struct list_head *list) | 607 | static int osync_buffers_list(spinlock_t *lock, struct list_head *list) |
608 | { | 608 | { |
609 | struct buffer_head *bh; | 609 | struct buffer_head *bh; |
610 | struct list_head *p; | 610 | struct list_head *p; |
611 | int err = 0; | 611 | int err = 0; |
612 | 612 | ||
613 | spin_lock(lock); | 613 | spin_lock(lock); |
614 | repeat: | 614 | repeat: |
615 | list_for_each_prev(p, list) { | 615 | list_for_each_prev(p, list) { |
616 | bh = BH_ENTRY(p); | 616 | bh = BH_ENTRY(p); |
617 | if (buffer_locked(bh)) { | 617 | if (buffer_locked(bh)) { |
618 | get_bh(bh); | 618 | get_bh(bh); |
619 | spin_unlock(lock); | 619 | spin_unlock(lock); |
620 | wait_on_buffer(bh); | 620 | wait_on_buffer(bh); |
621 | if (!buffer_uptodate(bh)) | 621 | if (!buffer_uptodate(bh)) |
622 | err = -EIO; | 622 | err = -EIO; |
623 | brelse(bh); | 623 | brelse(bh); |
624 | spin_lock(lock); | 624 | spin_lock(lock); |
625 | goto repeat; | 625 | goto repeat; |
626 | } | 626 | } |
627 | } | 627 | } |
628 | spin_unlock(lock); | 628 | spin_unlock(lock); |
629 | return err; | 629 | return err; |
630 | } | 630 | } |
631 | 631 | ||
632 | /** | 632 | /** |
633 | * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers | 633 | * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers |
634 | * @mapping: the mapping which wants those buffers written | 634 | * @mapping: the mapping which wants those buffers written |
635 | * | 635 | * |
636 | * Starts I/O against the buffers at mapping->private_list, and waits upon | 636 | * Starts I/O against the buffers at mapping->private_list, and waits upon |
637 | * that I/O. | 637 | * that I/O. |
638 | * | 638 | * |
639 | * Basically, this is a convenience function for fsync(). | 639 | * Basically, this is a convenience function for fsync(). |
640 | * @mapping is a file or directory which needs those buffers to be written for | 640 | * @mapping is a file or directory which needs those buffers to be written for |
641 | * a successful fsync(). | 641 | * a successful fsync(). |
642 | */ | 642 | */ |
643 | int sync_mapping_buffers(struct address_space *mapping) | 643 | int sync_mapping_buffers(struct address_space *mapping) |
644 | { | 644 | { |
645 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 645 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
646 | 646 | ||
647 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) | 647 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) |
648 | return 0; | 648 | return 0; |
649 | 649 | ||
650 | return fsync_buffers_list(&buffer_mapping->private_lock, | 650 | return fsync_buffers_list(&buffer_mapping->private_lock, |
651 | &mapping->private_list); | 651 | &mapping->private_list); |
652 | } | 652 | } |
653 | EXPORT_SYMBOL(sync_mapping_buffers); | 653 | EXPORT_SYMBOL(sync_mapping_buffers); |
654 | 654 | ||
655 | /* | 655 | /* |
656 | * Called when we've recently written block `bblock', and it is known that | 656 | * Called when we've recently written block `bblock', and it is known that |
657 | * `bblock' was for a buffer_boundary() buffer. This means that the block at | 657 | * `bblock' was for a buffer_boundary() buffer. This means that the block at |
658 | * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's | 658 | * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's |
659 | * dirty, schedule it for IO. So that indirects merge nicely with their data. | 659 | * dirty, schedule it for IO. So that indirects merge nicely with their data. |
660 | */ | 660 | */ |
661 | void write_boundary_block(struct block_device *bdev, | 661 | void write_boundary_block(struct block_device *bdev, |
662 | sector_t bblock, unsigned blocksize) | 662 | sector_t bblock, unsigned blocksize) |
663 | { | 663 | { |
664 | struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); | 664 | struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); |
665 | if (bh) { | 665 | if (bh) { |
666 | if (buffer_dirty(bh)) | 666 | if (buffer_dirty(bh)) |
667 | ll_rw_block(WRITE, 1, &bh); | 667 | ll_rw_block(WRITE, 1, &bh); |
668 | put_bh(bh); | 668 | put_bh(bh); |
669 | } | 669 | } |
670 | } | 670 | } |
671 | 671 | ||
672 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) | 672 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) |
673 | { | 673 | { |
674 | struct address_space *mapping = inode->i_mapping; | 674 | struct address_space *mapping = inode->i_mapping; |
675 | struct address_space *buffer_mapping = bh->b_page->mapping; | 675 | struct address_space *buffer_mapping = bh->b_page->mapping; |
676 | 676 | ||
677 | mark_buffer_dirty(bh); | 677 | mark_buffer_dirty(bh); |
678 | if (!mapping->assoc_mapping) { | 678 | if (!mapping->assoc_mapping) { |
679 | mapping->assoc_mapping = buffer_mapping; | 679 | mapping->assoc_mapping = buffer_mapping; |
680 | } else { | 680 | } else { |
681 | BUG_ON(mapping->assoc_mapping != buffer_mapping); | 681 | BUG_ON(mapping->assoc_mapping != buffer_mapping); |
682 | } | 682 | } |
683 | if (!bh->b_assoc_map) { | 683 | if (!bh->b_assoc_map) { |
684 | spin_lock(&buffer_mapping->private_lock); | 684 | spin_lock(&buffer_mapping->private_lock); |
685 | list_move_tail(&bh->b_assoc_buffers, | 685 | list_move_tail(&bh->b_assoc_buffers, |
686 | &mapping->private_list); | 686 | &mapping->private_list); |
687 | bh->b_assoc_map = mapping; | 687 | bh->b_assoc_map = mapping; |
688 | spin_unlock(&buffer_mapping->private_lock); | 688 | spin_unlock(&buffer_mapping->private_lock); |
689 | } | 689 | } |
690 | } | 690 | } |
691 | EXPORT_SYMBOL(mark_buffer_dirty_inode); | 691 | EXPORT_SYMBOL(mark_buffer_dirty_inode); |
692 | 692 | ||
693 | /* | 693 | /* |
694 | * Mark the page dirty, and set it dirty in the radix tree, and mark the inode | 694 | * Mark the page dirty, and set it dirty in the radix tree, and mark the inode |
695 | * dirty. | 695 | * dirty. |
696 | * | 696 | * |
697 | * If warn is true, then emit a warning if the page is not uptodate and has | 697 | * If warn is true, then emit a warning if the page is not uptodate and has |
698 | * not been truncated. | 698 | * not been truncated. |
699 | */ | 699 | */ |
700 | static int __set_page_dirty(struct page *page, | 700 | static int __set_page_dirty(struct page *page, |
701 | struct address_space *mapping, int warn) | 701 | struct address_space *mapping, int warn) |
702 | { | 702 | { |
703 | if (unlikely(!mapping)) | 703 | if (unlikely(!mapping)) |
704 | return !TestSetPageDirty(page); | 704 | return !TestSetPageDirty(page); |
705 | 705 | ||
706 | if (TestSetPageDirty(page)) | 706 | if (TestSetPageDirty(page)) |
707 | return 0; | 707 | return 0; |
708 | 708 | ||
709 | write_lock_irq(&mapping->tree_lock); | 709 | write_lock_irq(&mapping->tree_lock); |
710 | if (page->mapping) { /* Race with truncate? */ | 710 | if (page->mapping) { /* Race with truncate? */ |
711 | WARN_ON_ONCE(warn && !PageUptodate(page)); | 711 | WARN_ON_ONCE(warn && !PageUptodate(page)); |
712 | 712 | ||
713 | if (mapping_cap_account_dirty(mapping)) { | 713 | if (mapping_cap_account_dirty(mapping)) { |
714 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 714 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
715 | __inc_bdi_stat(mapping->backing_dev_info, | 715 | __inc_bdi_stat(mapping->backing_dev_info, |
716 | BDI_RECLAIMABLE); | 716 | BDI_RECLAIMABLE); |
717 | task_io_account_write(PAGE_CACHE_SIZE); | 717 | task_io_account_write(PAGE_CACHE_SIZE); |
718 | } | 718 | } |
719 | radix_tree_tag_set(&mapping->page_tree, | 719 | radix_tree_tag_set(&mapping->page_tree, |
720 | page_index(page), PAGECACHE_TAG_DIRTY); | 720 | page_index(page), PAGECACHE_TAG_DIRTY); |
721 | } | 721 | } |
722 | write_unlock_irq(&mapping->tree_lock); | 722 | write_unlock_irq(&mapping->tree_lock); |
723 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 723 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
724 | 724 | ||
725 | return 1; | 725 | return 1; |
726 | } | 726 | } |
727 | 727 | ||
728 | /* | 728 | /* |
729 | * Add a page to the dirty page list. | 729 | * Add a page to the dirty page list. |
730 | * | 730 | * |
731 | * It is a sad fact of life that this function is called from several places | 731 | * It is a sad fact of life that this function is called from several places |
732 | * deeply under spinlocking. It may not sleep. | 732 | * deeply under spinlocking. It may not sleep. |
733 | * | 733 | * |
734 | * If the page has buffers, the uptodate buffers are set dirty, to preserve | 734 | * If the page has buffers, the uptodate buffers are set dirty, to preserve |
735 | * dirty-state coherency between the page and the buffers. It the page does | 735 | * dirty-state coherency between the page and the buffers. It the page does |
736 | * not have buffers then when they are later attached they will all be set | 736 | * not have buffers then when they are later attached they will all be set |
737 | * dirty. | 737 | * dirty. |
738 | * | 738 | * |
739 | * The buffers are dirtied before the page is dirtied. There's a small race | 739 | * The buffers are dirtied before the page is dirtied. There's a small race |
740 | * window in which a writepage caller may see the page cleanness but not the | 740 | * window in which a writepage caller may see the page cleanness but not the |
741 | * buffer dirtiness. That's fine. If this code were to set the page dirty | 741 | * buffer dirtiness. That's fine. If this code were to set the page dirty |
742 | * before the buffers, a concurrent writepage caller could clear the page dirty | 742 | * before the buffers, a concurrent writepage caller could clear the page dirty |
743 | * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean | 743 | * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean |
744 | * page on the dirty page list. | 744 | * page on the dirty page list. |
745 | * | 745 | * |
746 | * We use private_lock to lock against try_to_free_buffers while using the | 746 | * We use private_lock to lock against try_to_free_buffers while using the |
747 | * page's buffer list. Also use this to protect against clean buffers being | 747 | * page's buffer list. Also use this to protect against clean buffers being |
748 | * added to the page after it was set dirty. | 748 | * added to the page after it was set dirty. |
749 | * | 749 | * |
750 | * FIXME: may need to call ->reservepage here as well. That's rather up to the | 750 | * FIXME: may need to call ->reservepage here as well. That's rather up to the |
751 | * address_space though. | 751 | * address_space though. |
752 | */ | 752 | */ |
753 | int __set_page_dirty_buffers(struct page *page) | 753 | int __set_page_dirty_buffers(struct page *page) |
754 | { | 754 | { |
755 | struct address_space *mapping = page_mapping(page); | 755 | struct address_space *mapping = page_mapping(page); |
756 | 756 | ||
757 | if (unlikely(!mapping)) | 757 | if (unlikely(!mapping)) |
758 | return !TestSetPageDirty(page); | 758 | return !TestSetPageDirty(page); |
759 | 759 | ||
760 | spin_lock(&mapping->private_lock); | 760 | spin_lock(&mapping->private_lock); |
761 | if (page_has_buffers(page)) { | 761 | if (page_has_buffers(page)) { |
762 | struct buffer_head *head = page_buffers(page); | 762 | struct buffer_head *head = page_buffers(page); |
763 | struct buffer_head *bh = head; | 763 | struct buffer_head *bh = head; |
764 | 764 | ||
765 | do { | 765 | do { |
766 | set_buffer_dirty(bh); | 766 | set_buffer_dirty(bh); |
767 | bh = bh->b_this_page; | 767 | bh = bh->b_this_page; |
768 | } while (bh != head); | 768 | } while (bh != head); |
769 | } | 769 | } |
770 | spin_unlock(&mapping->private_lock); | 770 | spin_unlock(&mapping->private_lock); |
771 | 771 | ||
772 | return __set_page_dirty(page, mapping, 1); | 772 | return __set_page_dirty(page, mapping, 1); |
773 | } | 773 | } |
774 | EXPORT_SYMBOL(__set_page_dirty_buffers); | 774 | EXPORT_SYMBOL(__set_page_dirty_buffers); |
775 | 775 | ||
776 | /* | 776 | /* |
777 | * Write out and wait upon a list of buffers. | 777 | * Write out and wait upon a list of buffers. |
778 | * | 778 | * |
779 | * We have conflicting pressures: we want to make sure that all | 779 | * We have conflicting pressures: we want to make sure that all |
780 | * initially dirty buffers get waited on, but that any subsequently | 780 | * initially dirty buffers get waited on, but that any subsequently |
781 | * dirtied buffers don't. After all, we don't want fsync to last | 781 | * dirtied buffers don't. After all, we don't want fsync to last |
782 | * forever if somebody is actively writing to the file. | 782 | * forever if somebody is actively writing to the file. |
783 | * | 783 | * |
784 | * Do this in two main stages: first we copy dirty buffers to a | 784 | * Do this in two main stages: first we copy dirty buffers to a |
785 | * temporary inode list, queueing the writes as we go. Then we clean | 785 | * temporary inode list, queueing the writes as we go. Then we clean |
786 | * up, waiting for those writes to complete. | 786 | * up, waiting for those writes to complete. |
787 | * | 787 | * |
788 | * During this second stage, any subsequent updates to the file may end | 788 | * During this second stage, any subsequent updates to the file may end |
789 | * up refiling the buffer on the original inode's dirty list again, so | 789 | * up refiling the buffer on the original inode's dirty list again, so |
790 | * there is a chance we will end up with a buffer queued for write but | 790 | * there is a chance we will end up with a buffer queued for write but |
791 | * not yet completed on that list. So, as a final cleanup we go through | 791 | * not yet completed on that list. So, as a final cleanup we go through |
792 | * the osync code to catch these locked, dirty buffers without requeuing | 792 | * the osync code to catch these locked, dirty buffers without requeuing |
793 | * any newly dirty buffers for write. | 793 | * any newly dirty buffers for write. |
794 | */ | 794 | */ |
795 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | 795 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) |
796 | { | 796 | { |
797 | struct buffer_head *bh; | 797 | struct buffer_head *bh; |
798 | struct list_head tmp; | 798 | struct list_head tmp; |
799 | struct address_space *mapping; | 799 | struct address_space *mapping; |
800 | int err = 0, err2; | 800 | int err = 0, err2; |
801 | 801 | ||
802 | INIT_LIST_HEAD(&tmp); | 802 | INIT_LIST_HEAD(&tmp); |
803 | 803 | ||
804 | spin_lock(lock); | 804 | spin_lock(lock); |
805 | while (!list_empty(list)) { | 805 | while (!list_empty(list)) { |
806 | bh = BH_ENTRY(list->next); | 806 | bh = BH_ENTRY(list->next); |
807 | mapping = bh->b_assoc_map; | 807 | mapping = bh->b_assoc_map; |
808 | __remove_assoc_queue(bh); | 808 | __remove_assoc_queue(bh); |
809 | /* Avoid race with mark_buffer_dirty_inode() which does | 809 | /* Avoid race with mark_buffer_dirty_inode() which does |
810 | * a lockless check and we rely on seeing the dirty bit */ | 810 | * a lockless check and we rely on seeing the dirty bit */ |
811 | smp_mb(); | 811 | smp_mb(); |
812 | if (buffer_dirty(bh) || buffer_locked(bh)) { | 812 | if (buffer_dirty(bh) || buffer_locked(bh)) { |
813 | list_add(&bh->b_assoc_buffers, &tmp); | 813 | list_add(&bh->b_assoc_buffers, &tmp); |
814 | bh->b_assoc_map = mapping; | 814 | bh->b_assoc_map = mapping; |
815 | if (buffer_dirty(bh)) { | 815 | if (buffer_dirty(bh)) { |
816 | get_bh(bh); | 816 | get_bh(bh); |
817 | spin_unlock(lock); | 817 | spin_unlock(lock); |
818 | /* | 818 | /* |
819 | * Ensure any pending I/O completes so that | 819 | * Ensure any pending I/O completes so that |
820 | * ll_rw_block() actually writes the current | 820 | * ll_rw_block() actually writes the current |
821 | * contents - it is a noop if I/O is still in | 821 | * contents - it is a noop if I/O is still in |
822 | * flight on potentially older contents. | 822 | * flight on potentially older contents. |
823 | */ | 823 | */ |
824 | ll_rw_block(SWRITE_SYNC, 1, &bh); | 824 | ll_rw_block(SWRITE_SYNC, 1, &bh); |
825 | brelse(bh); | 825 | brelse(bh); |
826 | spin_lock(lock); | 826 | spin_lock(lock); |
827 | } | 827 | } |
828 | } | 828 | } |
829 | } | 829 | } |
830 | 830 | ||
831 | while (!list_empty(&tmp)) { | 831 | while (!list_empty(&tmp)) { |
832 | bh = BH_ENTRY(tmp.prev); | 832 | bh = BH_ENTRY(tmp.prev); |
833 | get_bh(bh); | 833 | get_bh(bh); |
834 | mapping = bh->b_assoc_map; | 834 | mapping = bh->b_assoc_map; |
835 | __remove_assoc_queue(bh); | 835 | __remove_assoc_queue(bh); |
836 | /* Avoid race with mark_buffer_dirty_inode() which does | 836 | /* Avoid race with mark_buffer_dirty_inode() which does |
837 | * a lockless check and we rely on seeing the dirty bit */ | 837 | * a lockless check and we rely on seeing the dirty bit */ |
838 | smp_mb(); | 838 | smp_mb(); |
839 | if (buffer_dirty(bh)) { | 839 | if (buffer_dirty(bh)) { |
840 | list_add(&bh->b_assoc_buffers, | 840 | list_add(&bh->b_assoc_buffers, |
841 | &mapping->private_list); | 841 | &mapping->private_list); |
842 | bh->b_assoc_map = mapping; | 842 | bh->b_assoc_map = mapping; |
843 | } | 843 | } |
844 | spin_unlock(lock); | 844 | spin_unlock(lock); |
845 | wait_on_buffer(bh); | 845 | wait_on_buffer(bh); |
846 | if (!buffer_uptodate(bh)) | 846 | if (!buffer_uptodate(bh)) |
847 | err = -EIO; | 847 | err = -EIO; |
848 | brelse(bh); | 848 | brelse(bh); |
849 | spin_lock(lock); | 849 | spin_lock(lock); |
850 | } | 850 | } |
851 | 851 | ||
852 | spin_unlock(lock); | 852 | spin_unlock(lock); |
853 | err2 = osync_buffers_list(lock, list); | 853 | err2 = osync_buffers_list(lock, list); |
854 | if (err) | 854 | if (err) |
855 | return err; | 855 | return err; |
856 | else | 856 | else |
857 | return err2; | 857 | return err2; |
858 | } | 858 | } |
859 | 859 | ||
860 | /* | 860 | /* |
861 | * Invalidate any and all dirty buffers on a given inode. We are | 861 | * Invalidate any and all dirty buffers on a given inode. We are |
862 | * probably unmounting the fs, but that doesn't mean we have already | 862 | * probably unmounting the fs, but that doesn't mean we have already |
863 | * done a sync(). Just drop the buffers from the inode list. | 863 | * done a sync(). Just drop the buffers from the inode list. |
864 | * | 864 | * |
865 | * NOTE: we take the inode's blockdev's mapping's private_lock. Which | 865 | * NOTE: we take the inode's blockdev's mapping's private_lock. Which |
866 | * assumes that all the buffers are against the blockdev. Not true | 866 | * assumes that all the buffers are against the blockdev. Not true |
867 | * for reiserfs. | 867 | * for reiserfs. |
868 | */ | 868 | */ |
869 | void invalidate_inode_buffers(struct inode *inode) | 869 | void invalidate_inode_buffers(struct inode *inode) |
870 | { | 870 | { |
871 | if (inode_has_buffers(inode)) { | 871 | if (inode_has_buffers(inode)) { |
872 | struct address_space *mapping = &inode->i_data; | 872 | struct address_space *mapping = &inode->i_data; |
873 | struct list_head *list = &mapping->private_list; | 873 | struct list_head *list = &mapping->private_list; |
874 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 874 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
875 | 875 | ||
876 | spin_lock(&buffer_mapping->private_lock); | 876 | spin_lock(&buffer_mapping->private_lock); |
877 | while (!list_empty(list)) | 877 | while (!list_empty(list)) |
878 | __remove_assoc_queue(BH_ENTRY(list->next)); | 878 | __remove_assoc_queue(BH_ENTRY(list->next)); |
879 | spin_unlock(&buffer_mapping->private_lock); | 879 | spin_unlock(&buffer_mapping->private_lock); |
880 | } | 880 | } |
881 | } | 881 | } |
882 | 882 | ||
883 | /* | 883 | /* |
884 | * Remove any clean buffers from the inode's buffer list. This is called | 884 | * Remove any clean buffers from the inode's buffer list. This is called |
885 | * when we're trying to free the inode itself. Those buffers can pin it. | 885 | * when we're trying to free the inode itself. Those buffers can pin it. |
886 | * | 886 | * |
887 | * Returns true if all buffers were removed. | 887 | * Returns true if all buffers were removed. |
888 | */ | 888 | */ |
889 | int remove_inode_buffers(struct inode *inode) | 889 | int remove_inode_buffers(struct inode *inode) |
890 | { | 890 | { |
891 | int ret = 1; | 891 | int ret = 1; |
892 | 892 | ||
893 | if (inode_has_buffers(inode)) { | 893 | if (inode_has_buffers(inode)) { |
894 | struct address_space *mapping = &inode->i_data; | 894 | struct address_space *mapping = &inode->i_data; |
895 | struct list_head *list = &mapping->private_list; | 895 | struct list_head *list = &mapping->private_list; |
896 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 896 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
897 | 897 | ||
898 | spin_lock(&buffer_mapping->private_lock); | 898 | spin_lock(&buffer_mapping->private_lock); |
899 | while (!list_empty(list)) { | 899 | while (!list_empty(list)) { |
900 | struct buffer_head *bh = BH_ENTRY(list->next); | 900 | struct buffer_head *bh = BH_ENTRY(list->next); |
901 | if (buffer_dirty(bh)) { | 901 | if (buffer_dirty(bh)) { |
902 | ret = 0; | 902 | ret = 0; |
903 | break; | 903 | break; |
904 | } | 904 | } |
905 | __remove_assoc_queue(bh); | 905 | __remove_assoc_queue(bh); |
906 | } | 906 | } |
907 | spin_unlock(&buffer_mapping->private_lock); | 907 | spin_unlock(&buffer_mapping->private_lock); |
908 | } | 908 | } |
909 | return ret; | 909 | return ret; |
910 | } | 910 | } |
911 | 911 | ||
912 | /* | 912 | /* |
913 | * Create the appropriate buffers when given a page for data area and | 913 | * Create the appropriate buffers when given a page for data area and |
914 | * the size of each buffer.. Use the bh->b_this_page linked list to | 914 | * the size of each buffer.. Use the bh->b_this_page linked list to |
915 | * follow the buffers created. Return NULL if unable to create more | 915 | * follow the buffers created. Return NULL if unable to create more |
916 | * buffers. | 916 | * buffers. |
917 | * | 917 | * |
918 | * The retry flag is used to differentiate async IO (paging, swapping) | 918 | * The retry flag is used to differentiate async IO (paging, swapping) |
919 | * which may not fail from ordinary buffer allocations. | 919 | * which may not fail from ordinary buffer allocations. |
920 | */ | 920 | */ |
921 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | 921 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, |
922 | int retry) | 922 | int retry) |
923 | { | 923 | { |
924 | struct buffer_head *bh, *head; | 924 | struct buffer_head *bh, *head; |
925 | long offset; | 925 | long offset; |
926 | 926 | ||
927 | try_again: | 927 | try_again: |
928 | head = NULL; | 928 | head = NULL; |
929 | offset = PAGE_SIZE; | 929 | offset = PAGE_SIZE; |
930 | while ((offset -= size) >= 0) { | 930 | while ((offset -= size) >= 0) { |
931 | bh = alloc_buffer_head(GFP_NOFS); | 931 | bh = alloc_buffer_head(GFP_NOFS); |
932 | if (!bh) | 932 | if (!bh) |
933 | goto no_grow; | 933 | goto no_grow; |
934 | 934 | ||
935 | bh->b_bdev = NULL; | 935 | bh->b_bdev = NULL; |
936 | bh->b_this_page = head; | 936 | bh->b_this_page = head; |
937 | bh->b_blocknr = -1; | 937 | bh->b_blocknr = -1; |
938 | head = bh; | 938 | head = bh; |
939 | 939 | ||
940 | bh->b_state = 0; | 940 | bh->b_state = 0; |
941 | atomic_set(&bh->b_count, 0); | 941 | atomic_set(&bh->b_count, 0); |
942 | bh->b_private = NULL; | 942 | bh->b_private = NULL; |
943 | bh->b_size = size; | 943 | bh->b_size = size; |
944 | 944 | ||
945 | /* Link the buffer to its page */ | 945 | /* Link the buffer to its page */ |
946 | set_bh_page(bh, page, offset); | 946 | set_bh_page(bh, page, offset); |
947 | 947 | ||
948 | init_buffer(bh, NULL, NULL); | 948 | init_buffer(bh, NULL, NULL); |
949 | } | 949 | } |
950 | return head; | 950 | return head; |
951 | /* | 951 | /* |
952 | * In case anything failed, we just free everything we got. | 952 | * In case anything failed, we just free everything we got. |
953 | */ | 953 | */ |
954 | no_grow: | 954 | no_grow: |
955 | if (head) { | 955 | if (head) { |
956 | do { | 956 | do { |
957 | bh = head; | 957 | bh = head; |
958 | head = head->b_this_page; | 958 | head = head->b_this_page; |
959 | free_buffer_head(bh); | 959 | free_buffer_head(bh); |
960 | } while (head); | 960 | } while (head); |
961 | } | 961 | } |
962 | 962 | ||
963 | /* | 963 | /* |
964 | * Return failure for non-async IO requests. Async IO requests | 964 | * Return failure for non-async IO requests. Async IO requests |
965 | * are not allowed to fail, so we have to wait until buffer heads | 965 | * are not allowed to fail, so we have to wait until buffer heads |
966 | * become available. But we don't want tasks sleeping with | 966 | * become available. But we don't want tasks sleeping with |
967 | * partially complete buffers, so all were released above. | 967 | * partially complete buffers, so all were released above. |
968 | */ | 968 | */ |
969 | if (!retry) | 969 | if (!retry) |
970 | return NULL; | 970 | return NULL; |
971 | 971 | ||
972 | /* We're _really_ low on memory. Now we just | 972 | /* We're _really_ low on memory. Now we just |
973 | * wait for old buffer heads to become free due to | 973 | * wait for old buffer heads to become free due to |
974 | * finishing IO. Since this is an async request and | 974 | * finishing IO. Since this is an async request and |
975 | * the reserve list is empty, we're sure there are | 975 | * the reserve list is empty, we're sure there are |
976 | * async buffer heads in use. | 976 | * async buffer heads in use. |
977 | */ | 977 | */ |
978 | free_more_memory(); | 978 | free_more_memory(); |
979 | goto try_again; | 979 | goto try_again; |
980 | } | 980 | } |
981 | EXPORT_SYMBOL_GPL(alloc_page_buffers); | 981 | EXPORT_SYMBOL_GPL(alloc_page_buffers); |
982 | 982 | ||
983 | static inline void | 983 | static inline void |
984 | link_dev_buffers(struct page *page, struct buffer_head *head) | 984 | link_dev_buffers(struct page *page, struct buffer_head *head) |
985 | { | 985 | { |
986 | struct buffer_head *bh, *tail; | 986 | struct buffer_head *bh, *tail; |
987 | 987 | ||
988 | bh = head; | 988 | bh = head; |
989 | do { | 989 | do { |
990 | tail = bh; | 990 | tail = bh; |
991 | bh = bh->b_this_page; | 991 | bh = bh->b_this_page; |
992 | } while (bh); | 992 | } while (bh); |
993 | tail->b_this_page = head; | 993 | tail->b_this_page = head; |
994 | attach_page_buffers(page, head); | 994 | attach_page_buffers(page, head); |
995 | } | 995 | } |
996 | 996 | ||
997 | /* | 997 | /* |
998 | * Initialise the state of a blockdev page's buffers. | 998 | * Initialise the state of a blockdev page's buffers. |
999 | */ | 999 | */ |
1000 | static void | 1000 | static void |
1001 | init_page_buffers(struct page *page, struct block_device *bdev, | 1001 | init_page_buffers(struct page *page, struct block_device *bdev, |
1002 | sector_t block, int size) | 1002 | sector_t block, int size) |
1003 | { | 1003 | { |
1004 | struct buffer_head *head = page_buffers(page); | 1004 | struct buffer_head *head = page_buffers(page); |
1005 | struct buffer_head *bh = head; | 1005 | struct buffer_head *bh = head; |
1006 | int uptodate = PageUptodate(page); | 1006 | int uptodate = PageUptodate(page); |
1007 | 1007 | ||
1008 | do { | 1008 | do { |
1009 | if (!buffer_mapped(bh)) { | 1009 | if (!buffer_mapped(bh)) { |
1010 | init_buffer(bh, NULL, NULL); | 1010 | init_buffer(bh, NULL, NULL); |
1011 | bh->b_bdev = bdev; | 1011 | bh->b_bdev = bdev; |
1012 | bh->b_blocknr = block; | 1012 | bh->b_blocknr = block; |
1013 | if (uptodate) | 1013 | if (uptodate) |
1014 | set_buffer_uptodate(bh); | 1014 | set_buffer_uptodate(bh); |
1015 | set_buffer_mapped(bh); | 1015 | set_buffer_mapped(bh); |
1016 | } | 1016 | } |
1017 | block++; | 1017 | block++; |
1018 | bh = bh->b_this_page; | 1018 | bh = bh->b_this_page; |
1019 | } while (bh != head); | 1019 | } while (bh != head); |
1020 | } | 1020 | } |
1021 | 1021 | ||
1022 | /* | 1022 | /* |
1023 | * Create the page-cache page that contains the requested block. | 1023 | * Create the page-cache page that contains the requested block. |
1024 | * | 1024 | * |
1025 | * This is user purely for blockdev mappings. | 1025 | * This is user purely for blockdev mappings. |
1026 | */ | 1026 | */ |
1027 | static struct page * | 1027 | static struct page * |
1028 | grow_dev_page(struct block_device *bdev, sector_t block, | 1028 | grow_dev_page(struct block_device *bdev, sector_t block, |
1029 | pgoff_t index, int size) | 1029 | pgoff_t index, int size) |
1030 | { | 1030 | { |
1031 | struct inode *inode = bdev->bd_inode; | 1031 | struct inode *inode = bdev->bd_inode; |
1032 | struct page *page; | 1032 | struct page *page; |
1033 | struct buffer_head *bh; | 1033 | struct buffer_head *bh; |
1034 | 1034 | ||
1035 | page = find_or_create_page(inode->i_mapping, index, | 1035 | page = find_or_create_page(inode->i_mapping, index, |
1036 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); | 1036 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); |
1037 | if (!page) | 1037 | if (!page) |
1038 | return NULL; | 1038 | return NULL; |
1039 | 1039 | ||
1040 | BUG_ON(!PageLocked(page)); | 1040 | BUG_ON(!PageLocked(page)); |
1041 | 1041 | ||
1042 | if (page_has_buffers(page)) { | 1042 | if (page_has_buffers(page)) { |
1043 | bh = page_buffers(page); | 1043 | bh = page_buffers(page); |
1044 | if (bh->b_size == size) { | 1044 | if (bh->b_size == size) { |
1045 | init_page_buffers(page, bdev, block, size); | 1045 | init_page_buffers(page, bdev, block, size); |
1046 | return page; | 1046 | return page; |
1047 | } | 1047 | } |
1048 | if (!try_to_free_buffers(page)) | 1048 | if (!try_to_free_buffers(page)) |
1049 | goto failed; | 1049 | goto failed; |
1050 | } | 1050 | } |
1051 | 1051 | ||
1052 | /* | 1052 | /* |
1053 | * Allocate some buffers for this page | 1053 | * Allocate some buffers for this page |
1054 | */ | 1054 | */ |
1055 | bh = alloc_page_buffers(page, size, 0); | 1055 | bh = alloc_page_buffers(page, size, 0); |
1056 | if (!bh) | 1056 | if (!bh) |
1057 | goto failed; | 1057 | goto failed; |
1058 | 1058 | ||
1059 | /* | 1059 | /* |
1060 | * Link the page to the buffers and initialise them. Take the | 1060 | * Link the page to the buffers and initialise them. Take the |
1061 | * lock to be atomic wrt __find_get_block(), which does not | 1061 | * lock to be atomic wrt __find_get_block(), which does not |
1062 | * run under the page lock. | 1062 | * run under the page lock. |
1063 | */ | 1063 | */ |
1064 | spin_lock(&inode->i_mapping->private_lock); | 1064 | spin_lock(&inode->i_mapping->private_lock); |
1065 | link_dev_buffers(page, bh); | 1065 | link_dev_buffers(page, bh); |
1066 | init_page_buffers(page, bdev, block, size); | 1066 | init_page_buffers(page, bdev, block, size); |
1067 | spin_unlock(&inode->i_mapping->private_lock); | 1067 | spin_unlock(&inode->i_mapping->private_lock); |
1068 | return page; | 1068 | return page; |
1069 | 1069 | ||
1070 | failed: | 1070 | failed: |
1071 | BUG(); | 1071 | BUG(); |
1072 | unlock_page(page); | 1072 | unlock_page(page); |
1073 | page_cache_release(page); | 1073 | page_cache_release(page); |
1074 | return NULL; | 1074 | return NULL; |
1075 | } | 1075 | } |
1076 | 1076 | ||
1077 | /* | 1077 | /* |
1078 | * Create buffers for the specified block device block's page. If | 1078 | * Create buffers for the specified block device block's page. If |
1079 | * that page was dirty, the buffers are set dirty also. | 1079 | * that page was dirty, the buffers are set dirty also. |
1080 | */ | 1080 | */ |
1081 | static int | 1081 | static int |
1082 | grow_buffers(struct block_device *bdev, sector_t block, int size) | 1082 | grow_buffers(struct block_device *bdev, sector_t block, int size) |
1083 | { | 1083 | { |
1084 | struct page *page; | 1084 | struct page *page; |
1085 | pgoff_t index; | 1085 | pgoff_t index; |
1086 | int sizebits; | 1086 | int sizebits; |
1087 | 1087 | ||
1088 | sizebits = -1; | 1088 | sizebits = -1; |
1089 | do { | 1089 | do { |
1090 | sizebits++; | 1090 | sizebits++; |
1091 | } while ((size << sizebits) < PAGE_SIZE); | 1091 | } while ((size << sizebits) < PAGE_SIZE); |
1092 | 1092 | ||
1093 | index = block >> sizebits; | 1093 | index = block >> sizebits; |
1094 | 1094 | ||
1095 | /* | 1095 | /* |
1096 | * Check for a block which wants to lie outside our maximum possible | 1096 | * Check for a block which wants to lie outside our maximum possible |
1097 | * pagecache index. (this comparison is done using sector_t types). | 1097 | * pagecache index. (this comparison is done using sector_t types). |
1098 | */ | 1098 | */ |
1099 | if (unlikely(index != block >> sizebits)) { | 1099 | if (unlikely(index != block >> sizebits)) { |
1100 | char b[BDEVNAME_SIZE]; | 1100 | char b[BDEVNAME_SIZE]; |
1101 | 1101 | ||
1102 | printk(KERN_ERR "%s: requested out-of-range block %llu for " | 1102 | printk(KERN_ERR "%s: requested out-of-range block %llu for " |
1103 | "device %s\n", | 1103 | "device %s\n", |
1104 | __func__, (unsigned long long)block, | 1104 | __func__, (unsigned long long)block, |
1105 | bdevname(bdev, b)); | 1105 | bdevname(bdev, b)); |
1106 | return -EIO; | 1106 | return -EIO; |
1107 | } | 1107 | } |
1108 | block = index << sizebits; | 1108 | block = index << sizebits; |
1109 | /* Create a page with the proper size buffers.. */ | 1109 | /* Create a page with the proper size buffers.. */ |
1110 | page = grow_dev_page(bdev, block, index, size); | 1110 | page = grow_dev_page(bdev, block, index, size); |
1111 | if (!page) | 1111 | if (!page) |
1112 | return 0; | 1112 | return 0; |
1113 | unlock_page(page); | 1113 | unlock_page(page); |
1114 | page_cache_release(page); | 1114 | page_cache_release(page); |
1115 | return 1; | 1115 | return 1; |
1116 | } | 1116 | } |
1117 | 1117 | ||
1118 | static struct buffer_head * | 1118 | static struct buffer_head * |
1119 | __getblk_slow(struct block_device *bdev, sector_t block, int size) | 1119 | __getblk_slow(struct block_device *bdev, sector_t block, int size) |
1120 | { | 1120 | { |
1121 | /* Size must be multiple of hard sectorsize */ | 1121 | /* Size must be multiple of hard sectorsize */ |
1122 | if (unlikely(size & (bdev_hardsect_size(bdev)-1) || | 1122 | if (unlikely(size & (bdev_hardsect_size(bdev)-1) || |
1123 | (size < 512 || size > PAGE_SIZE))) { | 1123 | (size < 512 || size > PAGE_SIZE))) { |
1124 | printk(KERN_ERR "getblk(): invalid block size %d requested\n", | 1124 | printk(KERN_ERR "getblk(): invalid block size %d requested\n", |
1125 | size); | 1125 | size); |
1126 | printk(KERN_ERR "hardsect size: %d\n", | 1126 | printk(KERN_ERR "hardsect size: %d\n", |
1127 | bdev_hardsect_size(bdev)); | 1127 | bdev_hardsect_size(bdev)); |
1128 | 1128 | ||
1129 | dump_stack(); | 1129 | dump_stack(); |
1130 | return NULL; | 1130 | return NULL; |
1131 | } | 1131 | } |
1132 | 1132 | ||
1133 | for (;;) { | 1133 | for (;;) { |
1134 | struct buffer_head * bh; | 1134 | struct buffer_head * bh; |
1135 | int ret; | 1135 | int ret; |
1136 | 1136 | ||
1137 | bh = __find_get_block(bdev, block, size); | 1137 | bh = __find_get_block(bdev, block, size); |
1138 | if (bh) | 1138 | if (bh) |
1139 | return bh; | 1139 | return bh; |
1140 | 1140 | ||
1141 | ret = grow_buffers(bdev, block, size); | 1141 | ret = grow_buffers(bdev, block, size); |
1142 | if (ret < 0) | 1142 | if (ret < 0) |
1143 | return NULL; | 1143 | return NULL; |
1144 | if (ret == 0) | 1144 | if (ret == 0) |
1145 | free_more_memory(); | 1145 | free_more_memory(); |
1146 | } | 1146 | } |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | /* | 1149 | /* |
1150 | * The relationship between dirty buffers and dirty pages: | 1150 | * The relationship between dirty buffers and dirty pages: |
1151 | * | 1151 | * |
1152 | * Whenever a page has any dirty buffers, the page's dirty bit is set, and | 1152 | * Whenever a page has any dirty buffers, the page's dirty bit is set, and |
1153 | * the page is tagged dirty in its radix tree. | 1153 | * the page is tagged dirty in its radix tree. |
1154 | * | 1154 | * |
1155 | * At all times, the dirtiness of the buffers represents the dirtiness of | 1155 | * At all times, the dirtiness of the buffers represents the dirtiness of |
1156 | * subsections of the page. If the page has buffers, the page dirty bit is | 1156 | * subsections of the page. If the page has buffers, the page dirty bit is |
1157 | * merely a hint about the true dirty state. | 1157 | * merely a hint about the true dirty state. |
1158 | * | 1158 | * |
1159 | * When a page is set dirty in its entirety, all its buffers are marked dirty | 1159 | * When a page is set dirty in its entirety, all its buffers are marked dirty |
1160 | * (if the page has buffers). | 1160 | * (if the page has buffers). |
1161 | * | 1161 | * |
1162 | * When a buffer is marked dirty, its page is dirtied, but the page's other | 1162 | * When a buffer is marked dirty, its page is dirtied, but the page's other |
1163 | * buffers are not. | 1163 | * buffers are not. |
1164 | * | 1164 | * |
1165 | * Also. When blockdev buffers are explicitly read with bread(), they | 1165 | * Also. When blockdev buffers are explicitly read with bread(), they |
1166 | * individually become uptodate. But their backing page remains not | 1166 | * individually become uptodate. But their backing page remains not |
1167 | * uptodate - even if all of its buffers are uptodate. A subsequent | 1167 | * uptodate - even if all of its buffers are uptodate. A subsequent |
1168 | * block_read_full_page() against that page will discover all the uptodate | 1168 | * block_read_full_page() against that page will discover all the uptodate |
1169 | * buffers, will set the page uptodate and will perform no I/O. | 1169 | * buffers, will set the page uptodate and will perform no I/O. |
1170 | */ | 1170 | */ |
1171 | 1171 | ||
1172 | /** | 1172 | /** |
1173 | * mark_buffer_dirty - mark a buffer_head as needing writeout | 1173 | * mark_buffer_dirty - mark a buffer_head as needing writeout |
1174 | * @bh: the buffer_head to mark dirty | 1174 | * @bh: the buffer_head to mark dirty |
1175 | * | 1175 | * |
1176 | * mark_buffer_dirty() will set the dirty bit against the buffer, then set its | 1176 | * mark_buffer_dirty() will set the dirty bit against the buffer, then set its |
1177 | * backing page dirty, then tag the page as dirty in its address_space's radix | 1177 | * backing page dirty, then tag the page as dirty in its address_space's radix |
1178 | * tree and then attach the address_space's inode to its superblock's dirty | 1178 | * tree and then attach the address_space's inode to its superblock's dirty |
1179 | * inode list. | 1179 | * inode list. |
1180 | * | 1180 | * |
1181 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, | 1181 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, |
1182 | * mapping->tree_lock and the global inode_lock. | 1182 | * mapping->tree_lock and the global inode_lock. |
1183 | */ | 1183 | */ |
1184 | void mark_buffer_dirty(struct buffer_head *bh) | 1184 | void mark_buffer_dirty(struct buffer_head *bh) |
1185 | { | 1185 | { |
1186 | WARN_ON_ONCE(!buffer_uptodate(bh)); | 1186 | WARN_ON_ONCE(!buffer_uptodate(bh)); |
1187 | 1187 | ||
1188 | /* | 1188 | /* |
1189 | * Very *carefully* optimize the it-is-already-dirty case. | 1189 | * Very *carefully* optimize the it-is-already-dirty case. |
1190 | * | 1190 | * |
1191 | * Don't let the final "is it dirty" escape to before we | 1191 | * Don't let the final "is it dirty" escape to before we |
1192 | * perhaps modified the buffer. | 1192 | * perhaps modified the buffer. |
1193 | */ | 1193 | */ |
1194 | if (buffer_dirty(bh)) { | 1194 | if (buffer_dirty(bh)) { |
1195 | smp_mb(); | 1195 | smp_mb(); |
1196 | if (buffer_dirty(bh)) | 1196 | if (buffer_dirty(bh)) |
1197 | return; | 1197 | return; |
1198 | } | 1198 | } |
1199 | 1199 | ||
1200 | if (!test_set_buffer_dirty(bh)) | 1200 | if (!test_set_buffer_dirty(bh)) |
1201 | __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); | 1201 | __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); |
1202 | } | 1202 | } |
1203 | 1203 | ||
1204 | /* | 1204 | /* |
1205 | * Decrement a buffer_head's reference count. If all buffers against a page | 1205 | * Decrement a buffer_head's reference count. If all buffers against a page |
1206 | * have zero reference count, are clean and unlocked, and if the page is clean | 1206 | * have zero reference count, are clean and unlocked, and if the page is clean |
1207 | * and unlocked then try_to_free_buffers() may strip the buffers from the page | 1207 | * and unlocked then try_to_free_buffers() may strip the buffers from the page |
1208 | * in preparation for freeing it (sometimes, rarely, buffers are removed from | 1208 | * in preparation for freeing it (sometimes, rarely, buffers are removed from |
1209 | * a page but it ends up not being freed, and buffers may later be reattached). | 1209 | * a page but it ends up not being freed, and buffers may later be reattached). |
1210 | */ | 1210 | */ |
1211 | void __brelse(struct buffer_head * buf) | 1211 | void __brelse(struct buffer_head * buf) |
1212 | { | 1212 | { |
1213 | if (atomic_read(&buf->b_count)) { | 1213 | if (atomic_read(&buf->b_count)) { |
1214 | put_bh(buf); | 1214 | put_bh(buf); |
1215 | return; | 1215 | return; |
1216 | } | 1216 | } |
1217 | printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); | 1217 | printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); |
1218 | WARN_ON(1); | 1218 | WARN_ON(1); |
1219 | } | 1219 | } |
1220 | 1220 | ||
1221 | /* | 1221 | /* |
1222 | * bforget() is like brelse(), except it discards any | 1222 | * bforget() is like brelse(), except it discards any |
1223 | * potentially dirty data. | 1223 | * potentially dirty data. |
1224 | */ | 1224 | */ |
1225 | void __bforget(struct buffer_head *bh) | 1225 | void __bforget(struct buffer_head *bh) |
1226 | { | 1226 | { |
1227 | clear_buffer_dirty(bh); | 1227 | clear_buffer_dirty(bh); |
1228 | if (bh->b_assoc_map) { | 1228 | if (bh->b_assoc_map) { |
1229 | struct address_space *buffer_mapping = bh->b_page->mapping; | 1229 | struct address_space *buffer_mapping = bh->b_page->mapping; |
1230 | 1230 | ||
1231 | spin_lock(&buffer_mapping->private_lock); | 1231 | spin_lock(&buffer_mapping->private_lock); |
1232 | list_del_init(&bh->b_assoc_buffers); | 1232 | list_del_init(&bh->b_assoc_buffers); |
1233 | bh->b_assoc_map = NULL; | 1233 | bh->b_assoc_map = NULL; |
1234 | spin_unlock(&buffer_mapping->private_lock); | 1234 | spin_unlock(&buffer_mapping->private_lock); |
1235 | } | 1235 | } |
1236 | __brelse(bh); | 1236 | __brelse(bh); |
1237 | } | 1237 | } |
1238 | 1238 | ||
1239 | static struct buffer_head *__bread_slow(struct buffer_head *bh) | 1239 | static struct buffer_head *__bread_slow(struct buffer_head *bh) |
1240 | { | 1240 | { |
1241 | lock_buffer(bh); | 1241 | lock_buffer(bh); |
1242 | if (buffer_uptodate(bh)) { | 1242 | if (buffer_uptodate(bh)) { |
1243 | unlock_buffer(bh); | 1243 | unlock_buffer(bh); |
1244 | return bh; | 1244 | return bh; |
1245 | } else { | 1245 | } else { |
1246 | get_bh(bh); | 1246 | get_bh(bh); |
1247 | bh->b_end_io = end_buffer_read_sync; | 1247 | bh->b_end_io = end_buffer_read_sync; |
1248 | submit_bh(READ, bh); | 1248 | submit_bh(READ, bh); |
1249 | wait_on_buffer(bh); | 1249 | wait_on_buffer(bh); |
1250 | if (buffer_uptodate(bh)) | 1250 | if (buffer_uptodate(bh)) |
1251 | return bh; | 1251 | return bh; |
1252 | } | 1252 | } |
1253 | brelse(bh); | 1253 | brelse(bh); |
1254 | return NULL; | 1254 | return NULL; |
1255 | } | 1255 | } |
1256 | 1256 | ||
1257 | /* | 1257 | /* |
1258 | * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). | 1258 | * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). |
1259 | * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their | 1259 | * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their |
1260 | * refcount elevated by one when they're in an LRU. A buffer can only appear | 1260 | * refcount elevated by one when they're in an LRU. A buffer can only appear |
1261 | * once in a particular CPU's LRU. A single buffer can be present in multiple | 1261 | * once in a particular CPU's LRU. A single buffer can be present in multiple |
1262 | * CPU's LRUs at the same time. | 1262 | * CPU's LRUs at the same time. |
1263 | * | 1263 | * |
1264 | * This is a transparent caching front-end to sb_bread(), sb_getblk() and | 1264 | * This is a transparent caching front-end to sb_bread(), sb_getblk() and |
1265 | * sb_find_get_block(). | 1265 | * sb_find_get_block(). |
1266 | * | 1266 | * |
1267 | * The LRUs themselves only need locking against invalidate_bh_lrus. We use | 1267 | * The LRUs themselves only need locking against invalidate_bh_lrus. We use |
1268 | * a local interrupt disable for that. | 1268 | * a local interrupt disable for that. |
1269 | */ | 1269 | */ |
1270 | 1270 | ||
1271 | #define BH_LRU_SIZE 8 | 1271 | #define BH_LRU_SIZE 8 |
1272 | 1272 | ||
1273 | struct bh_lru { | 1273 | struct bh_lru { |
1274 | struct buffer_head *bhs[BH_LRU_SIZE]; | 1274 | struct buffer_head *bhs[BH_LRU_SIZE]; |
1275 | }; | 1275 | }; |
1276 | 1276 | ||
1277 | static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; | 1277 | static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; |
1278 | 1278 | ||
1279 | #ifdef CONFIG_SMP | 1279 | #ifdef CONFIG_SMP |
1280 | #define bh_lru_lock() local_irq_disable() | 1280 | #define bh_lru_lock() local_irq_disable() |
1281 | #define bh_lru_unlock() local_irq_enable() | 1281 | #define bh_lru_unlock() local_irq_enable() |
1282 | #else | 1282 | #else |
1283 | #define bh_lru_lock() preempt_disable() | 1283 | #define bh_lru_lock() preempt_disable() |
1284 | #define bh_lru_unlock() preempt_enable() | 1284 | #define bh_lru_unlock() preempt_enable() |
1285 | #endif | 1285 | #endif |
1286 | 1286 | ||
1287 | static inline void check_irqs_on(void) | 1287 | static inline void check_irqs_on(void) |
1288 | { | 1288 | { |
1289 | #ifdef irqs_disabled | 1289 | #ifdef irqs_disabled |
1290 | BUG_ON(irqs_disabled()); | 1290 | BUG_ON(irqs_disabled()); |
1291 | #endif | 1291 | #endif |
1292 | } | 1292 | } |
1293 | 1293 | ||
1294 | /* | 1294 | /* |
1295 | * The LRU management algorithm is dopey-but-simple. Sorry. | 1295 | * The LRU management algorithm is dopey-but-simple. Sorry. |
1296 | */ | 1296 | */ |
1297 | static void bh_lru_install(struct buffer_head *bh) | 1297 | static void bh_lru_install(struct buffer_head *bh) |
1298 | { | 1298 | { |
1299 | struct buffer_head *evictee = NULL; | 1299 | struct buffer_head *evictee = NULL; |
1300 | struct bh_lru *lru; | 1300 | struct bh_lru *lru; |
1301 | 1301 | ||
1302 | check_irqs_on(); | 1302 | check_irqs_on(); |
1303 | bh_lru_lock(); | 1303 | bh_lru_lock(); |
1304 | lru = &__get_cpu_var(bh_lrus); | 1304 | lru = &__get_cpu_var(bh_lrus); |
1305 | if (lru->bhs[0] != bh) { | 1305 | if (lru->bhs[0] != bh) { |
1306 | struct buffer_head *bhs[BH_LRU_SIZE]; | 1306 | struct buffer_head *bhs[BH_LRU_SIZE]; |
1307 | int in; | 1307 | int in; |
1308 | int out = 0; | 1308 | int out = 0; |
1309 | 1309 | ||
1310 | get_bh(bh); | 1310 | get_bh(bh); |
1311 | bhs[out++] = bh; | 1311 | bhs[out++] = bh; |
1312 | for (in = 0; in < BH_LRU_SIZE; in++) { | 1312 | for (in = 0; in < BH_LRU_SIZE; in++) { |
1313 | struct buffer_head *bh2 = lru->bhs[in]; | 1313 | struct buffer_head *bh2 = lru->bhs[in]; |
1314 | 1314 | ||
1315 | if (bh2 == bh) { | 1315 | if (bh2 == bh) { |
1316 | __brelse(bh2); | 1316 | __brelse(bh2); |
1317 | } else { | 1317 | } else { |
1318 | if (out >= BH_LRU_SIZE) { | 1318 | if (out >= BH_LRU_SIZE) { |
1319 | BUG_ON(evictee != NULL); | 1319 | BUG_ON(evictee != NULL); |
1320 | evictee = bh2; | 1320 | evictee = bh2; |
1321 | } else { | 1321 | } else { |
1322 | bhs[out++] = bh2; | 1322 | bhs[out++] = bh2; |
1323 | } | 1323 | } |
1324 | } | 1324 | } |
1325 | } | 1325 | } |
1326 | while (out < BH_LRU_SIZE) | 1326 | while (out < BH_LRU_SIZE) |
1327 | bhs[out++] = NULL; | 1327 | bhs[out++] = NULL; |
1328 | memcpy(lru->bhs, bhs, sizeof(bhs)); | 1328 | memcpy(lru->bhs, bhs, sizeof(bhs)); |
1329 | } | 1329 | } |
1330 | bh_lru_unlock(); | 1330 | bh_lru_unlock(); |
1331 | 1331 | ||
1332 | if (evictee) | 1332 | if (evictee) |
1333 | __brelse(evictee); | 1333 | __brelse(evictee); |
1334 | } | 1334 | } |
1335 | 1335 | ||
1336 | /* | 1336 | /* |
1337 | * Look up the bh in this cpu's LRU. If it's there, move it to the head. | 1337 | * Look up the bh in this cpu's LRU. If it's there, move it to the head. |
1338 | */ | 1338 | */ |
1339 | static struct buffer_head * | 1339 | static struct buffer_head * |
1340 | lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) | 1340 | lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) |
1341 | { | 1341 | { |
1342 | struct buffer_head *ret = NULL; | 1342 | struct buffer_head *ret = NULL; |
1343 | struct bh_lru *lru; | 1343 | struct bh_lru *lru; |
1344 | unsigned int i; | 1344 | unsigned int i; |
1345 | 1345 | ||
1346 | check_irqs_on(); | 1346 | check_irqs_on(); |
1347 | bh_lru_lock(); | 1347 | bh_lru_lock(); |
1348 | lru = &__get_cpu_var(bh_lrus); | 1348 | lru = &__get_cpu_var(bh_lrus); |
1349 | for (i = 0; i < BH_LRU_SIZE; i++) { | 1349 | for (i = 0; i < BH_LRU_SIZE; i++) { |
1350 | struct buffer_head *bh = lru->bhs[i]; | 1350 | struct buffer_head *bh = lru->bhs[i]; |
1351 | 1351 | ||
1352 | if (bh && bh->b_bdev == bdev && | 1352 | if (bh && bh->b_bdev == bdev && |
1353 | bh->b_blocknr == block && bh->b_size == size) { | 1353 | bh->b_blocknr == block && bh->b_size == size) { |
1354 | if (i) { | 1354 | if (i) { |
1355 | while (i) { | 1355 | while (i) { |
1356 | lru->bhs[i] = lru->bhs[i - 1]; | 1356 | lru->bhs[i] = lru->bhs[i - 1]; |
1357 | i--; | 1357 | i--; |
1358 | } | 1358 | } |
1359 | lru->bhs[0] = bh; | 1359 | lru->bhs[0] = bh; |
1360 | } | 1360 | } |
1361 | get_bh(bh); | 1361 | get_bh(bh); |
1362 | ret = bh; | 1362 | ret = bh; |
1363 | break; | 1363 | break; |
1364 | } | 1364 | } |
1365 | } | 1365 | } |
1366 | bh_lru_unlock(); | 1366 | bh_lru_unlock(); |
1367 | return ret; | 1367 | return ret; |
1368 | } | 1368 | } |
1369 | 1369 | ||
1370 | /* | 1370 | /* |
1371 | * Perform a pagecache lookup for the matching buffer. If it's there, refresh | 1371 | * Perform a pagecache lookup for the matching buffer. If it's there, refresh |
1372 | * it in the LRU and mark it as accessed. If it is not present then return | 1372 | * it in the LRU and mark it as accessed. If it is not present then return |
1373 | * NULL | 1373 | * NULL |
1374 | */ | 1374 | */ |
1375 | struct buffer_head * | 1375 | struct buffer_head * |
1376 | __find_get_block(struct block_device *bdev, sector_t block, unsigned size) | 1376 | __find_get_block(struct block_device *bdev, sector_t block, unsigned size) |
1377 | { | 1377 | { |
1378 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); | 1378 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); |
1379 | 1379 | ||
1380 | if (bh == NULL) { | 1380 | if (bh == NULL) { |
1381 | bh = __find_get_block_slow(bdev, block); | 1381 | bh = __find_get_block_slow(bdev, block); |
1382 | if (bh) | 1382 | if (bh) |
1383 | bh_lru_install(bh); | 1383 | bh_lru_install(bh); |
1384 | } | 1384 | } |
1385 | if (bh) | 1385 | if (bh) |
1386 | touch_buffer(bh); | 1386 | touch_buffer(bh); |
1387 | return bh; | 1387 | return bh; |
1388 | } | 1388 | } |
1389 | EXPORT_SYMBOL(__find_get_block); | 1389 | EXPORT_SYMBOL(__find_get_block); |
1390 | 1390 | ||
1391 | /* | 1391 | /* |
1392 | * __getblk will locate (and, if necessary, create) the buffer_head | 1392 | * __getblk will locate (and, if necessary, create) the buffer_head |
1393 | * which corresponds to the passed block_device, block and size. The | 1393 | * which corresponds to the passed block_device, block and size. The |
1394 | * returned buffer has its reference count incremented. | 1394 | * returned buffer has its reference count incremented. |
1395 | * | 1395 | * |
1396 | * __getblk() cannot fail - it just keeps trying. If you pass it an | 1396 | * __getblk() cannot fail - it just keeps trying. If you pass it an |
1397 | * illegal block number, __getblk() will happily return a buffer_head | 1397 | * illegal block number, __getblk() will happily return a buffer_head |
1398 | * which represents the non-existent block. Very weird. | 1398 | * which represents the non-existent block. Very weird. |
1399 | * | 1399 | * |
1400 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() | 1400 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() |
1401 | * attempt is failing. FIXME, perhaps? | 1401 | * attempt is failing. FIXME, perhaps? |
1402 | */ | 1402 | */ |
1403 | struct buffer_head * | 1403 | struct buffer_head * |
1404 | __getblk(struct block_device *bdev, sector_t block, unsigned size) | 1404 | __getblk(struct block_device *bdev, sector_t block, unsigned size) |
1405 | { | 1405 | { |
1406 | struct buffer_head *bh = __find_get_block(bdev, block, size); | 1406 | struct buffer_head *bh = __find_get_block(bdev, block, size); |
1407 | 1407 | ||
1408 | might_sleep(); | 1408 | might_sleep(); |
1409 | if (bh == NULL) | 1409 | if (bh == NULL) |
1410 | bh = __getblk_slow(bdev, block, size); | 1410 | bh = __getblk_slow(bdev, block, size); |
1411 | return bh; | 1411 | return bh; |
1412 | } | 1412 | } |
1413 | EXPORT_SYMBOL(__getblk); | 1413 | EXPORT_SYMBOL(__getblk); |
1414 | 1414 | ||
1415 | /* | 1415 | /* |
1416 | * Do async read-ahead on a buffer.. | 1416 | * Do async read-ahead on a buffer.. |
1417 | */ | 1417 | */ |
1418 | void __breadahead(struct block_device *bdev, sector_t block, unsigned size) | 1418 | void __breadahead(struct block_device *bdev, sector_t block, unsigned size) |
1419 | { | 1419 | { |
1420 | struct buffer_head *bh = __getblk(bdev, block, size); | 1420 | struct buffer_head *bh = __getblk(bdev, block, size); |
1421 | if (likely(bh)) { | 1421 | if (likely(bh)) { |
1422 | ll_rw_block(READA, 1, &bh); | 1422 | ll_rw_block(READA, 1, &bh); |
1423 | brelse(bh); | 1423 | brelse(bh); |
1424 | } | 1424 | } |
1425 | } | 1425 | } |
1426 | EXPORT_SYMBOL(__breadahead); | 1426 | EXPORT_SYMBOL(__breadahead); |
1427 | 1427 | ||
1428 | /** | 1428 | /** |
1429 | * __bread() - reads a specified block and returns the bh | 1429 | * __bread() - reads a specified block and returns the bh |
1430 | * @bdev: the block_device to read from | 1430 | * @bdev: the block_device to read from |
1431 | * @block: number of block | 1431 | * @block: number of block |
1432 | * @size: size (in bytes) to read | 1432 | * @size: size (in bytes) to read |
1433 | * | 1433 | * |
1434 | * Reads a specified block, and returns buffer head that contains it. | 1434 | * Reads a specified block, and returns buffer head that contains it. |
1435 | * It returns NULL if the block was unreadable. | 1435 | * It returns NULL if the block was unreadable. |
1436 | */ | 1436 | */ |
1437 | struct buffer_head * | 1437 | struct buffer_head * |
1438 | __bread(struct block_device *bdev, sector_t block, unsigned size) | 1438 | __bread(struct block_device *bdev, sector_t block, unsigned size) |
1439 | { | 1439 | { |
1440 | struct buffer_head *bh = __getblk(bdev, block, size); | 1440 | struct buffer_head *bh = __getblk(bdev, block, size); |
1441 | 1441 | ||
1442 | if (likely(bh) && !buffer_uptodate(bh)) | 1442 | if (likely(bh) && !buffer_uptodate(bh)) |
1443 | bh = __bread_slow(bh); | 1443 | bh = __bread_slow(bh); |
1444 | return bh; | 1444 | return bh; |
1445 | } | 1445 | } |
1446 | EXPORT_SYMBOL(__bread); | 1446 | EXPORT_SYMBOL(__bread); |
1447 | 1447 | ||
1448 | /* | 1448 | /* |
1449 | * invalidate_bh_lrus() is called rarely - but not only at unmount. | 1449 | * invalidate_bh_lrus() is called rarely - but not only at unmount. |
1450 | * This doesn't race because it runs in each cpu either in irq | 1450 | * This doesn't race because it runs in each cpu either in irq |
1451 | * or with preempt disabled. | 1451 | * or with preempt disabled. |
1452 | */ | 1452 | */ |
1453 | static void invalidate_bh_lru(void *arg) | 1453 | static void invalidate_bh_lru(void *arg) |
1454 | { | 1454 | { |
1455 | struct bh_lru *b = &get_cpu_var(bh_lrus); | 1455 | struct bh_lru *b = &get_cpu_var(bh_lrus); |
1456 | int i; | 1456 | int i; |
1457 | 1457 | ||
1458 | for (i = 0; i < BH_LRU_SIZE; i++) { | 1458 | for (i = 0; i < BH_LRU_SIZE; i++) { |
1459 | brelse(b->bhs[i]); | 1459 | brelse(b->bhs[i]); |
1460 | b->bhs[i] = NULL; | 1460 | b->bhs[i] = NULL; |
1461 | } | 1461 | } |
1462 | put_cpu_var(bh_lrus); | 1462 | put_cpu_var(bh_lrus); |
1463 | } | 1463 | } |
1464 | 1464 | ||
1465 | void invalidate_bh_lrus(void) | 1465 | void invalidate_bh_lrus(void) |
1466 | { | 1466 | { |
1467 | on_each_cpu(invalidate_bh_lru, NULL, 1, 1); | 1467 | on_each_cpu(invalidate_bh_lru, NULL, 1, 1); |
1468 | } | 1468 | } |
1469 | EXPORT_SYMBOL_GPL(invalidate_bh_lrus); | 1469 | EXPORT_SYMBOL_GPL(invalidate_bh_lrus); |
1470 | 1470 | ||
1471 | void set_bh_page(struct buffer_head *bh, | 1471 | void set_bh_page(struct buffer_head *bh, |
1472 | struct page *page, unsigned long offset) | 1472 | struct page *page, unsigned long offset) |
1473 | { | 1473 | { |
1474 | bh->b_page = page; | 1474 | bh->b_page = page; |
1475 | BUG_ON(offset >= PAGE_SIZE); | 1475 | BUG_ON(offset >= PAGE_SIZE); |
1476 | if (PageHighMem(page)) | 1476 | if (PageHighMem(page)) |
1477 | /* | 1477 | /* |
1478 | * This catches illegal uses and preserves the offset: | 1478 | * This catches illegal uses and preserves the offset: |
1479 | */ | 1479 | */ |
1480 | bh->b_data = (char *)(0 + offset); | 1480 | bh->b_data = (char *)(0 + offset); |
1481 | else | 1481 | else |
1482 | bh->b_data = page_address(page) + offset; | 1482 | bh->b_data = page_address(page) + offset; |
1483 | } | 1483 | } |
1484 | EXPORT_SYMBOL(set_bh_page); | 1484 | EXPORT_SYMBOL(set_bh_page); |
1485 | 1485 | ||
1486 | /* | 1486 | /* |
1487 | * Called when truncating a buffer on a page completely. | 1487 | * Called when truncating a buffer on a page completely. |
1488 | */ | 1488 | */ |
1489 | static void discard_buffer(struct buffer_head * bh) | 1489 | static void discard_buffer(struct buffer_head * bh) |
1490 | { | 1490 | { |
1491 | lock_buffer(bh); | 1491 | lock_buffer(bh); |
1492 | clear_buffer_dirty(bh); | 1492 | clear_buffer_dirty(bh); |
1493 | bh->b_bdev = NULL; | 1493 | bh->b_bdev = NULL; |
1494 | clear_buffer_mapped(bh); | 1494 | clear_buffer_mapped(bh); |
1495 | clear_buffer_req(bh); | 1495 | clear_buffer_req(bh); |
1496 | clear_buffer_new(bh); | 1496 | clear_buffer_new(bh); |
1497 | clear_buffer_delay(bh); | 1497 | clear_buffer_delay(bh); |
1498 | clear_buffer_unwritten(bh); | 1498 | clear_buffer_unwritten(bh); |
1499 | unlock_buffer(bh); | 1499 | unlock_buffer(bh); |
1500 | } | 1500 | } |
1501 | 1501 | ||
1502 | /** | 1502 | /** |
1503 | * block_invalidatepage - invalidate part of all of a buffer-backed page | 1503 | * block_invalidatepage - invalidate part of all of a buffer-backed page |
1504 | * | 1504 | * |
1505 | * @page: the page which is affected | 1505 | * @page: the page which is affected |
1506 | * @offset: the index of the truncation point | 1506 | * @offset: the index of the truncation point |
1507 | * | 1507 | * |
1508 | * block_invalidatepage() is called when all or part of the page has become | 1508 | * block_invalidatepage() is called when all or part of the page has become |
1509 | * invalidatedby a truncate operation. | 1509 | * invalidatedby a truncate operation. |
1510 | * | 1510 | * |
1511 | * block_invalidatepage() does not have to release all buffers, but it must | 1511 | * block_invalidatepage() does not have to release all buffers, but it must |
1512 | * ensure that no dirty buffer is left outside @offset and that no I/O | 1512 | * ensure that no dirty buffer is left outside @offset and that no I/O |
1513 | * is underway against any of the blocks which are outside the truncation | 1513 | * is underway against any of the blocks which are outside the truncation |
1514 | * point. Because the caller is about to free (and possibly reuse) those | 1514 | * point. Because the caller is about to free (and possibly reuse) those |
1515 | * blocks on-disk. | 1515 | * blocks on-disk. |
1516 | */ | 1516 | */ |
1517 | void block_invalidatepage(struct page *page, unsigned long offset) | 1517 | void block_invalidatepage(struct page *page, unsigned long offset) |
1518 | { | 1518 | { |
1519 | struct buffer_head *head, *bh, *next; | 1519 | struct buffer_head *head, *bh, *next; |
1520 | unsigned int curr_off = 0; | 1520 | unsigned int curr_off = 0; |
1521 | 1521 | ||
1522 | BUG_ON(!PageLocked(page)); | 1522 | BUG_ON(!PageLocked(page)); |
1523 | if (!page_has_buffers(page)) | 1523 | if (!page_has_buffers(page)) |
1524 | goto out; | 1524 | goto out; |
1525 | 1525 | ||
1526 | head = page_buffers(page); | 1526 | head = page_buffers(page); |
1527 | bh = head; | 1527 | bh = head; |
1528 | do { | 1528 | do { |
1529 | unsigned int next_off = curr_off + bh->b_size; | 1529 | unsigned int next_off = curr_off + bh->b_size; |
1530 | next = bh->b_this_page; | 1530 | next = bh->b_this_page; |
1531 | 1531 | ||
1532 | /* | 1532 | /* |
1533 | * is this block fully invalidated? | 1533 | * is this block fully invalidated? |
1534 | */ | 1534 | */ |
1535 | if (offset <= curr_off) | 1535 | if (offset <= curr_off) |
1536 | discard_buffer(bh); | 1536 | discard_buffer(bh); |
1537 | curr_off = next_off; | 1537 | curr_off = next_off; |
1538 | bh = next; | 1538 | bh = next; |
1539 | } while (bh != head); | 1539 | } while (bh != head); |
1540 | 1540 | ||
1541 | /* | 1541 | /* |
1542 | * We release buffers only if the entire page is being invalidated. | 1542 | * We release buffers only if the entire page is being invalidated. |
1543 | * The get_block cached value has been unconditionally invalidated, | 1543 | * The get_block cached value has been unconditionally invalidated, |
1544 | * so real IO is not possible anymore. | 1544 | * so real IO is not possible anymore. |
1545 | */ | 1545 | */ |
1546 | if (offset == 0) | 1546 | if (offset == 0) |
1547 | try_to_release_page(page, 0); | 1547 | try_to_release_page(page, 0); |
1548 | out: | 1548 | out: |
1549 | return; | 1549 | return; |
1550 | } | 1550 | } |
1551 | EXPORT_SYMBOL(block_invalidatepage); | 1551 | EXPORT_SYMBOL(block_invalidatepage); |
1552 | 1552 | ||
1553 | /* | 1553 | /* |
1554 | * We attach and possibly dirty the buffers atomically wrt | 1554 | * We attach and possibly dirty the buffers atomically wrt |
1555 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers | 1555 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers |
1556 | * is already excluded via the page lock. | 1556 | * is already excluded via the page lock. |
1557 | */ | 1557 | */ |
1558 | void create_empty_buffers(struct page *page, | 1558 | void create_empty_buffers(struct page *page, |
1559 | unsigned long blocksize, unsigned long b_state) | 1559 | unsigned long blocksize, unsigned long b_state) |
1560 | { | 1560 | { |
1561 | struct buffer_head *bh, *head, *tail; | 1561 | struct buffer_head *bh, *head, *tail; |
1562 | 1562 | ||
1563 | head = alloc_page_buffers(page, blocksize, 1); | 1563 | head = alloc_page_buffers(page, blocksize, 1); |
1564 | bh = head; | 1564 | bh = head; |
1565 | do { | 1565 | do { |
1566 | bh->b_state |= b_state; | 1566 | bh->b_state |= b_state; |
1567 | tail = bh; | 1567 | tail = bh; |
1568 | bh = bh->b_this_page; | 1568 | bh = bh->b_this_page; |
1569 | } while (bh); | 1569 | } while (bh); |
1570 | tail->b_this_page = head; | 1570 | tail->b_this_page = head; |
1571 | 1571 | ||
1572 | spin_lock(&page->mapping->private_lock); | 1572 | spin_lock(&page->mapping->private_lock); |
1573 | if (PageUptodate(page) || PageDirty(page)) { | 1573 | if (PageUptodate(page) || PageDirty(page)) { |
1574 | bh = head; | 1574 | bh = head; |
1575 | do { | 1575 | do { |
1576 | if (PageDirty(page)) | 1576 | if (PageDirty(page)) |
1577 | set_buffer_dirty(bh); | 1577 | set_buffer_dirty(bh); |
1578 | if (PageUptodate(page)) | 1578 | if (PageUptodate(page)) |
1579 | set_buffer_uptodate(bh); | 1579 | set_buffer_uptodate(bh); |
1580 | bh = bh->b_this_page; | 1580 | bh = bh->b_this_page; |
1581 | } while (bh != head); | 1581 | } while (bh != head); |
1582 | } | 1582 | } |
1583 | attach_page_buffers(page, head); | 1583 | attach_page_buffers(page, head); |
1584 | spin_unlock(&page->mapping->private_lock); | 1584 | spin_unlock(&page->mapping->private_lock); |
1585 | } | 1585 | } |
1586 | EXPORT_SYMBOL(create_empty_buffers); | 1586 | EXPORT_SYMBOL(create_empty_buffers); |
1587 | 1587 | ||
1588 | /* | 1588 | /* |
1589 | * We are taking a block for data and we don't want any output from any | 1589 | * We are taking a block for data and we don't want any output from any |
1590 | * buffer-cache aliases starting from return from that function and | 1590 | * buffer-cache aliases starting from return from that function and |
1591 | * until the moment when something will explicitly mark the buffer | 1591 | * until the moment when something will explicitly mark the buffer |
1592 | * dirty (hopefully that will not happen until we will free that block ;-) | 1592 | * dirty (hopefully that will not happen until we will free that block ;-) |
1593 | * We don't even need to mark it not-uptodate - nobody can expect | 1593 | * We don't even need to mark it not-uptodate - nobody can expect |
1594 | * anything from a newly allocated buffer anyway. We used to used | 1594 | * anything from a newly allocated buffer anyway. We used to used |
1595 | * unmap_buffer() for such invalidation, but that was wrong. We definitely | 1595 | * unmap_buffer() for such invalidation, but that was wrong. We definitely |
1596 | * don't want to mark the alias unmapped, for example - it would confuse | 1596 | * don't want to mark the alias unmapped, for example - it would confuse |
1597 | * anyone who might pick it with bread() afterwards... | 1597 | * anyone who might pick it with bread() afterwards... |
1598 | * | 1598 | * |
1599 | * Also.. Note that bforget() doesn't lock the buffer. So there can | 1599 | * Also.. Note that bforget() doesn't lock the buffer. So there can |
1600 | * be writeout I/O going on against recently-freed buffers. We don't | 1600 | * be writeout I/O going on against recently-freed buffers. We don't |
1601 | * wait on that I/O in bforget() - it's more efficient to wait on the I/O | 1601 | * wait on that I/O in bforget() - it's more efficient to wait on the I/O |
1602 | * only if we really need to. That happens here. | 1602 | * only if we really need to. That happens here. |
1603 | */ | 1603 | */ |
1604 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block) | 1604 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block) |
1605 | { | 1605 | { |
1606 | struct buffer_head *old_bh; | 1606 | struct buffer_head *old_bh; |
1607 | 1607 | ||
1608 | might_sleep(); | 1608 | might_sleep(); |
1609 | 1609 | ||
1610 | old_bh = __find_get_block_slow(bdev, block); | 1610 | old_bh = __find_get_block_slow(bdev, block); |
1611 | if (old_bh) { | 1611 | if (old_bh) { |
1612 | clear_buffer_dirty(old_bh); | 1612 | clear_buffer_dirty(old_bh); |
1613 | wait_on_buffer(old_bh); | 1613 | wait_on_buffer(old_bh); |
1614 | clear_buffer_req(old_bh); | 1614 | clear_buffer_req(old_bh); |
1615 | __brelse(old_bh); | 1615 | __brelse(old_bh); |
1616 | } | 1616 | } |
1617 | } | 1617 | } |
1618 | EXPORT_SYMBOL(unmap_underlying_metadata); | 1618 | EXPORT_SYMBOL(unmap_underlying_metadata); |
1619 | 1619 | ||
1620 | /* | 1620 | /* |
1621 | * NOTE! All mapped/uptodate combinations are valid: | 1621 | * NOTE! All mapped/uptodate combinations are valid: |
1622 | * | 1622 | * |
1623 | * Mapped Uptodate Meaning | 1623 | * Mapped Uptodate Meaning |
1624 | * | 1624 | * |
1625 | * No No "unknown" - must do get_block() | 1625 | * No No "unknown" - must do get_block() |
1626 | * No Yes "hole" - zero-filled | 1626 | * No Yes "hole" - zero-filled |
1627 | * Yes No "allocated" - allocated on disk, not read in | 1627 | * Yes No "allocated" - allocated on disk, not read in |
1628 | * Yes Yes "valid" - allocated and up-to-date in memory. | 1628 | * Yes Yes "valid" - allocated and up-to-date in memory. |
1629 | * | 1629 | * |
1630 | * "Dirty" is valid only with the last case (mapped+uptodate). | 1630 | * "Dirty" is valid only with the last case (mapped+uptodate). |
1631 | */ | 1631 | */ |
1632 | 1632 | ||
1633 | /* | 1633 | /* |
1634 | * While block_write_full_page is writing back the dirty buffers under | 1634 | * While block_write_full_page is writing back the dirty buffers under |
1635 | * the page lock, whoever dirtied the buffers may decide to clean them | 1635 | * the page lock, whoever dirtied the buffers may decide to clean them |
1636 | * again at any time. We handle that by only looking at the buffer | 1636 | * again at any time. We handle that by only looking at the buffer |
1637 | * state inside lock_buffer(). | 1637 | * state inside lock_buffer(). |
1638 | * | 1638 | * |
1639 | * If block_write_full_page() is called for regular writeback | 1639 | * If block_write_full_page() is called for regular writeback |
1640 | * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a | 1640 | * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a |
1641 | * locked buffer. This only can happen if someone has written the buffer | 1641 | * locked buffer. This only can happen if someone has written the buffer |
1642 | * directly, with submit_bh(). At the address_space level PageWriteback | 1642 | * directly, with submit_bh(). At the address_space level PageWriteback |
1643 | * prevents this contention from occurring. | 1643 | * prevents this contention from occurring. |
1644 | */ | 1644 | */ |
1645 | static int __block_write_full_page(struct inode *inode, struct page *page, | 1645 | static int __block_write_full_page(struct inode *inode, struct page *page, |
1646 | get_block_t *get_block, struct writeback_control *wbc) | 1646 | get_block_t *get_block, struct writeback_control *wbc) |
1647 | { | 1647 | { |
1648 | int err; | 1648 | int err; |
1649 | sector_t block; | 1649 | sector_t block; |
1650 | sector_t last_block; | 1650 | sector_t last_block; |
1651 | struct buffer_head *bh, *head; | 1651 | struct buffer_head *bh, *head; |
1652 | const unsigned blocksize = 1 << inode->i_blkbits; | 1652 | const unsigned blocksize = 1 << inode->i_blkbits; |
1653 | int nr_underway = 0; | 1653 | int nr_underway = 0; |
1654 | 1654 | ||
1655 | BUG_ON(!PageLocked(page)); | 1655 | BUG_ON(!PageLocked(page)); |
1656 | 1656 | ||
1657 | last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; | 1657 | last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; |
1658 | 1658 | ||
1659 | if (!page_has_buffers(page)) { | 1659 | if (!page_has_buffers(page)) { |
1660 | create_empty_buffers(page, blocksize, | 1660 | create_empty_buffers(page, blocksize, |
1661 | (1 << BH_Dirty)|(1 << BH_Uptodate)); | 1661 | (1 << BH_Dirty)|(1 << BH_Uptodate)); |
1662 | } | 1662 | } |
1663 | 1663 | ||
1664 | /* | 1664 | /* |
1665 | * Be very careful. We have no exclusion from __set_page_dirty_buffers | 1665 | * Be very careful. We have no exclusion from __set_page_dirty_buffers |
1666 | * here, and the (potentially unmapped) buffers may become dirty at | 1666 | * here, and the (potentially unmapped) buffers may become dirty at |
1667 | * any time. If a buffer becomes dirty here after we've inspected it | 1667 | * any time. If a buffer becomes dirty here after we've inspected it |
1668 | * then we just miss that fact, and the page stays dirty. | 1668 | * then we just miss that fact, and the page stays dirty. |
1669 | * | 1669 | * |
1670 | * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; | 1670 | * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; |
1671 | * handle that here by just cleaning them. | 1671 | * handle that here by just cleaning them. |
1672 | */ | 1672 | */ |
1673 | 1673 | ||
1674 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 1674 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
1675 | head = page_buffers(page); | 1675 | head = page_buffers(page); |
1676 | bh = head; | 1676 | bh = head; |
1677 | 1677 | ||
1678 | /* | 1678 | /* |
1679 | * Get all the dirty buffers mapped to disk addresses and | 1679 | * Get all the dirty buffers mapped to disk addresses and |
1680 | * handle any aliases from the underlying blockdev's mapping. | 1680 | * handle any aliases from the underlying blockdev's mapping. |
1681 | */ | 1681 | */ |
1682 | do { | 1682 | do { |
1683 | if (block > last_block) { | 1683 | if (block > last_block) { |
1684 | /* | 1684 | /* |
1685 | * mapped buffers outside i_size will occur, because | 1685 | * mapped buffers outside i_size will occur, because |
1686 | * this page can be outside i_size when there is a | 1686 | * this page can be outside i_size when there is a |
1687 | * truncate in progress. | 1687 | * truncate in progress. |
1688 | */ | 1688 | */ |
1689 | /* | 1689 | /* |
1690 | * The buffer was zeroed by block_write_full_page() | 1690 | * The buffer was zeroed by block_write_full_page() |
1691 | */ | 1691 | */ |
1692 | clear_buffer_dirty(bh); | 1692 | clear_buffer_dirty(bh); |
1693 | set_buffer_uptodate(bh); | 1693 | set_buffer_uptodate(bh); |
1694 | } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { | 1694 | } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && |
1695 | buffer_dirty(bh)) { | ||
1695 | WARN_ON(bh->b_size != blocksize); | 1696 | WARN_ON(bh->b_size != blocksize); |
1696 | err = get_block(inode, block, bh, 1); | 1697 | err = get_block(inode, block, bh, 1); |
1697 | if (err) | 1698 | if (err) |
1698 | goto recover; | 1699 | goto recover; |
1700 | clear_buffer_delay(bh); | ||
1699 | if (buffer_new(bh)) { | 1701 | if (buffer_new(bh)) { |
1700 | /* blockdev mappings never come here */ | 1702 | /* blockdev mappings never come here */ |
1701 | clear_buffer_new(bh); | 1703 | clear_buffer_new(bh); |
1702 | unmap_underlying_metadata(bh->b_bdev, | 1704 | unmap_underlying_metadata(bh->b_bdev, |
1703 | bh->b_blocknr); | 1705 | bh->b_blocknr); |
1704 | } | 1706 | } |
1705 | } | 1707 | } |
1706 | bh = bh->b_this_page; | 1708 | bh = bh->b_this_page; |
1707 | block++; | 1709 | block++; |
1708 | } while (bh != head); | 1710 | } while (bh != head); |
1709 | 1711 | ||
1710 | do { | 1712 | do { |
1711 | if (!buffer_mapped(bh)) | 1713 | if (!buffer_mapped(bh)) |
1712 | continue; | 1714 | continue; |
1713 | /* | 1715 | /* |
1714 | * If it's a fully non-blocking write attempt and we cannot | 1716 | * If it's a fully non-blocking write attempt and we cannot |
1715 | * lock the buffer then redirty the page. Note that this can | 1717 | * lock the buffer then redirty the page. Note that this can |
1716 | * potentially cause a busy-wait loop from pdflush and kswapd | 1718 | * potentially cause a busy-wait loop from pdflush and kswapd |
1717 | * activity, but those code paths have their own higher-level | 1719 | * activity, but those code paths have their own higher-level |
1718 | * throttling. | 1720 | * throttling. |
1719 | */ | 1721 | */ |
1720 | if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { | 1722 | if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { |
1721 | lock_buffer(bh); | 1723 | lock_buffer(bh); |
1722 | } else if (test_set_buffer_locked(bh)) { | 1724 | } else if (test_set_buffer_locked(bh)) { |
1723 | redirty_page_for_writepage(wbc, page); | 1725 | redirty_page_for_writepage(wbc, page); |
1724 | continue; | 1726 | continue; |
1725 | } | 1727 | } |
1726 | if (test_clear_buffer_dirty(bh)) { | 1728 | if (test_clear_buffer_dirty(bh)) { |
1727 | mark_buffer_async_write(bh); | 1729 | mark_buffer_async_write(bh); |
1728 | } else { | 1730 | } else { |
1729 | unlock_buffer(bh); | 1731 | unlock_buffer(bh); |
1730 | } | 1732 | } |
1731 | } while ((bh = bh->b_this_page) != head); | 1733 | } while ((bh = bh->b_this_page) != head); |
1732 | 1734 | ||
1733 | /* | 1735 | /* |
1734 | * The page and its buffers are protected by PageWriteback(), so we can | 1736 | * The page and its buffers are protected by PageWriteback(), so we can |
1735 | * drop the bh refcounts early. | 1737 | * drop the bh refcounts early. |
1736 | */ | 1738 | */ |
1737 | BUG_ON(PageWriteback(page)); | 1739 | BUG_ON(PageWriteback(page)); |
1738 | set_page_writeback(page); | 1740 | set_page_writeback(page); |
1739 | 1741 | ||
1740 | do { | 1742 | do { |
1741 | struct buffer_head *next = bh->b_this_page; | 1743 | struct buffer_head *next = bh->b_this_page; |
1742 | if (buffer_async_write(bh)) { | 1744 | if (buffer_async_write(bh)) { |
1743 | submit_bh(WRITE, bh); | 1745 | submit_bh(WRITE, bh); |
1744 | nr_underway++; | 1746 | nr_underway++; |
1745 | } | 1747 | } |
1746 | bh = next; | 1748 | bh = next; |
1747 | } while (bh != head); | 1749 | } while (bh != head); |
1748 | unlock_page(page); | 1750 | unlock_page(page); |
1749 | 1751 | ||
1750 | err = 0; | 1752 | err = 0; |
1751 | done: | 1753 | done: |
1752 | if (nr_underway == 0) { | 1754 | if (nr_underway == 0) { |
1753 | /* | 1755 | /* |
1754 | * The page was marked dirty, but the buffers were | 1756 | * The page was marked dirty, but the buffers were |
1755 | * clean. Someone wrote them back by hand with | 1757 | * clean. Someone wrote them back by hand with |
1756 | * ll_rw_block/submit_bh. A rare case. | 1758 | * ll_rw_block/submit_bh. A rare case. |
1757 | */ | 1759 | */ |
1758 | end_page_writeback(page); | 1760 | end_page_writeback(page); |
1759 | 1761 | ||
1760 | /* | 1762 | /* |
1761 | * The page and buffer_heads can be released at any time from | 1763 | * The page and buffer_heads can be released at any time from |
1762 | * here on. | 1764 | * here on. |
1763 | */ | 1765 | */ |
1764 | } | 1766 | } |
1765 | return err; | 1767 | return err; |
1766 | 1768 | ||
1767 | recover: | 1769 | recover: |
1768 | /* | 1770 | /* |
1769 | * ENOSPC, or some other error. We may already have added some | 1771 | * ENOSPC, or some other error. We may already have added some |
1770 | * blocks to the file, so we need to write these out to avoid | 1772 | * blocks to the file, so we need to write these out to avoid |
1771 | * exposing stale data. | 1773 | * exposing stale data. |
1772 | * The page is currently locked and not marked for writeback | 1774 | * The page is currently locked and not marked for writeback |
1773 | */ | 1775 | */ |
1774 | bh = head; | 1776 | bh = head; |
1775 | /* Recovery: lock and submit the mapped buffers */ | 1777 | /* Recovery: lock and submit the mapped buffers */ |
1776 | do { | 1778 | do { |
1777 | if (buffer_mapped(bh) && buffer_dirty(bh)) { | 1779 | if (buffer_mapped(bh) && buffer_dirty(bh) && |
1780 | !buffer_delay(bh)) { | ||
1778 | lock_buffer(bh); | 1781 | lock_buffer(bh); |
1779 | mark_buffer_async_write(bh); | 1782 | mark_buffer_async_write(bh); |
1780 | } else { | 1783 | } else { |
1781 | /* | 1784 | /* |
1782 | * The buffer may have been set dirty during | 1785 | * The buffer may have been set dirty during |
1783 | * attachment to a dirty page. | 1786 | * attachment to a dirty page. |
1784 | */ | 1787 | */ |
1785 | clear_buffer_dirty(bh); | 1788 | clear_buffer_dirty(bh); |
1786 | } | 1789 | } |
1787 | } while ((bh = bh->b_this_page) != head); | 1790 | } while ((bh = bh->b_this_page) != head); |
1788 | SetPageError(page); | 1791 | SetPageError(page); |
1789 | BUG_ON(PageWriteback(page)); | 1792 | BUG_ON(PageWriteback(page)); |
1790 | mapping_set_error(page->mapping, err); | 1793 | mapping_set_error(page->mapping, err); |
1791 | set_page_writeback(page); | 1794 | set_page_writeback(page); |
1792 | do { | 1795 | do { |
1793 | struct buffer_head *next = bh->b_this_page; | 1796 | struct buffer_head *next = bh->b_this_page; |
1794 | if (buffer_async_write(bh)) { | 1797 | if (buffer_async_write(bh)) { |
1795 | clear_buffer_dirty(bh); | 1798 | clear_buffer_dirty(bh); |
1796 | submit_bh(WRITE, bh); | 1799 | submit_bh(WRITE, bh); |
1797 | nr_underway++; | 1800 | nr_underway++; |
1798 | } | 1801 | } |
1799 | bh = next; | 1802 | bh = next; |
1800 | } while (bh != head); | 1803 | } while (bh != head); |
1801 | unlock_page(page); | 1804 | unlock_page(page); |
1802 | goto done; | 1805 | goto done; |
1803 | } | 1806 | } |
1804 | 1807 | ||
1805 | /* | 1808 | /* |
1806 | * If a page has any new buffers, zero them out here, and mark them uptodate | 1809 | * If a page has any new buffers, zero them out here, and mark them uptodate |
1807 | * and dirty so they'll be written out (in order to prevent uninitialised | 1810 | * and dirty so they'll be written out (in order to prevent uninitialised |
1808 | * block data from leaking). And clear the new bit. | 1811 | * block data from leaking). And clear the new bit. |
1809 | */ | 1812 | */ |
1810 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) | 1813 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) |
1811 | { | 1814 | { |
1812 | unsigned int block_start, block_end; | 1815 | unsigned int block_start, block_end; |
1813 | struct buffer_head *head, *bh; | 1816 | struct buffer_head *head, *bh; |
1814 | 1817 | ||
1815 | BUG_ON(!PageLocked(page)); | 1818 | BUG_ON(!PageLocked(page)); |
1816 | if (!page_has_buffers(page)) | 1819 | if (!page_has_buffers(page)) |
1817 | return; | 1820 | return; |
1818 | 1821 | ||
1819 | bh = head = page_buffers(page); | 1822 | bh = head = page_buffers(page); |
1820 | block_start = 0; | 1823 | block_start = 0; |
1821 | do { | 1824 | do { |
1822 | block_end = block_start + bh->b_size; | 1825 | block_end = block_start + bh->b_size; |
1823 | 1826 | ||
1824 | if (buffer_new(bh)) { | 1827 | if (buffer_new(bh)) { |
1825 | if (block_end > from && block_start < to) { | 1828 | if (block_end > from && block_start < to) { |
1826 | if (!PageUptodate(page)) { | 1829 | if (!PageUptodate(page)) { |
1827 | unsigned start, size; | 1830 | unsigned start, size; |
1828 | 1831 | ||
1829 | start = max(from, block_start); | 1832 | start = max(from, block_start); |
1830 | size = min(to, block_end) - start; | 1833 | size = min(to, block_end) - start; |
1831 | 1834 | ||
1832 | zero_user(page, start, size); | 1835 | zero_user(page, start, size); |
1833 | set_buffer_uptodate(bh); | 1836 | set_buffer_uptodate(bh); |
1834 | } | 1837 | } |
1835 | 1838 | ||
1836 | clear_buffer_new(bh); | 1839 | clear_buffer_new(bh); |
1837 | mark_buffer_dirty(bh); | 1840 | mark_buffer_dirty(bh); |
1838 | } | 1841 | } |
1839 | } | 1842 | } |
1840 | 1843 | ||
1841 | block_start = block_end; | 1844 | block_start = block_end; |
1842 | bh = bh->b_this_page; | 1845 | bh = bh->b_this_page; |
1843 | } while (bh != head); | 1846 | } while (bh != head); |
1844 | } | 1847 | } |
1845 | EXPORT_SYMBOL(page_zero_new_buffers); | 1848 | EXPORT_SYMBOL(page_zero_new_buffers); |
1846 | 1849 | ||
1847 | static int __block_prepare_write(struct inode *inode, struct page *page, | 1850 | static int __block_prepare_write(struct inode *inode, struct page *page, |
1848 | unsigned from, unsigned to, get_block_t *get_block) | 1851 | unsigned from, unsigned to, get_block_t *get_block) |
1849 | { | 1852 | { |
1850 | unsigned block_start, block_end; | 1853 | unsigned block_start, block_end; |
1851 | sector_t block; | 1854 | sector_t block; |
1852 | int err = 0; | 1855 | int err = 0; |
1853 | unsigned blocksize, bbits; | 1856 | unsigned blocksize, bbits; |
1854 | struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; | 1857 | struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; |
1855 | 1858 | ||
1856 | BUG_ON(!PageLocked(page)); | 1859 | BUG_ON(!PageLocked(page)); |
1857 | BUG_ON(from > PAGE_CACHE_SIZE); | 1860 | BUG_ON(from > PAGE_CACHE_SIZE); |
1858 | BUG_ON(to > PAGE_CACHE_SIZE); | 1861 | BUG_ON(to > PAGE_CACHE_SIZE); |
1859 | BUG_ON(from > to); | 1862 | BUG_ON(from > to); |
1860 | 1863 | ||
1861 | blocksize = 1 << inode->i_blkbits; | 1864 | blocksize = 1 << inode->i_blkbits; |
1862 | if (!page_has_buffers(page)) | 1865 | if (!page_has_buffers(page)) |
1863 | create_empty_buffers(page, blocksize, 0); | 1866 | create_empty_buffers(page, blocksize, 0); |
1864 | head = page_buffers(page); | 1867 | head = page_buffers(page); |
1865 | 1868 | ||
1866 | bbits = inode->i_blkbits; | 1869 | bbits = inode->i_blkbits; |
1867 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); | 1870 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); |
1868 | 1871 | ||
1869 | for(bh = head, block_start = 0; bh != head || !block_start; | 1872 | for(bh = head, block_start = 0; bh != head || !block_start; |
1870 | block++, block_start=block_end, bh = bh->b_this_page) { | 1873 | block++, block_start=block_end, bh = bh->b_this_page) { |
1871 | block_end = block_start + blocksize; | 1874 | block_end = block_start + blocksize; |
1872 | if (block_end <= from || block_start >= to) { | 1875 | if (block_end <= from || block_start >= to) { |
1873 | if (PageUptodate(page)) { | 1876 | if (PageUptodate(page)) { |
1874 | if (!buffer_uptodate(bh)) | 1877 | if (!buffer_uptodate(bh)) |
1875 | set_buffer_uptodate(bh); | 1878 | set_buffer_uptodate(bh); |
1876 | } | 1879 | } |
1877 | continue; | 1880 | continue; |
1878 | } | 1881 | } |
1879 | if (buffer_new(bh)) | 1882 | if (buffer_new(bh)) |
1880 | clear_buffer_new(bh); | 1883 | clear_buffer_new(bh); |
1881 | if (!buffer_mapped(bh)) { | 1884 | if (!buffer_mapped(bh)) { |
1882 | WARN_ON(bh->b_size != blocksize); | 1885 | WARN_ON(bh->b_size != blocksize); |
1883 | err = get_block(inode, block, bh, 1); | 1886 | err = get_block(inode, block, bh, 1); |
1884 | if (err) | 1887 | if (err) |
1885 | break; | 1888 | break; |
1886 | if (buffer_new(bh)) { | 1889 | if (buffer_new(bh)) { |
1887 | unmap_underlying_metadata(bh->b_bdev, | 1890 | unmap_underlying_metadata(bh->b_bdev, |
1888 | bh->b_blocknr); | 1891 | bh->b_blocknr); |
1889 | if (PageUptodate(page)) { | 1892 | if (PageUptodate(page)) { |
1890 | clear_buffer_new(bh); | 1893 | clear_buffer_new(bh); |
1891 | set_buffer_uptodate(bh); | 1894 | set_buffer_uptodate(bh); |
1892 | mark_buffer_dirty(bh); | 1895 | mark_buffer_dirty(bh); |
1893 | continue; | 1896 | continue; |
1894 | } | 1897 | } |
1895 | if (block_end > to || block_start < from) | 1898 | if (block_end > to || block_start < from) |
1896 | zero_user_segments(page, | 1899 | zero_user_segments(page, |
1897 | to, block_end, | 1900 | to, block_end, |
1898 | block_start, from); | 1901 | block_start, from); |
1899 | continue; | 1902 | continue; |
1900 | } | 1903 | } |
1901 | } | 1904 | } |
1902 | if (PageUptodate(page)) { | 1905 | if (PageUptodate(page)) { |
1903 | if (!buffer_uptodate(bh)) | 1906 | if (!buffer_uptodate(bh)) |
1904 | set_buffer_uptodate(bh); | 1907 | set_buffer_uptodate(bh); |
1905 | continue; | 1908 | continue; |
1906 | } | 1909 | } |
1907 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && | 1910 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && |
1908 | !buffer_unwritten(bh) && | 1911 | !buffer_unwritten(bh) && |
1909 | (block_start < from || block_end > to)) { | 1912 | (block_start < from || block_end > to)) { |
1910 | ll_rw_block(READ, 1, &bh); | 1913 | ll_rw_block(READ, 1, &bh); |
1911 | *wait_bh++=bh; | 1914 | *wait_bh++=bh; |
1912 | } | 1915 | } |
1913 | } | 1916 | } |
1914 | /* | 1917 | /* |
1915 | * If we issued read requests - let them complete. | 1918 | * If we issued read requests - let them complete. |
1916 | */ | 1919 | */ |
1917 | while(wait_bh > wait) { | 1920 | while(wait_bh > wait) { |
1918 | wait_on_buffer(*--wait_bh); | 1921 | wait_on_buffer(*--wait_bh); |
1919 | if (!buffer_uptodate(*wait_bh)) | 1922 | if (!buffer_uptodate(*wait_bh)) |
1920 | err = -EIO; | 1923 | err = -EIO; |
1921 | } | 1924 | } |
1922 | if (unlikely(err)) | 1925 | if (unlikely(err)) |
1923 | page_zero_new_buffers(page, from, to); | 1926 | page_zero_new_buffers(page, from, to); |
1924 | return err; | 1927 | return err; |
1925 | } | 1928 | } |
1926 | 1929 | ||
1927 | static int __block_commit_write(struct inode *inode, struct page *page, | 1930 | static int __block_commit_write(struct inode *inode, struct page *page, |
1928 | unsigned from, unsigned to) | 1931 | unsigned from, unsigned to) |
1929 | { | 1932 | { |
1930 | unsigned block_start, block_end; | 1933 | unsigned block_start, block_end; |
1931 | int partial = 0; | 1934 | int partial = 0; |
1932 | unsigned blocksize; | 1935 | unsigned blocksize; |
1933 | struct buffer_head *bh, *head; | 1936 | struct buffer_head *bh, *head; |
1934 | 1937 | ||
1935 | blocksize = 1 << inode->i_blkbits; | 1938 | blocksize = 1 << inode->i_blkbits; |
1936 | 1939 | ||
1937 | for(bh = head = page_buffers(page), block_start = 0; | 1940 | for(bh = head = page_buffers(page), block_start = 0; |
1938 | bh != head || !block_start; | 1941 | bh != head || !block_start; |
1939 | block_start=block_end, bh = bh->b_this_page) { | 1942 | block_start=block_end, bh = bh->b_this_page) { |
1940 | block_end = block_start + blocksize; | 1943 | block_end = block_start + blocksize; |
1941 | if (block_end <= from || block_start >= to) { | 1944 | if (block_end <= from || block_start >= to) { |
1942 | if (!buffer_uptodate(bh)) | 1945 | if (!buffer_uptodate(bh)) |
1943 | partial = 1; | 1946 | partial = 1; |
1944 | } else { | 1947 | } else { |
1945 | set_buffer_uptodate(bh); | 1948 | set_buffer_uptodate(bh); |
1946 | mark_buffer_dirty(bh); | 1949 | mark_buffer_dirty(bh); |
1947 | } | 1950 | } |
1948 | clear_buffer_new(bh); | 1951 | clear_buffer_new(bh); |
1949 | } | 1952 | } |
1950 | 1953 | ||
1951 | /* | 1954 | /* |
1952 | * If this is a partial write which happened to make all buffers | 1955 | * If this is a partial write which happened to make all buffers |
1953 | * uptodate then we can optimize away a bogus readpage() for | 1956 | * uptodate then we can optimize away a bogus readpage() for |
1954 | * the next read(). Here we 'discover' whether the page went | 1957 | * the next read(). Here we 'discover' whether the page went |
1955 | * uptodate as a result of this (potentially partial) write. | 1958 | * uptodate as a result of this (potentially partial) write. |
1956 | */ | 1959 | */ |
1957 | if (!partial) | 1960 | if (!partial) |
1958 | SetPageUptodate(page); | 1961 | SetPageUptodate(page); |
1959 | return 0; | 1962 | return 0; |
1960 | } | 1963 | } |
1961 | 1964 | ||
1962 | /* | 1965 | /* |
1963 | * block_write_begin takes care of the basic task of block allocation and | 1966 | * block_write_begin takes care of the basic task of block allocation and |
1964 | * bringing partial write blocks uptodate first. | 1967 | * bringing partial write blocks uptodate first. |
1965 | * | 1968 | * |
1966 | * If *pagep is not NULL, then block_write_begin uses the locked page | 1969 | * If *pagep is not NULL, then block_write_begin uses the locked page |
1967 | * at *pagep rather than allocating its own. In this case, the page will | 1970 | * at *pagep rather than allocating its own. In this case, the page will |
1968 | * not be unlocked or deallocated on failure. | 1971 | * not be unlocked or deallocated on failure. |
1969 | */ | 1972 | */ |
1970 | int block_write_begin(struct file *file, struct address_space *mapping, | 1973 | int block_write_begin(struct file *file, struct address_space *mapping, |
1971 | loff_t pos, unsigned len, unsigned flags, | 1974 | loff_t pos, unsigned len, unsigned flags, |
1972 | struct page **pagep, void **fsdata, | 1975 | struct page **pagep, void **fsdata, |
1973 | get_block_t *get_block) | 1976 | get_block_t *get_block) |
1974 | { | 1977 | { |
1975 | struct inode *inode = mapping->host; | 1978 | struct inode *inode = mapping->host; |
1976 | int status = 0; | 1979 | int status = 0; |
1977 | struct page *page; | 1980 | struct page *page; |
1978 | pgoff_t index; | 1981 | pgoff_t index; |
1979 | unsigned start, end; | 1982 | unsigned start, end; |
1980 | int ownpage = 0; | 1983 | int ownpage = 0; |
1981 | 1984 | ||
1982 | index = pos >> PAGE_CACHE_SHIFT; | 1985 | index = pos >> PAGE_CACHE_SHIFT; |
1983 | start = pos & (PAGE_CACHE_SIZE - 1); | 1986 | start = pos & (PAGE_CACHE_SIZE - 1); |
1984 | end = start + len; | 1987 | end = start + len; |
1985 | 1988 | ||
1986 | page = *pagep; | 1989 | page = *pagep; |
1987 | if (page == NULL) { | 1990 | if (page == NULL) { |
1988 | ownpage = 1; | 1991 | ownpage = 1; |
1989 | page = __grab_cache_page(mapping, index); | 1992 | page = __grab_cache_page(mapping, index); |
1990 | if (!page) { | 1993 | if (!page) { |
1991 | status = -ENOMEM; | 1994 | status = -ENOMEM; |
1992 | goto out; | 1995 | goto out; |
1993 | } | 1996 | } |
1994 | *pagep = page; | 1997 | *pagep = page; |
1995 | } else | 1998 | } else |
1996 | BUG_ON(!PageLocked(page)); | 1999 | BUG_ON(!PageLocked(page)); |
1997 | 2000 | ||
1998 | status = __block_prepare_write(inode, page, start, end, get_block); | 2001 | status = __block_prepare_write(inode, page, start, end, get_block); |
1999 | if (unlikely(status)) { | 2002 | if (unlikely(status)) { |
2000 | ClearPageUptodate(page); | 2003 | ClearPageUptodate(page); |
2001 | 2004 | ||
2002 | if (ownpage) { | 2005 | if (ownpage) { |
2003 | unlock_page(page); | 2006 | unlock_page(page); |
2004 | page_cache_release(page); | 2007 | page_cache_release(page); |
2005 | *pagep = NULL; | 2008 | *pagep = NULL; |
2006 | 2009 | ||
2007 | /* | 2010 | /* |
2008 | * prepare_write() may have instantiated a few blocks | 2011 | * prepare_write() may have instantiated a few blocks |
2009 | * outside i_size. Trim these off again. Don't need | 2012 | * outside i_size. Trim these off again. Don't need |
2010 | * i_size_read because we hold i_mutex. | 2013 | * i_size_read because we hold i_mutex. |
2011 | */ | 2014 | */ |
2012 | if (pos + len > inode->i_size) | 2015 | if (pos + len > inode->i_size) |
2013 | vmtruncate(inode, inode->i_size); | 2016 | vmtruncate(inode, inode->i_size); |
2014 | } | 2017 | } |
2015 | goto out; | 2018 | goto out; |
2016 | } | 2019 | } |
2017 | 2020 | ||
2018 | out: | 2021 | out: |
2019 | return status; | 2022 | return status; |
2020 | } | 2023 | } |
2021 | EXPORT_SYMBOL(block_write_begin); | 2024 | EXPORT_SYMBOL(block_write_begin); |
2022 | 2025 | ||
2023 | int block_write_end(struct file *file, struct address_space *mapping, | 2026 | int block_write_end(struct file *file, struct address_space *mapping, |
2024 | loff_t pos, unsigned len, unsigned copied, | 2027 | loff_t pos, unsigned len, unsigned copied, |
2025 | struct page *page, void *fsdata) | 2028 | struct page *page, void *fsdata) |
2026 | { | 2029 | { |
2027 | struct inode *inode = mapping->host; | 2030 | struct inode *inode = mapping->host; |
2028 | unsigned start; | 2031 | unsigned start; |
2029 | 2032 | ||
2030 | start = pos & (PAGE_CACHE_SIZE - 1); | 2033 | start = pos & (PAGE_CACHE_SIZE - 1); |
2031 | 2034 | ||
2032 | if (unlikely(copied < len)) { | 2035 | if (unlikely(copied < len)) { |
2033 | /* | 2036 | /* |
2034 | * The buffers that were written will now be uptodate, so we | 2037 | * The buffers that were written will now be uptodate, so we |
2035 | * don't have to worry about a readpage reading them and | 2038 | * don't have to worry about a readpage reading them and |
2036 | * overwriting a partial write. However if we have encountered | 2039 | * overwriting a partial write. However if we have encountered |
2037 | * a short write and only partially written into a buffer, it | 2040 | * a short write and only partially written into a buffer, it |
2038 | * will not be marked uptodate, so a readpage might come in and | 2041 | * will not be marked uptodate, so a readpage might come in and |
2039 | * destroy our partial write. | 2042 | * destroy our partial write. |
2040 | * | 2043 | * |
2041 | * Do the simplest thing, and just treat any short write to a | 2044 | * Do the simplest thing, and just treat any short write to a |
2042 | * non uptodate page as a zero-length write, and force the | 2045 | * non uptodate page as a zero-length write, and force the |
2043 | * caller to redo the whole thing. | 2046 | * caller to redo the whole thing. |
2044 | */ | 2047 | */ |
2045 | if (!PageUptodate(page)) | 2048 | if (!PageUptodate(page)) |
2046 | copied = 0; | 2049 | copied = 0; |
2047 | 2050 | ||
2048 | page_zero_new_buffers(page, start+copied, start+len); | 2051 | page_zero_new_buffers(page, start+copied, start+len); |
2049 | } | 2052 | } |
2050 | flush_dcache_page(page); | 2053 | flush_dcache_page(page); |
2051 | 2054 | ||
2052 | /* This could be a short (even 0-length) commit */ | 2055 | /* This could be a short (even 0-length) commit */ |
2053 | __block_commit_write(inode, page, start, start+copied); | 2056 | __block_commit_write(inode, page, start, start+copied); |
2054 | 2057 | ||
2055 | return copied; | 2058 | return copied; |
2056 | } | 2059 | } |
2057 | EXPORT_SYMBOL(block_write_end); | 2060 | EXPORT_SYMBOL(block_write_end); |
2058 | 2061 | ||
2059 | int generic_write_end(struct file *file, struct address_space *mapping, | 2062 | int generic_write_end(struct file *file, struct address_space *mapping, |
2060 | loff_t pos, unsigned len, unsigned copied, | 2063 | loff_t pos, unsigned len, unsigned copied, |
2061 | struct page *page, void *fsdata) | 2064 | struct page *page, void *fsdata) |
2062 | { | 2065 | { |
2063 | struct inode *inode = mapping->host; | 2066 | struct inode *inode = mapping->host; |
2064 | int i_size_changed = 0; | 2067 | int i_size_changed = 0; |
2065 | 2068 | ||
2066 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | 2069 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); |
2067 | 2070 | ||
2068 | /* | 2071 | /* |
2069 | * No need to use i_size_read() here, the i_size | 2072 | * No need to use i_size_read() here, the i_size |
2070 | * cannot change under us because we hold i_mutex. | 2073 | * cannot change under us because we hold i_mutex. |
2071 | * | 2074 | * |
2072 | * But it's important to update i_size while still holding page lock: | 2075 | * But it's important to update i_size while still holding page lock: |
2073 | * page writeout could otherwise come in and zero beyond i_size. | 2076 | * page writeout could otherwise come in and zero beyond i_size. |
2074 | */ | 2077 | */ |
2075 | if (pos+copied > inode->i_size) { | 2078 | if (pos+copied > inode->i_size) { |
2076 | i_size_write(inode, pos+copied); | 2079 | i_size_write(inode, pos+copied); |
2077 | i_size_changed = 1; | 2080 | i_size_changed = 1; |
2078 | } | 2081 | } |
2079 | 2082 | ||
2080 | unlock_page(page); | 2083 | unlock_page(page); |
2081 | page_cache_release(page); | 2084 | page_cache_release(page); |
2082 | 2085 | ||
2083 | /* | 2086 | /* |
2084 | * Don't mark the inode dirty under page lock. First, it unnecessarily | 2087 | * Don't mark the inode dirty under page lock. First, it unnecessarily |
2085 | * makes the holding time of page lock longer. Second, it forces lock | 2088 | * makes the holding time of page lock longer. Second, it forces lock |
2086 | * ordering of page lock and transaction start for journaling | 2089 | * ordering of page lock and transaction start for journaling |
2087 | * filesystems. | 2090 | * filesystems. |
2088 | */ | 2091 | */ |
2089 | if (i_size_changed) | 2092 | if (i_size_changed) |
2090 | mark_inode_dirty(inode); | 2093 | mark_inode_dirty(inode); |
2091 | 2094 | ||
2092 | return copied; | 2095 | return copied; |
2093 | } | 2096 | } |
2094 | EXPORT_SYMBOL(generic_write_end); | 2097 | EXPORT_SYMBOL(generic_write_end); |
2095 | 2098 | ||
2096 | /* | 2099 | /* |
2097 | * Generic "read page" function for block devices that have the normal | 2100 | * Generic "read page" function for block devices that have the normal |
2098 | * get_block functionality. This is most of the block device filesystems. | 2101 | * get_block functionality. This is most of the block device filesystems. |
2099 | * Reads the page asynchronously --- the unlock_buffer() and | 2102 | * Reads the page asynchronously --- the unlock_buffer() and |
2100 | * set/clear_buffer_uptodate() functions propagate buffer state into the | 2103 | * set/clear_buffer_uptodate() functions propagate buffer state into the |
2101 | * page struct once IO has completed. | 2104 | * page struct once IO has completed. |
2102 | */ | 2105 | */ |
2103 | int block_read_full_page(struct page *page, get_block_t *get_block) | 2106 | int block_read_full_page(struct page *page, get_block_t *get_block) |
2104 | { | 2107 | { |
2105 | struct inode *inode = page->mapping->host; | 2108 | struct inode *inode = page->mapping->host; |
2106 | sector_t iblock, lblock; | 2109 | sector_t iblock, lblock; |
2107 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; | 2110 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; |
2108 | unsigned int blocksize; | 2111 | unsigned int blocksize; |
2109 | int nr, i; | 2112 | int nr, i; |
2110 | int fully_mapped = 1; | 2113 | int fully_mapped = 1; |
2111 | 2114 | ||
2112 | BUG_ON(!PageLocked(page)); | 2115 | BUG_ON(!PageLocked(page)); |
2113 | blocksize = 1 << inode->i_blkbits; | 2116 | blocksize = 1 << inode->i_blkbits; |
2114 | if (!page_has_buffers(page)) | 2117 | if (!page_has_buffers(page)) |
2115 | create_empty_buffers(page, blocksize, 0); | 2118 | create_empty_buffers(page, blocksize, 0); |
2116 | head = page_buffers(page); | 2119 | head = page_buffers(page); |
2117 | 2120 | ||
2118 | iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2121 | iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2119 | lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; | 2122 | lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; |
2120 | bh = head; | 2123 | bh = head; |
2121 | nr = 0; | 2124 | nr = 0; |
2122 | i = 0; | 2125 | i = 0; |
2123 | 2126 | ||
2124 | do { | 2127 | do { |
2125 | if (buffer_uptodate(bh)) | 2128 | if (buffer_uptodate(bh)) |
2126 | continue; | 2129 | continue; |
2127 | 2130 | ||
2128 | if (!buffer_mapped(bh)) { | 2131 | if (!buffer_mapped(bh)) { |
2129 | int err = 0; | 2132 | int err = 0; |
2130 | 2133 | ||
2131 | fully_mapped = 0; | 2134 | fully_mapped = 0; |
2132 | if (iblock < lblock) { | 2135 | if (iblock < lblock) { |
2133 | WARN_ON(bh->b_size != blocksize); | 2136 | WARN_ON(bh->b_size != blocksize); |
2134 | err = get_block(inode, iblock, bh, 0); | 2137 | err = get_block(inode, iblock, bh, 0); |
2135 | if (err) | 2138 | if (err) |
2136 | SetPageError(page); | 2139 | SetPageError(page); |
2137 | } | 2140 | } |
2138 | if (!buffer_mapped(bh)) { | 2141 | if (!buffer_mapped(bh)) { |
2139 | zero_user(page, i * blocksize, blocksize); | 2142 | zero_user(page, i * blocksize, blocksize); |
2140 | if (!err) | 2143 | if (!err) |
2141 | set_buffer_uptodate(bh); | 2144 | set_buffer_uptodate(bh); |
2142 | continue; | 2145 | continue; |
2143 | } | 2146 | } |
2144 | /* | 2147 | /* |
2145 | * get_block() might have updated the buffer | 2148 | * get_block() might have updated the buffer |
2146 | * synchronously | 2149 | * synchronously |
2147 | */ | 2150 | */ |
2148 | if (buffer_uptodate(bh)) | 2151 | if (buffer_uptodate(bh)) |
2149 | continue; | 2152 | continue; |
2150 | } | 2153 | } |
2151 | arr[nr++] = bh; | 2154 | arr[nr++] = bh; |
2152 | } while (i++, iblock++, (bh = bh->b_this_page) != head); | 2155 | } while (i++, iblock++, (bh = bh->b_this_page) != head); |
2153 | 2156 | ||
2154 | if (fully_mapped) | 2157 | if (fully_mapped) |
2155 | SetPageMappedToDisk(page); | 2158 | SetPageMappedToDisk(page); |
2156 | 2159 | ||
2157 | if (!nr) { | 2160 | if (!nr) { |
2158 | /* | 2161 | /* |
2159 | * All buffers are uptodate - we can set the page uptodate | 2162 | * All buffers are uptodate - we can set the page uptodate |
2160 | * as well. But not if get_block() returned an error. | 2163 | * as well. But not if get_block() returned an error. |
2161 | */ | 2164 | */ |
2162 | if (!PageError(page)) | 2165 | if (!PageError(page)) |
2163 | SetPageUptodate(page); | 2166 | SetPageUptodate(page); |
2164 | unlock_page(page); | 2167 | unlock_page(page); |
2165 | return 0; | 2168 | return 0; |
2166 | } | 2169 | } |
2167 | 2170 | ||
2168 | /* Stage two: lock the buffers */ | 2171 | /* Stage two: lock the buffers */ |
2169 | for (i = 0; i < nr; i++) { | 2172 | for (i = 0; i < nr; i++) { |
2170 | bh = arr[i]; | 2173 | bh = arr[i]; |
2171 | lock_buffer(bh); | 2174 | lock_buffer(bh); |
2172 | mark_buffer_async_read(bh); | 2175 | mark_buffer_async_read(bh); |
2173 | } | 2176 | } |
2174 | 2177 | ||
2175 | /* | 2178 | /* |
2176 | * Stage 3: start the IO. Check for uptodateness | 2179 | * Stage 3: start the IO. Check for uptodateness |
2177 | * inside the buffer lock in case another process reading | 2180 | * inside the buffer lock in case another process reading |
2178 | * the underlying blockdev brought it uptodate (the sct fix). | 2181 | * the underlying blockdev brought it uptodate (the sct fix). |
2179 | */ | 2182 | */ |
2180 | for (i = 0; i < nr; i++) { | 2183 | for (i = 0; i < nr; i++) { |
2181 | bh = arr[i]; | 2184 | bh = arr[i]; |
2182 | if (buffer_uptodate(bh)) | 2185 | if (buffer_uptodate(bh)) |
2183 | end_buffer_async_read(bh, 1); | 2186 | end_buffer_async_read(bh, 1); |
2184 | else | 2187 | else |
2185 | submit_bh(READ, bh); | 2188 | submit_bh(READ, bh); |
2186 | } | 2189 | } |
2187 | return 0; | 2190 | return 0; |
2188 | } | 2191 | } |
2189 | 2192 | ||
2190 | /* utility function for filesystems that need to do work on expanding | 2193 | /* utility function for filesystems that need to do work on expanding |
2191 | * truncates. Uses filesystem pagecache writes to allow the filesystem to | 2194 | * truncates. Uses filesystem pagecache writes to allow the filesystem to |
2192 | * deal with the hole. | 2195 | * deal with the hole. |
2193 | */ | 2196 | */ |
2194 | int generic_cont_expand_simple(struct inode *inode, loff_t size) | 2197 | int generic_cont_expand_simple(struct inode *inode, loff_t size) |
2195 | { | 2198 | { |
2196 | struct address_space *mapping = inode->i_mapping; | 2199 | struct address_space *mapping = inode->i_mapping; |
2197 | struct page *page; | 2200 | struct page *page; |
2198 | void *fsdata; | 2201 | void *fsdata; |
2199 | unsigned long limit; | 2202 | unsigned long limit; |
2200 | int err; | 2203 | int err; |
2201 | 2204 | ||
2202 | err = -EFBIG; | 2205 | err = -EFBIG; |
2203 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 2206 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; |
2204 | if (limit != RLIM_INFINITY && size > (loff_t)limit) { | 2207 | if (limit != RLIM_INFINITY && size > (loff_t)limit) { |
2205 | send_sig(SIGXFSZ, current, 0); | 2208 | send_sig(SIGXFSZ, current, 0); |
2206 | goto out; | 2209 | goto out; |
2207 | } | 2210 | } |
2208 | if (size > inode->i_sb->s_maxbytes) | 2211 | if (size > inode->i_sb->s_maxbytes) |
2209 | goto out; | 2212 | goto out; |
2210 | 2213 | ||
2211 | err = pagecache_write_begin(NULL, mapping, size, 0, | 2214 | err = pagecache_write_begin(NULL, mapping, size, 0, |
2212 | AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, | 2215 | AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, |
2213 | &page, &fsdata); | 2216 | &page, &fsdata); |
2214 | if (err) | 2217 | if (err) |
2215 | goto out; | 2218 | goto out; |
2216 | 2219 | ||
2217 | err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); | 2220 | err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); |
2218 | BUG_ON(err > 0); | 2221 | BUG_ON(err > 0); |
2219 | 2222 | ||
2220 | out: | 2223 | out: |
2221 | return err; | 2224 | return err; |
2222 | } | 2225 | } |
2223 | 2226 | ||
2224 | static int cont_expand_zero(struct file *file, struct address_space *mapping, | 2227 | static int cont_expand_zero(struct file *file, struct address_space *mapping, |
2225 | loff_t pos, loff_t *bytes) | 2228 | loff_t pos, loff_t *bytes) |
2226 | { | 2229 | { |
2227 | struct inode *inode = mapping->host; | 2230 | struct inode *inode = mapping->host; |
2228 | unsigned blocksize = 1 << inode->i_blkbits; | 2231 | unsigned blocksize = 1 << inode->i_blkbits; |
2229 | struct page *page; | 2232 | struct page *page; |
2230 | void *fsdata; | 2233 | void *fsdata; |
2231 | pgoff_t index, curidx; | 2234 | pgoff_t index, curidx; |
2232 | loff_t curpos; | 2235 | loff_t curpos; |
2233 | unsigned zerofrom, offset, len; | 2236 | unsigned zerofrom, offset, len; |
2234 | int err = 0; | 2237 | int err = 0; |
2235 | 2238 | ||
2236 | index = pos >> PAGE_CACHE_SHIFT; | 2239 | index = pos >> PAGE_CACHE_SHIFT; |
2237 | offset = pos & ~PAGE_CACHE_MASK; | 2240 | offset = pos & ~PAGE_CACHE_MASK; |
2238 | 2241 | ||
2239 | while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { | 2242 | while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { |
2240 | zerofrom = curpos & ~PAGE_CACHE_MASK; | 2243 | zerofrom = curpos & ~PAGE_CACHE_MASK; |
2241 | if (zerofrom & (blocksize-1)) { | 2244 | if (zerofrom & (blocksize-1)) { |
2242 | *bytes |= (blocksize-1); | 2245 | *bytes |= (blocksize-1); |
2243 | (*bytes)++; | 2246 | (*bytes)++; |
2244 | } | 2247 | } |
2245 | len = PAGE_CACHE_SIZE - zerofrom; | 2248 | len = PAGE_CACHE_SIZE - zerofrom; |
2246 | 2249 | ||
2247 | err = pagecache_write_begin(file, mapping, curpos, len, | 2250 | err = pagecache_write_begin(file, mapping, curpos, len, |
2248 | AOP_FLAG_UNINTERRUPTIBLE, | 2251 | AOP_FLAG_UNINTERRUPTIBLE, |
2249 | &page, &fsdata); | 2252 | &page, &fsdata); |
2250 | if (err) | 2253 | if (err) |
2251 | goto out; | 2254 | goto out; |
2252 | zero_user(page, zerofrom, len); | 2255 | zero_user(page, zerofrom, len); |
2253 | err = pagecache_write_end(file, mapping, curpos, len, len, | 2256 | err = pagecache_write_end(file, mapping, curpos, len, len, |
2254 | page, fsdata); | 2257 | page, fsdata); |
2255 | if (err < 0) | 2258 | if (err < 0) |
2256 | goto out; | 2259 | goto out; |
2257 | BUG_ON(err != len); | 2260 | BUG_ON(err != len); |
2258 | err = 0; | 2261 | err = 0; |
2259 | 2262 | ||
2260 | balance_dirty_pages_ratelimited(mapping); | 2263 | balance_dirty_pages_ratelimited(mapping); |
2261 | } | 2264 | } |
2262 | 2265 | ||
2263 | /* page covers the boundary, find the boundary offset */ | 2266 | /* page covers the boundary, find the boundary offset */ |
2264 | if (index == curidx) { | 2267 | if (index == curidx) { |
2265 | zerofrom = curpos & ~PAGE_CACHE_MASK; | 2268 | zerofrom = curpos & ~PAGE_CACHE_MASK; |
2266 | /* if we will expand the thing last block will be filled */ | 2269 | /* if we will expand the thing last block will be filled */ |
2267 | if (offset <= zerofrom) { | 2270 | if (offset <= zerofrom) { |
2268 | goto out; | 2271 | goto out; |
2269 | } | 2272 | } |
2270 | if (zerofrom & (blocksize-1)) { | 2273 | if (zerofrom & (blocksize-1)) { |
2271 | *bytes |= (blocksize-1); | 2274 | *bytes |= (blocksize-1); |
2272 | (*bytes)++; | 2275 | (*bytes)++; |
2273 | } | 2276 | } |
2274 | len = offset - zerofrom; | 2277 | len = offset - zerofrom; |
2275 | 2278 | ||
2276 | err = pagecache_write_begin(file, mapping, curpos, len, | 2279 | err = pagecache_write_begin(file, mapping, curpos, len, |
2277 | AOP_FLAG_UNINTERRUPTIBLE, | 2280 | AOP_FLAG_UNINTERRUPTIBLE, |
2278 | &page, &fsdata); | 2281 | &page, &fsdata); |
2279 | if (err) | 2282 | if (err) |
2280 | goto out; | 2283 | goto out; |
2281 | zero_user(page, zerofrom, len); | 2284 | zero_user(page, zerofrom, len); |
2282 | err = pagecache_write_end(file, mapping, curpos, len, len, | 2285 | err = pagecache_write_end(file, mapping, curpos, len, len, |
2283 | page, fsdata); | 2286 | page, fsdata); |
2284 | if (err < 0) | 2287 | if (err < 0) |
2285 | goto out; | 2288 | goto out; |
2286 | BUG_ON(err != len); | 2289 | BUG_ON(err != len); |
2287 | err = 0; | 2290 | err = 0; |
2288 | } | 2291 | } |
2289 | out: | 2292 | out: |
2290 | return err; | 2293 | return err; |
2291 | } | 2294 | } |
2292 | 2295 | ||
2293 | /* | 2296 | /* |
2294 | * For moronic filesystems that do not allow holes in file. | 2297 | * For moronic filesystems that do not allow holes in file. |
2295 | * We may have to extend the file. | 2298 | * We may have to extend the file. |
2296 | */ | 2299 | */ |
2297 | int cont_write_begin(struct file *file, struct address_space *mapping, | 2300 | int cont_write_begin(struct file *file, struct address_space *mapping, |
2298 | loff_t pos, unsigned len, unsigned flags, | 2301 | loff_t pos, unsigned len, unsigned flags, |
2299 | struct page **pagep, void **fsdata, | 2302 | struct page **pagep, void **fsdata, |
2300 | get_block_t *get_block, loff_t *bytes) | 2303 | get_block_t *get_block, loff_t *bytes) |
2301 | { | 2304 | { |
2302 | struct inode *inode = mapping->host; | 2305 | struct inode *inode = mapping->host; |
2303 | unsigned blocksize = 1 << inode->i_blkbits; | 2306 | unsigned blocksize = 1 << inode->i_blkbits; |
2304 | unsigned zerofrom; | 2307 | unsigned zerofrom; |
2305 | int err; | 2308 | int err; |
2306 | 2309 | ||
2307 | err = cont_expand_zero(file, mapping, pos, bytes); | 2310 | err = cont_expand_zero(file, mapping, pos, bytes); |
2308 | if (err) | 2311 | if (err) |
2309 | goto out; | 2312 | goto out; |
2310 | 2313 | ||
2311 | zerofrom = *bytes & ~PAGE_CACHE_MASK; | 2314 | zerofrom = *bytes & ~PAGE_CACHE_MASK; |
2312 | if (pos+len > *bytes && zerofrom & (blocksize-1)) { | 2315 | if (pos+len > *bytes && zerofrom & (blocksize-1)) { |
2313 | *bytes |= (blocksize-1); | 2316 | *bytes |= (blocksize-1); |
2314 | (*bytes)++; | 2317 | (*bytes)++; |
2315 | } | 2318 | } |
2316 | 2319 | ||
2317 | *pagep = NULL; | 2320 | *pagep = NULL; |
2318 | err = block_write_begin(file, mapping, pos, len, | 2321 | err = block_write_begin(file, mapping, pos, len, |
2319 | flags, pagep, fsdata, get_block); | 2322 | flags, pagep, fsdata, get_block); |
2320 | out: | 2323 | out: |
2321 | return err; | 2324 | return err; |
2322 | } | 2325 | } |
2323 | 2326 | ||
2324 | int block_prepare_write(struct page *page, unsigned from, unsigned to, | 2327 | int block_prepare_write(struct page *page, unsigned from, unsigned to, |
2325 | get_block_t *get_block) | 2328 | get_block_t *get_block) |
2326 | { | 2329 | { |
2327 | struct inode *inode = page->mapping->host; | 2330 | struct inode *inode = page->mapping->host; |
2328 | int err = __block_prepare_write(inode, page, from, to, get_block); | 2331 | int err = __block_prepare_write(inode, page, from, to, get_block); |
2329 | if (err) | 2332 | if (err) |
2330 | ClearPageUptodate(page); | 2333 | ClearPageUptodate(page); |
2331 | return err; | 2334 | return err; |
2332 | } | 2335 | } |
2333 | 2336 | ||
2334 | int block_commit_write(struct page *page, unsigned from, unsigned to) | 2337 | int block_commit_write(struct page *page, unsigned from, unsigned to) |
2335 | { | 2338 | { |
2336 | struct inode *inode = page->mapping->host; | 2339 | struct inode *inode = page->mapping->host; |
2337 | __block_commit_write(inode,page,from,to); | 2340 | __block_commit_write(inode,page,from,to); |
2338 | return 0; | 2341 | return 0; |
2339 | } | 2342 | } |
2340 | 2343 | ||
2341 | /* | 2344 | /* |
2342 | * block_page_mkwrite() is not allowed to change the file size as it gets | 2345 | * block_page_mkwrite() is not allowed to change the file size as it gets |
2343 | * called from a page fault handler when a page is first dirtied. Hence we must | 2346 | * called from a page fault handler when a page is first dirtied. Hence we must |
2344 | * be careful to check for EOF conditions here. We set the page up correctly | 2347 | * be careful to check for EOF conditions here. We set the page up correctly |
2345 | * for a written page which means we get ENOSPC checking when writing into | 2348 | * for a written page which means we get ENOSPC checking when writing into |
2346 | * holes and correct delalloc and unwritten extent mapping on filesystems that | 2349 | * holes and correct delalloc and unwritten extent mapping on filesystems that |
2347 | * support these features. | 2350 | * support these features. |
2348 | * | 2351 | * |
2349 | * We are not allowed to take the i_mutex here so we have to play games to | 2352 | * We are not allowed to take the i_mutex here so we have to play games to |
2350 | * protect against truncate races as the page could now be beyond EOF. Because | 2353 | * protect against truncate races as the page could now be beyond EOF. Because |
2351 | * vmtruncate() writes the inode size before removing pages, once we have the | 2354 | * vmtruncate() writes the inode size before removing pages, once we have the |
2352 | * page lock we can determine safely if the page is beyond EOF. If it is not | 2355 | * page lock we can determine safely if the page is beyond EOF. If it is not |
2353 | * beyond EOF, then the page is guaranteed safe against truncation until we | 2356 | * beyond EOF, then the page is guaranteed safe against truncation until we |
2354 | * unlock the page. | 2357 | * unlock the page. |
2355 | */ | 2358 | */ |
2356 | int | 2359 | int |
2357 | block_page_mkwrite(struct vm_area_struct *vma, struct page *page, | 2360 | block_page_mkwrite(struct vm_area_struct *vma, struct page *page, |
2358 | get_block_t get_block) | 2361 | get_block_t get_block) |
2359 | { | 2362 | { |
2360 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 2363 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
2361 | unsigned long end; | 2364 | unsigned long end; |
2362 | loff_t size; | 2365 | loff_t size; |
2363 | int ret = -EINVAL; | 2366 | int ret = -EINVAL; |
2364 | 2367 | ||
2365 | lock_page(page); | 2368 | lock_page(page); |
2366 | size = i_size_read(inode); | 2369 | size = i_size_read(inode); |
2367 | if ((page->mapping != inode->i_mapping) || | 2370 | if ((page->mapping != inode->i_mapping) || |
2368 | (page_offset(page) > size)) { | 2371 | (page_offset(page) > size)) { |
2369 | /* page got truncated out from underneath us */ | 2372 | /* page got truncated out from underneath us */ |
2370 | goto out_unlock; | 2373 | goto out_unlock; |
2371 | } | 2374 | } |
2372 | 2375 | ||
2373 | /* page is wholly or partially inside EOF */ | 2376 | /* page is wholly or partially inside EOF */ |
2374 | if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) | 2377 | if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) |
2375 | end = size & ~PAGE_CACHE_MASK; | 2378 | end = size & ~PAGE_CACHE_MASK; |
2376 | else | 2379 | else |
2377 | end = PAGE_CACHE_SIZE; | 2380 | end = PAGE_CACHE_SIZE; |
2378 | 2381 | ||
2379 | ret = block_prepare_write(page, 0, end, get_block); | 2382 | ret = block_prepare_write(page, 0, end, get_block); |
2380 | if (!ret) | 2383 | if (!ret) |
2381 | ret = block_commit_write(page, 0, end); | 2384 | ret = block_commit_write(page, 0, end); |
2382 | 2385 | ||
2383 | out_unlock: | 2386 | out_unlock: |
2384 | unlock_page(page); | 2387 | unlock_page(page); |
2385 | return ret; | 2388 | return ret; |
2386 | } | 2389 | } |
2387 | 2390 | ||
2388 | /* | 2391 | /* |
2389 | * nobh_write_begin()'s prereads are special: the buffer_heads are freed | 2392 | * nobh_write_begin()'s prereads are special: the buffer_heads are freed |
2390 | * immediately, while under the page lock. So it needs a special end_io | 2393 | * immediately, while under the page lock. So it needs a special end_io |
2391 | * handler which does not touch the bh after unlocking it. | 2394 | * handler which does not touch the bh after unlocking it. |
2392 | */ | 2395 | */ |
2393 | static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) | 2396 | static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) |
2394 | { | 2397 | { |
2395 | __end_buffer_read_notouch(bh, uptodate); | 2398 | __end_buffer_read_notouch(bh, uptodate); |
2396 | } | 2399 | } |
2397 | 2400 | ||
2398 | /* | 2401 | /* |
2399 | * Attach the singly-linked list of buffers created by nobh_write_begin, to | 2402 | * Attach the singly-linked list of buffers created by nobh_write_begin, to |
2400 | * the page (converting it to circular linked list and taking care of page | 2403 | * the page (converting it to circular linked list and taking care of page |
2401 | * dirty races). | 2404 | * dirty races). |
2402 | */ | 2405 | */ |
2403 | static void attach_nobh_buffers(struct page *page, struct buffer_head *head) | 2406 | static void attach_nobh_buffers(struct page *page, struct buffer_head *head) |
2404 | { | 2407 | { |
2405 | struct buffer_head *bh; | 2408 | struct buffer_head *bh; |
2406 | 2409 | ||
2407 | BUG_ON(!PageLocked(page)); | 2410 | BUG_ON(!PageLocked(page)); |
2408 | 2411 | ||
2409 | spin_lock(&page->mapping->private_lock); | 2412 | spin_lock(&page->mapping->private_lock); |
2410 | bh = head; | 2413 | bh = head; |
2411 | do { | 2414 | do { |
2412 | if (PageDirty(page)) | 2415 | if (PageDirty(page)) |
2413 | set_buffer_dirty(bh); | 2416 | set_buffer_dirty(bh); |
2414 | if (!bh->b_this_page) | 2417 | if (!bh->b_this_page) |
2415 | bh->b_this_page = head; | 2418 | bh->b_this_page = head; |
2416 | bh = bh->b_this_page; | 2419 | bh = bh->b_this_page; |
2417 | } while (bh != head); | 2420 | } while (bh != head); |
2418 | attach_page_buffers(page, head); | 2421 | attach_page_buffers(page, head); |
2419 | spin_unlock(&page->mapping->private_lock); | 2422 | spin_unlock(&page->mapping->private_lock); |
2420 | } | 2423 | } |
2421 | 2424 | ||
2422 | /* | 2425 | /* |
2423 | * On entry, the page is fully not uptodate. | 2426 | * On entry, the page is fully not uptodate. |
2424 | * On exit the page is fully uptodate in the areas outside (from,to) | 2427 | * On exit the page is fully uptodate in the areas outside (from,to) |
2425 | */ | 2428 | */ |
2426 | int nobh_write_begin(struct file *file, struct address_space *mapping, | 2429 | int nobh_write_begin(struct file *file, struct address_space *mapping, |
2427 | loff_t pos, unsigned len, unsigned flags, | 2430 | loff_t pos, unsigned len, unsigned flags, |
2428 | struct page **pagep, void **fsdata, | 2431 | struct page **pagep, void **fsdata, |
2429 | get_block_t *get_block) | 2432 | get_block_t *get_block) |
2430 | { | 2433 | { |
2431 | struct inode *inode = mapping->host; | 2434 | struct inode *inode = mapping->host; |
2432 | const unsigned blkbits = inode->i_blkbits; | 2435 | const unsigned blkbits = inode->i_blkbits; |
2433 | const unsigned blocksize = 1 << blkbits; | 2436 | const unsigned blocksize = 1 << blkbits; |
2434 | struct buffer_head *head, *bh; | 2437 | struct buffer_head *head, *bh; |
2435 | struct page *page; | 2438 | struct page *page; |
2436 | pgoff_t index; | 2439 | pgoff_t index; |
2437 | unsigned from, to; | 2440 | unsigned from, to; |
2438 | unsigned block_in_page; | 2441 | unsigned block_in_page; |
2439 | unsigned block_start, block_end; | 2442 | unsigned block_start, block_end; |
2440 | sector_t block_in_file; | 2443 | sector_t block_in_file; |
2441 | int nr_reads = 0; | 2444 | int nr_reads = 0; |
2442 | int ret = 0; | 2445 | int ret = 0; |
2443 | int is_mapped_to_disk = 1; | 2446 | int is_mapped_to_disk = 1; |
2444 | 2447 | ||
2445 | index = pos >> PAGE_CACHE_SHIFT; | 2448 | index = pos >> PAGE_CACHE_SHIFT; |
2446 | from = pos & (PAGE_CACHE_SIZE - 1); | 2449 | from = pos & (PAGE_CACHE_SIZE - 1); |
2447 | to = from + len; | 2450 | to = from + len; |
2448 | 2451 | ||
2449 | page = __grab_cache_page(mapping, index); | 2452 | page = __grab_cache_page(mapping, index); |
2450 | if (!page) | 2453 | if (!page) |
2451 | return -ENOMEM; | 2454 | return -ENOMEM; |
2452 | *pagep = page; | 2455 | *pagep = page; |
2453 | *fsdata = NULL; | 2456 | *fsdata = NULL; |
2454 | 2457 | ||
2455 | if (page_has_buffers(page)) { | 2458 | if (page_has_buffers(page)) { |
2456 | unlock_page(page); | 2459 | unlock_page(page); |
2457 | page_cache_release(page); | 2460 | page_cache_release(page); |
2458 | *pagep = NULL; | 2461 | *pagep = NULL; |
2459 | return block_write_begin(file, mapping, pos, len, flags, pagep, | 2462 | return block_write_begin(file, mapping, pos, len, flags, pagep, |
2460 | fsdata, get_block); | 2463 | fsdata, get_block); |
2461 | } | 2464 | } |
2462 | 2465 | ||
2463 | if (PageMappedToDisk(page)) | 2466 | if (PageMappedToDisk(page)) |
2464 | return 0; | 2467 | return 0; |
2465 | 2468 | ||
2466 | /* | 2469 | /* |
2467 | * Allocate buffers so that we can keep track of state, and potentially | 2470 | * Allocate buffers so that we can keep track of state, and potentially |
2468 | * attach them to the page if an error occurs. In the common case of | 2471 | * attach them to the page if an error occurs. In the common case of |
2469 | * no error, they will just be freed again without ever being attached | 2472 | * no error, they will just be freed again without ever being attached |
2470 | * to the page (which is all OK, because we're under the page lock). | 2473 | * to the page (which is all OK, because we're under the page lock). |
2471 | * | 2474 | * |
2472 | * Be careful: the buffer linked list is a NULL terminated one, rather | 2475 | * Be careful: the buffer linked list is a NULL terminated one, rather |
2473 | * than the circular one we're used to. | 2476 | * than the circular one we're used to. |
2474 | */ | 2477 | */ |
2475 | head = alloc_page_buffers(page, blocksize, 0); | 2478 | head = alloc_page_buffers(page, blocksize, 0); |
2476 | if (!head) { | 2479 | if (!head) { |
2477 | ret = -ENOMEM; | 2480 | ret = -ENOMEM; |
2478 | goto out_release; | 2481 | goto out_release; |
2479 | } | 2482 | } |
2480 | 2483 | ||
2481 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); | 2484 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); |
2482 | 2485 | ||
2483 | /* | 2486 | /* |
2484 | * We loop across all blocks in the page, whether or not they are | 2487 | * We loop across all blocks in the page, whether or not they are |
2485 | * part of the affected region. This is so we can discover if the | 2488 | * part of the affected region. This is so we can discover if the |
2486 | * page is fully mapped-to-disk. | 2489 | * page is fully mapped-to-disk. |
2487 | */ | 2490 | */ |
2488 | for (block_start = 0, block_in_page = 0, bh = head; | 2491 | for (block_start = 0, block_in_page = 0, bh = head; |
2489 | block_start < PAGE_CACHE_SIZE; | 2492 | block_start < PAGE_CACHE_SIZE; |
2490 | block_in_page++, block_start += blocksize, bh = bh->b_this_page) { | 2493 | block_in_page++, block_start += blocksize, bh = bh->b_this_page) { |
2491 | int create; | 2494 | int create; |
2492 | 2495 | ||
2493 | block_end = block_start + blocksize; | 2496 | block_end = block_start + blocksize; |
2494 | bh->b_state = 0; | 2497 | bh->b_state = 0; |
2495 | create = 1; | 2498 | create = 1; |
2496 | if (block_start >= to) | 2499 | if (block_start >= to) |
2497 | create = 0; | 2500 | create = 0; |
2498 | ret = get_block(inode, block_in_file + block_in_page, | 2501 | ret = get_block(inode, block_in_file + block_in_page, |
2499 | bh, create); | 2502 | bh, create); |
2500 | if (ret) | 2503 | if (ret) |
2501 | goto failed; | 2504 | goto failed; |
2502 | if (!buffer_mapped(bh)) | 2505 | if (!buffer_mapped(bh)) |
2503 | is_mapped_to_disk = 0; | 2506 | is_mapped_to_disk = 0; |
2504 | if (buffer_new(bh)) | 2507 | if (buffer_new(bh)) |
2505 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | 2508 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); |
2506 | if (PageUptodate(page)) { | 2509 | if (PageUptodate(page)) { |
2507 | set_buffer_uptodate(bh); | 2510 | set_buffer_uptodate(bh); |
2508 | continue; | 2511 | continue; |
2509 | } | 2512 | } |
2510 | if (buffer_new(bh) || !buffer_mapped(bh)) { | 2513 | if (buffer_new(bh) || !buffer_mapped(bh)) { |
2511 | zero_user_segments(page, block_start, from, | 2514 | zero_user_segments(page, block_start, from, |
2512 | to, block_end); | 2515 | to, block_end); |
2513 | continue; | 2516 | continue; |
2514 | } | 2517 | } |
2515 | if (buffer_uptodate(bh)) | 2518 | if (buffer_uptodate(bh)) |
2516 | continue; /* reiserfs does this */ | 2519 | continue; /* reiserfs does this */ |
2517 | if (block_start < from || block_end > to) { | 2520 | if (block_start < from || block_end > to) { |
2518 | lock_buffer(bh); | 2521 | lock_buffer(bh); |
2519 | bh->b_end_io = end_buffer_read_nobh; | 2522 | bh->b_end_io = end_buffer_read_nobh; |
2520 | submit_bh(READ, bh); | 2523 | submit_bh(READ, bh); |
2521 | nr_reads++; | 2524 | nr_reads++; |
2522 | } | 2525 | } |
2523 | } | 2526 | } |
2524 | 2527 | ||
2525 | if (nr_reads) { | 2528 | if (nr_reads) { |
2526 | /* | 2529 | /* |
2527 | * The page is locked, so these buffers are protected from | 2530 | * The page is locked, so these buffers are protected from |
2528 | * any VM or truncate activity. Hence we don't need to care | 2531 | * any VM or truncate activity. Hence we don't need to care |
2529 | * for the buffer_head refcounts. | 2532 | * for the buffer_head refcounts. |
2530 | */ | 2533 | */ |
2531 | for (bh = head; bh; bh = bh->b_this_page) { | 2534 | for (bh = head; bh; bh = bh->b_this_page) { |
2532 | wait_on_buffer(bh); | 2535 | wait_on_buffer(bh); |
2533 | if (!buffer_uptodate(bh)) | 2536 | if (!buffer_uptodate(bh)) |
2534 | ret = -EIO; | 2537 | ret = -EIO; |
2535 | } | 2538 | } |
2536 | if (ret) | 2539 | if (ret) |
2537 | goto failed; | 2540 | goto failed; |
2538 | } | 2541 | } |
2539 | 2542 | ||
2540 | if (is_mapped_to_disk) | 2543 | if (is_mapped_to_disk) |
2541 | SetPageMappedToDisk(page); | 2544 | SetPageMappedToDisk(page); |
2542 | 2545 | ||
2543 | *fsdata = head; /* to be released by nobh_write_end */ | 2546 | *fsdata = head; /* to be released by nobh_write_end */ |
2544 | 2547 | ||
2545 | return 0; | 2548 | return 0; |
2546 | 2549 | ||
2547 | failed: | 2550 | failed: |
2548 | BUG_ON(!ret); | 2551 | BUG_ON(!ret); |
2549 | /* | 2552 | /* |
2550 | * Error recovery is a bit difficult. We need to zero out blocks that | 2553 | * Error recovery is a bit difficult. We need to zero out blocks that |
2551 | * were newly allocated, and dirty them to ensure they get written out. | 2554 | * were newly allocated, and dirty them to ensure they get written out. |
2552 | * Buffers need to be attached to the page at this point, otherwise | 2555 | * Buffers need to be attached to the page at this point, otherwise |
2553 | * the handling of potential IO errors during writeout would be hard | 2556 | * the handling of potential IO errors during writeout would be hard |
2554 | * (could try doing synchronous writeout, but what if that fails too?) | 2557 | * (could try doing synchronous writeout, but what if that fails too?) |
2555 | */ | 2558 | */ |
2556 | attach_nobh_buffers(page, head); | 2559 | attach_nobh_buffers(page, head); |
2557 | page_zero_new_buffers(page, from, to); | 2560 | page_zero_new_buffers(page, from, to); |
2558 | 2561 | ||
2559 | out_release: | 2562 | out_release: |
2560 | unlock_page(page); | 2563 | unlock_page(page); |
2561 | page_cache_release(page); | 2564 | page_cache_release(page); |
2562 | *pagep = NULL; | 2565 | *pagep = NULL; |
2563 | 2566 | ||
2564 | if (pos + len > inode->i_size) | 2567 | if (pos + len > inode->i_size) |
2565 | vmtruncate(inode, inode->i_size); | 2568 | vmtruncate(inode, inode->i_size); |
2566 | 2569 | ||
2567 | return ret; | 2570 | return ret; |
2568 | } | 2571 | } |
2569 | EXPORT_SYMBOL(nobh_write_begin); | 2572 | EXPORT_SYMBOL(nobh_write_begin); |
2570 | 2573 | ||
2571 | int nobh_write_end(struct file *file, struct address_space *mapping, | 2574 | int nobh_write_end(struct file *file, struct address_space *mapping, |
2572 | loff_t pos, unsigned len, unsigned copied, | 2575 | loff_t pos, unsigned len, unsigned copied, |
2573 | struct page *page, void *fsdata) | 2576 | struct page *page, void *fsdata) |
2574 | { | 2577 | { |
2575 | struct inode *inode = page->mapping->host; | 2578 | struct inode *inode = page->mapping->host; |
2576 | struct buffer_head *head = fsdata; | 2579 | struct buffer_head *head = fsdata; |
2577 | struct buffer_head *bh; | 2580 | struct buffer_head *bh; |
2578 | BUG_ON(fsdata != NULL && page_has_buffers(page)); | 2581 | BUG_ON(fsdata != NULL && page_has_buffers(page)); |
2579 | 2582 | ||
2580 | if (unlikely(copied < len) && !page_has_buffers(page)) | 2583 | if (unlikely(copied < len) && !page_has_buffers(page)) |
2581 | attach_nobh_buffers(page, head); | 2584 | attach_nobh_buffers(page, head); |
2582 | if (page_has_buffers(page)) | 2585 | if (page_has_buffers(page)) |
2583 | return generic_write_end(file, mapping, pos, len, | 2586 | return generic_write_end(file, mapping, pos, len, |
2584 | copied, page, fsdata); | 2587 | copied, page, fsdata); |
2585 | 2588 | ||
2586 | SetPageUptodate(page); | 2589 | SetPageUptodate(page); |
2587 | set_page_dirty(page); | 2590 | set_page_dirty(page); |
2588 | if (pos+copied > inode->i_size) { | 2591 | if (pos+copied > inode->i_size) { |
2589 | i_size_write(inode, pos+copied); | 2592 | i_size_write(inode, pos+copied); |
2590 | mark_inode_dirty(inode); | 2593 | mark_inode_dirty(inode); |
2591 | } | 2594 | } |
2592 | 2595 | ||
2593 | unlock_page(page); | 2596 | unlock_page(page); |
2594 | page_cache_release(page); | 2597 | page_cache_release(page); |
2595 | 2598 | ||
2596 | while (head) { | 2599 | while (head) { |
2597 | bh = head; | 2600 | bh = head; |
2598 | head = head->b_this_page; | 2601 | head = head->b_this_page; |
2599 | free_buffer_head(bh); | 2602 | free_buffer_head(bh); |
2600 | } | 2603 | } |
2601 | 2604 | ||
2602 | return copied; | 2605 | return copied; |
2603 | } | 2606 | } |
2604 | EXPORT_SYMBOL(nobh_write_end); | 2607 | EXPORT_SYMBOL(nobh_write_end); |
2605 | 2608 | ||
2606 | /* | 2609 | /* |
2607 | * nobh_writepage() - based on block_full_write_page() except | 2610 | * nobh_writepage() - based on block_full_write_page() except |
2608 | * that it tries to operate without attaching bufferheads to | 2611 | * that it tries to operate without attaching bufferheads to |
2609 | * the page. | 2612 | * the page. |
2610 | */ | 2613 | */ |
2611 | int nobh_writepage(struct page *page, get_block_t *get_block, | 2614 | int nobh_writepage(struct page *page, get_block_t *get_block, |
2612 | struct writeback_control *wbc) | 2615 | struct writeback_control *wbc) |
2613 | { | 2616 | { |
2614 | struct inode * const inode = page->mapping->host; | 2617 | struct inode * const inode = page->mapping->host; |
2615 | loff_t i_size = i_size_read(inode); | 2618 | loff_t i_size = i_size_read(inode); |
2616 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; | 2619 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; |
2617 | unsigned offset; | 2620 | unsigned offset; |
2618 | int ret; | 2621 | int ret; |
2619 | 2622 | ||
2620 | /* Is the page fully inside i_size? */ | 2623 | /* Is the page fully inside i_size? */ |
2621 | if (page->index < end_index) | 2624 | if (page->index < end_index) |
2622 | goto out; | 2625 | goto out; |
2623 | 2626 | ||
2624 | /* Is the page fully outside i_size? (truncate in progress) */ | 2627 | /* Is the page fully outside i_size? (truncate in progress) */ |
2625 | offset = i_size & (PAGE_CACHE_SIZE-1); | 2628 | offset = i_size & (PAGE_CACHE_SIZE-1); |
2626 | if (page->index >= end_index+1 || !offset) { | 2629 | if (page->index >= end_index+1 || !offset) { |
2627 | /* | 2630 | /* |
2628 | * The page may have dirty, unmapped buffers. For example, | 2631 | * The page may have dirty, unmapped buffers. For example, |
2629 | * they may have been added in ext3_writepage(). Make them | 2632 | * they may have been added in ext3_writepage(). Make them |
2630 | * freeable here, so the page does not leak. | 2633 | * freeable here, so the page does not leak. |
2631 | */ | 2634 | */ |
2632 | #if 0 | 2635 | #if 0 |
2633 | /* Not really sure about this - do we need this ? */ | 2636 | /* Not really sure about this - do we need this ? */ |
2634 | if (page->mapping->a_ops->invalidatepage) | 2637 | if (page->mapping->a_ops->invalidatepage) |
2635 | page->mapping->a_ops->invalidatepage(page, offset); | 2638 | page->mapping->a_ops->invalidatepage(page, offset); |
2636 | #endif | 2639 | #endif |
2637 | unlock_page(page); | 2640 | unlock_page(page); |
2638 | return 0; /* don't care */ | 2641 | return 0; /* don't care */ |
2639 | } | 2642 | } |
2640 | 2643 | ||
2641 | /* | 2644 | /* |
2642 | * The page straddles i_size. It must be zeroed out on each and every | 2645 | * The page straddles i_size. It must be zeroed out on each and every |
2643 | * writepage invocation because it may be mmapped. "A file is mapped | 2646 | * writepage invocation because it may be mmapped. "A file is mapped |
2644 | * in multiples of the page size. For a file that is not a multiple of | 2647 | * in multiples of the page size. For a file that is not a multiple of |
2645 | * the page size, the remaining memory is zeroed when mapped, and | 2648 | * the page size, the remaining memory is zeroed when mapped, and |
2646 | * writes to that region are not written out to the file." | 2649 | * writes to that region are not written out to the file." |
2647 | */ | 2650 | */ |
2648 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 2651 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
2649 | out: | 2652 | out: |
2650 | ret = mpage_writepage(page, get_block, wbc); | 2653 | ret = mpage_writepage(page, get_block, wbc); |
2651 | if (ret == -EAGAIN) | 2654 | if (ret == -EAGAIN) |
2652 | ret = __block_write_full_page(inode, page, get_block, wbc); | 2655 | ret = __block_write_full_page(inode, page, get_block, wbc); |
2653 | return ret; | 2656 | return ret; |
2654 | } | 2657 | } |
2655 | EXPORT_SYMBOL(nobh_writepage); | 2658 | EXPORT_SYMBOL(nobh_writepage); |
2656 | 2659 | ||
2657 | int nobh_truncate_page(struct address_space *mapping, | 2660 | int nobh_truncate_page(struct address_space *mapping, |
2658 | loff_t from, get_block_t *get_block) | 2661 | loff_t from, get_block_t *get_block) |
2659 | { | 2662 | { |
2660 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 2663 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
2661 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 2664 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
2662 | unsigned blocksize; | 2665 | unsigned blocksize; |
2663 | sector_t iblock; | 2666 | sector_t iblock; |
2664 | unsigned length, pos; | 2667 | unsigned length, pos; |
2665 | struct inode *inode = mapping->host; | 2668 | struct inode *inode = mapping->host; |
2666 | struct page *page; | 2669 | struct page *page; |
2667 | struct buffer_head map_bh; | 2670 | struct buffer_head map_bh; |
2668 | int err; | 2671 | int err; |
2669 | 2672 | ||
2670 | blocksize = 1 << inode->i_blkbits; | 2673 | blocksize = 1 << inode->i_blkbits; |
2671 | length = offset & (blocksize - 1); | 2674 | length = offset & (blocksize - 1); |
2672 | 2675 | ||
2673 | /* Block boundary? Nothing to do */ | 2676 | /* Block boundary? Nothing to do */ |
2674 | if (!length) | 2677 | if (!length) |
2675 | return 0; | 2678 | return 0; |
2676 | 2679 | ||
2677 | length = blocksize - length; | 2680 | length = blocksize - length; |
2678 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2681 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2679 | 2682 | ||
2680 | page = grab_cache_page(mapping, index); | 2683 | page = grab_cache_page(mapping, index); |
2681 | err = -ENOMEM; | 2684 | err = -ENOMEM; |
2682 | if (!page) | 2685 | if (!page) |
2683 | goto out; | 2686 | goto out; |
2684 | 2687 | ||
2685 | if (page_has_buffers(page)) { | 2688 | if (page_has_buffers(page)) { |
2686 | has_buffers: | 2689 | has_buffers: |
2687 | unlock_page(page); | 2690 | unlock_page(page); |
2688 | page_cache_release(page); | 2691 | page_cache_release(page); |
2689 | return block_truncate_page(mapping, from, get_block); | 2692 | return block_truncate_page(mapping, from, get_block); |
2690 | } | 2693 | } |
2691 | 2694 | ||
2692 | /* Find the buffer that contains "offset" */ | 2695 | /* Find the buffer that contains "offset" */ |
2693 | pos = blocksize; | 2696 | pos = blocksize; |
2694 | while (offset >= pos) { | 2697 | while (offset >= pos) { |
2695 | iblock++; | 2698 | iblock++; |
2696 | pos += blocksize; | 2699 | pos += blocksize; |
2697 | } | 2700 | } |
2698 | 2701 | ||
2699 | err = get_block(inode, iblock, &map_bh, 0); | 2702 | err = get_block(inode, iblock, &map_bh, 0); |
2700 | if (err) | 2703 | if (err) |
2701 | goto unlock; | 2704 | goto unlock; |
2702 | /* unmapped? It's a hole - nothing to do */ | 2705 | /* unmapped? It's a hole - nothing to do */ |
2703 | if (!buffer_mapped(&map_bh)) | 2706 | if (!buffer_mapped(&map_bh)) |
2704 | goto unlock; | 2707 | goto unlock; |
2705 | 2708 | ||
2706 | /* Ok, it's mapped. Make sure it's up-to-date */ | 2709 | /* Ok, it's mapped. Make sure it's up-to-date */ |
2707 | if (!PageUptodate(page)) { | 2710 | if (!PageUptodate(page)) { |
2708 | err = mapping->a_ops->readpage(NULL, page); | 2711 | err = mapping->a_ops->readpage(NULL, page); |
2709 | if (err) { | 2712 | if (err) { |
2710 | page_cache_release(page); | 2713 | page_cache_release(page); |
2711 | goto out; | 2714 | goto out; |
2712 | } | 2715 | } |
2713 | lock_page(page); | 2716 | lock_page(page); |
2714 | if (!PageUptodate(page)) { | 2717 | if (!PageUptodate(page)) { |
2715 | err = -EIO; | 2718 | err = -EIO; |
2716 | goto unlock; | 2719 | goto unlock; |
2717 | } | 2720 | } |
2718 | if (page_has_buffers(page)) | 2721 | if (page_has_buffers(page)) |
2719 | goto has_buffers; | 2722 | goto has_buffers; |
2720 | } | 2723 | } |
2721 | zero_user(page, offset, length); | 2724 | zero_user(page, offset, length); |
2722 | set_page_dirty(page); | 2725 | set_page_dirty(page); |
2723 | err = 0; | 2726 | err = 0; |
2724 | 2727 | ||
2725 | unlock: | 2728 | unlock: |
2726 | unlock_page(page); | 2729 | unlock_page(page); |
2727 | page_cache_release(page); | 2730 | page_cache_release(page); |
2728 | out: | 2731 | out: |
2729 | return err; | 2732 | return err; |
2730 | } | 2733 | } |
2731 | EXPORT_SYMBOL(nobh_truncate_page); | 2734 | EXPORT_SYMBOL(nobh_truncate_page); |
2732 | 2735 | ||
2733 | int block_truncate_page(struct address_space *mapping, | 2736 | int block_truncate_page(struct address_space *mapping, |
2734 | loff_t from, get_block_t *get_block) | 2737 | loff_t from, get_block_t *get_block) |
2735 | { | 2738 | { |
2736 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 2739 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
2737 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 2740 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
2738 | unsigned blocksize; | 2741 | unsigned blocksize; |
2739 | sector_t iblock; | 2742 | sector_t iblock; |
2740 | unsigned length, pos; | 2743 | unsigned length, pos; |
2741 | struct inode *inode = mapping->host; | 2744 | struct inode *inode = mapping->host; |
2742 | struct page *page; | 2745 | struct page *page; |
2743 | struct buffer_head *bh; | 2746 | struct buffer_head *bh; |
2744 | int err; | 2747 | int err; |
2745 | 2748 | ||
2746 | blocksize = 1 << inode->i_blkbits; | 2749 | blocksize = 1 << inode->i_blkbits; |
2747 | length = offset & (blocksize - 1); | 2750 | length = offset & (blocksize - 1); |
2748 | 2751 | ||
2749 | /* Block boundary? Nothing to do */ | 2752 | /* Block boundary? Nothing to do */ |
2750 | if (!length) | 2753 | if (!length) |
2751 | return 0; | 2754 | return 0; |
2752 | 2755 | ||
2753 | length = blocksize - length; | 2756 | length = blocksize - length; |
2754 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2757 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2755 | 2758 | ||
2756 | page = grab_cache_page(mapping, index); | 2759 | page = grab_cache_page(mapping, index); |
2757 | err = -ENOMEM; | 2760 | err = -ENOMEM; |
2758 | if (!page) | 2761 | if (!page) |
2759 | goto out; | 2762 | goto out; |
2760 | 2763 | ||
2761 | if (!page_has_buffers(page)) | 2764 | if (!page_has_buffers(page)) |
2762 | create_empty_buffers(page, blocksize, 0); | 2765 | create_empty_buffers(page, blocksize, 0); |
2763 | 2766 | ||
2764 | /* Find the buffer that contains "offset" */ | 2767 | /* Find the buffer that contains "offset" */ |
2765 | bh = page_buffers(page); | 2768 | bh = page_buffers(page); |
2766 | pos = blocksize; | 2769 | pos = blocksize; |
2767 | while (offset >= pos) { | 2770 | while (offset >= pos) { |
2768 | bh = bh->b_this_page; | 2771 | bh = bh->b_this_page; |
2769 | iblock++; | 2772 | iblock++; |
2770 | pos += blocksize; | 2773 | pos += blocksize; |
2771 | } | 2774 | } |
2772 | 2775 | ||
2773 | err = 0; | 2776 | err = 0; |
2774 | if (!buffer_mapped(bh)) { | 2777 | if (!buffer_mapped(bh)) { |
2775 | WARN_ON(bh->b_size != blocksize); | 2778 | WARN_ON(bh->b_size != blocksize); |
2776 | err = get_block(inode, iblock, bh, 0); | 2779 | err = get_block(inode, iblock, bh, 0); |
2777 | if (err) | 2780 | if (err) |
2778 | goto unlock; | 2781 | goto unlock; |
2779 | /* unmapped? It's a hole - nothing to do */ | 2782 | /* unmapped? It's a hole - nothing to do */ |
2780 | if (!buffer_mapped(bh)) | 2783 | if (!buffer_mapped(bh)) |
2781 | goto unlock; | 2784 | goto unlock; |
2782 | } | 2785 | } |
2783 | 2786 | ||
2784 | /* Ok, it's mapped. Make sure it's up-to-date */ | 2787 | /* Ok, it's mapped. Make sure it's up-to-date */ |
2785 | if (PageUptodate(page)) | 2788 | if (PageUptodate(page)) |
2786 | set_buffer_uptodate(bh); | 2789 | set_buffer_uptodate(bh); |
2787 | 2790 | ||
2788 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { | 2791 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { |
2789 | err = -EIO; | 2792 | err = -EIO; |
2790 | ll_rw_block(READ, 1, &bh); | 2793 | ll_rw_block(READ, 1, &bh); |
2791 | wait_on_buffer(bh); | 2794 | wait_on_buffer(bh); |
2792 | /* Uhhuh. Read error. Complain and punt. */ | 2795 | /* Uhhuh. Read error. Complain and punt. */ |
2793 | if (!buffer_uptodate(bh)) | 2796 | if (!buffer_uptodate(bh)) |
2794 | goto unlock; | 2797 | goto unlock; |
2795 | } | 2798 | } |
2796 | 2799 | ||
2797 | zero_user(page, offset, length); | 2800 | zero_user(page, offset, length); |
2798 | mark_buffer_dirty(bh); | 2801 | mark_buffer_dirty(bh); |
2799 | err = 0; | 2802 | err = 0; |
2800 | 2803 | ||
2801 | unlock: | 2804 | unlock: |
2802 | unlock_page(page); | 2805 | unlock_page(page); |
2803 | page_cache_release(page); | 2806 | page_cache_release(page); |
2804 | out: | 2807 | out: |
2805 | return err; | 2808 | return err; |
2806 | } | 2809 | } |
2807 | 2810 | ||
2808 | /* | 2811 | /* |
2809 | * The generic ->writepage function for buffer-backed address_spaces | 2812 | * The generic ->writepage function for buffer-backed address_spaces |
2810 | */ | 2813 | */ |
2811 | int block_write_full_page(struct page *page, get_block_t *get_block, | 2814 | int block_write_full_page(struct page *page, get_block_t *get_block, |
2812 | struct writeback_control *wbc) | 2815 | struct writeback_control *wbc) |
2813 | { | 2816 | { |
2814 | struct inode * const inode = page->mapping->host; | 2817 | struct inode * const inode = page->mapping->host; |
2815 | loff_t i_size = i_size_read(inode); | 2818 | loff_t i_size = i_size_read(inode); |
2816 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; | 2819 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; |
2817 | unsigned offset; | 2820 | unsigned offset; |
2818 | 2821 | ||
2819 | /* Is the page fully inside i_size? */ | 2822 | /* Is the page fully inside i_size? */ |
2820 | if (page->index < end_index) | 2823 | if (page->index < end_index) |
2821 | return __block_write_full_page(inode, page, get_block, wbc); | 2824 | return __block_write_full_page(inode, page, get_block, wbc); |
2822 | 2825 | ||
2823 | /* Is the page fully outside i_size? (truncate in progress) */ | 2826 | /* Is the page fully outside i_size? (truncate in progress) */ |
2824 | offset = i_size & (PAGE_CACHE_SIZE-1); | 2827 | offset = i_size & (PAGE_CACHE_SIZE-1); |
2825 | if (page->index >= end_index+1 || !offset) { | 2828 | if (page->index >= end_index+1 || !offset) { |
2826 | /* | 2829 | /* |
2827 | * The page may have dirty, unmapped buffers. For example, | 2830 | * The page may have dirty, unmapped buffers. For example, |
2828 | * they may have been added in ext3_writepage(). Make them | 2831 | * they may have been added in ext3_writepage(). Make them |
2829 | * freeable here, so the page does not leak. | 2832 | * freeable here, so the page does not leak. |
2830 | */ | 2833 | */ |
2831 | do_invalidatepage(page, 0); | 2834 | do_invalidatepage(page, 0); |
2832 | unlock_page(page); | 2835 | unlock_page(page); |
2833 | return 0; /* don't care */ | 2836 | return 0; /* don't care */ |
2834 | } | 2837 | } |
2835 | 2838 | ||
2836 | /* | 2839 | /* |
2837 | * The page straddles i_size. It must be zeroed out on each and every | 2840 | * The page straddles i_size. It must be zeroed out on each and every |
2838 | * writepage invokation because it may be mmapped. "A file is mapped | 2841 | * writepage invokation because it may be mmapped. "A file is mapped |
2839 | * in multiples of the page size. For a file that is not a multiple of | 2842 | * in multiples of the page size. For a file that is not a multiple of |
2840 | * the page size, the remaining memory is zeroed when mapped, and | 2843 | * the page size, the remaining memory is zeroed when mapped, and |
2841 | * writes to that region are not written out to the file." | 2844 | * writes to that region are not written out to the file." |
2842 | */ | 2845 | */ |
2843 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 2846 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
2844 | return __block_write_full_page(inode, page, get_block, wbc); | 2847 | return __block_write_full_page(inode, page, get_block, wbc); |
2845 | } | 2848 | } |
2846 | 2849 | ||
2847 | sector_t generic_block_bmap(struct address_space *mapping, sector_t block, | 2850 | sector_t generic_block_bmap(struct address_space *mapping, sector_t block, |
2848 | get_block_t *get_block) | 2851 | get_block_t *get_block) |
2849 | { | 2852 | { |
2850 | struct buffer_head tmp; | 2853 | struct buffer_head tmp; |
2851 | struct inode *inode = mapping->host; | 2854 | struct inode *inode = mapping->host; |
2852 | tmp.b_state = 0; | 2855 | tmp.b_state = 0; |
2853 | tmp.b_blocknr = 0; | 2856 | tmp.b_blocknr = 0; |
2854 | tmp.b_size = 1 << inode->i_blkbits; | 2857 | tmp.b_size = 1 << inode->i_blkbits; |
2855 | get_block(inode, block, &tmp, 0); | 2858 | get_block(inode, block, &tmp, 0); |
2856 | return tmp.b_blocknr; | 2859 | return tmp.b_blocknr; |
2857 | } | 2860 | } |
2858 | 2861 | ||
2859 | static void end_bio_bh_io_sync(struct bio *bio, int err) | 2862 | static void end_bio_bh_io_sync(struct bio *bio, int err) |
2860 | { | 2863 | { |
2861 | struct buffer_head *bh = bio->bi_private; | 2864 | struct buffer_head *bh = bio->bi_private; |
2862 | 2865 | ||
2863 | if (err == -EOPNOTSUPP) { | 2866 | if (err == -EOPNOTSUPP) { |
2864 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | 2867 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); |
2865 | set_bit(BH_Eopnotsupp, &bh->b_state); | 2868 | set_bit(BH_Eopnotsupp, &bh->b_state); |
2866 | } | 2869 | } |
2867 | 2870 | ||
2868 | bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); | 2871 | bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); |
2869 | bio_put(bio); | 2872 | bio_put(bio); |
2870 | } | 2873 | } |
2871 | 2874 | ||
2872 | int submit_bh(int rw, struct buffer_head * bh) | 2875 | int submit_bh(int rw, struct buffer_head * bh) |
2873 | { | 2876 | { |
2874 | struct bio *bio; | 2877 | struct bio *bio; |
2875 | int ret = 0; | 2878 | int ret = 0; |
2876 | 2879 | ||
2877 | BUG_ON(!buffer_locked(bh)); | 2880 | BUG_ON(!buffer_locked(bh)); |
2878 | BUG_ON(!buffer_mapped(bh)); | 2881 | BUG_ON(!buffer_mapped(bh)); |
2879 | BUG_ON(!bh->b_end_io); | 2882 | BUG_ON(!bh->b_end_io); |
2880 | 2883 | ||
2881 | if (buffer_ordered(bh) && (rw == WRITE)) | 2884 | if (buffer_ordered(bh) && (rw == WRITE)) |
2882 | rw = WRITE_BARRIER; | 2885 | rw = WRITE_BARRIER; |
2883 | 2886 | ||
2884 | /* | 2887 | /* |
2885 | * Only clear out a write error when rewriting, should this | 2888 | * Only clear out a write error when rewriting, should this |
2886 | * include WRITE_SYNC as well? | 2889 | * include WRITE_SYNC as well? |
2887 | */ | 2890 | */ |
2888 | if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) | 2891 | if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) |
2889 | clear_buffer_write_io_error(bh); | 2892 | clear_buffer_write_io_error(bh); |
2890 | 2893 | ||
2891 | /* | 2894 | /* |
2892 | * from here on down, it's all bio -- do the initial mapping, | 2895 | * from here on down, it's all bio -- do the initial mapping, |
2893 | * submit_bio -> generic_make_request may further map this bio around | 2896 | * submit_bio -> generic_make_request may further map this bio around |
2894 | */ | 2897 | */ |
2895 | bio = bio_alloc(GFP_NOIO, 1); | 2898 | bio = bio_alloc(GFP_NOIO, 1); |
2896 | 2899 | ||
2897 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 2900 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
2898 | bio->bi_bdev = bh->b_bdev; | 2901 | bio->bi_bdev = bh->b_bdev; |
2899 | bio->bi_io_vec[0].bv_page = bh->b_page; | 2902 | bio->bi_io_vec[0].bv_page = bh->b_page; |
2900 | bio->bi_io_vec[0].bv_len = bh->b_size; | 2903 | bio->bi_io_vec[0].bv_len = bh->b_size; |
2901 | bio->bi_io_vec[0].bv_offset = bh_offset(bh); | 2904 | bio->bi_io_vec[0].bv_offset = bh_offset(bh); |
2902 | 2905 | ||
2903 | bio->bi_vcnt = 1; | 2906 | bio->bi_vcnt = 1; |
2904 | bio->bi_idx = 0; | 2907 | bio->bi_idx = 0; |
2905 | bio->bi_size = bh->b_size; | 2908 | bio->bi_size = bh->b_size; |
2906 | 2909 | ||
2907 | bio->bi_end_io = end_bio_bh_io_sync; | 2910 | bio->bi_end_io = end_bio_bh_io_sync; |
2908 | bio->bi_private = bh; | 2911 | bio->bi_private = bh; |
2909 | 2912 | ||
2910 | bio_get(bio); | 2913 | bio_get(bio); |
2911 | submit_bio(rw, bio); | 2914 | submit_bio(rw, bio); |
2912 | 2915 | ||
2913 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 2916 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
2914 | ret = -EOPNOTSUPP; | 2917 | ret = -EOPNOTSUPP; |
2915 | 2918 | ||
2916 | bio_put(bio); | 2919 | bio_put(bio); |
2917 | return ret; | 2920 | return ret; |
2918 | } | 2921 | } |
2919 | 2922 | ||
2920 | /** | 2923 | /** |
2921 | * ll_rw_block: low-level access to block devices (DEPRECATED) | 2924 | * ll_rw_block: low-level access to block devices (DEPRECATED) |
2922 | * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) | 2925 | * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) |
2923 | * @nr: number of &struct buffer_heads in the array | 2926 | * @nr: number of &struct buffer_heads in the array |
2924 | * @bhs: array of pointers to &struct buffer_head | 2927 | * @bhs: array of pointers to &struct buffer_head |
2925 | * | 2928 | * |
2926 | * ll_rw_block() takes an array of pointers to &struct buffer_heads, and | 2929 | * ll_rw_block() takes an array of pointers to &struct buffer_heads, and |
2927 | * requests an I/O operation on them, either a %READ or a %WRITE. The third | 2930 | * requests an I/O operation on them, either a %READ or a %WRITE. The third |
2928 | * %SWRITE is like %WRITE only we make sure that the *current* data in buffers | 2931 | * %SWRITE is like %WRITE only we make sure that the *current* data in buffers |
2929 | * are sent to disk. The fourth %READA option is described in the documentation | 2932 | * are sent to disk. The fourth %READA option is described in the documentation |
2930 | * for generic_make_request() which ll_rw_block() calls. | 2933 | * for generic_make_request() which ll_rw_block() calls. |
2931 | * | 2934 | * |
2932 | * This function drops any buffer that it cannot get a lock on (with the | 2935 | * This function drops any buffer that it cannot get a lock on (with the |
2933 | * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be | 2936 | * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be |
2934 | * clean when doing a write request, and any buffer that appears to be | 2937 | * clean when doing a write request, and any buffer that appears to be |
2935 | * up-to-date when doing read request. Further it marks as clean buffers that | 2938 | * up-to-date when doing read request. Further it marks as clean buffers that |
2936 | * are processed for writing (the buffer cache won't assume that they are | 2939 | * are processed for writing (the buffer cache won't assume that they are |
2937 | * actually clean until the buffer gets unlocked). | 2940 | * actually clean until the buffer gets unlocked). |
2938 | * | 2941 | * |
2939 | * ll_rw_block sets b_end_io to simple completion handler that marks | 2942 | * ll_rw_block sets b_end_io to simple completion handler that marks |
2940 | * the buffer up-to-date (if approriate), unlocks the buffer and wakes | 2943 | * the buffer up-to-date (if approriate), unlocks the buffer and wakes |
2941 | * any waiters. | 2944 | * any waiters. |
2942 | * | 2945 | * |
2943 | * All of the buffers must be for the same device, and must also be a | 2946 | * All of the buffers must be for the same device, and must also be a |
2944 | * multiple of the current approved size for the device. | 2947 | * multiple of the current approved size for the device. |
2945 | */ | 2948 | */ |
2946 | void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) | 2949 | void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) |
2947 | { | 2950 | { |
2948 | int i; | 2951 | int i; |
2949 | 2952 | ||
2950 | for (i = 0; i < nr; i++) { | 2953 | for (i = 0; i < nr; i++) { |
2951 | struct buffer_head *bh = bhs[i]; | 2954 | struct buffer_head *bh = bhs[i]; |
2952 | 2955 | ||
2953 | if (rw == SWRITE || rw == SWRITE_SYNC) | 2956 | if (rw == SWRITE || rw == SWRITE_SYNC) |
2954 | lock_buffer(bh); | 2957 | lock_buffer(bh); |
2955 | else if (test_set_buffer_locked(bh)) | 2958 | else if (test_set_buffer_locked(bh)) |
2956 | continue; | 2959 | continue; |
2957 | 2960 | ||
2958 | if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { | 2961 | if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { |
2959 | if (test_clear_buffer_dirty(bh)) { | 2962 | if (test_clear_buffer_dirty(bh)) { |
2960 | bh->b_end_io = end_buffer_write_sync; | 2963 | bh->b_end_io = end_buffer_write_sync; |
2961 | get_bh(bh); | 2964 | get_bh(bh); |
2962 | if (rw == SWRITE_SYNC) | 2965 | if (rw == SWRITE_SYNC) |
2963 | submit_bh(WRITE_SYNC, bh); | 2966 | submit_bh(WRITE_SYNC, bh); |
2964 | else | 2967 | else |
2965 | submit_bh(WRITE, bh); | 2968 | submit_bh(WRITE, bh); |
2966 | continue; | 2969 | continue; |
2967 | } | 2970 | } |
2968 | } else { | 2971 | } else { |
2969 | if (!buffer_uptodate(bh)) { | 2972 | if (!buffer_uptodate(bh)) { |
2970 | bh->b_end_io = end_buffer_read_sync; | 2973 | bh->b_end_io = end_buffer_read_sync; |
2971 | get_bh(bh); | 2974 | get_bh(bh); |
2972 | submit_bh(rw, bh); | 2975 | submit_bh(rw, bh); |
2973 | continue; | 2976 | continue; |
2974 | } | 2977 | } |
2975 | } | 2978 | } |
2976 | unlock_buffer(bh); | 2979 | unlock_buffer(bh); |
2977 | } | 2980 | } |
2978 | } | 2981 | } |
2979 | 2982 | ||
2980 | /* | 2983 | /* |
2981 | * For a data-integrity writeout, we need to wait upon any in-progress I/O | 2984 | * For a data-integrity writeout, we need to wait upon any in-progress I/O |
2982 | * and then start new I/O and then wait upon it. The caller must have a ref on | 2985 | * and then start new I/O and then wait upon it. The caller must have a ref on |
2983 | * the buffer_head. | 2986 | * the buffer_head. |
2984 | */ | 2987 | */ |
2985 | int sync_dirty_buffer(struct buffer_head *bh) | 2988 | int sync_dirty_buffer(struct buffer_head *bh) |
2986 | { | 2989 | { |
2987 | int ret = 0; | 2990 | int ret = 0; |
2988 | 2991 | ||
2989 | WARN_ON(atomic_read(&bh->b_count) < 1); | 2992 | WARN_ON(atomic_read(&bh->b_count) < 1); |
2990 | lock_buffer(bh); | 2993 | lock_buffer(bh); |
2991 | if (test_clear_buffer_dirty(bh)) { | 2994 | if (test_clear_buffer_dirty(bh)) { |
2992 | get_bh(bh); | 2995 | get_bh(bh); |
2993 | bh->b_end_io = end_buffer_write_sync; | 2996 | bh->b_end_io = end_buffer_write_sync; |
2994 | ret = submit_bh(WRITE_SYNC, bh); | 2997 | ret = submit_bh(WRITE_SYNC, bh); |
2995 | wait_on_buffer(bh); | 2998 | wait_on_buffer(bh); |
2996 | if (buffer_eopnotsupp(bh)) { | 2999 | if (buffer_eopnotsupp(bh)) { |
2997 | clear_buffer_eopnotsupp(bh); | 3000 | clear_buffer_eopnotsupp(bh); |
2998 | ret = -EOPNOTSUPP; | 3001 | ret = -EOPNOTSUPP; |
2999 | } | 3002 | } |
3000 | if (!ret && !buffer_uptodate(bh)) | 3003 | if (!ret && !buffer_uptodate(bh)) |
3001 | ret = -EIO; | 3004 | ret = -EIO; |
3002 | } else { | 3005 | } else { |
3003 | unlock_buffer(bh); | 3006 | unlock_buffer(bh); |
3004 | } | 3007 | } |
3005 | return ret; | 3008 | return ret; |
3006 | } | 3009 | } |
3007 | 3010 | ||
3008 | /* | 3011 | /* |
3009 | * try_to_free_buffers() checks if all the buffers on this particular page | 3012 | * try_to_free_buffers() checks if all the buffers on this particular page |
3010 | * are unused, and releases them if so. | 3013 | * are unused, and releases them if so. |
3011 | * | 3014 | * |
3012 | * Exclusion against try_to_free_buffers may be obtained by either | 3015 | * Exclusion against try_to_free_buffers may be obtained by either |
3013 | * locking the page or by holding its mapping's private_lock. | 3016 | * locking the page or by holding its mapping's private_lock. |
3014 | * | 3017 | * |
3015 | * If the page is dirty but all the buffers are clean then we need to | 3018 | * If the page is dirty but all the buffers are clean then we need to |
3016 | * be sure to mark the page clean as well. This is because the page | 3019 | * be sure to mark the page clean as well. This is because the page |
3017 | * may be against a block device, and a later reattachment of buffers | 3020 | * may be against a block device, and a later reattachment of buffers |
3018 | * to a dirty page will set *all* buffers dirty. Which would corrupt | 3021 | * to a dirty page will set *all* buffers dirty. Which would corrupt |
3019 | * filesystem data on the same device. | 3022 | * filesystem data on the same device. |
3020 | * | 3023 | * |
3021 | * The same applies to regular filesystem pages: if all the buffers are | 3024 | * The same applies to regular filesystem pages: if all the buffers are |
3022 | * clean then we set the page clean and proceed. To do that, we require | 3025 | * clean then we set the page clean and proceed. To do that, we require |
3023 | * total exclusion from __set_page_dirty_buffers(). That is obtained with | 3026 | * total exclusion from __set_page_dirty_buffers(). That is obtained with |
3024 | * private_lock. | 3027 | * private_lock. |
3025 | * | 3028 | * |
3026 | * try_to_free_buffers() is non-blocking. | 3029 | * try_to_free_buffers() is non-blocking. |
3027 | */ | 3030 | */ |
3028 | static inline int buffer_busy(struct buffer_head *bh) | 3031 | static inline int buffer_busy(struct buffer_head *bh) |
3029 | { | 3032 | { |
3030 | return atomic_read(&bh->b_count) | | 3033 | return atomic_read(&bh->b_count) | |
3031 | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); | 3034 | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); |
3032 | } | 3035 | } |
3033 | 3036 | ||
3034 | static int | 3037 | static int |
3035 | drop_buffers(struct page *page, struct buffer_head **buffers_to_free) | 3038 | drop_buffers(struct page *page, struct buffer_head **buffers_to_free) |
3036 | { | 3039 | { |
3037 | struct buffer_head *head = page_buffers(page); | 3040 | struct buffer_head *head = page_buffers(page); |
3038 | struct buffer_head *bh; | 3041 | struct buffer_head *bh; |
3039 | 3042 | ||
3040 | bh = head; | 3043 | bh = head; |
3041 | do { | 3044 | do { |
3042 | if (buffer_write_io_error(bh) && page->mapping) | 3045 | if (buffer_write_io_error(bh) && page->mapping) |
3043 | set_bit(AS_EIO, &page->mapping->flags); | 3046 | set_bit(AS_EIO, &page->mapping->flags); |
3044 | if (buffer_busy(bh)) | 3047 | if (buffer_busy(bh)) |
3045 | goto failed; | 3048 | goto failed; |
3046 | bh = bh->b_this_page; | 3049 | bh = bh->b_this_page; |
3047 | } while (bh != head); | 3050 | } while (bh != head); |
3048 | 3051 | ||
3049 | do { | 3052 | do { |
3050 | struct buffer_head *next = bh->b_this_page; | 3053 | struct buffer_head *next = bh->b_this_page; |
3051 | 3054 | ||
3052 | if (bh->b_assoc_map) | 3055 | if (bh->b_assoc_map) |
3053 | __remove_assoc_queue(bh); | 3056 | __remove_assoc_queue(bh); |
3054 | bh = next; | 3057 | bh = next; |
3055 | } while (bh != head); | 3058 | } while (bh != head); |
3056 | *buffers_to_free = head; | 3059 | *buffers_to_free = head; |
3057 | __clear_page_buffers(page); | 3060 | __clear_page_buffers(page); |
3058 | return 1; | 3061 | return 1; |
3059 | failed: | 3062 | failed: |
3060 | return 0; | 3063 | return 0; |
3061 | } | 3064 | } |
3062 | 3065 | ||
3063 | int try_to_free_buffers(struct page *page) | 3066 | int try_to_free_buffers(struct page *page) |
3064 | { | 3067 | { |
3065 | struct address_space * const mapping = page->mapping; | 3068 | struct address_space * const mapping = page->mapping; |
3066 | struct buffer_head *buffers_to_free = NULL; | 3069 | struct buffer_head *buffers_to_free = NULL; |
3067 | int ret = 0; | 3070 | int ret = 0; |
3068 | 3071 | ||
3069 | BUG_ON(!PageLocked(page)); | 3072 | BUG_ON(!PageLocked(page)); |
3070 | if (PageWriteback(page)) | 3073 | if (PageWriteback(page)) |
3071 | return 0; | 3074 | return 0; |
3072 | 3075 | ||
3073 | if (mapping == NULL) { /* can this still happen? */ | 3076 | if (mapping == NULL) { /* can this still happen? */ |
3074 | ret = drop_buffers(page, &buffers_to_free); | 3077 | ret = drop_buffers(page, &buffers_to_free); |
3075 | goto out; | 3078 | goto out; |
3076 | } | 3079 | } |
3077 | 3080 | ||
3078 | spin_lock(&mapping->private_lock); | 3081 | spin_lock(&mapping->private_lock); |
3079 | ret = drop_buffers(page, &buffers_to_free); | 3082 | ret = drop_buffers(page, &buffers_to_free); |
3080 | 3083 | ||
3081 | /* | 3084 | /* |
3082 | * If the filesystem writes its buffers by hand (eg ext3) | 3085 | * If the filesystem writes its buffers by hand (eg ext3) |
3083 | * then we can have clean buffers against a dirty page. We | 3086 | * then we can have clean buffers against a dirty page. We |
3084 | * clean the page here; otherwise the VM will never notice | 3087 | * clean the page here; otherwise the VM will never notice |
3085 | * that the filesystem did any IO at all. | 3088 | * that the filesystem did any IO at all. |
3086 | * | 3089 | * |
3087 | * Also, during truncate, discard_buffer will have marked all | 3090 | * Also, during truncate, discard_buffer will have marked all |
3088 | * the page's buffers clean. We discover that here and clean | 3091 | * the page's buffers clean. We discover that here and clean |
3089 | * the page also. | 3092 | * the page also. |
3090 | * | 3093 | * |
3091 | * private_lock must be held over this entire operation in order | 3094 | * private_lock must be held over this entire operation in order |
3092 | * to synchronise against __set_page_dirty_buffers and prevent the | 3095 | * to synchronise against __set_page_dirty_buffers and prevent the |
3093 | * dirty bit from being lost. | 3096 | * dirty bit from being lost. |
3094 | */ | 3097 | */ |
3095 | if (ret) | 3098 | if (ret) |
3096 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 3099 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
3097 | spin_unlock(&mapping->private_lock); | 3100 | spin_unlock(&mapping->private_lock); |
3098 | out: | 3101 | out: |
3099 | if (buffers_to_free) { | 3102 | if (buffers_to_free) { |
3100 | struct buffer_head *bh = buffers_to_free; | 3103 | struct buffer_head *bh = buffers_to_free; |
3101 | 3104 | ||
3102 | do { | 3105 | do { |
3103 | struct buffer_head *next = bh->b_this_page; | 3106 | struct buffer_head *next = bh->b_this_page; |
3104 | free_buffer_head(bh); | 3107 | free_buffer_head(bh); |
3105 | bh = next; | 3108 | bh = next; |
3106 | } while (bh != buffers_to_free); | 3109 | } while (bh != buffers_to_free); |
3107 | } | 3110 | } |
3108 | return ret; | 3111 | return ret; |
3109 | } | 3112 | } |
3110 | EXPORT_SYMBOL(try_to_free_buffers); | 3113 | EXPORT_SYMBOL(try_to_free_buffers); |
3111 | 3114 | ||
3112 | void block_sync_page(struct page *page) | 3115 | void block_sync_page(struct page *page) |
3113 | { | 3116 | { |
3114 | struct address_space *mapping; | 3117 | struct address_space *mapping; |
3115 | 3118 | ||
3116 | smp_mb(); | 3119 | smp_mb(); |
3117 | mapping = page_mapping(page); | 3120 | mapping = page_mapping(page); |
3118 | if (mapping) | 3121 | if (mapping) |
3119 | blk_run_backing_dev(mapping->backing_dev_info, page); | 3122 | blk_run_backing_dev(mapping->backing_dev_info, page); |
3120 | } | 3123 | } |
3121 | 3124 | ||
3122 | /* | 3125 | /* |
3123 | * There are no bdflush tunables left. But distributions are | 3126 | * There are no bdflush tunables left. But distributions are |
3124 | * still running obsolete flush daemons, so we terminate them here. | 3127 | * still running obsolete flush daemons, so we terminate them here. |
3125 | * | 3128 | * |
3126 | * Use of bdflush() is deprecated and will be removed in a future kernel. | 3129 | * Use of bdflush() is deprecated and will be removed in a future kernel. |
3127 | * The `pdflush' kernel threads fully replace bdflush daemons and this call. | 3130 | * The `pdflush' kernel threads fully replace bdflush daemons and this call. |
3128 | */ | 3131 | */ |
3129 | asmlinkage long sys_bdflush(int func, long data) | 3132 | asmlinkage long sys_bdflush(int func, long data) |
3130 | { | 3133 | { |
3131 | static int msg_count; | 3134 | static int msg_count; |
3132 | 3135 | ||
3133 | if (!capable(CAP_SYS_ADMIN)) | 3136 | if (!capable(CAP_SYS_ADMIN)) |
3134 | return -EPERM; | 3137 | return -EPERM; |
3135 | 3138 | ||
3136 | if (msg_count < 5) { | 3139 | if (msg_count < 5) { |
3137 | msg_count++; | 3140 | msg_count++; |
3138 | printk(KERN_INFO | 3141 | printk(KERN_INFO |
3139 | "warning: process `%s' used the obsolete bdflush" | 3142 | "warning: process `%s' used the obsolete bdflush" |
3140 | " system call\n", current->comm); | 3143 | " system call\n", current->comm); |
3141 | printk(KERN_INFO "Fix your initscripts?\n"); | 3144 | printk(KERN_INFO "Fix your initscripts?\n"); |
3142 | } | 3145 | } |
3143 | 3146 | ||
3144 | if (func == 1) | 3147 | if (func == 1) |
3145 | do_exit(0); | 3148 | do_exit(0); |
3146 | return 0; | 3149 | return 0; |
3147 | } | 3150 | } |
3148 | 3151 | ||
3149 | /* | 3152 | /* |
3150 | * Buffer-head allocation | 3153 | * Buffer-head allocation |
3151 | */ | 3154 | */ |
3152 | static struct kmem_cache *bh_cachep; | 3155 | static struct kmem_cache *bh_cachep; |
3153 | 3156 | ||
3154 | /* | 3157 | /* |
3155 | * Once the number of bh's in the machine exceeds this level, we start | 3158 | * Once the number of bh's in the machine exceeds this level, we start |
3156 | * stripping them in writeback. | 3159 | * stripping them in writeback. |
3157 | */ | 3160 | */ |
3158 | static int max_buffer_heads; | 3161 | static int max_buffer_heads; |
3159 | 3162 | ||
3160 | int buffer_heads_over_limit; | 3163 | int buffer_heads_over_limit; |
3161 | 3164 | ||
3162 | struct bh_accounting { | 3165 | struct bh_accounting { |
3163 | int nr; /* Number of live bh's */ | 3166 | int nr; /* Number of live bh's */ |
3164 | int ratelimit; /* Limit cacheline bouncing */ | 3167 | int ratelimit; /* Limit cacheline bouncing */ |
3165 | }; | 3168 | }; |
3166 | 3169 | ||
3167 | static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; | 3170 | static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; |
3168 | 3171 | ||
3169 | static void recalc_bh_state(void) | 3172 | static void recalc_bh_state(void) |
3170 | { | 3173 | { |
3171 | int i; | 3174 | int i; |
3172 | int tot = 0; | 3175 | int tot = 0; |
3173 | 3176 | ||
3174 | if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) | 3177 | if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) |
3175 | return; | 3178 | return; |
3176 | __get_cpu_var(bh_accounting).ratelimit = 0; | 3179 | __get_cpu_var(bh_accounting).ratelimit = 0; |
3177 | for_each_online_cpu(i) | 3180 | for_each_online_cpu(i) |
3178 | tot += per_cpu(bh_accounting, i).nr; | 3181 | tot += per_cpu(bh_accounting, i).nr; |
3179 | buffer_heads_over_limit = (tot > max_buffer_heads); | 3182 | buffer_heads_over_limit = (tot > max_buffer_heads); |
3180 | } | 3183 | } |
3181 | 3184 | ||
3182 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) | 3185 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) |
3183 | { | 3186 | { |
3184 | struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); | 3187 | struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); |
3185 | if (ret) { | 3188 | if (ret) { |
3186 | INIT_LIST_HEAD(&ret->b_assoc_buffers); | 3189 | INIT_LIST_HEAD(&ret->b_assoc_buffers); |
3187 | get_cpu_var(bh_accounting).nr++; | 3190 | get_cpu_var(bh_accounting).nr++; |
3188 | recalc_bh_state(); | 3191 | recalc_bh_state(); |
3189 | put_cpu_var(bh_accounting); | 3192 | put_cpu_var(bh_accounting); |
3190 | } | 3193 | } |
3191 | return ret; | 3194 | return ret; |
3192 | } | 3195 | } |
3193 | EXPORT_SYMBOL(alloc_buffer_head); | 3196 | EXPORT_SYMBOL(alloc_buffer_head); |
3194 | 3197 | ||
3195 | void free_buffer_head(struct buffer_head *bh) | 3198 | void free_buffer_head(struct buffer_head *bh) |
3196 | { | 3199 | { |
3197 | BUG_ON(!list_empty(&bh->b_assoc_buffers)); | 3200 | BUG_ON(!list_empty(&bh->b_assoc_buffers)); |
3198 | kmem_cache_free(bh_cachep, bh); | 3201 | kmem_cache_free(bh_cachep, bh); |
3199 | get_cpu_var(bh_accounting).nr--; | 3202 | get_cpu_var(bh_accounting).nr--; |
3200 | recalc_bh_state(); | 3203 | recalc_bh_state(); |
3201 | put_cpu_var(bh_accounting); | 3204 | put_cpu_var(bh_accounting); |
3202 | } | 3205 | } |
3203 | EXPORT_SYMBOL(free_buffer_head); | 3206 | EXPORT_SYMBOL(free_buffer_head); |
3204 | 3207 | ||
3205 | static void buffer_exit_cpu(int cpu) | 3208 | static void buffer_exit_cpu(int cpu) |
3206 | { | 3209 | { |
3207 | int i; | 3210 | int i; |
3208 | struct bh_lru *b = &per_cpu(bh_lrus, cpu); | 3211 | struct bh_lru *b = &per_cpu(bh_lrus, cpu); |
3209 | 3212 | ||
3210 | for (i = 0; i < BH_LRU_SIZE; i++) { | 3213 | for (i = 0; i < BH_LRU_SIZE; i++) { |
3211 | brelse(b->bhs[i]); | 3214 | brelse(b->bhs[i]); |
3212 | b->bhs[i] = NULL; | 3215 | b->bhs[i] = NULL; |
3213 | } | 3216 | } |
3214 | get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; | 3217 | get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; |
3215 | per_cpu(bh_accounting, cpu).nr = 0; | 3218 | per_cpu(bh_accounting, cpu).nr = 0; |
3216 | put_cpu_var(bh_accounting); | 3219 | put_cpu_var(bh_accounting); |
3217 | } | 3220 | } |
3218 | 3221 | ||
3219 | static int buffer_cpu_notify(struct notifier_block *self, | 3222 | static int buffer_cpu_notify(struct notifier_block *self, |
3220 | unsigned long action, void *hcpu) | 3223 | unsigned long action, void *hcpu) |
3221 | { | 3224 | { |
3222 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) | 3225 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) |
3223 | buffer_exit_cpu((unsigned long)hcpu); | 3226 | buffer_exit_cpu((unsigned long)hcpu); |
3224 | return NOTIFY_OK; | 3227 | return NOTIFY_OK; |
3225 | } | 3228 | } |
3226 | 3229 | ||
3227 | /** | 3230 | /** |
3228 | * bh_uptodate_or_lock - Test whether the buffer is uptodate | 3231 | * bh_uptodate_or_lock - Test whether the buffer is uptodate |
3229 | * @bh: struct buffer_head | 3232 | * @bh: struct buffer_head |
3230 | * | 3233 | * |
3231 | * Return true if the buffer is up-to-date and false, | 3234 | * Return true if the buffer is up-to-date and false, |
3232 | * with the buffer locked, if not. | 3235 | * with the buffer locked, if not. |
3233 | */ | 3236 | */ |
3234 | int bh_uptodate_or_lock(struct buffer_head *bh) | 3237 | int bh_uptodate_or_lock(struct buffer_head *bh) |
3235 | { | 3238 | { |
3236 | if (!buffer_uptodate(bh)) { | 3239 | if (!buffer_uptodate(bh)) { |
3237 | lock_buffer(bh); | 3240 | lock_buffer(bh); |
3238 | if (!buffer_uptodate(bh)) | 3241 | if (!buffer_uptodate(bh)) |
3239 | return 0; | 3242 | return 0; |
3240 | unlock_buffer(bh); | 3243 | unlock_buffer(bh); |
3241 | } | 3244 | } |
3242 | return 1; | 3245 | return 1; |
3243 | } | 3246 | } |
3244 | EXPORT_SYMBOL(bh_uptodate_or_lock); | 3247 | EXPORT_SYMBOL(bh_uptodate_or_lock); |
3245 | 3248 | ||
3246 | /** | 3249 | /** |
3247 | * bh_submit_read - Submit a locked buffer for reading | 3250 | * bh_submit_read - Submit a locked buffer for reading |
3248 | * @bh: struct buffer_head | 3251 | * @bh: struct buffer_head |
3249 | * | 3252 | * |
3250 | * Returns zero on success and -EIO on error. | 3253 | * Returns zero on success and -EIO on error. |
3251 | */ | 3254 | */ |
3252 | int bh_submit_read(struct buffer_head *bh) | 3255 | int bh_submit_read(struct buffer_head *bh) |
3253 | { | 3256 | { |
3254 | BUG_ON(!buffer_locked(bh)); | 3257 | BUG_ON(!buffer_locked(bh)); |
3255 | 3258 | ||
3256 | if (buffer_uptodate(bh)) { | 3259 | if (buffer_uptodate(bh)) { |
3257 | unlock_buffer(bh); | 3260 | unlock_buffer(bh); |
3258 | return 0; | 3261 | return 0; |
3259 | } | 3262 | } |
3260 | 3263 | ||
3261 | get_bh(bh); | 3264 | get_bh(bh); |
3262 | bh->b_end_io = end_buffer_read_sync; | 3265 | bh->b_end_io = end_buffer_read_sync; |
3263 | submit_bh(READ, bh); | 3266 | submit_bh(READ, bh); |
3264 | wait_on_buffer(bh); | 3267 | wait_on_buffer(bh); |
3265 | if (buffer_uptodate(bh)) | 3268 | if (buffer_uptodate(bh)) |
3266 | return 0; | 3269 | return 0; |
3267 | return -EIO; | 3270 | return -EIO; |
3268 | } | 3271 | } |
3269 | EXPORT_SYMBOL(bh_submit_read); | 3272 | EXPORT_SYMBOL(bh_submit_read); |
3270 | 3273 | ||
3271 | static void | 3274 | static void |
3272 | init_buffer_head(struct kmem_cache *cachep, void *data) | 3275 | init_buffer_head(struct kmem_cache *cachep, void *data) |
3273 | { | 3276 | { |
3274 | struct buffer_head *bh = data; | 3277 | struct buffer_head *bh = data; |
3275 | 3278 | ||
3276 | memset(bh, 0, sizeof(*bh)); | 3279 | memset(bh, 0, sizeof(*bh)); |
3277 | INIT_LIST_HEAD(&bh->b_assoc_buffers); | 3280 | INIT_LIST_HEAD(&bh->b_assoc_buffers); |
3278 | } | 3281 | } |
3279 | 3282 | ||
3280 | void __init buffer_init(void) | 3283 | void __init buffer_init(void) |
3281 | { | 3284 | { |
3282 | int nrpages; | 3285 | int nrpages; |
3283 | 3286 | ||
3284 | bh_cachep = kmem_cache_create("buffer_head", | 3287 | bh_cachep = kmem_cache_create("buffer_head", |
3285 | sizeof(struct buffer_head), 0, | 3288 | sizeof(struct buffer_head), 0, |
3286 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| | 3289 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| |
3287 | SLAB_MEM_SPREAD), | 3290 | SLAB_MEM_SPREAD), |
3288 | init_buffer_head); | 3291 | init_buffer_head); |
3289 | 3292 | ||
3290 | /* | 3293 | /* |
3291 | * Limit the bh occupancy to 10% of ZONE_NORMAL | 3294 | * Limit the bh occupancy to 10% of ZONE_NORMAL |
3292 | */ | 3295 | */ |
3293 | nrpages = (nr_free_buffer_pages() * 10) / 100; | 3296 | nrpages = (nr_free_buffer_pages() * 10) / 100; |
3294 | max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); | 3297 | max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); |
3295 | hotcpu_notifier(buffer_cpu_notify, 0); | 3298 | hotcpu_notifier(buffer_cpu_notify, 0); |
3296 | } | 3299 | } |
3297 | 3300 | ||
3298 | EXPORT_SYMBOL(__bforget); | 3301 | EXPORT_SYMBOL(__bforget); |
3299 | EXPORT_SYMBOL(__brelse); | 3302 | EXPORT_SYMBOL(__brelse); |
3300 | EXPORT_SYMBOL(__wait_on_buffer); | 3303 | EXPORT_SYMBOL(__wait_on_buffer); |
3301 | EXPORT_SYMBOL(block_commit_write); | 3304 | EXPORT_SYMBOL(block_commit_write); |
3302 | EXPORT_SYMBOL(block_prepare_write); | 3305 | EXPORT_SYMBOL(block_prepare_write); |
3303 | EXPORT_SYMBOL(block_page_mkwrite); | 3306 | EXPORT_SYMBOL(block_page_mkwrite); |
3304 | EXPORT_SYMBOL(block_read_full_page); | 3307 | EXPORT_SYMBOL(block_read_full_page); |
3305 | EXPORT_SYMBOL(block_sync_page); | 3308 | EXPORT_SYMBOL(block_sync_page); |
3306 | EXPORT_SYMBOL(block_truncate_page); | 3309 | EXPORT_SYMBOL(block_truncate_page); |
3307 | EXPORT_SYMBOL(block_write_full_page); | 3310 | EXPORT_SYMBOL(block_write_full_page); |
3308 | EXPORT_SYMBOL(cont_write_begin); | 3311 | EXPORT_SYMBOL(cont_write_begin); |
3309 | EXPORT_SYMBOL(end_buffer_read_sync); | 3312 | EXPORT_SYMBOL(end_buffer_read_sync); |
3310 | EXPORT_SYMBOL(end_buffer_write_sync); | 3313 | EXPORT_SYMBOL(end_buffer_write_sync); |
3311 | EXPORT_SYMBOL(file_fsync); | 3314 | EXPORT_SYMBOL(file_fsync); |
3312 | EXPORT_SYMBOL(fsync_bdev); | 3315 | EXPORT_SYMBOL(fsync_bdev); |
3313 | EXPORT_SYMBOL(generic_block_bmap); | 3316 | EXPORT_SYMBOL(generic_block_bmap); |
3314 | EXPORT_SYMBOL(generic_cont_expand_simple); | 3317 | EXPORT_SYMBOL(generic_cont_expand_simple); |
3315 | EXPORT_SYMBOL(init_buffer); | 3318 | EXPORT_SYMBOL(init_buffer); |
3316 | EXPORT_SYMBOL(invalidate_bdev); | 3319 | EXPORT_SYMBOL(invalidate_bdev); |
3317 | EXPORT_SYMBOL(ll_rw_block); | 3320 | EXPORT_SYMBOL(ll_rw_block); |
3318 | EXPORT_SYMBOL(mark_buffer_dirty); | 3321 | EXPORT_SYMBOL(mark_buffer_dirty); |
3319 | EXPORT_SYMBOL(submit_bh); | 3322 | EXPORT_SYMBOL(submit_bh); |
3320 | EXPORT_SYMBOL(sync_dirty_buffer); | 3323 | EXPORT_SYMBOL(sync_dirty_buffer); |
3321 | EXPORT_SYMBOL(unlock_buffer); | 3324 | EXPORT_SYMBOL(unlock_buffer); |
3322 | 3325 |
fs/mpage.c
1 | /* | 1 | /* |
2 | * fs/mpage.c | 2 | * fs/mpage.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * | 5 | * |
6 | * Contains functions related to preparing and submitting BIOs which contain | 6 | * Contains functions related to preparing and submitting BIOs which contain |
7 | * multiple pagecache pages. | 7 | * multiple pagecache pages. |
8 | * | 8 | * |
9 | * 15May2002 akpm@zip.com.au | 9 | * 15May2002 akpm@zip.com.au |
10 | * Initial version | 10 | * Initial version |
11 | * 27Jun2002 axboe@suse.de | 11 | * 27Jun2002 axboe@suse.de |
12 | * use bio_add_page() to build bio's just the right size | 12 | * use bio_add_page() to build bio's just the right size |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/kdev_t.h> | 18 | #include <linux/kdev_t.h> |
19 | #include <linux/bio.h> | 19 | #include <linux/bio.h> |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/buffer_head.h> | 21 | #include <linux/buffer_head.h> |
22 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/prefetch.h> | 24 | #include <linux/prefetch.h> |
25 | #include <linux/mpage.h> | 25 | #include <linux/mpage.h> |
26 | #include <linux/writeback.h> | 26 | #include <linux/writeback.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/pagevec.h> | 28 | #include <linux/pagevec.h> |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * I/O completion handler for multipage BIOs. | 31 | * I/O completion handler for multipage BIOs. |
32 | * | 32 | * |
33 | * The mpage code never puts partial pages into a BIO (except for end-of-file). | 33 | * The mpage code never puts partial pages into a BIO (except for end-of-file). |
34 | * If a page does not map to a contiguous run of blocks then it simply falls | 34 | * If a page does not map to a contiguous run of blocks then it simply falls |
35 | * back to block_read_full_page(). | 35 | * back to block_read_full_page(). |
36 | * | 36 | * |
37 | * Why is this? If a page's completion depends on a number of different BIOs | 37 | * Why is this? If a page's completion depends on a number of different BIOs |
38 | * which can complete in any order (or at the same time) then determining the | 38 | * which can complete in any order (or at the same time) then determining the |
39 | * status of that page is hard. See end_buffer_async_read() for the details. | 39 | * status of that page is hard. See end_buffer_async_read() for the details. |
40 | * There is no point in duplicating all that complexity. | 40 | * There is no point in duplicating all that complexity. |
41 | */ | 41 | */ |
42 | static void mpage_end_io_read(struct bio *bio, int err) | 42 | static void mpage_end_io_read(struct bio *bio, int err) |
43 | { | 43 | { |
44 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 44 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
45 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | 45 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; |
46 | 46 | ||
47 | do { | 47 | do { |
48 | struct page *page = bvec->bv_page; | 48 | struct page *page = bvec->bv_page; |
49 | 49 | ||
50 | if (--bvec >= bio->bi_io_vec) | 50 | if (--bvec >= bio->bi_io_vec) |
51 | prefetchw(&bvec->bv_page->flags); | 51 | prefetchw(&bvec->bv_page->flags); |
52 | 52 | ||
53 | if (uptodate) { | 53 | if (uptodate) { |
54 | SetPageUptodate(page); | 54 | SetPageUptodate(page); |
55 | } else { | 55 | } else { |
56 | ClearPageUptodate(page); | 56 | ClearPageUptodate(page); |
57 | SetPageError(page); | 57 | SetPageError(page); |
58 | } | 58 | } |
59 | unlock_page(page); | 59 | unlock_page(page); |
60 | } while (bvec >= bio->bi_io_vec); | 60 | } while (bvec >= bio->bi_io_vec); |
61 | bio_put(bio); | 61 | bio_put(bio); |
62 | } | 62 | } |
63 | 63 | ||
64 | static void mpage_end_io_write(struct bio *bio, int err) | 64 | static void mpage_end_io_write(struct bio *bio, int err) |
65 | { | 65 | { |
66 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 66 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
67 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | 67 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; |
68 | 68 | ||
69 | do { | 69 | do { |
70 | struct page *page = bvec->bv_page; | 70 | struct page *page = bvec->bv_page; |
71 | 71 | ||
72 | if (--bvec >= bio->bi_io_vec) | 72 | if (--bvec >= bio->bi_io_vec) |
73 | prefetchw(&bvec->bv_page->flags); | 73 | prefetchw(&bvec->bv_page->flags); |
74 | 74 | ||
75 | if (!uptodate){ | 75 | if (!uptodate){ |
76 | SetPageError(page); | 76 | SetPageError(page); |
77 | if (page->mapping) | 77 | if (page->mapping) |
78 | set_bit(AS_EIO, &page->mapping->flags); | 78 | set_bit(AS_EIO, &page->mapping->flags); |
79 | } | 79 | } |
80 | end_page_writeback(page); | 80 | end_page_writeback(page); |
81 | } while (bvec >= bio->bi_io_vec); | 81 | } while (bvec >= bio->bi_io_vec); |
82 | bio_put(bio); | 82 | bio_put(bio); |
83 | } | 83 | } |
84 | 84 | ||
85 | static struct bio *mpage_bio_submit(int rw, struct bio *bio) | 85 | struct bio *mpage_bio_submit(int rw, struct bio *bio) |
86 | { | 86 | { |
87 | bio->bi_end_io = mpage_end_io_read; | 87 | bio->bi_end_io = mpage_end_io_read; |
88 | if (rw == WRITE) | 88 | if (rw == WRITE) |
89 | bio->bi_end_io = mpage_end_io_write; | 89 | bio->bi_end_io = mpage_end_io_write; |
90 | submit_bio(rw, bio); | 90 | submit_bio(rw, bio); |
91 | return NULL; | 91 | return NULL; |
92 | } | 92 | } |
93 | EXPORT_SYMBOL(mpage_bio_submit); | ||
93 | 94 | ||
94 | static struct bio * | 95 | static struct bio * |
95 | mpage_alloc(struct block_device *bdev, | 96 | mpage_alloc(struct block_device *bdev, |
96 | sector_t first_sector, int nr_vecs, | 97 | sector_t first_sector, int nr_vecs, |
97 | gfp_t gfp_flags) | 98 | gfp_t gfp_flags) |
98 | { | 99 | { |
99 | struct bio *bio; | 100 | struct bio *bio; |
100 | 101 | ||
101 | bio = bio_alloc(gfp_flags, nr_vecs); | 102 | bio = bio_alloc(gfp_flags, nr_vecs); |
102 | 103 | ||
103 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { | 104 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { |
104 | while (!bio && (nr_vecs /= 2)) | 105 | while (!bio && (nr_vecs /= 2)) |
105 | bio = bio_alloc(gfp_flags, nr_vecs); | 106 | bio = bio_alloc(gfp_flags, nr_vecs); |
106 | } | 107 | } |
107 | 108 | ||
108 | if (bio) { | 109 | if (bio) { |
109 | bio->bi_bdev = bdev; | 110 | bio->bi_bdev = bdev; |
110 | bio->bi_sector = first_sector; | 111 | bio->bi_sector = first_sector; |
111 | } | 112 | } |
112 | return bio; | 113 | return bio; |
113 | } | 114 | } |
114 | 115 | ||
115 | /* | 116 | /* |
116 | * support function for mpage_readpages. The fs supplied get_block might | 117 | * support function for mpage_readpages. The fs supplied get_block might |
117 | * return an up to date buffer. This is used to map that buffer into | 118 | * return an up to date buffer. This is used to map that buffer into |
118 | * the page, which allows readpage to avoid triggering a duplicate call | 119 | * the page, which allows readpage to avoid triggering a duplicate call |
119 | * to get_block. | 120 | * to get_block. |
120 | * | 121 | * |
121 | * The idea is to avoid adding buffers to pages that don't already have | 122 | * The idea is to avoid adding buffers to pages that don't already have |
122 | * them. So when the buffer is up to date and the page size == block size, | 123 | * them. So when the buffer is up to date and the page size == block size, |
123 | * this marks the page up to date instead of adding new buffers. | 124 | * this marks the page up to date instead of adding new buffers. |
124 | */ | 125 | */ |
125 | static void | 126 | static void |
126 | map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) | 127 | map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) |
127 | { | 128 | { |
128 | struct inode *inode = page->mapping->host; | 129 | struct inode *inode = page->mapping->host; |
129 | struct buffer_head *page_bh, *head; | 130 | struct buffer_head *page_bh, *head; |
130 | int block = 0; | 131 | int block = 0; |
131 | 132 | ||
132 | if (!page_has_buffers(page)) { | 133 | if (!page_has_buffers(page)) { |
133 | /* | 134 | /* |
134 | * don't make any buffers if there is only one buffer on | 135 | * don't make any buffers if there is only one buffer on |
135 | * the page and the page just needs to be set up to date | 136 | * the page and the page just needs to be set up to date |
136 | */ | 137 | */ |
137 | if (inode->i_blkbits == PAGE_CACHE_SHIFT && | 138 | if (inode->i_blkbits == PAGE_CACHE_SHIFT && |
138 | buffer_uptodate(bh)) { | 139 | buffer_uptodate(bh)) { |
139 | SetPageUptodate(page); | 140 | SetPageUptodate(page); |
140 | return; | 141 | return; |
141 | } | 142 | } |
142 | create_empty_buffers(page, 1 << inode->i_blkbits, 0); | 143 | create_empty_buffers(page, 1 << inode->i_blkbits, 0); |
143 | } | 144 | } |
144 | head = page_buffers(page); | 145 | head = page_buffers(page); |
145 | page_bh = head; | 146 | page_bh = head; |
146 | do { | 147 | do { |
147 | if (block == page_block) { | 148 | if (block == page_block) { |
148 | page_bh->b_state = bh->b_state; | 149 | page_bh->b_state = bh->b_state; |
149 | page_bh->b_bdev = bh->b_bdev; | 150 | page_bh->b_bdev = bh->b_bdev; |
150 | page_bh->b_blocknr = bh->b_blocknr; | 151 | page_bh->b_blocknr = bh->b_blocknr; |
151 | break; | 152 | break; |
152 | } | 153 | } |
153 | page_bh = page_bh->b_this_page; | 154 | page_bh = page_bh->b_this_page; |
154 | block++; | 155 | block++; |
155 | } while (page_bh != head); | 156 | } while (page_bh != head); |
156 | } | 157 | } |
157 | 158 | ||
158 | /* | 159 | /* |
159 | * This is the worker routine which does all the work of mapping the disk | 160 | * This is the worker routine which does all the work of mapping the disk |
160 | * blocks and constructs largest possible bios, submits them for IO if the | 161 | * blocks and constructs largest possible bios, submits them for IO if the |
161 | * blocks are not contiguous on the disk. | 162 | * blocks are not contiguous on the disk. |
162 | * | 163 | * |
163 | * We pass a buffer_head back and forth and use its buffer_mapped() flag to | 164 | * We pass a buffer_head back and forth and use its buffer_mapped() flag to |
164 | * represent the validity of its disk mapping and to decide when to do the next | 165 | * represent the validity of its disk mapping and to decide when to do the next |
165 | * get_block() call. | 166 | * get_block() call. |
166 | */ | 167 | */ |
167 | static struct bio * | 168 | static struct bio * |
168 | do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, | 169 | do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, |
169 | sector_t *last_block_in_bio, struct buffer_head *map_bh, | 170 | sector_t *last_block_in_bio, struct buffer_head *map_bh, |
170 | unsigned long *first_logical_block, get_block_t get_block) | 171 | unsigned long *first_logical_block, get_block_t get_block) |
171 | { | 172 | { |
172 | struct inode *inode = page->mapping->host; | 173 | struct inode *inode = page->mapping->host; |
173 | const unsigned blkbits = inode->i_blkbits; | 174 | const unsigned blkbits = inode->i_blkbits; |
174 | const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; | 175 | const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; |
175 | const unsigned blocksize = 1 << blkbits; | 176 | const unsigned blocksize = 1 << blkbits; |
176 | sector_t block_in_file; | 177 | sector_t block_in_file; |
177 | sector_t last_block; | 178 | sector_t last_block; |
178 | sector_t last_block_in_file; | 179 | sector_t last_block_in_file; |
179 | sector_t blocks[MAX_BUF_PER_PAGE]; | 180 | sector_t blocks[MAX_BUF_PER_PAGE]; |
180 | unsigned page_block; | 181 | unsigned page_block; |
181 | unsigned first_hole = blocks_per_page; | 182 | unsigned first_hole = blocks_per_page; |
182 | struct block_device *bdev = NULL; | 183 | struct block_device *bdev = NULL; |
183 | int length; | 184 | int length; |
184 | int fully_mapped = 1; | 185 | int fully_mapped = 1; |
185 | unsigned nblocks; | 186 | unsigned nblocks; |
186 | unsigned relative_block; | 187 | unsigned relative_block; |
187 | 188 | ||
188 | if (page_has_buffers(page)) | 189 | if (page_has_buffers(page)) |
189 | goto confused; | 190 | goto confused; |
190 | 191 | ||
191 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); | 192 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); |
192 | last_block = block_in_file + nr_pages * blocks_per_page; | 193 | last_block = block_in_file + nr_pages * blocks_per_page; |
193 | last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; | 194 | last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; |
194 | if (last_block > last_block_in_file) | 195 | if (last_block > last_block_in_file) |
195 | last_block = last_block_in_file; | 196 | last_block = last_block_in_file; |
196 | page_block = 0; | 197 | page_block = 0; |
197 | 198 | ||
198 | /* | 199 | /* |
199 | * Map blocks using the result from the previous get_blocks call first. | 200 | * Map blocks using the result from the previous get_blocks call first. |
200 | */ | 201 | */ |
201 | nblocks = map_bh->b_size >> blkbits; | 202 | nblocks = map_bh->b_size >> blkbits; |
202 | if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && | 203 | if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && |
203 | block_in_file < (*first_logical_block + nblocks)) { | 204 | block_in_file < (*first_logical_block + nblocks)) { |
204 | unsigned map_offset = block_in_file - *first_logical_block; | 205 | unsigned map_offset = block_in_file - *first_logical_block; |
205 | unsigned last = nblocks - map_offset; | 206 | unsigned last = nblocks - map_offset; |
206 | 207 | ||
207 | for (relative_block = 0; ; relative_block++) { | 208 | for (relative_block = 0; ; relative_block++) { |
208 | if (relative_block == last) { | 209 | if (relative_block == last) { |
209 | clear_buffer_mapped(map_bh); | 210 | clear_buffer_mapped(map_bh); |
210 | break; | 211 | break; |
211 | } | 212 | } |
212 | if (page_block == blocks_per_page) | 213 | if (page_block == blocks_per_page) |
213 | break; | 214 | break; |
214 | blocks[page_block] = map_bh->b_blocknr + map_offset + | 215 | blocks[page_block] = map_bh->b_blocknr + map_offset + |
215 | relative_block; | 216 | relative_block; |
216 | page_block++; | 217 | page_block++; |
217 | block_in_file++; | 218 | block_in_file++; |
218 | } | 219 | } |
219 | bdev = map_bh->b_bdev; | 220 | bdev = map_bh->b_bdev; |
220 | } | 221 | } |
221 | 222 | ||
222 | /* | 223 | /* |
223 | * Then do more get_blocks calls until we are done with this page. | 224 | * Then do more get_blocks calls until we are done with this page. |
224 | */ | 225 | */ |
225 | map_bh->b_page = page; | 226 | map_bh->b_page = page; |
226 | while (page_block < blocks_per_page) { | 227 | while (page_block < blocks_per_page) { |
227 | map_bh->b_state = 0; | 228 | map_bh->b_state = 0; |
228 | map_bh->b_size = 0; | 229 | map_bh->b_size = 0; |
229 | 230 | ||
230 | if (block_in_file < last_block) { | 231 | if (block_in_file < last_block) { |
231 | map_bh->b_size = (last_block-block_in_file) << blkbits; | 232 | map_bh->b_size = (last_block-block_in_file) << blkbits; |
232 | if (get_block(inode, block_in_file, map_bh, 0)) | 233 | if (get_block(inode, block_in_file, map_bh, 0)) |
233 | goto confused; | 234 | goto confused; |
234 | *first_logical_block = block_in_file; | 235 | *first_logical_block = block_in_file; |
235 | } | 236 | } |
236 | 237 | ||
237 | if (!buffer_mapped(map_bh)) { | 238 | if (!buffer_mapped(map_bh)) { |
238 | fully_mapped = 0; | 239 | fully_mapped = 0; |
239 | if (first_hole == blocks_per_page) | 240 | if (first_hole == blocks_per_page) |
240 | first_hole = page_block; | 241 | first_hole = page_block; |
241 | page_block++; | 242 | page_block++; |
242 | block_in_file++; | 243 | block_in_file++; |
243 | clear_buffer_mapped(map_bh); | 244 | clear_buffer_mapped(map_bh); |
244 | continue; | 245 | continue; |
245 | } | 246 | } |
246 | 247 | ||
247 | /* some filesystems will copy data into the page during | 248 | /* some filesystems will copy data into the page during |
248 | * the get_block call, in which case we don't want to | 249 | * the get_block call, in which case we don't want to |
249 | * read it again. map_buffer_to_page copies the data | 250 | * read it again. map_buffer_to_page copies the data |
250 | * we just collected from get_block into the page's buffers | 251 | * we just collected from get_block into the page's buffers |
251 | * so readpage doesn't have to repeat the get_block call | 252 | * so readpage doesn't have to repeat the get_block call |
252 | */ | 253 | */ |
253 | if (buffer_uptodate(map_bh)) { | 254 | if (buffer_uptodate(map_bh)) { |
254 | map_buffer_to_page(page, map_bh, page_block); | 255 | map_buffer_to_page(page, map_bh, page_block); |
255 | goto confused; | 256 | goto confused; |
256 | } | 257 | } |
257 | 258 | ||
258 | if (first_hole != blocks_per_page) | 259 | if (first_hole != blocks_per_page) |
259 | goto confused; /* hole -> non-hole */ | 260 | goto confused; /* hole -> non-hole */ |
260 | 261 | ||
261 | /* Contiguous blocks? */ | 262 | /* Contiguous blocks? */ |
262 | if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) | 263 | if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) |
263 | goto confused; | 264 | goto confused; |
264 | nblocks = map_bh->b_size >> blkbits; | 265 | nblocks = map_bh->b_size >> blkbits; |
265 | for (relative_block = 0; ; relative_block++) { | 266 | for (relative_block = 0; ; relative_block++) { |
266 | if (relative_block == nblocks) { | 267 | if (relative_block == nblocks) { |
267 | clear_buffer_mapped(map_bh); | 268 | clear_buffer_mapped(map_bh); |
268 | break; | 269 | break; |
269 | } else if (page_block == blocks_per_page) | 270 | } else if (page_block == blocks_per_page) |
270 | break; | 271 | break; |
271 | blocks[page_block] = map_bh->b_blocknr+relative_block; | 272 | blocks[page_block] = map_bh->b_blocknr+relative_block; |
272 | page_block++; | 273 | page_block++; |
273 | block_in_file++; | 274 | block_in_file++; |
274 | } | 275 | } |
275 | bdev = map_bh->b_bdev; | 276 | bdev = map_bh->b_bdev; |
276 | } | 277 | } |
277 | 278 | ||
278 | if (first_hole != blocks_per_page) { | 279 | if (first_hole != blocks_per_page) { |
279 | zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); | 280 | zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); |
280 | if (first_hole == 0) { | 281 | if (first_hole == 0) { |
281 | SetPageUptodate(page); | 282 | SetPageUptodate(page); |
282 | unlock_page(page); | 283 | unlock_page(page); |
283 | goto out; | 284 | goto out; |
284 | } | 285 | } |
285 | } else if (fully_mapped) { | 286 | } else if (fully_mapped) { |
286 | SetPageMappedToDisk(page); | 287 | SetPageMappedToDisk(page); |
287 | } | 288 | } |
288 | 289 | ||
289 | /* | 290 | /* |
290 | * This page will go to BIO. Do we need to send this BIO off first? | 291 | * This page will go to BIO. Do we need to send this BIO off first? |
291 | */ | 292 | */ |
292 | if (bio && (*last_block_in_bio != blocks[0] - 1)) | 293 | if (bio && (*last_block_in_bio != blocks[0] - 1)) |
293 | bio = mpage_bio_submit(READ, bio); | 294 | bio = mpage_bio_submit(READ, bio); |
294 | 295 | ||
295 | alloc_new: | 296 | alloc_new: |
296 | if (bio == NULL) { | 297 | if (bio == NULL) { |
297 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), | 298 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), |
298 | min_t(int, nr_pages, bio_get_nr_vecs(bdev)), | 299 | min_t(int, nr_pages, bio_get_nr_vecs(bdev)), |
299 | GFP_KERNEL); | 300 | GFP_KERNEL); |
300 | if (bio == NULL) | 301 | if (bio == NULL) |
301 | goto confused; | 302 | goto confused; |
302 | } | 303 | } |
303 | 304 | ||
304 | length = first_hole << blkbits; | 305 | length = first_hole << blkbits; |
305 | if (bio_add_page(bio, page, length, 0) < length) { | 306 | if (bio_add_page(bio, page, length, 0) < length) { |
306 | bio = mpage_bio_submit(READ, bio); | 307 | bio = mpage_bio_submit(READ, bio); |
307 | goto alloc_new; | 308 | goto alloc_new; |
308 | } | 309 | } |
309 | 310 | ||
310 | if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) | 311 | if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) |
311 | bio = mpage_bio_submit(READ, bio); | 312 | bio = mpage_bio_submit(READ, bio); |
312 | else | 313 | else |
313 | *last_block_in_bio = blocks[blocks_per_page - 1]; | 314 | *last_block_in_bio = blocks[blocks_per_page - 1]; |
314 | out: | 315 | out: |
315 | return bio; | 316 | return bio; |
316 | 317 | ||
317 | confused: | 318 | confused: |
318 | if (bio) | 319 | if (bio) |
319 | bio = mpage_bio_submit(READ, bio); | 320 | bio = mpage_bio_submit(READ, bio); |
320 | if (!PageUptodate(page)) | 321 | if (!PageUptodate(page)) |
321 | block_read_full_page(page, get_block); | 322 | block_read_full_page(page, get_block); |
322 | else | 323 | else |
323 | unlock_page(page); | 324 | unlock_page(page); |
324 | goto out; | 325 | goto out; |
325 | } | 326 | } |
326 | 327 | ||
327 | /** | 328 | /** |
328 | * mpage_readpages - populate an address space with some pages & start reads against them | 329 | * mpage_readpages - populate an address space with some pages & start reads against them |
329 | * @mapping: the address_space | 330 | * @mapping: the address_space |
330 | * @pages: The address of a list_head which contains the target pages. These | 331 | * @pages: The address of a list_head which contains the target pages. These |
331 | * pages have their ->index populated and are otherwise uninitialised. | 332 | * pages have their ->index populated and are otherwise uninitialised. |
332 | * The page at @pages->prev has the lowest file offset, and reads should be | 333 | * The page at @pages->prev has the lowest file offset, and reads should be |
333 | * issued in @pages->prev to @pages->next order. | 334 | * issued in @pages->prev to @pages->next order. |
334 | * @nr_pages: The number of pages at *@pages | 335 | * @nr_pages: The number of pages at *@pages |
335 | * @get_block: The filesystem's block mapper function. | 336 | * @get_block: The filesystem's block mapper function. |
336 | * | 337 | * |
337 | * This function walks the pages and the blocks within each page, building and | 338 | * This function walks the pages and the blocks within each page, building and |
338 | * emitting large BIOs. | 339 | * emitting large BIOs. |
339 | * | 340 | * |
340 | * If anything unusual happens, such as: | 341 | * If anything unusual happens, such as: |
341 | * | 342 | * |
342 | * - encountering a page which has buffers | 343 | * - encountering a page which has buffers |
343 | * - encountering a page which has a non-hole after a hole | 344 | * - encountering a page which has a non-hole after a hole |
344 | * - encountering a page with non-contiguous blocks | 345 | * - encountering a page with non-contiguous blocks |
345 | * | 346 | * |
346 | * then this code just gives up and calls the buffer_head-based read function. | 347 | * then this code just gives up and calls the buffer_head-based read function. |
347 | * It does handle a page which has holes at the end - that is a common case: | 348 | * It does handle a page which has holes at the end - that is a common case: |
348 | * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. | 349 | * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. |
349 | * | 350 | * |
350 | * BH_Boundary explanation: | 351 | * BH_Boundary explanation: |
351 | * | 352 | * |
352 | * There is a problem. The mpage read code assembles several pages, gets all | 353 | * There is a problem. The mpage read code assembles several pages, gets all |
353 | * their disk mappings, and then submits them all. That's fine, but obtaining | 354 | * their disk mappings, and then submits them all. That's fine, but obtaining |
354 | * the disk mappings may require I/O. Reads of indirect blocks, for example. | 355 | * the disk mappings may require I/O. Reads of indirect blocks, for example. |
355 | * | 356 | * |
356 | * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be | 357 | * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be |
357 | * submitted in the following order: | 358 | * submitted in the following order: |
358 | * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 | 359 | * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 |
359 | * | 360 | * |
360 | * because the indirect block has to be read to get the mappings of blocks | 361 | * because the indirect block has to be read to get the mappings of blocks |
361 | * 13,14,15,16. Obviously, this impacts performance. | 362 | * 13,14,15,16. Obviously, this impacts performance. |
362 | * | 363 | * |
363 | * So what we do it to allow the filesystem's get_block() function to set | 364 | * So what we do it to allow the filesystem's get_block() function to set |
364 | * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block | 365 | * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block |
365 | * after this one will require I/O against a block which is probably close to | 366 | * after this one will require I/O against a block which is probably close to |
366 | * this one. So you should push what I/O you have currently accumulated. | 367 | * this one. So you should push what I/O you have currently accumulated. |
367 | * | 368 | * |
368 | * This all causes the disk requests to be issued in the correct order. | 369 | * This all causes the disk requests to be issued in the correct order. |
369 | */ | 370 | */ |
370 | int | 371 | int |
371 | mpage_readpages(struct address_space *mapping, struct list_head *pages, | 372 | mpage_readpages(struct address_space *mapping, struct list_head *pages, |
372 | unsigned nr_pages, get_block_t get_block) | 373 | unsigned nr_pages, get_block_t get_block) |
373 | { | 374 | { |
374 | struct bio *bio = NULL; | 375 | struct bio *bio = NULL; |
375 | unsigned page_idx; | 376 | unsigned page_idx; |
376 | sector_t last_block_in_bio = 0; | 377 | sector_t last_block_in_bio = 0; |
377 | struct buffer_head map_bh; | 378 | struct buffer_head map_bh; |
378 | unsigned long first_logical_block = 0; | 379 | unsigned long first_logical_block = 0; |
379 | 380 | ||
380 | clear_buffer_mapped(&map_bh); | 381 | clear_buffer_mapped(&map_bh); |
381 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | 382 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { |
382 | struct page *page = list_entry(pages->prev, struct page, lru); | 383 | struct page *page = list_entry(pages->prev, struct page, lru); |
383 | 384 | ||
384 | prefetchw(&page->flags); | 385 | prefetchw(&page->flags); |
385 | list_del(&page->lru); | 386 | list_del(&page->lru); |
386 | if (!add_to_page_cache_lru(page, mapping, | 387 | if (!add_to_page_cache_lru(page, mapping, |
387 | page->index, GFP_KERNEL)) { | 388 | page->index, GFP_KERNEL)) { |
388 | bio = do_mpage_readpage(bio, page, | 389 | bio = do_mpage_readpage(bio, page, |
389 | nr_pages - page_idx, | 390 | nr_pages - page_idx, |
390 | &last_block_in_bio, &map_bh, | 391 | &last_block_in_bio, &map_bh, |
391 | &first_logical_block, | 392 | &first_logical_block, |
392 | get_block); | 393 | get_block); |
393 | } | 394 | } |
394 | page_cache_release(page); | 395 | page_cache_release(page); |
395 | } | 396 | } |
396 | BUG_ON(!list_empty(pages)); | 397 | BUG_ON(!list_empty(pages)); |
397 | if (bio) | 398 | if (bio) |
398 | mpage_bio_submit(READ, bio); | 399 | mpage_bio_submit(READ, bio); |
399 | return 0; | 400 | return 0; |
400 | } | 401 | } |
401 | EXPORT_SYMBOL(mpage_readpages); | 402 | EXPORT_SYMBOL(mpage_readpages); |
402 | 403 | ||
403 | /* | 404 | /* |
404 | * This isn't called much at all | 405 | * This isn't called much at all |
405 | */ | 406 | */ |
406 | int mpage_readpage(struct page *page, get_block_t get_block) | 407 | int mpage_readpage(struct page *page, get_block_t get_block) |
407 | { | 408 | { |
408 | struct bio *bio = NULL; | 409 | struct bio *bio = NULL; |
409 | sector_t last_block_in_bio = 0; | 410 | sector_t last_block_in_bio = 0; |
410 | struct buffer_head map_bh; | 411 | struct buffer_head map_bh; |
411 | unsigned long first_logical_block = 0; | 412 | unsigned long first_logical_block = 0; |
412 | 413 | ||
413 | clear_buffer_mapped(&map_bh); | 414 | clear_buffer_mapped(&map_bh); |
414 | bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, | 415 | bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, |
415 | &map_bh, &first_logical_block, get_block); | 416 | &map_bh, &first_logical_block, get_block); |
416 | if (bio) | 417 | if (bio) |
417 | mpage_bio_submit(READ, bio); | 418 | mpage_bio_submit(READ, bio); |
418 | return 0; | 419 | return 0; |
419 | } | 420 | } |
420 | EXPORT_SYMBOL(mpage_readpage); | 421 | EXPORT_SYMBOL(mpage_readpage); |
421 | 422 | ||
422 | /* | 423 | /* |
423 | * Writing is not so simple. | 424 | * Writing is not so simple. |
424 | * | 425 | * |
425 | * If the page has buffers then they will be used for obtaining the disk | 426 | * If the page has buffers then they will be used for obtaining the disk |
426 | * mapping. We only support pages which are fully mapped-and-dirty, with a | 427 | * mapping. We only support pages which are fully mapped-and-dirty, with a |
427 | * special case for pages which are unmapped at the end: end-of-file. | 428 | * special case for pages which are unmapped at the end: end-of-file. |
428 | * | 429 | * |
429 | * If the page has no buffers (preferred) then the page is mapped here. | 430 | * If the page has no buffers (preferred) then the page is mapped here. |
430 | * | 431 | * |
431 | * If all blocks are found to be contiguous then the page can go into the | 432 | * If all blocks are found to be contiguous then the page can go into the |
432 | * BIO. Otherwise fall back to the mapping's writepage(). | 433 | * BIO. Otherwise fall back to the mapping's writepage(). |
433 | * | 434 | * |
434 | * FIXME: This code wants an estimate of how many pages are still to be | 435 | * FIXME: This code wants an estimate of how many pages are still to be |
435 | * written, so it can intelligently allocate a suitably-sized BIO. For now, | 436 | * written, so it can intelligently allocate a suitably-sized BIO. For now, |
436 | * just allocate full-size (16-page) BIOs. | 437 | * just allocate full-size (16-page) BIOs. |
437 | */ | 438 | */ |
438 | struct mpage_data { | ||
439 | struct bio *bio; | ||
440 | sector_t last_block_in_bio; | ||
441 | get_block_t *get_block; | ||
442 | unsigned use_writepage; | ||
443 | }; | ||
444 | 439 | ||
445 | static int __mpage_writepage(struct page *page, struct writeback_control *wbc, | 440 | int __mpage_writepage(struct page *page, struct writeback_control *wbc, |
446 | void *data) | 441 | void *data) |
447 | { | 442 | { |
448 | struct mpage_data *mpd = data; | 443 | struct mpage_data *mpd = data; |
449 | struct bio *bio = mpd->bio; | 444 | struct bio *bio = mpd->bio; |
450 | struct address_space *mapping = page->mapping; | 445 | struct address_space *mapping = page->mapping; |
451 | struct inode *inode = page->mapping->host; | 446 | struct inode *inode = page->mapping->host; |
452 | const unsigned blkbits = inode->i_blkbits; | 447 | const unsigned blkbits = inode->i_blkbits; |
453 | unsigned long end_index; | 448 | unsigned long end_index; |
454 | const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; | 449 | const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; |
455 | sector_t last_block; | 450 | sector_t last_block; |
456 | sector_t block_in_file; | 451 | sector_t block_in_file; |
457 | sector_t blocks[MAX_BUF_PER_PAGE]; | 452 | sector_t blocks[MAX_BUF_PER_PAGE]; |
458 | unsigned page_block; | 453 | unsigned page_block; |
459 | unsigned first_unmapped = blocks_per_page; | 454 | unsigned first_unmapped = blocks_per_page; |
460 | struct block_device *bdev = NULL; | 455 | struct block_device *bdev = NULL; |
461 | int boundary = 0; | 456 | int boundary = 0; |
462 | sector_t boundary_block = 0; | 457 | sector_t boundary_block = 0; |
463 | struct block_device *boundary_bdev = NULL; | 458 | struct block_device *boundary_bdev = NULL; |
464 | int length; | 459 | int length; |
465 | struct buffer_head map_bh; | 460 | struct buffer_head map_bh; |
466 | loff_t i_size = i_size_read(inode); | 461 | loff_t i_size = i_size_read(inode); |
467 | int ret = 0; | 462 | int ret = 0; |
468 | 463 | ||
469 | if (page_has_buffers(page)) { | 464 | if (page_has_buffers(page)) { |
470 | struct buffer_head *head = page_buffers(page); | 465 | struct buffer_head *head = page_buffers(page); |
471 | struct buffer_head *bh = head; | 466 | struct buffer_head *bh = head; |
472 | 467 | ||
473 | /* If they're all mapped and dirty, do it */ | 468 | /* If they're all mapped and dirty, do it */ |
474 | page_block = 0; | 469 | page_block = 0; |
475 | do { | 470 | do { |
476 | BUG_ON(buffer_locked(bh)); | 471 | BUG_ON(buffer_locked(bh)); |
477 | if (!buffer_mapped(bh)) { | 472 | if (!buffer_mapped(bh)) { |
478 | /* | 473 | /* |
479 | * unmapped dirty buffers are created by | 474 | * unmapped dirty buffers are created by |
480 | * __set_page_dirty_buffers -> mmapped data | 475 | * __set_page_dirty_buffers -> mmapped data |
481 | */ | 476 | */ |
482 | if (buffer_dirty(bh)) | 477 | if (buffer_dirty(bh)) |
483 | goto confused; | 478 | goto confused; |
484 | if (first_unmapped == blocks_per_page) | 479 | if (first_unmapped == blocks_per_page) |
485 | first_unmapped = page_block; | 480 | first_unmapped = page_block; |
486 | continue; | 481 | continue; |
487 | } | 482 | } |
488 | 483 | ||
489 | if (first_unmapped != blocks_per_page) | 484 | if (first_unmapped != blocks_per_page) |
490 | goto confused; /* hole -> non-hole */ | 485 | goto confused; /* hole -> non-hole */ |
491 | 486 | ||
492 | if (!buffer_dirty(bh) || !buffer_uptodate(bh)) | 487 | if (!buffer_dirty(bh) || !buffer_uptodate(bh)) |
493 | goto confused; | 488 | goto confused; |
494 | if (page_block) { | 489 | if (page_block) { |
495 | if (bh->b_blocknr != blocks[page_block-1] + 1) | 490 | if (bh->b_blocknr != blocks[page_block-1] + 1) |
496 | goto confused; | 491 | goto confused; |
497 | } | 492 | } |
498 | blocks[page_block++] = bh->b_blocknr; | 493 | blocks[page_block++] = bh->b_blocknr; |
499 | boundary = buffer_boundary(bh); | 494 | boundary = buffer_boundary(bh); |
500 | if (boundary) { | 495 | if (boundary) { |
501 | boundary_block = bh->b_blocknr; | 496 | boundary_block = bh->b_blocknr; |
502 | boundary_bdev = bh->b_bdev; | 497 | boundary_bdev = bh->b_bdev; |
503 | } | 498 | } |
504 | bdev = bh->b_bdev; | 499 | bdev = bh->b_bdev; |
505 | } while ((bh = bh->b_this_page) != head); | 500 | } while ((bh = bh->b_this_page) != head); |
506 | 501 | ||
507 | if (first_unmapped) | 502 | if (first_unmapped) |
508 | goto page_is_mapped; | 503 | goto page_is_mapped; |
509 | 504 | ||
510 | /* | 505 | /* |
511 | * Page has buffers, but they are all unmapped. The page was | 506 | * Page has buffers, but they are all unmapped. The page was |
512 | * created by pagein or read over a hole which was handled by | 507 | * created by pagein or read over a hole which was handled by |
513 | * block_read_full_page(). If this address_space is also | 508 | * block_read_full_page(). If this address_space is also |
514 | * using mpage_readpages then this can rarely happen. | 509 | * using mpage_readpages then this can rarely happen. |
515 | */ | 510 | */ |
516 | goto confused; | 511 | goto confused; |
517 | } | 512 | } |
518 | 513 | ||
519 | /* | 514 | /* |
520 | * The page has no buffers: map it to disk | 515 | * The page has no buffers: map it to disk |
521 | */ | 516 | */ |
522 | BUG_ON(!PageUptodate(page)); | 517 | BUG_ON(!PageUptodate(page)); |
523 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); | 518 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); |
524 | last_block = (i_size - 1) >> blkbits; | 519 | last_block = (i_size - 1) >> blkbits; |
525 | map_bh.b_page = page; | 520 | map_bh.b_page = page; |
526 | for (page_block = 0; page_block < blocks_per_page; ) { | 521 | for (page_block = 0; page_block < blocks_per_page; ) { |
527 | 522 | ||
528 | map_bh.b_state = 0; | 523 | map_bh.b_state = 0; |
529 | map_bh.b_size = 1 << blkbits; | 524 | map_bh.b_size = 1 << blkbits; |
530 | if (mpd->get_block(inode, block_in_file, &map_bh, 1)) | 525 | if (mpd->get_block(inode, block_in_file, &map_bh, 1)) |
531 | goto confused; | 526 | goto confused; |
532 | if (buffer_new(&map_bh)) | 527 | if (buffer_new(&map_bh)) |
533 | unmap_underlying_metadata(map_bh.b_bdev, | 528 | unmap_underlying_metadata(map_bh.b_bdev, |
534 | map_bh.b_blocknr); | 529 | map_bh.b_blocknr); |
535 | if (buffer_boundary(&map_bh)) { | 530 | if (buffer_boundary(&map_bh)) { |
536 | boundary_block = map_bh.b_blocknr; | 531 | boundary_block = map_bh.b_blocknr; |
537 | boundary_bdev = map_bh.b_bdev; | 532 | boundary_bdev = map_bh.b_bdev; |
538 | } | 533 | } |
539 | if (page_block) { | 534 | if (page_block) { |
540 | if (map_bh.b_blocknr != blocks[page_block-1] + 1) | 535 | if (map_bh.b_blocknr != blocks[page_block-1] + 1) |
541 | goto confused; | 536 | goto confused; |
542 | } | 537 | } |
543 | blocks[page_block++] = map_bh.b_blocknr; | 538 | blocks[page_block++] = map_bh.b_blocknr; |
544 | boundary = buffer_boundary(&map_bh); | 539 | boundary = buffer_boundary(&map_bh); |
545 | bdev = map_bh.b_bdev; | 540 | bdev = map_bh.b_bdev; |
546 | if (block_in_file == last_block) | 541 | if (block_in_file == last_block) |
547 | break; | 542 | break; |
548 | block_in_file++; | 543 | block_in_file++; |
549 | } | 544 | } |
550 | BUG_ON(page_block == 0); | 545 | BUG_ON(page_block == 0); |
551 | 546 | ||
552 | first_unmapped = page_block; | 547 | first_unmapped = page_block; |
553 | 548 | ||
554 | page_is_mapped: | 549 | page_is_mapped: |
555 | end_index = i_size >> PAGE_CACHE_SHIFT; | 550 | end_index = i_size >> PAGE_CACHE_SHIFT; |
556 | if (page->index >= end_index) { | 551 | if (page->index >= end_index) { |
557 | /* | 552 | /* |
558 | * The page straddles i_size. It must be zeroed out on each | 553 | * The page straddles i_size. It must be zeroed out on each |
559 | * and every writepage invokation because it may be mmapped. | 554 | * and every writepage invokation because it may be mmapped. |
560 | * "A file is mapped in multiples of the page size. For a file | 555 | * "A file is mapped in multiples of the page size. For a file |
561 | * that is not a multiple of the page size, the remaining memory | 556 | * that is not a multiple of the page size, the remaining memory |
562 | * is zeroed when mapped, and writes to that region are not | 557 | * is zeroed when mapped, and writes to that region are not |
563 | * written out to the file." | 558 | * written out to the file." |
564 | */ | 559 | */ |
565 | unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); | 560 | unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); |
566 | 561 | ||
567 | if (page->index > end_index || !offset) | 562 | if (page->index > end_index || !offset) |
568 | goto confused; | 563 | goto confused; |
569 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 564 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
570 | } | 565 | } |
571 | 566 | ||
572 | /* | 567 | /* |
573 | * This page will go to BIO. Do we need to send this BIO off first? | 568 | * This page will go to BIO. Do we need to send this BIO off first? |
574 | */ | 569 | */ |
575 | if (bio && mpd->last_block_in_bio != blocks[0] - 1) | 570 | if (bio && mpd->last_block_in_bio != blocks[0] - 1) |
576 | bio = mpage_bio_submit(WRITE, bio); | 571 | bio = mpage_bio_submit(WRITE, bio); |
577 | 572 | ||
578 | alloc_new: | 573 | alloc_new: |
579 | if (bio == NULL) { | 574 | if (bio == NULL) { |
580 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), | 575 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), |
581 | bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); | 576 | bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); |
582 | if (bio == NULL) | 577 | if (bio == NULL) |
583 | goto confused; | 578 | goto confused; |
584 | } | 579 | } |
585 | 580 | ||
586 | /* | 581 | /* |
587 | * Must try to add the page before marking the buffer clean or | 582 | * Must try to add the page before marking the buffer clean or |
588 | * the confused fail path above (OOM) will be very confused when | 583 | * the confused fail path above (OOM) will be very confused when |
589 | * it finds all bh marked clean (i.e. it will not write anything) | 584 | * it finds all bh marked clean (i.e. it will not write anything) |
590 | */ | 585 | */ |
591 | length = first_unmapped << blkbits; | 586 | length = first_unmapped << blkbits; |
592 | if (bio_add_page(bio, page, length, 0) < length) { | 587 | if (bio_add_page(bio, page, length, 0) < length) { |
593 | bio = mpage_bio_submit(WRITE, bio); | 588 | bio = mpage_bio_submit(WRITE, bio); |
594 | goto alloc_new; | 589 | goto alloc_new; |
595 | } | 590 | } |
596 | 591 | ||
597 | /* | 592 | /* |
598 | * OK, we have our BIO, so we can now mark the buffers clean. Make | 593 | * OK, we have our BIO, so we can now mark the buffers clean. Make |
599 | * sure to only clean buffers which we know we'll be writing. | 594 | * sure to only clean buffers which we know we'll be writing. |
600 | */ | 595 | */ |
601 | if (page_has_buffers(page)) { | 596 | if (page_has_buffers(page)) { |
602 | struct buffer_head *head = page_buffers(page); | 597 | struct buffer_head *head = page_buffers(page); |
603 | struct buffer_head *bh = head; | 598 | struct buffer_head *bh = head; |
604 | unsigned buffer_counter = 0; | 599 | unsigned buffer_counter = 0; |
605 | 600 | ||
606 | do { | 601 | do { |
607 | if (buffer_counter++ == first_unmapped) | 602 | if (buffer_counter++ == first_unmapped) |
608 | break; | 603 | break; |
609 | clear_buffer_dirty(bh); | 604 | clear_buffer_dirty(bh); |
610 | bh = bh->b_this_page; | 605 | bh = bh->b_this_page; |
611 | } while (bh != head); | 606 | } while (bh != head); |
612 | 607 | ||
613 | /* | 608 | /* |
614 | * we cannot drop the bh if the page is not uptodate | 609 | * we cannot drop the bh if the page is not uptodate |
615 | * or a concurrent readpage would fail to serialize with the bh | 610 | * or a concurrent readpage would fail to serialize with the bh |
616 | * and it would read from disk before we reach the platter. | 611 | * and it would read from disk before we reach the platter. |
617 | */ | 612 | */ |
618 | if (buffer_heads_over_limit && PageUptodate(page)) | 613 | if (buffer_heads_over_limit && PageUptodate(page)) |
619 | try_to_free_buffers(page); | 614 | try_to_free_buffers(page); |
620 | } | 615 | } |
621 | 616 | ||
622 | BUG_ON(PageWriteback(page)); | 617 | BUG_ON(PageWriteback(page)); |
623 | set_page_writeback(page); | 618 | set_page_writeback(page); |
624 | unlock_page(page); | 619 | unlock_page(page); |
625 | if (boundary || (first_unmapped != blocks_per_page)) { | 620 | if (boundary || (first_unmapped != blocks_per_page)) { |
626 | bio = mpage_bio_submit(WRITE, bio); | 621 | bio = mpage_bio_submit(WRITE, bio); |
627 | if (boundary_block) { | 622 | if (boundary_block) { |
628 | write_boundary_block(boundary_bdev, | 623 | write_boundary_block(boundary_bdev, |
629 | boundary_block, 1 << blkbits); | 624 | boundary_block, 1 << blkbits); |
630 | } | 625 | } |
631 | } else { | 626 | } else { |
632 | mpd->last_block_in_bio = blocks[blocks_per_page - 1]; | 627 | mpd->last_block_in_bio = blocks[blocks_per_page - 1]; |
633 | } | 628 | } |
634 | goto out; | 629 | goto out; |
635 | 630 | ||
636 | confused: | 631 | confused: |
637 | if (bio) | 632 | if (bio) |
638 | bio = mpage_bio_submit(WRITE, bio); | 633 | bio = mpage_bio_submit(WRITE, bio); |
639 | 634 | ||
640 | if (mpd->use_writepage) { | 635 | if (mpd->use_writepage) { |
641 | ret = mapping->a_ops->writepage(page, wbc); | 636 | ret = mapping->a_ops->writepage(page, wbc); |
642 | } else { | 637 | } else { |
643 | ret = -EAGAIN; | 638 | ret = -EAGAIN; |
644 | goto out; | 639 | goto out; |
645 | } | 640 | } |
646 | /* | 641 | /* |
647 | * The caller has a ref on the inode, so *mapping is stable | 642 | * The caller has a ref on the inode, so *mapping is stable |
648 | */ | 643 | */ |
649 | mapping_set_error(mapping, ret); | 644 | mapping_set_error(mapping, ret); |
650 | out: | 645 | out: |
651 | mpd->bio = bio; | 646 | mpd->bio = bio; |
652 | return ret; | 647 | return ret; |
653 | } | 648 | } |
649 | EXPORT_SYMBOL(__mpage_writepage); | ||
654 | 650 | ||
655 | /** | 651 | /** |
656 | * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them | 652 | * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them |
657 | * @mapping: address space structure to write | 653 | * @mapping: address space structure to write |
658 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | 654 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
659 | * @get_block: the filesystem's block mapper function. | 655 | * @get_block: the filesystem's block mapper function. |
660 | * If this is NULL then use a_ops->writepage. Otherwise, go | 656 | * If this is NULL then use a_ops->writepage. Otherwise, go |
661 | * direct-to-BIO. | 657 | * direct-to-BIO. |
662 | * | 658 | * |
663 | * This is a library function, which implements the writepages() | 659 | * This is a library function, which implements the writepages() |
664 | * address_space_operation. | 660 | * address_space_operation. |
665 | * | 661 | * |
666 | * If a page is already under I/O, generic_writepages() skips it, even | 662 | * If a page is already under I/O, generic_writepages() skips it, even |
667 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, | 663 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, |
668 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() | 664 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() |
669 | * and msync() need to guarantee that all the data which was dirty at the time | 665 | * and msync() need to guarantee that all the data which was dirty at the time |
670 | * the call was made get new I/O started against them. If wbc->sync_mode is | 666 | * the call was made get new I/O started against them. If wbc->sync_mode is |
671 | * WB_SYNC_ALL then we were called for data integrity and we must wait for | 667 | * WB_SYNC_ALL then we were called for data integrity and we must wait for |
672 | * existing IO to complete. | 668 | * existing IO to complete. |
673 | */ | 669 | */ |
674 | int | 670 | int |
675 | mpage_writepages(struct address_space *mapping, | 671 | mpage_writepages(struct address_space *mapping, |
676 | struct writeback_control *wbc, get_block_t get_block) | 672 | struct writeback_control *wbc, get_block_t get_block) |
677 | { | 673 | { |
678 | int ret; | 674 | int ret; |
679 | 675 | ||
680 | if (!get_block) | 676 | if (!get_block) |
681 | ret = generic_writepages(mapping, wbc); | 677 | ret = generic_writepages(mapping, wbc); |
682 | else { | 678 | else { |
683 | struct mpage_data mpd = { | 679 | struct mpage_data mpd = { |
684 | .bio = NULL, | 680 | .bio = NULL, |
685 | .last_block_in_bio = 0, | 681 | .last_block_in_bio = 0, |
686 | .get_block = get_block, | 682 | .get_block = get_block, |
687 | .use_writepage = 1, | 683 | .use_writepage = 1, |
688 | }; | 684 | }; |
689 | 685 | ||
690 | ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); | 686 | ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); |
691 | if (mpd.bio) | 687 | if (mpd.bio) |
692 | mpage_bio_submit(WRITE, mpd.bio); | 688 | mpage_bio_submit(WRITE, mpd.bio); |
693 | } | 689 | } |
694 | return ret; | 690 | return ret; |
695 | } | 691 | } |
696 | EXPORT_SYMBOL(mpage_writepages); | 692 | EXPORT_SYMBOL(mpage_writepages); |
697 | 693 | ||
698 | int mpage_writepage(struct page *page, get_block_t get_block, | 694 | int mpage_writepage(struct page *page, get_block_t get_block, |
699 | struct writeback_control *wbc) | 695 | struct writeback_control *wbc) |
700 | { | 696 | { |
701 | struct mpage_data mpd = { | 697 | struct mpage_data mpd = { |
702 | .bio = NULL, | 698 | .bio = NULL, |
703 | .last_block_in_bio = 0, | 699 | .last_block_in_bio = 0, |
704 | .get_block = get_block, | 700 | .get_block = get_block, |
705 | .use_writepage = 0, | 701 | .use_writepage = 0, |
706 | }; | 702 | }; |
707 | int ret = __mpage_writepage(page, wbc, &mpd); | 703 | int ret = __mpage_writepage(page, wbc, &mpd); |
708 | if (mpd.bio) | 704 | if (mpd.bio) |
709 | mpage_bio_submit(WRITE, mpd.bio); | 705 | mpage_bio_submit(WRITE, mpd.bio); |
710 | return ret; | 706 | return ret; |
711 | } | 707 | } |
include/linux/mpage.h
1 | /* | 1 | /* |
2 | * include/linux/mpage.h | 2 | * include/linux/mpage.h |
3 | * | 3 | * |
4 | * Contains declarations related to preparing and submitting BIOS which contain | 4 | * Contains declarations related to preparing and submitting BIOS which contain |
5 | * multiple pagecache pages. | 5 | * multiple pagecache pages. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | /* | 8 | /* |
9 | * (And no, it doesn't do the #ifdef __MPAGE_H thing, and it doesn't do | 9 | * (And no, it doesn't do the #ifdef __MPAGE_H thing, and it doesn't do |
10 | * nested includes. Get it right in the .c file). | 10 | * nested includes. Get it right in the .c file). |
11 | */ | 11 | */ |
12 | #ifdef CONFIG_BLOCK | 12 | #ifdef CONFIG_BLOCK |
13 | 13 | ||
14 | struct mpage_data { | ||
15 | struct bio *bio; | ||
16 | sector_t last_block_in_bio; | ||
17 | get_block_t *get_block; | ||
18 | unsigned use_writepage; | ||
19 | }; | ||
20 | |||
14 | struct writeback_control; | 21 | struct writeback_control; |
15 | 22 | ||
23 | struct bio *mpage_bio_submit(int rw, struct bio *bio); | ||
16 | int mpage_readpages(struct address_space *mapping, struct list_head *pages, | 24 | int mpage_readpages(struct address_space *mapping, struct list_head *pages, |
17 | unsigned nr_pages, get_block_t get_block); | 25 | unsigned nr_pages, get_block_t get_block); |
18 | int mpage_readpage(struct page *page, get_block_t get_block); | 26 | int mpage_readpage(struct page *page, get_block_t get_block); |
27 | int __mpage_writepage(struct page *page, struct writeback_control *wbc, | ||
28 | void *data); | ||
19 | int mpage_writepages(struct address_space *mapping, | 29 | int mpage_writepages(struct address_space *mapping, |
20 | struct writeback_control *wbc, get_block_t get_block); | 30 | struct writeback_control *wbc, get_block_t get_block); |
21 | int mpage_writepage(struct page *page, get_block_t *get_block, | 31 | int mpage_writepage(struct page *page, get_block_t *get_block, |
22 | struct writeback_control *wbc); | 32 | struct writeback_control *wbc); |
23 | 33 | ||
24 | #endif | 34 | #endif |
25 | 35 |