Commit 29a814d2ee0e43c2980f33f91c1311ec06c0aa35

Authored by Alex Tomas
Committed by Theodore Ts'o
1 parent 87c89c232c

vfs: add hooks for ext4's delayed allocation support

Export mpage_bio_submit() and __mpage_writepage() for the benefit of
ext4's delayed allocation support.   Also change __block_write_full_page
so that if buffers that have the BH_Delay flag set it will call
get_block() to get the physical block allocated, just as in the
!BH_Mapped case.

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 3 changed files with 20 additions and 11 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/buffer.c 2 * linux/fs/buffer.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds 4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 * 9 *
10 * Removed a lot of unnecessary code and simplified things now that 10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 * 12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating 13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM 14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 * 15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK 16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 * 17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> 18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */ 19 */
20 20
21 #include <linux/kernel.h> 21 #include <linux/kernel.h>
22 #include <linux/syscalls.h> 22 #include <linux/syscalls.h>
23 #include <linux/fs.h> 23 #include <linux/fs.h>
24 #include <linux/mm.h> 24 #include <linux/mm.h>
25 #include <linux/percpu.h> 25 #include <linux/percpu.h>
26 #include <linux/slab.h> 26 #include <linux/slab.h>
27 #include <linux/capability.h> 27 #include <linux/capability.h>
28 #include <linux/blkdev.h> 28 #include <linux/blkdev.h>
29 #include <linux/file.h> 29 #include <linux/file.h>
30 #include <linux/quotaops.h> 30 #include <linux/quotaops.h>
31 #include <linux/highmem.h> 31 #include <linux/highmem.h>
32 #include <linux/module.h> 32 #include <linux/module.h>
33 #include <linux/writeback.h> 33 #include <linux/writeback.h>
34 #include <linux/hash.h> 34 #include <linux/hash.h>
35 #include <linux/suspend.h> 35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h> 36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h> 37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h> 38 #include <linux/bio.h>
39 #include <linux/notifier.h> 39 #include <linux/notifier.h>
40 #include <linux/cpu.h> 40 #include <linux/cpu.h>
41 #include <linux/bitops.h> 41 #include <linux/bitops.h>
42 #include <linux/mpage.h> 42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h> 43 #include <linux/bit_spinlock.h>
44 44
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 46
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 48
49 inline void 49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) 50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 { 51 {
52 bh->b_end_io = handler; 52 bh->b_end_io = handler;
53 bh->b_private = private; 53 bh->b_private = private;
54 } 54 }
55 55
56 static int sync_buffer(void *word) 56 static int sync_buffer(void *word)
57 { 57 {
58 struct block_device *bd; 58 struct block_device *bd;
59 struct buffer_head *bh 59 struct buffer_head *bh
60 = container_of(word, struct buffer_head, b_state); 60 = container_of(word, struct buffer_head, b_state);
61 61
62 smp_mb(); 62 smp_mb();
63 bd = bh->b_bdev; 63 bd = bh->b_bdev;
64 if (bd) 64 if (bd)
65 blk_run_address_space(bd->bd_inode->i_mapping); 65 blk_run_address_space(bd->bd_inode->i_mapping);
66 io_schedule(); 66 io_schedule();
67 return 0; 67 return 0;
68 } 68 }
69 69
70 void __lock_buffer(struct buffer_head *bh) 70 void __lock_buffer(struct buffer_head *bh)
71 { 71 {
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, 72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 TASK_UNINTERRUPTIBLE); 73 TASK_UNINTERRUPTIBLE);
74 } 74 }
75 EXPORT_SYMBOL(__lock_buffer); 75 EXPORT_SYMBOL(__lock_buffer);
76 76
77 void unlock_buffer(struct buffer_head *bh) 77 void unlock_buffer(struct buffer_head *bh)
78 { 78 {
79 smp_mb__before_clear_bit(); 79 smp_mb__before_clear_bit();
80 clear_buffer_locked(bh); 80 clear_buffer_locked(bh);
81 smp_mb__after_clear_bit(); 81 smp_mb__after_clear_bit();
82 wake_up_bit(&bh->b_state, BH_Lock); 82 wake_up_bit(&bh->b_state, BH_Lock);
83 } 83 }
84 84
85 /* 85 /*
86 * Block until a buffer comes unlocked. This doesn't stop it 86 * Block until a buffer comes unlocked. This doesn't stop it
87 * from becoming locked again - you have to lock it yourself 87 * from becoming locked again - you have to lock it yourself
88 * if you want to preserve its state. 88 * if you want to preserve its state.
89 */ 89 */
90 void __wait_on_buffer(struct buffer_head * bh) 90 void __wait_on_buffer(struct buffer_head * bh)
91 { 91 {
92 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 92 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
93 } 93 }
94 94
95 static void 95 static void
96 __clear_page_buffers(struct page *page) 96 __clear_page_buffers(struct page *page)
97 { 97 {
98 ClearPagePrivate(page); 98 ClearPagePrivate(page);
99 set_page_private(page, 0); 99 set_page_private(page, 0);
100 page_cache_release(page); 100 page_cache_release(page);
101 } 101 }
102 102
103 static void buffer_io_error(struct buffer_head *bh) 103 static void buffer_io_error(struct buffer_head *bh)
104 { 104 {
105 char b[BDEVNAME_SIZE]; 105 char b[BDEVNAME_SIZE];
106 106
107 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", 107 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
108 bdevname(bh->b_bdev, b), 108 bdevname(bh->b_bdev, b),
109 (unsigned long long)bh->b_blocknr); 109 (unsigned long long)bh->b_blocknr);
110 } 110 }
111 111
112 /* 112 /*
113 * End-of-IO handler helper function which does not touch the bh after 113 * End-of-IO handler helper function which does not touch the bh after
114 * unlocking it. 114 * unlocking it.
115 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but 115 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
116 * a race there is benign: unlock_buffer() only use the bh's address for 116 * a race there is benign: unlock_buffer() only use the bh's address for
117 * hashing after unlocking the buffer, so it doesn't actually touch the bh 117 * hashing after unlocking the buffer, so it doesn't actually touch the bh
118 * itself. 118 * itself.
119 */ 119 */
120 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) 120 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
121 { 121 {
122 if (uptodate) { 122 if (uptodate) {
123 set_buffer_uptodate(bh); 123 set_buffer_uptodate(bh);
124 } else { 124 } else {
125 /* This happens, due to failed READA attempts. */ 125 /* This happens, due to failed READA attempts. */
126 clear_buffer_uptodate(bh); 126 clear_buffer_uptodate(bh);
127 } 127 }
128 unlock_buffer(bh); 128 unlock_buffer(bh);
129 } 129 }
130 130
131 /* 131 /*
132 * Default synchronous end-of-IO handler.. Just mark it up-to-date and 132 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
133 * unlock the buffer. This is what ll_rw_block uses too. 133 * unlock the buffer. This is what ll_rw_block uses too.
134 */ 134 */
135 void end_buffer_read_sync(struct buffer_head *bh, int uptodate) 135 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
136 { 136 {
137 __end_buffer_read_notouch(bh, uptodate); 137 __end_buffer_read_notouch(bh, uptodate);
138 put_bh(bh); 138 put_bh(bh);
139 } 139 }
140 140
141 void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 141 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
142 { 142 {
143 char b[BDEVNAME_SIZE]; 143 char b[BDEVNAME_SIZE];
144 144
145 if (uptodate) { 145 if (uptodate) {
146 set_buffer_uptodate(bh); 146 set_buffer_uptodate(bh);
147 } else { 147 } else {
148 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 148 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
149 buffer_io_error(bh); 149 buffer_io_error(bh);
150 printk(KERN_WARNING "lost page write due to " 150 printk(KERN_WARNING "lost page write due to "
151 "I/O error on %s\n", 151 "I/O error on %s\n",
152 bdevname(bh->b_bdev, b)); 152 bdevname(bh->b_bdev, b));
153 } 153 }
154 set_buffer_write_io_error(bh); 154 set_buffer_write_io_error(bh);
155 clear_buffer_uptodate(bh); 155 clear_buffer_uptodate(bh);
156 } 156 }
157 unlock_buffer(bh); 157 unlock_buffer(bh);
158 put_bh(bh); 158 put_bh(bh);
159 } 159 }
160 160
161 /* 161 /*
162 * Write out and wait upon all the dirty data associated with a block 162 * Write out and wait upon all the dirty data associated with a block
163 * device via its mapping. Does not take the superblock lock. 163 * device via its mapping. Does not take the superblock lock.
164 */ 164 */
165 int sync_blockdev(struct block_device *bdev) 165 int sync_blockdev(struct block_device *bdev)
166 { 166 {
167 int ret = 0; 167 int ret = 0;
168 168
169 if (bdev) 169 if (bdev)
170 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); 170 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
171 return ret; 171 return ret;
172 } 172 }
173 EXPORT_SYMBOL(sync_blockdev); 173 EXPORT_SYMBOL(sync_blockdev);
174 174
175 /* 175 /*
176 * Write out and wait upon all dirty data associated with this 176 * Write out and wait upon all dirty data associated with this
177 * device. Filesystem data as well as the underlying block 177 * device. Filesystem data as well as the underlying block
178 * device. Takes the superblock lock. 178 * device. Takes the superblock lock.
179 */ 179 */
180 int fsync_bdev(struct block_device *bdev) 180 int fsync_bdev(struct block_device *bdev)
181 { 181 {
182 struct super_block *sb = get_super(bdev); 182 struct super_block *sb = get_super(bdev);
183 if (sb) { 183 if (sb) {
184 int res = fsync_super(sb); 184 int res = fsync_super(sb);
185 drop_super(sb); 185 drop_super(sb);
186 return res; 186 return res;
187 } 187 }
188 return sync_blockdev(bdev); 188 return sync_blockdev(bdev);
189 } 189 }
190 190
191 /** 191 /**
192 * freeze_bdev -- lock a filesystem and force it into a consistent state 192 * freeze_bdev -- lock a filesystem and force it into a consistent state
193 * @bdev: blockdevice to lock 193 * @bdev: blockdevice to lock
194 * 194 *
195 * This takes the block device bd_mount_sem to make sure no new mounts 195 * This takes the block device bd_mount_sem to make sure no new mounts
196 * happen on bdev until thaw_bdev() is called. 196 * happen on bdev until thaw_bdev() is called.
197 * If a superblock is found on this device, we take the s_umount semaphore 197 * If a superblock is found on this device, we take the s_umount semaphore
198 * on it to make sure nobody unmounts until the snapshot creation is done. 198 * on it to make sure nobody unmounts until the snapshot creation is done.
199 */ 199 */
200 struct super_block *freeze_bdev(struct block_device *bdev) 200 struct super_block *freeze_bdev(struct block_device *bdev)
201 { 201 {
202 struct super_block *sb; 202 struct super_block *sb;
203 203
204 down(&bdev->bd_mount_sem); 204 down(&bdev->bd_mount_sem);
205 sb = get_super(bdev); 205 sb = get_super(bdev);
206 if (sb && !(sb->s_flags & MS_RDONLY)) { 206 if (sb && !(sb->s_flags & MS_RDONLY)) {
207 sb->s_frozen = SB_FREEZE_WRITE; 207 sb->s_frozen = SB_FREEZE_WRITE;
208 smp_wmb(); 208 smp_wmb();
209 209
210 __fsync_super(sb); 210 __fsync_super(sb);
211 211
212 sb->s_frozen = SB_FREEZE_TRANS; 212 sb->s_frozen = SB_FREEZE_TRANS;
213 smp_wmb(); 213 smp_wmb();
214 214
215 sync_blockdev(sb->s_bdev); 215 sync_blockdev(sb->s_bdev);
216 216
217 if (sb->s_op->write_super_lockfs) 217 if (sb->s_op->write_super_lockfs)
218 sb->s_op->write_super_lockfs(sb); 218 sb->s_op->write_super_lockfs(sb);
219 } 219 }
220 220
221 sync_blockdev(bdev); 221 sync_blockdev(bdev);
222 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ 222 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
223 } 223 }
224 EXPORT_SYMBOL(freeze_bdev); 224 EXPORT_SYMBOL(freeze_bdev);
225 225
226 /** 226 /**
227 * thaw_bdev -- unlock filesystem 227 * thaw_bdev -- unlock filesystem
228 * @bdev: blockdevice to unlock 228 * @bdev: blockdevice to unlock
229 * @sb: associated superblock 229 * @sb: associated superblock
230 * 230 *
231 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 231 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
232 */ 232 */
233 void thaw_bdev(struct block_device *bdev, struct super_block *sb) 233 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
234 { 234 {
235 if (sb) { 235 if (sb) {
236 BUG_ON(sb->s_bdev != bdev); 236 BUG_ON(sb->s_bdev != bdev);
237 237
238 if (sb->s_op->unlockfs) 238 if (sb->s_op->unlockfs)
239 sb->s_op->unlockfs(sb); 239 sb->s_op->unlockfs(sb);
240 sb->s_frozen = SB_UNFROZEN; 240 sb->s_frozen = SB_UNFROZEN;
241 smp_wmb(); 241 smp_wmb();
242 wake_up(&sb->s_wait_unfrozen); 242 wake_up(&sb->s_wait_unfrozen);
243 drop_super(sb); 243 drop_super(sb);
244 } 244 }
245 245
246 up(&bdev->bd_mount_sem); 246 up(&bdev->bd_mount_sem);
247 } 247 }
248 EXPORT_SYMBOL(thaw_bdev); 248 EXPORT_SYMBOL(thaw_bdev);
249 249
250 /* 250 /*
251 * Various filesystems appear to want __find_get_block to be non-blocking. 251 * Various filesystems appear to want __find_get_block to be non-blocking.
252 * But it's the page lock which protects the buffers. To get around this, 252 * But it's the page lock which protects the buffers. To get around this,
253 * we get exclusion from try_to_free_buffers with the blockdev mapping's 253 * we get exclusion from try_to_free_buffers with the blockdev mapping's
254 * private_lock. 254 * private_lock.
255 * 255 *
256 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention 256 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
257 * may be quite high. This code could TryLock the page, and if that 257 * may be quite high. This code could TryLock the page, and if that
258 * succeeds, there is no need to take private_lock. (But if 258 * succeeds, there is no need to take private_lock. (But if
259 * private_lock is contended then so is mapping->tree_lock). 259 * private_lock is contended then so is mapping->tree_lock).
260 */ 260 */
261 static struct buffer_head * 261 static struct buffer_head *
262 __find_get_block_slow(struct block_device *bdev, sector_t block) 262 __find_get_block_slow(struct block_device *bdev, sector_t block)
263 { 263 {
264 struct inode *bd_inode = bdev->bd_inode; 264 struct inode *bd_inode = bdev->bd_inode;
265 struct address_space *bd_mapping = bd_inode->i_mapping; 265 struct address_space *bd_mapping = bd_inode->i_mapping;
266 struct buffer_head *ret = NULL; 266 struct buffer_head *ret = NULL;
267 pgoff_t index; 267 pgoff_t index;
268 struct buffer_head *bh; 268 struct buffer_head *bh;
269 struct buffer_head *head; 269 struct buffer_head *head;
270 struct page *page; 270 struct page *page;
271 int all_mapped = 1; 271 int all_mapped = 1;
272 272
273 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); 273 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
274 page = find_get_page(bd_mapping, index); 274 page = find_get_page(bd_mapping, index);
275 if (!page) 275 if (!page)
276 goto out; 276 goto out;
277 277
278 spin_lock(&bd_mapping->private_lock); 278 spin_lock(&bd_mapping->private_lock);
279 if (!page_has_buffers(page)) 279 if (!page_has_buffers(page))
280 goto out_unlock; 280 goto out_unlock;
281 head = page_buffers(page); 281 head = page_buffers(page);
282 bh = head; 282 bh = head;
283 do { 283 do {
284 if (bh->b_blocknr == block) { 284 if (bh->b_blocknr == block) {
285 ret = bh; 285 ret = bh;
286 get_bh(bh); 286 get_bh(bh);
287 goto out_unlock; 287 goto out_unlock;
288 } 288 }
289 if (!buffer_mapped(bh)) 289 if (!buffer_mapped(bh))
290 all_mapped = 0; 290 all_mapped = 0;
291 bh = bh->b_this_page; 291 bh = bh->b_this_page;
292 } while (bh != head); 292 } while (bh != head);
293 293
294 /* we might be here because some of the buffers on this page are 294 /* we might be here because some of the buffers on this page are
295 * not mapped. This is due to various races between 295 * not mapped. This is due to various races between
296 * file io on the block device and getblk. It gets dealt with 296 * file io on the block device and getblk. It gets dealt with
297 * elsewhere, don't buffer_error if we had some unmapped buffers 297 * elsewhere, don't buffer_error if we had some unmapped buffers
298 */ 298 */
299 if (all_mapped) { 299 if (all_mapped) {
300 printk("__find_get_block_slow() failed. " 300 printk("__find_get_block_slow() failed. "
301 "block=%llu, b_blocknr=%llu\n", 301 "block=%llu, b_blocknr=%llu\n",
302 (unsigned long long)block, 302 (unsigned long long)block,
303 (unsigned long long)bh->b_blocknr); 303 (unsigned long long)bh->b_blocknr);
304 printk("b_state=0x%08lx, b_size=%zu\n", 304 printk("b_state=0x%08lx, b_size=%zu\n",
305 bh->b_state, bh->b_size); 305 bh->b_state, bh->b_size);
306 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); 306 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
307 } 307 }
308 out_unlock: 308 out_unlock:
309 spin_unlock(&bd_mapping->private_lock); 309 spin_unlock(&bd_mapping->private_lock);
310 page_cache_release(page); 310 page_cache_release(page);
311 out: 311 out:
312 return ret; 312 return ret;
313 } 313 }
314 314
315 /* If invalidate_buffers() will trash dirty buffers, it means some kind 315 /* If invalidate_buffers() will trash dirty buffers, it means some kind
316 of fs corruption is going on. Trashing dirty data always imply losing 316 of fs corruption is going on. Trashing dirty data always imply losing
317 information that was supposed to be just stored on the physical layer 317 information that was supposed to be just stored on the physical layer
318 by the user. 318 by the user.
319 319
320 Thus invalidate_buffers in general usage is not allwowed to trash 320 Thus invalidate_buffers in general usage is not allwowed to trash
321 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to 321 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
322 be preserved. These buffers are simply skipped. 322 be preserved. These buffers are simply skipped.
323 323
324 We also skip buffers which are still in use. For example this can 324 We also skip buffers which are still in use. For example this can
325 happen if a userspace program is reading the block device. 325 happen if a userspace program is reading the block device.
326 326
327 NOTE: In the case where the user removed a removable-media-disk even if 327 NOTE: In the case where the user removed a removable-media-disk even if
328 there's still dirty data not synced on disk (due a bug in the device driver 328 there's still dirty data not synced on disk (due a bug in the device driver
329 or due an error of the user), by not destroying the dirty buffers we could 329 or due an error of the user), by not destroying the dirty buffers we could
330 generate corruption also on the next media inserted, thus a parameter is 330 generate corruption also on the next media inserted, thus a parameter is
331 necessary to handle this case in the most safe way possible (trying 331 necessary to handle this case in the most safe way possible (trying
332 to not corrupt also the new disk inserted with the data belonging to 332 to not corrupt also the new disk inserted with the data belonging to
333 the old now corrupted disk). Also for the ramdisk the natural thing 333 the old now corrupted disk). Also for the ramdisk the natural thing
334 to do in order to release the ramdisk memory is to destroy dirty buffers. 334 to do in order to release the ramdisk memory is to destroy dirty buffers.
335 335
336 These are two special cases. Normal usage imply the device driver 336 These are two special cases. Normal usage imply the device driver
337 to issue a sync on the device (without waiting I/O completion) and 337 to issue a sync on the device (without waiting I/O completion) and
338 then an invalidate_buffers call that doesn't trash dirty buffers. 338 then an invalidate_buffers call that doesn't trash dirty buffers.
339 339
340 For handling cache coherency with the blkdev pagecache the 'update' case 340 For handling cache coherency with the blkdev pagecache the 'update' case
341 is been introduced. It is needed to re-read from disk any pinned 341 is been introduced. It is needed to re-read from disk any pinned
342 buffer. NOTE: re-reading from disk is destructive so we can do it only 342 buffer. NOTE: re-reading from disk is destructive so we can do it only
343 when we assume nobody is changing the buffercache under our I/O and when 343 when we assume nobody is changing the buffercache under our I/O and when
344 we think the disk contains more recent information than the buffercache. 344 we think the disk contains more recent information than the buffercache.
345 The update == 1 pass marks the buffers we need to update, the update == 2 345 The update == 1 pass marks the buffers we need to update, the update == 2
346 pass does the actual I/O. */ 346 pass does the actual I/O. */
347 void invalidate_bdev(struct block_device *bdev) 347 void invalidate_bdev(struct block_device *bdev)
348 { 348 {
349 struct address_space *mapping = bdev->bd_inode->i_mapping; 349 struct address_space *mapping = bdev->bd_inode->i_mapping;
350 350
351 if (mapping->nrpages == 0) 351 if (mapping->nrpages == 0)
352 return; 352 return;
353 353
354 invalidate_bh_lrus(); 354 invalidate_bh_lrus();
355 invalidate_mapping_pages(mapping, 0, -1); 355 invalidate_mapping_pages(mapping, 0, -1);
356 } 356 }
357 357
358 /* 358 /*
359 * Kick pdflush then try to free up some ZONE_NORMAL memory. 359 * Kick pdflush then try to free up some ZONE_NORMAL memory.
360 */ 360 */
361 static void free_more_memory(void) 361 static void free_more_memory(void)
362 { 362 {
363 struct zone *zone; 363 struct zone *zone;
364 int nid; 364 int nid;
365 365
366 wakeup_pdflush(1024); 366 wakeup_pdflush(1024);
367 yield(); 367 yield();
368 368
369 for_each_online_node(nid) { 369 for_each_online_node(nid) {
370 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), 370 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
371 gfp_zone(GFP_NOFS), NULL, 371 gfp_zone(GFP_NOFS), NULL,
372 &zone); 372 &zone);
373 if (zone) 373 if (zone)
374 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, 374 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
375 GFP_NOFS); 375 GFP_NOFS);
376 } 376 }
377 } 377 }
378 378
379 /* 379 /*
380 * I/O completion handler for block_read_full_page() - pages 380 * I/O completion handler for block_read_full_page() - pages
381 * which come unlocked at the end of I/O. 381 * which come unlocked at the end of I/O.
382 */ 382 */
383 static void end_buffer_async_read(struct buffer_head *bh, int uptodate) 383 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
384 { 384 {
385 unsigned long flags; 385 unsigned long flags;
386 struct buffer_head *first; 386 struct buffer_head *first;
387 struct buffer_head *tmp; 387 struct buffer_head *tmp;
388 struct page *page; 388 struct page *page;
389 int page_uptodate = 1; 389 int page_uptodate = 1;
390 390
391 BUG_ON(!buffer_async_read(bh)); 391 BUG_ON(!buffer_async_read(bh));
392 392
393 page = bh->b_page; 393 page = bh->b_page;
394 if (uptodate) { 394 if (uptodate) {
395 set_buffer_uptodate(bh); 395 set_buffer_uptodate(bh);
396 } else { 396 } else {
397 clear_buffer_uptodate(bh); 397 clear_buffer_uptodate(bh);
398 if (printk_ratelimit()) 398 if (printk_ratelimit())
399 buffer_io_error(bh); 399 buffer_io_error(bh);
400 SetPageError(page); 400 SetPageError(page);
401 } 401 }
402 402
403 /* 403 /*
404 * Be _very_ careful from here on. Bad things can happen if 404 * Be _very_ careful from here on. Bad things can happen if
405 * two buffer heads end IO at almost the same time and both 405 * two buffer heads end IO at almost the same time and both
406 * decide that the page is now completely done. 406 * decide that the page is now completely done.
407 */ 407 */
408 first = page_buffers(page); 408 first = page_buffers(page);
409 local_irq_save(flags); 409 local_irq_save(flags);
410 bit_spin_lock(BH_Uptodate_Lock, &first->b_state); 410 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
411 clear_buffer_async_read(bh); 411 clear_buffer_async_read(bh);
412 unlock_buffer(bh); 412 unlock_buffer(bh);
413 tmp = bh; 413 tmp = bh;
414 do { 414 do {
415 if (!buffer_uptodate(tmp)) 415 if (!buffer_uptodate(tmp))
416 page_uptodate = 0; 416 page_uptodate = 0;
417 if (buffer_async_read(tmp)) { 417 if (buffer_async_read(tmp)) {
418 BUG_ON(!buffer_locked(tmp)); 418 BUG_ON(!buffer_locked(tmp));
419 goto still_busy; 419 goto still_busy;
420 } 420 }
421 tmp = tmp->b_this_page; 421 tmp = tmp->b_this_page;
422 } while (tmp != bh); 422 } while (tmp != bh);
423 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 423 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
424 local_irq_restore(flags); 424 local_irq_restore(flags);
425 425
426 /* 426 /*
427 * If none of the buffers had errors and they are all 427 * If none of the buffers had errors and they are all
428 * uptodate then we can set the page uptodate. 428 * uptodate then we can set the page uptodate.
429 */ 429 */
430 if (page_uptodate && !PageError(page)) 430 if (page_uptodate && !PageError(page))
431 SetPageUptodate(page); 431 SetPageUptodate(page);
432 unlock_page(page); 432 unlock_page(page);
433 return; 433 return;
434 434
435 still_busy: 435 still_busy:
436 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 436 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
437 local_irq_restore(flags); 437 local_irq_restore(flags);
438 return; 438 return;
439 } 439 }
440 440
441 /* 441 /*
442 * Completion handler for block_write_full_page() - pages which are unlocked 442 * Completion handler for block_write_full_page() - pages which are unlocked
443 * during I/O, and which have PageWriteback cleared upon I/O completion. 443 * during I/O, and which have PageWriteback cleared upon I/O completion.
444 */ 444 */
445 static void end_buffer_async_write(struct buffer_head *bh, int uptodate) 445 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
446 { 446 {
447 char b[BDEVNAME_SIZE]; 447 char b[BDEVNAME_SIZE];
448 unsigned long flags; 448 unsigned long flags;
449 struct buffer_head *first; 449 struct buffer_head *first;
450 struct buffer_head *tmp; 450 struct buffer_head *tmp;
451 struct page *page; 451 struct page *page;
452 452
453 BUG_ON(!buffer_async_write(bh)); 453 BUG_ON(!buffer_async_write(bh));
454 454
455 page = bh->b_page; 455 page = bh->b_page;
456 if (uptodate) { 456 if (uptodate) {
457 set_buffer_uptodate(bh); 457 set_buffer_uptodate(bh);
458 } else { 458 } else {
459 if (printk_ratelimit()) { 459 if (printk_ratelimit()) {
460 buffer_io_error(bh); 460 buffer_io_error(bh);
461 printk(KERN_WARNING "lost page write due to " 461 printk(KERN_WARNING "lost page write due to "
462 "I/O error on %s\n", 462 "I/O error on %s\n",
463 bdevname(bh->b_bdev, b)); 463 bdevname(bh->b_bdev, b));
464 } 464 }
465 set_bit(AS_EIO, &page->mapping->flags); 465 set_bit(AS_EIO, &page->mapping->flags);
466 set_buffer_write_io_error(bh); 466 set_buffer_write_io_error(bh);
467 clear_buffer_uptodate(bh); 467 clear_buffer_uptodate(bh);
468 SetPageError(page); 468 SetPageError(page);
469 } 469 }
470 470
471 first = page_buffers(page); 471 first = page_buffers(page);
472 local_irq_save(flags); 472 local_irq_save(flags);
473 bit_spin_lock(BH_Uptodate_Lock, &first->b_state); 473 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
474 474
475 clear_buffer_async_write(bh); 475 clear_buffer_async_write(bh);
476 unlock_buffer(bh); 476 unlock_buffer(bh);
477 tmp = bh->b_this_page; 477 tmp = bh->b_this_page;
478 while (tmp != bh) { 478 while (tmp != bh) {
479 if (buffer_async_write(tmp)) { 479 if (buffer_async_write(tmp)) {
480 BUG_ON(!buffer_locked(tmp)); 480 BUG_ON(!buffer_locked(tmp));
481 goto still_busy; 481 goto still_busy;
482 } 482 }
483 tmp = tmp->b_this_page; 483 tmp = tmp->b_this_page;
484 } 484 }
485 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 485 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
486 local_irq_restore(flags); 486 local_irq_restore(flags);
487 end_page_writeback(page); 487 end_page_writeback(page);
488 return; 488 return;
489 489
490 still_busy: 490 still_busy:
491 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 491 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
492 local_irq_restore(flags); 492 local_irq_restore(flags);
493 return; 493 return;
494 } 494 }
495 495
496 /* 496 /*
497 * If a page's buffers are under async readin (end_buffer_async_read 497 * If a page's buffers are under async readin (end_buffer_async_read
498 * completion) then there is a possibility that another thread of 498 * completion) then there is a possibility that another thread of
499 * control could lock one of the buffers after it has completed 499 * control could lock one of the buffers after it has completed
500 * but while some of the other buffers have not completed. This 500 * but while some of the other buffers have not completed. This
501 * locked buffer would confuse end_buffer_async_read() into not unlocking 501 * locked buffer would confuse end_buffer_async_read() into not unlocking
502 * the page. So the absence of BH_Async_Read tells end_buffer_async_read() 502 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
503 * that this buffer is not under async I/O. 503 * that this buffer is not under async I/O.
504 * 504 *
505 * The page comes unlocked when it has no locked buffer_async buffers 505 * The page comes unlocked when it has no locked buffer_async buffers
506 * left. 506 * left.
507 * 507 *
508 * PageLocked prevents anyone starting new async I/O reads any of 508 * PageLocked prevents anyone starting new async I/O reads any of
509 * the buffers. 509 * the buffers.
510 * 510 *
511 * PageWriteback is used to prevent simultaneous writeout of the same 511 * PageWriteback is used to prevent simultaneous writeout of the same
512 * page. 512 * page.
513 * 513 *
514 * PageLocked prevents anyone from starting writeback of a page which is 514 * PageLocked prevents anyone from starting writeback of a page which is
515 * under read I/O (PageWriteback is only ever set against a locked page). 515 * under read I/O (PageWriteback is only ever set against a locked page).
516 */ 516 */
517 static void mark_buffer_async_read(struct buffer_head *bh) 517 static void mark_buffer_async_read(struct buffer_head *bh)
518 { 518 {
519 bh->b_end_io = end_buffer_async_read; 519 bh->b_end_io = end_buffer_async_read;
520 set_buffer_async_read(bh); 520 set_buffer_async_read(bh);
521 } 521 }
522 522
523 void mark_buffer_async_write(struct buffer_head *bh) 523 void mark_buffer_async_write(struct buffer_head *bh)
524 { 524 {
525 bh->b_end_io = end_buffer_async_write; 525 bh->b_end_io = end_buffer_async_write;
526 set_buffer_async_write(bh); 526 set_buffer_async_write(bh);
527 } 527 }
528 EXPORT_SYMBOL(mark_buffer_async_write); 528 EXPORT_SYMBOL(mark_buffer_async_write);
529 529
530 530
531 /* 531 /*
532 * fs/buffer.c contains helper functions for buffer-backed address space's 532 * fs/buffer.c contains helper functions for buffer-backed address space's
533 * fsync functions. A common requirement for buffer-based filesystems is 533 * fsync functions. A common requirement for buffer-based filesystems is
534 * that certain data from the backing blockdev needs to be written out for 534 * that certain data from the backing blockdev needs to be written out for
535 * a successful fsync(). For example, ext2 indirect blocks need to be 535 * a successful fsync(). For example, ext2 indirect blocks need to be
536 * written back and waited upon before fsync() returns. 536 * written back and waited upon before fsync() returns.
537 * 537 *
538 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), 538 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
539 * inode_has_buffers() and invalidate_inode_buffers() are provided for the 539 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
540 * management of a list of dependent buffers at ->i_mapping->private_list. 540 * management of a list of dependent buffers at ->i_mapping->private_list.
541 * 541 *
542 * Locking is a little subtle: try_to_free_buffers() will remove buffers 542 * Locking is a little subtle: try_to_free_buffers() will remove buffers
543 * from their controlling inode's queue when they are being freed. But 543 * from their controlling inode's queue when they are being freed. But
544 * try_to_free_buffers() will be operating against the *blockdev* mapping 544 * try_to_free_buffers() will be operating against the *blockdev* mapping
545 * at the time, not against the S_ISREG file which depends on those buffers. 545 * at the time, not against the S_ISREG file which depends on those buffers.
546 * So the locking for private_list is via the private_lock in the address_space 546 * So the locking for private_list is via the private_lock in the address_space
547 * which backs the buffers. Which is different from the address_space 547 * which backs the buffers. Which is different from the address_space
548 * against which the buffers are listed. So for a particular address_space, 548 * against which the buffers are listed. So for a particular address_space,
549 * mapping->private_lock does *not* protect mapping->private_list! In fact, 549 * mapping->private_lock does *not* protect mapping->private_list! In fact,
550 * mapping->private_list will always be protected by the backing blockdev's 550 * mapping->private_list will always be protected by the backing blockdev's
551 * ->private_lock. 551 * ->private_lock.
552 * 552 *
553 * Which introduces a requirement: all buffers on an address_space's 553 * Which introduces a requirement: all buffers on an address_space's
554 * ->private_list must be from the same address_space: the blockdev's. 554 * ->private_list must be from the same address_space: the blockdev's.
555 * 555 *
556 * address_spaces which do not place buffers at ->private_list via these 556 * address_spaces which do not place buffers at ->private_list via these
557 * utility functions are free to use private_lock and private_list for 557 * utility functions are free to use private_lock and private_list for
558 * whatever they want. The only requirement is that list_empty(private_list) 558 * whatever they want. The only requirement is that list_empty(private_list)
559 * be true at clear_inode() time. 559 * be true at clear_inode() time.
560 * 560 *
561 * FIXME: clear_inode should not call invalidate_inode_buffers(). The 561 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
562 * filesystems should do that. invalidate_inode_buffers() should just go 562 * filesystems should do that. invalidate_inode_buffers() should just go
563 * BUG_ON(!list_empty). 563 * BUG_ON(!list_empty).
564 * 564 *
565 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should 565 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
566 * take an address_space, not an inode. And it should be called 566 * take an address_space, not an inode. And it should be called
567 * mark_buffer_dirty_fsync() to clearly define why those buffers are being 567 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
568 * queued up. 568 * queued up.
569 * 569 *
570 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the 570 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
571 * list if it is already on a list. Because if the buffer is on a list, 571 * list if it is already on a list. Because if the buffer is on a list,
572 * it *must* already be on the right one. If not, the filesystem is being 572 * it *must* already be on the right one. If not, the filesystem is being
573 * silly. This will save a ton of locking. But first we have to ensure 573 * silly. This will save a ton of locking. But first we have to ensure
574 * that buffers are taken *off* the old inode's list when they are freed 574 * that buffers are taken *off* the old inode's list when they are freed
575 * (presumably in truncate). That requires careful auditing of all 575 * (presumably in truncate). That requires careful auditing of all
576 * filesystems (do it inside bforget()). It could also be done by bringing 576 * filesystems (do it inside bforget()). It could also be done by bringing
577 * b_inode back. 577 * b_inode back.
578 */ 578 */
579 579
580 /* 580 /*
581 * The buffer's backing address_space's private_lock must be held 581 * The buffer's backing address_space's private_lock must be held
582 */ 582 */
583 static inline void __remove_assoc_queue(struct buffer_head *bh) 583 static inline void __remove_assoc_queue(struct buffer_head *bh)
584 { 584 {
585 list_del_init(&bh->b_assoc_buffers); 585 list_del_init(&bh->b_assoc_buffers);
586 WARN_ON(!bh->b_assoc_map); 586 WARN_ON(!bh->b_assoc_map);
587 if (buffer_write_io_error(bh)) 587 if (buffer_write_io_error(bh))
588 set_bit(AS_EIO, &bh->b_assoc_map->flags); 588 set_bit(AS_EIO, &bh->b_assoc_map->flags);
589 bh->b_assoc_map = NULL; 589 bh->b_assoc_map = NULL;
590 } 590 }
591 591
592 int inode_has_buffers(struct inode *inode) 592 int inode_has_buffers(struct inode *inode)
593 { 593 {
594 return !list_empty(&inode->i_data.private_list); 594 return !list_empty(&inode->i_data.private_list);
595 } 595 }
596 596
597 /* 597 /*
598 * osync is designed to support O_SYNC io. It waits synchronously for 598 * osync is designed to support O_SYNC io. It waits synchronously for
599 * all already-submitted IO to complete, but does not queue any new 599 * all already-submitted IO to complete, but does not queue any new
600 * writes to the disk. 600 * writes to the disk.
601 * 601 *
602 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as 602 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
603 * you dirty the buffers, and then use osync_inode_buffers to wait for 603 * you dirty the buffers, and then use osync_inode_buffers to wait for
604 * completion. Any other dirty buffers which are not yet queued for 604 * completion. Any other dirty buffers which are not yet queued for
605 * write will not be flushed to disk by the osync. 605 * write will not be flushed to disk by the osync.
606 */ 606 */
607 static int osync_buffers_list(spinlock_t *lock, struct list_head *list) 607 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
608 { 608 {
609 struct buffer_head *bh; 609 struct buffer_head *bh;
610 struct list_head *p; 610 struct list_head *p;
611 int err = 0; 611 int err = 0;
612 612
613 spin_lock(lock); 613 spin_lock(lock);
614 repeat: 614 repeat:
615 list_for_each_prev(p, list) { 615 list_for_each_prev(p, list) {
616 bh = BH_ENTRY(p); 616 bh = BH_ENTRY(p);
617 if (buffer_locked(bh)) { 617 if (buffer_locked(bh)) {
618 get_bh(bh); 618 get_bh(bh);
619 spin_unlock(lock); 619 spin_unlock(lock);
620 wait_on_buffer(bh); 620 wait_on_buffer(bh);
621 if (!buffer_uptodate(bh)) 621 if (!buffer_uptodate(bh))
622 err = -EIO; 622 err = -EIO;
623 brelse(bh); 623 brelse(bh);
624 spin_lock(lock); 624 spin_lock(lock);
625 goto repeat; 625 goto repeat;
626 } 626 }
627 } 627 }
628 spin_unlock(lock); 628 spin_unlock(lock);
629 return err; 629 return err;
630 } 630 }
631 631
632 /** 632 /**
633 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 633 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
634 * @mapping: the mapping which wants those buffers written 634 * @mapping: the mapping which wants those buffers written
635 * 635 *
636 * Starts I/O against the buffers at mapping->private_list, and waits upon 636 * Starts I/O against the buffers at mapping->private_list, and waits upon
637 * that I/O. 637 * that I/O.
638 * 638 *
639 * Basically, this is a convenience function for fsync(). 639 * Basically, this is a convenience function for fsync().
640 * @mapping is a file or directory which needs those buffers to be written for 640 * @mapping is a file or directory which needs those buffers to be written for
641 * a successful fsync(). 641 * a successful fsync().
642 */ 642 */
643 int sync_mapping_buffers(struct address_space *mapping) 643 int sync_mapping_buffers(struct address_space *mapping)
644 { 644 {
645 struct address_space *buffer_mapping = mapping->assoc_mapping; 645 struct address_space *buffer_mapping = mapping->assoc_mapping;
646 646
647 if (buffer_mapping == NULL || list_empty(&mapping->private_list)) 647 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
648 return 0; 648 return 0;
649 649
650 return fsync_buffers_list(&buffer_mapping->private_lock, 650 return fsync_buffers_list(&buffer_mapping->private_lock,
651 &mapping->private_list); 651 &mapping->private_list);
652 } 652 }
653 EXPORT_SYMBOL(sync_mapping_buffers); 653 EXPORT_SYMBOL(sync_mapping_buffers);
654 654
655 /* 655 /*
656 * Called when we've recently written block `bblock', and it is known that 656 * Called when we've recently written block `bblock', and it is known that
657 * `bblock' was for a buffer_boundary() buffer. This means that the block at 657 * `bblock' was for a buffer_boundary() buffer. This means that the block at
658 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's 658 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
659 * dirty, schedule it for IO. So that indirects merge nicely with their data. 659 * dirty, schedule it for IO. So that indirects merge nicely with their data.
660 */ 660 */
661 void write_boundary_block(struct block_device *bdev, 661 void write_boundary_block(struct block_device *bdev,
662 sector_t bblock, unsigned blocksize) 662 sector_t bblock, unsigned blocksize)
663 { 663 {
664 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); 664 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
665 if (bh) { 665 if (bh) {
666 if (buffer_dirty(bh)) 666 if (buffer_dirty(bh))
667 ll_rw_block(WRITE, 1, &bh); 667 ll_rw_block(WRITE, 1, &bh);
668 put_bh(bh); 668 put_bh(bh);
669 } 669 }
670 } 670 }
671 671
672 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) 672 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
673 { 673 {
674 struct address_space *mapping = inode->i_mapping; 674 struct address_space *mapping = inode->i_mapping;
675 struct address_space *buffer_mapping = bh->b_page->mapping; 675 struct address_space *buffer_mapping = bh->b_page->mapping;
676 676
677 mark_buffer_dirty(bh); 677 mark_buffer_dirty(bh);
678 if (!mapping->assoc_mapping) { 678 if (!mapping->assoc_mapping) {
679 mapping->assoc_mapping = buffer_mapping; 679 mapping->assoc_mapping = buffer_mapping;
680 } else { 680 } else {
681 BUG_ON(mapping->assoc_mapping != buffer_mapping); 681 BUG_ON(mapping->assoc_mapping != buffer_mapping);
682 } 682 }
683 if (!bh->b_assoc_map) { 683 if (!bh->b_assoc_map) {
684 spin_lock(&buffer_mapping->private_lock); 684 spin_lock(&buffer_mapping->private_lock);
685 list_move_tail(&bh->b_assoc_buffers, 685 list_move_tail(&bh->b_assoc_buffers,
686 &mapping->private_list); 686 &mapping->private_list);
687 bh->b_assoc_map = mapping; 687 bh->b_assoc_map = mapping;
688 spin_unlock(&buffer_mapping->private_lock); 688 spin_unlock(&buffer_mapping->private_lock);
689 } 689 }
690 } 690 }
691 EXPORT_SYMBOL(mark_buffer_dirty_inode); 691 EXPORT_SYMBOL(mark_buffer_dirty_inode);
692 692
693 /* 693 /*
694 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode 694 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
695 * dirty. 695 * dirty.
696 * 696 *
697 * If warn is true, then emit a warning if the page is not uptodate and has 697 * If warn is true, then emit a warning if the page is not uptodate and has
698 * not been truncated. 698 * not been truncated.
699 */ 699 */
700 static int __set_page_dirty(struct page *page, 700 static int __set_page_dirty(struct page *page,
701 struct address_space *mapping, int warn) 701 struct address_space *mapping, int warn)
702 { 702 {
703 if (unlikely(!mapping)) 703 if (unlikely(!mapping))
704 return !TestSetPageDirty(page); 704 return !TestSetPageDirty(page);
705 705
706 if (TestSetPageDirty(page)) 706 if (TestSetPageDirty(page))
707 return 0; 707 return 0;
708 708
709 write_lock_irq(&mapping->tree_lock); 709 write_lock_irq(&mapping->tree_lock);
710 if (page->mapping) { /* Race with truncate? */ 710 if (page->mapping) { /* Race with truncate? */
711 WARN_ON_ONCE(warn && !PageUptodate(page)); 711 WARN_ON_ONCE(warn && !PageUptodate(page));
712 712
713 if (mapping_cap_account_dirty(mapping)) { 713 if (mapping_cap_account_dirty(mapping)) {
714 __inc_zone_page_state(page, NR_FILE_DIRTY); 714 __inc_zone_page_state(page, NR_FILE_DIRTY);
715 __inc_bdi_stat(mapping->backing_dev_info, 715 __inc_bdi_stat(mapping->backing_dev_info,
716 BDI_RECLAIMABLE); 716 BDI_RECLAIMABLE);
717 task_io_account_write(PAGE_CACHE_SIZE); 717 task_io_account_write(PAGE_CACHE_SIZE);
718 } 718 }
719 radix_tree_tag_set(&mapping->page_tree, 719 radix_tree_tag_set(&mapping->page_tree,
720 page_index(page), PAGECACHE_TAG_DIRTY); 720 page_index(page), PAGECACHE_TAG_DIRTY);
721 } 721 }
722 write_unlock_irq(&mapping->tree_lock); 722 write_unlock_irq(&mapping->tree_lock);
723 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 723 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
724 724
725 return 1; 725 return 1;
726 } 726 }
727 727
728 /* 728 /*
729 * Add a page to the dirty page list. 729 * Add a page to the dirty page list.
730 * 730 *
731 * It is a sad fact of life that this function is called from several places 731 * It is a sad fact of life that this function is called from several places
732 * deeply under spinlocking. It may not sleep. 732 * deeply under spinlocking. It may not sleep.
733 * 733 *
734 * If the page has buffers, the uptodate buffers are set dirty, to preserve 734 * If the page has buffers, the uptodate buffers are set dirty, to preserve
735 * dirty-state coherency between the page and the buffers. It the page does 735 * dirty-state coherency between the page and the buffers. It the page does
736 * not have buffers then when they are later attached they will all be set 736 * not have buffers then when they are later attached they will all be set
737 * dirty. 737 * dirty.
738 * 738 *
739 * The buffers are dirtied before the page is dirtied. There's a small race 739 * The buffers are dirtied before the page is dirtied. There's a small race
740 * window in which a writepage caller may see the page cleanness but not the 740 * window in which a writepage caller may see the page cleanness but not the
741 * buffer dirtiness. That's fine. If this code were to set the page dirty 741 * buffer dirtiness. That's fine. If this code were to set the page dirty
742 * before the buffers, a concurrent writepage caller could clear the page dirty 742 * before the buffers, a concurrent writepage caller could clear the page dirty
743 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean 743 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
744 * page on the dirty page list. 744 * page on the dirty page list.
745 * 745 *
746 * We use private_lock to lock against try_to_free_buffers while using the 746 * We use private_lock to lock against try_to_free_buffers while using the
747 * page's buffer list. Also use this to protect against clean buffers being 747 * page's buffer list. Also use this to protect against clean buffers being
748 * added to the page after it was set dirty. 748 * added to the page after it was set dirty.
749 * 749 *
750 * FIXME: may need to call ->reservepage here as well. That's rather up to the 750 * FIXME: may need to call ->reservepage here as well. That's rather up to the
751 * address_space though. 751 * address_space though.
752 */ 752 */
753 int __set_page_dirty_buffers(struct page *page) 753 int __set_page_dirty_buffers(struct page *page)
754 { 754 {
755 struct address_space *mapping = page_mapping(page); 755 struct address_space *mapping = page_mapping(page);
756 756
757 if (unlikely(!mapping)) 757 if (unlikely(!mapping))
758 return !TestSetPageDirty(page); 758 return !TestSetPageDirty(page);
759 759
760 spin_lock(&mapping->private_lock); 760 spin_lock(&mapping->private_lock);
761 if (page_has_buffers(page)) { 761 if (page_has_buffers(page)) {
762 struct buffer_head *head = page_buffers(page); 762 struct buffer_head *head = page_buffers(page);
763 struct buffer_head *bh = head; 763 struct buffer_head *bh = head;
764 764
765 do { 765 do {
766 set_buffer_dirty(bh); 766 set_buffer_dirty(bh);
767 bh = bh->b_this_page; 767 bh = bh->b_this_page;
768 } while (bh != head); 768 } while (bh != head);
769 } 769 }
770 spin_unlock(&mapping->private_lock); 770 spin_unlock(&mapping->private_lock);
771 771
772 return __set_page_dirty(page, mapping, 1); 772 return __set_page_dirty(page, mapping, 1);
773 } 773 }
774 EXPORT_SYMBOL(__set_page_dirty_buffers); 774 EXPORT_SYMBOL(__set_page_dirty_buffers);
775 775
776 /* 776 /*
777 * Write out and wait upon a list of buffers. 777 * Write out and wait upon a list of buffers.
778 * 778 *
779 * We have conflicting pressures: we want to make sure that all 779 * We have conflicting pressures: we want to make sure that all
780 * initially dirty buffers get waited on, but that any subsequently 780 * initially dirty buffers get waited on, but that any subsequently
781 * dirtied buffers don't. After all, we don't want fsync to last 781 * dirtied buffers don't. After all, we don't want fsync to last
782 * forever if somebody is actively writing to the file. 782 * forever if somebody is actively writing to the file.
783 * 783 *
784 * Do this in two main stages: first we copy dirty buffers to a 784 * Do this in two main stages: first we copy dirty buffers to a
785 * temporary inode list, queueing the writes as we go. Then we clean 785 * temporary inode list, queueing the writes as we go. Then we clean
786 * up, waiting for those writes to complete. 786 * up, waiting for those writes to complete.
787 * 787 *
788 * During this second stage, any subsequent updates to the file may end 788 * During this second stage, any subsequent updates to the file may end
789 * up refiling the buffer on the original inode's dirty list again, so 789 * up refiling the buffer on the original inode's dirty list again, so
790 * there is a chance we will end up with a buffer queued for write but 790 * there is a chance we will end up with a buffer queued for write but
791 * not yet completed on that list. So, as a final cleanup we go through 791 * not yet completed on that list. So, as a final cleanup we go through
792 * the osync code to catch these locked, dirty buffers without requeuing 792 * the osync code to catch these locked, dirty buffers without requeuing
793 * any newly dirty buffers for write. 793 * any newly dirty buffers for write.
794 */ 794 */
795 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) 795 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
796 { 796 {
797 struct buffer_head *bh; 797 struct buffer_head *bh;
798 struct list_head tmp; 798 struct list_head tmp;
799 struct address_space *mapping; 799 struct address_space *mapping;
800 int err = 0, err2; 800 int err = 0, err2;
801 801
802 INIT_LIST_HEAD(&tmp); 802 INIT_LIST_HEAD(&tmp);
803 803
804 spin_lock(lock); 804 spin_lock(lock);
805 while (!list_empty(list)) { 805 while (!list_empty(list)) {
806 bh = BH_ENTRY(list->next); 806 bh = BH_ENTRY(list->next);
807 mapping = bh->b_assoc_map; 807 mapping = bh->b_assoc_map;
808 __remove_assoc_queue(bh); 808 __remove_assoc_queue(bh);
809 /* Avoid race with mark_buffer_dirty_inode() which does 809 /* Avoid race with mark_buffer_dirty_inode() which does
810 * a lockless check and we rely on seeing the dirty bit */ 810 * a lockless check and we rely on seeing the dirty bit */
811 smp_mb(); 811 smp_mb();
812 if (buffer_dirty(bh) || buffer_locked(bh)) { 812 if (buffer_dirty(bh) || buffer_locked(bh)) {
813 list_add(&bh->b_assoc_buffers, &tmp); 813 list_add(&bh->b_assoc_buffers, &tmp);
814 bh->b_assoc_map = mapping; 814 bh->b_assoc_map = mapping;
815 if (buffer_dirty(bh)) { 815 if (buffer_dirty(bh)) {
816 get_bh(bh); 816 get_bh(bh);
817 spin_unlock(lock); 817 spin_unlock(lock);
818 /* 818 /*
819 * Ensure any pending I/O completes so that 819 * Ensure any pending I/O completes so that
820 * ll_rw_block() actually writes the current 820 * ll_rw_block() actually writes the current
821 * contents - it is a noop if I/O is still in 821 * contents - it is a noop if I/O is still in
822 * flight on potentially older contents. 822 * flight on potentially older contents.
823 */ 823 */
824 ll_rw_block(SWRITE_SYNC, 1, &bh); 824 ll_rw_block(SWRITE_SYNC, 1, &bh);
825 brelse(bh); 825 brelse(bh);
826 spin_lock(lock); 826 spin_lock(lock);
827 } 827 }
828 } 828 }
829 } 829 }
830 830
831 while (!list_empty(&tmp)) { 831 while (!list_empty(&tmp)) {
832 bh = BH_ENTRY(tmp.prev); 832 bh = BH_ENTRY(tmp.prev);
833 get_bh(bh); 833 get_bh(bh);
834 mapping = bh->b_assoc_map; 834 mapping = bh->b_assoc_map;
835 __remove_assoc_queue(bh); 835 __remove_assoc_queue(bh);
836 /* Avoid race with mark_buffer_dirty_inode() which does 836 /* Avoid race with mark_buffer_dirty_inode() which does
837 * a lockless check and we rely on seeing the dirty bit */ 837 * a lockless check and we rely on seeing the dirty bit */
838 smp_mb(); 838 smp_mb();
839 if (buffer_dirty(bh)) { 839 if (buffer_dirty(bh)) {
840 list_add(&bh->b_assoc_buffers, 840 list_add(&bh->b_assoc_buffers,
841 &mapping->private_list); 841 &mapping->private_list);
842 bh->b_assoc_map = mapping; 842 bh->b_assoc_map = mapping;
843 } 843 }
844 spin_unlock(lock); 844 spin_unlock(lock);
845 wait_on_buffer(bh); 845 wait_on_buffer(bh);
846 if (!buffer_uptodate(bh)) 846 if (!buffer_uptodate(bh))
847 err = -EIO; 847 err = -EIO;
848 brelse(bh); 848 brelse(bh);
849 spin_lock(lock); 849 spin_lock(lock);
850 } 850 }
851 851
852 spin_unlock(lock); 852 spin_unlock(lock);
853 err2 = osync_buffers_list(lock, list); 853 err2 = osync_buffers_list(lock, list);
854 if (err) 854 if (err)
855 return err; 855 return err;
856 else 856 else
857 return err2; 857 return err2;
858 } 858 }
859 859
860 /* 860 /*
861 * Invalidate any and all dirty buffers on a given inode. We are 861 * Invalidate any and all dirty buffers on a given inode. We are
862 * probably unmounting the fs, but that doesn't mean we have already 862 * probably unmounting the fs, but that doesn't mean we have already
863 * done a sync(). Just drop the buffers from the inode list. 863 * done a sync(). Just drop the buffers from the inode list.
864 * 864 *
865 * NOTE: we take the inode's blockdev's mapping's private_lock. Which 865 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
866 * assumes that all the buffers are against the blockdev. Not true 866 * assumes that all the buffers are against the blockdev. Not true
867 * for reiserfs. 867 * for reiserfs.
868 */ 868 */
869 void invalidate_inode_buffers(struct inode *inode) 869 void invalidate_inode_buffers(struct inode *inode)
870 { 870 {
871 if (inode_has_buffers(inode)) { 871 if (inode_has_buffers(inode)) {
872 struct address_space *mapping = &inode->i_data; 872 struct address_space *mapping = &inode->i_data;
873 struct list_head *list = &mapping->private_list; 873 struct list_head *list = &mapping->private_list;
874 struct address_space *buffer_mapping = mapping->assoc_mapping; 874 struct address_space *buffer_mapping = mapping->assoc_mapping;
875 875
876 spin_lock(&buffer_mapping->private_lock); 876 spin_lock(&buffer_mapping->private_lock);
877 while (!list_empty(list)) 877 while (!list_empty(list))
878 __remove_assoc_queue(BH_ENTRY(list->next)); 878 __remove_assoc_queue(BH_ENTRY(list->next));
879 spin_unlock(&buffer_mapping->private_lock); 879 spin_unlock(&buffer_mapping->private_lock);
880 } 880 }
881 } 881 }
882 882
883 /* 883 /*
884 * Remove any clean buffers from the inode's buffer list. This is called 884 * Remove any clean buffers from the inode's buffer list. This is called
885 * when we're trying to free the inode itself. Those buffers can pin it. 885 * when we're trying to free the inode itself. Those buffers can pin it.
886 * 886 *
887 * Returns true if all buffers were removed. 887 * Returns true if all buffers were removed.
888 */ 888 */
889 int remove_inode_buffers(struct inode *inode) 889 int remove_inode_buffers(struct inode *inode)
890 { 890 {
891 int ret = 1; 891 int ret = 1;
892 892
893 if (inode_has_buffers(inode)) { 893 if (inode_has_buffers(inode)) {
894 struct address_space *mapping = &inode->i_data; 894 struct address_space *mapping = &inode->i_data;
895 struct list_head *list = &mapping->private_list; 895 struct list_head *list = &mapping->private_list;
896 struct address_space *buffer_mapping = mapping->assoc_mapping; 896 struct address_space *buffer_mapping = mapping->assoc_mapping;
897 897
898 spin_lock(&buffer_mapping->private_lock); 898 spin_lock(&buffer_mapping->private_lock);
899 while (!list_empty(list)) { 899 while (!list_empty(list)) {
900 struct buffer_head *bh = BH_ENTRY(list->next); 900 struct buffer_head *bh = BH_ENTRY(list->next);
901 if (buffer_dirty(bh)) { 901 if (buffer_dirty(bh)) {
902 ret = 0; 902 ret = 0;
903 break; 903 break;
904 } 904 }
905 __remove_assoc_queue(bh); 905 __remove_assoc_queue(bh);
906 } 906 }
907 spin_unlock(&buffer_mapping->private_lock); 907 spin_unlock(&buffer_mapping->private_lock);
908 } 908 }
909 return ret; 909 return ret;
910 } 910 }
911 911
912 /* 912 /*
913 * Create the appropriate buffers when given a page for data area and 913 * Create the appropriate buffers when given a page for data area and
914 * the size of each buffer.. Use the bh->b_this_page linked list to 914 * the size of each buffer.. Use the bh->b_this_page linked list to
915 * follow the buffers created. Return NULL if unable to create more 915 * follow the buffers created. Return NULL if unable to create more
916 * buffers. 916 * buffers.
917 * 917 *
918 * The retry flag is used to differentiate async IO (paging, swapping) 918 * The retry flag is used to differentiate async IO (paging, swapping)
919 * which may not fail from ordinary buffer allocations. 919 * which may not fail from ordinary buffer allocations.
920 */ 920 */
921 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, 921 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
922 int retry) 922 int retry)
923 { 923 {
924 struct buffer_head *bh, *head; 924 struct buffer_head *bh, *head;
925 long offset; 925 long offset;
926 926
927 try_again: 927 try_again:
928 head = NULL; 928 head = NULL;
929 offset = PAGE_SIZE; 929 offset = PAGE_SIZE;
930 while ((offset -= size) >= 0) { 930 while ((offset -= size) >= 0) {
931 bh = alloc_buffer_head(GFP_NOFS); 931 bh = alloc_buffer_head(GFP_NOFS);
932 if (!bh) 932 if (!bh)
933 goto no_grow; 933 goto no_grow;
934 934
935 bh->b_bdev = NULL; 935 bh->b_bdev = NULL;
936 bh->b_this_page = head; 936 bh->b_this_page = head;
937 bh->b_blocknr = -1; 937 bh->b_blocknr = -1;
938 head = bh; 938 head = bh;
939 939
940 bh->b_state = 0; 940 bh->b_state = 0;
941 atomic_set(&bh->b_count, 0); 941 atomic_set(&bh->b_count, 0);
942 bh->b_private = NULL; 942 bh->b_private = NULL;
943 bh->b_size = size; 943 bh->b_size = size;
944 944
945 /* Link the buffer to its page */ 945 /* Link the buffer to its page */
946 set_bh_page(bh, page, offset); 946 set_bh_page(bh, page, offset);
947 947
948 init_buffer(bh, NULL, NULL); 948 init_buffer(bh, NULL, NULL);
949 } 949 }
950 return head; 950 return head;
951 /* 951 /*
952 * In case anything failed, we just free everything we got. 952 * In case anything failed, we just free everything we got.
953 */ 953 */
954 no_grow: 954 no_grow:
955 if (head) { 955 if (head) {
956 do { 956 do {
957 bh = head; 957 bh = head;
958 head = head->b_this_page; 958 head = head->b_this_page;
959 free_buffer_head(bh); 959 free_buffer_head(bh);
960 } while (head); 960 } while (head);
961 } 961 }
962 962
963 /* 963 /*
964 * Return failure for non-async IO requests. Async IO requests 964 * Return failure for non-async IO requests. Async IO requests
965 * are not allowed to fail, so we have to wait until buffer heads 965 * are not allowed to fail, so we have to wait until buffer heads
966 * become available. But we don't want tasks sleeping with 966 * become available. But we don't want tasks sleeping with
967 * partially complete buffers, so all were released above. 967 * partially complete buffers, so all were released above.
968 */ 968 */
969 if (!retry) 969 if (!retry)
970 return NULL; 970 return NULL;
971 971
972 /* We're _really_ low on memory. Now we just 972 /* We're _really_ low on memory. Now we just
973 * wait for old buffer heads to become free due to 973 * wait for old buffer heads to become free due to
974 * finishing IO. Since this is an async request and 974 * finishing IO. Since this is an async request and
975 * the reserve list is empty, we're sure there are 975 * the reserve list is empty, we're sure there are
976 * async buffer heads in use. 976 * async buffer heads in use.
977 */ 977 */
978 free_more_memory(); 978 free_more_memory();
979 goto try_again; 979 goto try_again;
980 } 980 }
981 EXPORT_SYMBOL_GPL(alloc_page_buffers); 981 EXPORT_SYMBOL_GPL(alloc_page_buffers);
982 982
983 static inline void 983 static inline void
984 link_dev_buffers(struct page *page, struct buffer_head *head) 984 link_dev_buffers(struct page *page, struct buffer_head *head)
985 { 985 {
986 struct buffer_head *bh, *tail; 986 struct buffer_head *bh, *tail;
987 987
988 bh = head; 988 bh = head;
989 do { 989 do {
990 tail = bh; 990 tail = bh;
991 bh = bh->b_this_page; 991 bh = bh->b_this_page;
992 } while (bh); 992 } while (bh);
993 tail->b_this_page = head; 993 tail->b_this_page = head;
994 attach_page_buffers(page, head); 994 attach_page_buffers(page, head);
995 } 995 }
996 996
997 /* 997 /*
998 * Initialise the state of a blockdev page's buffers. 998 * Initialise the state of a blockdev page's buffers.
999 */ 999 */
1000 static void 1000 static void
1001 init_page_buffers(struct page *page, struct block_device *bdev, 1001 init_page_buffers(struct page *page, struct block_device *bdev,
1002 sector_t block, int size) 1002 sector_t block, int size)
1003 { 1003 {
1004 struct buffer_head *head = page_buffers(page); 1004 struct buffer_head *head = page_buffers(page);
1005 struct buffer_head *bh = head; 1005 struct buffer_head *bh = head;
1006 int uptodate = PageUptodate(page); 1006 int uptodate = PageUptodate(page);
1007 1007
1008 do { 1008 do {
1009 if (!buffer_mapped(bh)) { 1009 if (!buffer_mapped(bh)) {
1010 init_buffer(bh, NULL, NULL); 1010 init_buffer(bh, NULL, NULL);
1011 bh->b_bdev = bdev; 1011 bh->b_bdev = bdev;
1012 bh->b_blocknr = block; 1012 bh->b_blocknr = block;
1013 if (uptodate) 1013 if (uptodate)
1014 set_buffer_uptodate(bh); 1014 set_buffer_uptodate(bh);
1015 set_buffer_mapped(bh); 1015 set_buffer_mapped(bh);
1016 } 1016 }
1017 block++; 1017 block++;
1018 bh = bh->b_this_page; 1018 bh = bh->b_this_page;
1019 } while (bh != head); 1019 } while (bh != head);
1020 } 1020 }
1021 1021
1022 /* 1022 /*
1023 * Create the page-cache page that contains the requested block. 1023 * Create the page-cache page that contains the requested block.
1024 * 1024 *
1025 * This is user purely for blockdev mappings. 1025 * This is user purely for blockdev mappings.
1026 */ 1026 */
1027 static struct page * 1027 static struct page *
1028 grow_dev_page(struct block_device *bdev, sector_t block, 1028 grow_dev_page(struct block_device *bdev, sector_t block,
1029 pgoff_t index, int size) 1029 pgoff_t index, int size)
1030 { 1030 {
1031 struct inode *inode = bdev->bd_inode; 1031 struct inode *inode = bdev->bd_inode;
1032 struct page *page; 1032 struct page *page;
1033 struct buffer_head *bh; 1033 struct buffer_head *bh;
1034 1034
1035 page = find_or_create_page(inode->i_mapping, index, 1035 page = find_or_create_page(inode->i_mapping, index,
1036 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); 1036 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1037 if (!page) 1037 if (!page)
1038 return NULL; 1038 return NULL;
1039 1039
1040 BUG_ON(!PageLocked(page)); 1040 BUG_ON(!PageLocked(page));
1041 1041
1042 if (page_has_buffers(page)) { 1042 if (page_has_buffers(page)) {
1043 bh = page_buffers(page); 1043 bh = page_buffers(page);
1044 if (bh->b_size == size) { 1044 if (bh->b_size == size) {
1045 init_page_buffers(page, bdev, block, size); 1045 init_page_buffers(page, bdev, block, size);
1046 return page; 1046 return page;
1047 } 1047 }
1048 if (!try_to_free_buffers(page)) 1048 if (!try_to_free_buffers(page))
1049 goto failed; 1049 goto failed;
1050 } 1050 }
1051 1051
1052 /* 1052 /*
1053 * Allocate some buffers for this page 1053 * Allocate some buffers for this page
1054 */ 1054 */
1055 bh = alloc_page_buffers(page, size, 0); 1055 bh = alloc_page_buffers(page, size, 0);
1056 if (!bh) 1056 if (!bh)
1057 goto failed; 1057 goto failed;
1058 1058
1059 /* 1059 /*
1060 * Link the page to the buffers and initialise them. Take the 1060 * Link the page to the buffers and initialise them. Take the
1061 * lock to be atomic wrt __find_get_block(), which does not 1061 * lock to be atomic wrt __find_get_block(), which does not
1062 * run under the page lock. 1062 * run under the page lock.
1063 */ 1063 */
1064 spin_lock(&inode->i_mapping->private_lock); 1064 spin_lock(&inode->i_mapping->private_lock);
1065 link_dev_buffers(page, bh); 1065 link_dev_buffers(page, bh);
1066 init_page_buffers(page, bdev, block, size); 1066 init_page_buffers(page, bdev, block, size);
1067 spin_unlock(&inode->i_mapping->private_lock); 1067 spin_unlock(&inode->i_mapping->private_lock);
1068 return page; 1068 return page;
1069 1069
1070 failed: 1070 failed:
1071 BUG(); 1071 BUG();
1072 unlock_page(page); 1072 unlock_page(page);
1073 page_cache_release(page); 1073 page_cache_release(page);
1074 return NULL; 1074 return NULL;
1075 } 1075 }
1076 1076
1077 /* 1077 /*
1078 * Create buffers for the specified block device block's page. If 1078 * Create buffers for the specified block device block's page. If
1079 * that page was dirty, the buffers are set dirty also. 1079 * that page was dirty, the buffers are set dirty also.
1080 */ 1080 */
1081 static int 1081 static int
1082 grow_buffers(struct block_device *bdev, sector_t block, int size) 1082 grow_buffers(struct block_device *bdev, sector_t block, int size)
1083 { 1083 {
1084 struct page *page; 1084 struct page *page;
1085 pgoff_t index; 1085 pgoff_t index;
1086 int sizebits; 1086 int sizebits;
1087 1087
1088 sizebits = -1; 1088 sizebits = -1;
1089 do { 1089 do {
1090 sizebits++; 1090 sizebits++;
1091 } while ((size << sizebits) < PAGE_SIZE); 1091 } while ((size << sizebits) < PAGE_SIZE);
1092 1092
1093 index = block >> sizebits; 1093 index = block >> sizebits;
1094 1094
1095 /* 1095 /*
1096 * Check for a block which wants to lie outside our maximum possible 1096 * Check for a block which wants to lie outside our maximum possible
1097 * pagecache index. (this comparison is done using sector_t types). 1097 * pagecache index. (this comparison is done using sector_t types).
1098 */ 1098 */
1099 if (unlikely(index != block >> sizebits)) { 1099 if (unlikely(index != block >> sizebits)) {
1100 char b[BDEVNAME_SIZE]; 1100 char b[BDEVNAME_SIZE];
1101 1101
1102 printk(KERN_ERR "%s: requested out-of-range block %llu for " 1102 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1103 "device %s\n", 1103 "device %s\n",
1104 __func__, (unsigned long long)block, 1104 __func__, (unsigned long long)block,
1105 bdevname(bdev, b)); 1105 bdevname(bdev, b));
1106 return -EIO; 1106 return -EIO;
1107 } 1107 }
1108 block = index << sizebits; 1108 block = index << sizebits;
1109 /* Create a page with the proper size buffers.. */ 1109 /* Create a page with the proper size buffers.. */
1110 page = grow_dev_page(bdev, block, index, size); 1110 page = grow_dev_page(bdev, block, index, size);
1111 if (!page) 1111 if (!page)
1112 return 0; 1112 return 0;
1113 unlock_page(page); 1113 unlock_page(page);
1114 page_cache_release(page); 1114 page_cache_release(page);
1115 return 1; 1115 return 1;
1116 } 1116 }
1117 1117
1118 static struct buffer_head * 1118 static struct buffer_head *
1119 __getblk_slow(struct block_device *bdev, sector_t block, int size) 1119 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1120 { 1120 {
1121 /* Size must be multiple of hard sectorsize */ 1121 /* Size must be multiple of hard sectorsize */
1122 if (unlikely(size & (bdev_hardsect_size(bdev)-1) || 1122 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1123 (size < 512 || size > PAGE_SIZE))) { 1123 (size < 512 || size > PAGE_SIZE))) {
1124 printk(KERN_ERR "getblk(): invalid block size %d requested\n", 1124 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1125 size); 1125 size);
1126 printk(KERN_ERR "hardsect size: %d\n", 1126 printk(KERN_ERR "hardsect size: %d\n",
1127 bdev_hardsect_size(bdev)); 1127 bdev_hardsect_size(bdev));
1128 1128
1129 dump_stack(); 1129 dump_stack();
1130 return NULL; 1130 return NULL;
1131 } 1131 }
1132 1132
1133 for (;;) { 1133 for (;;) {
1134 struct buffer_head * bh; 1134 struct buffer_head * bh;
1135 int ret; 1135 int ret;
1136 1136
1137 bh = __find_get_block(bdev, block, size); 1137 bh = __find_get_block(bdev, block, size);
1138 if (bh) 1138 if (bh)
1139 return bh; 1139 return bh;
1140 1140
1141 ret = grow_buffers(bdev, block, size); 1141 ret = grow_buffers(bdev, block, size);
1142 if (ret < 0) 1142 if (ret < 0)
1143 return NULL; 1143 return NULL;
1144 if (ret == 0) 1144 if (ret == 0)
1145 free_more_memory(); 1145 free_more_memory();
1146 } 1146 }
1147 } 1147 }
1148 1148
1149 /* 1149 /*
1150 * The relationship between dirty buffers and dirty pages: 1150 * The relationship between dirty buffers and dirty pages:
1151 * 1151 *
1152 * Whenever a page has any dirty buffers, the page's dirty bit is set, and 1152 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1153 * the page is tagged dirty in its radix tree. 1153 * the page is tagged dirty in its radix tree.
1154 * 1154 *
1155 * At all times, the dirtiness of the buffers represents the dirtiness of 1155 * At all times, the dirtiness of the buffers represents the dirtiness of
1156 * subsections of the page. If the page has buffers, the page dirty bit is 1156 * subsections of the page. If the page has buffers, the page dirty bit is
1157 * merely a hint about the true dirty state. 1157 * merely a hint about the true dirty state.
1158 * 1158 *
1159 * When a page is set dirty in its entirety, all its buffers are marked dirty 1159 * When a page is set dirty in its entirety, all its buffers are marked dirty
1160 * (if the page has buffers). 1160 * (if the page has buffers).
1161 * 1161 *
1162 * When a buffer is marked dirty, its page is dirtied, but the page's other 1162 * When a buffer is marked dirty, its page is dirtied, but the page's other
1163 * buffers are not. 1163 * buffers are not.
1164 * 1164 *
1165 * Also. When blockdev buffers are explicitly read with bread(), they 1165 * Also. When blockdev buffers are explicitly read with bread(), they
1166 * individually become uptodate. But their backing page remains not 1166 * individually become uptodate. But their backing page remains not
1167 * uptodate - even if all of its buffers are uptodate. A subsequent 1167 * uptodate - even if all of its buffers are uptodate. A subsequent
1168 * block_read_full_page() against that page will discover all the uptodate 1168 * block_read_full_page() against that page will discover all the uptodate
1169 * buffers, will set the page uptodate and will perform no I/O. 1169 * buffers, will set the page uptodate and will perform no I/O.
1170 */ 1170 */
1171 1171
1172 /** 1172 /**
1173 * mark_buffer_dirty - mark a buffer_head as needing writeout 1173 * mark_buffer_dirty - mark a buffer_head as needing writeout
1174 * @bh: the buffer_head to mark dirty 1174 * @bh: the buffer_head to mark dirty
1175 * 1175 *
1176 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its 1176 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1177 * backing page dirty, then tag the page as dirty in its address_space's radix 1177 * backing page dirty, then tag the page as dirty in its address_space's radix
1178 * tree and then attach the address_space's inode to its superblock's dirty 1178 * tree and then attach the address_space's inode to its superblock's dirty
1179 * inode list. 1179 * inode list.
1180 * 1180 *
1181 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1181 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1182 * mapping->tree_lock and the global inode_lock. 1182 * mapping->tree_lock and the global inode_lock.
1183 */ 1183 */
1184 void mark_buffer_dirty(struct buffer_head *bh) 1184 void mark_buffer_dirty(struct buffer_head *bh)
1185 { 1185 {
1186 WARN_ON_ONCE(!buffer_uptodate(bh)); 1186 WARN_ON_ONCE(!buffer_uptodate(bh));
1187 1187
1188 /* 1188 /*
1189 * Very *carefully* optimize the it-is-already-dirty case. 1189 * Very *carefully* optimize the it-is-already-dirty case.
1190 * 1190 *
1191 * Don't let the final "is it dirty" escape to before we 1191 * Don't let the final "is it dirty" escape to before we
1192 * perhaps modified the buffer. 1192 * perhaps modified the buffer.
1193 */ 1193 */
1194 if (buffer_dirty(bh)) { 1194 if (buffer_dirty(bh)) {
1195 smp_mb(); 1195 smp_mb();
1196 if (buffer_dirty(bh)) 1196 if (buffer_dirty(bh))
1197 return; 1197 return;
1198 } 1198 }
1199 1199
1200 if (!test_set_buffer_dirty(bh)) 1200 if (!test_set_buffer_dirty(bh))
1201 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); 1201 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1202 } 1202 }
1203 1203
1204 /* 1204 /*
1205 * Decrement a buffer_head's reference count. If all buffers against a page 1205 * Decrement a buffer_head's reference count. If all buffers against a page
1206 * have zero reference count, are clean and unlocked, and if the page is clean 1206 * have zero reference count, are clean and unlocked, and if the page is clean
1207 * and unlocked then try_to_free_buffers() may strip the buffers from the page 1207 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1208 * in preparation for freeing it (sometimes, rarely, buffers are removed from 1208 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1209 * a page but it ends up not being freed, and buffers may later be reattached). 1209 * a page but it ends up not being freed, and buffers may later be reattached).
1210 */ 1210 */
1211 void __brelse(struct buffer_head * buf) 1211 void __brelse(struct buffer_head * buf)
1212 { 1212 {
1213 if (atomic_read(&buf->b_count)) { 1213 if (atomic_read(&buf->b_count)) {
1214 put_bh(buf); 1214 put_bh(buf);
1215 return; 1215 return;
1216 } 1216 }
1217 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); 1217 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1218 WARN_ON(1); 1218 WARN_ON(1);
1219 } 1219 }
1220 1220
1221 /* 1221 /*
1222 * bforget() is like brelse(), except it discards any 1222 * bforget() is like brelse(), except it discards any
1223 * potentially dirty data. 1223 * potentially dirty data.
1224 */ 1224 */
1225 void __bforget(struct buffer_head *bh) 1225 void __bforget(struct buffer_head *bh)
1226 { 1226 {
1227 clear_buffer_dirty(bh); 1227 clear_buffer_dirty(bh);
1228 if (bh->b_assoc_map) { 1228 if (bh->b_assoc_map) {
1229 struct address_space *buffer_mapping = bh->b_page->mapping; 1229 struct address_space *buffer_mapping = bh->b_page->mapping;
1230 1230
1231 spin_lock(&buffer_mapping->private_lock); 1231 spin_lock(&buffer_mapping->private_lock);
1232 list_del_init(&bh->b_assoc_buffers); 1232 list_del_init(&bh->b_assoc_buffers);
1233 bh->b_assoc_map = NULL; 1233 bh->b_assoc_map = NULL;
1234 spin_unlock(&buffer_mapping->private_lock); 1234 spin_unlock(&buffer_mapping->private_lock);
1235 } 1235 }
1236 __brelse(bh); 1236 __brelse(bh);
1237 } 1237 }
1238 1238
1239 static struct buffer_head *__bread_slow(struct buffer_head *bh) 1239 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1240 { 1240 {
1241 lock_buffer(bh); 1241 lock_buffer(bh);
1242 if (buffer_uptodate(bh)) { 1242 if (buffer_uptodate(bh)) {
1243 unlock_buffer(bh); 1243 unlock_buffer(bh);
1244 return bh; 1244 return bh;
1245 } else { 1245 } else {
1246 get_bh(bh); 1246 get_bh(bh);
1247 bh->b_end_io = end_buffer_read_sync; 1247 bh->b_end_io = end_buffer_read_sync;
1248 submit_bh(READ, bh); 1248 submit_bh(READ, bh);
1249 wait_on_buffer(bh); 1249 wait_on_buffer(bh);
1250 if (buffer_uptodate(bh)) 1250 if (buffer_uptodate(bh))
1251 return bh; 1251 return bh;
1252 } 1252 }
1253 brelse(bh); 1253 brelse(bh);
1254 return NULL; 1254 return NULL;
1255 } 1255 }
1256 1256
1257 /* 1257 /*
1258 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). 1258 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1259 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their 1259 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1260 * refcount elevated by one when they're in an LRU. A buffer can only appear 1260 * refcount elevated by one when they're in an LRU. A buffer can only appear
1261 * once in a particular CPU's LRU. A single buffer can be present in multiple 1261 * once in a particular CPU's LRU. A single buffer can be present in multiple
1262 * CPU's LRUs at the same time. 1262 * CPU's LRUs at the same time.
1263 * 1263 *
1264 * This is a transparent caching front-end to sb_bread(), sb_getblk() and 1264 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1265 * sb_find_get_block(). 1265 * sb_find_get_block().
1266 * 1266 *
1267 * The LRUs themselves only need locking against invalidate_bh_lrus. We use 1267 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1268 * a local interrupt disable for that. 1268 * a local interrupt disable for that.
1269 */ 1269 */
1270 1270
1271 #define BH_LRU_SIZE 8 1271 #define BH_LRU_SIZE 8
1272 1272
1273 struct bh_lru { 1273 struct bh_lru {
1274 struct buffer_head *bhs[BH_LRU_SIZE]; 1274 struct buffer_head *bhs[BH_LRU_SIZE];
1275 }; 1275 };
1276 1276
1277 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; 1277 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1278 1278
1279 #ifdef CONFIG_SMP 1279 #ifdef CONFIG_SMP
1280 #define bh_lru_lock() local_irq_disable() 1280 #define bh_lru_lock() local_irq_disable()
1281 #define bh_lru_unlock() local_irq_enable() 1281 #define bh_lru_unlock() local_irq_enable()
1282 #else 1282 #else
1283 #define bh_lru_lock() preempt_disable() 1283 #define bh_lru_lock() preempt_disable()
1284 #define bh_lru_unlock() preempt_enable() 1284 #define bh_lru_unlock() preempt_enable()
1285 #endif 1285 #endif
1286 1286
1287 static inline void check_irqs_on(void) 1287 static inline void check_irqs_on(void)
1288 { 1288 {
1289 #ifdef irqs_disabled 1289 #ifdef irqs_disabled
1290 BUG_ON(irqs_disabled()); 1290 BUG_ON(irqs_disabled());
1291 #endif 1291 #endif
1292 } 1292 }
1293 1293
1294 /* 1294 /*
1295 * The LRU management algorithm is dopey-but-simple. Sorry. 1295 * The LRU management algorithm is dopey-but-simple. Sorry.
1296 */ 1296 */
1297 static void bh_lru_install(struct buffer_head *bh) 1297 static void bh_lru_install(struct buffer_head *bh)
1298 { 1298 {
1299 struct buffer_head *evictee = NULL; 1299 struct buffer_head *evictee = NULL;
1300 struct bh_lru *lru; 1300 struct bh_lru *lru;
1301 1301
1302 check_irqs_on(); 1302 check_irqs_on();
1303 bh_lru_lock(); 1303 bh_lru_lock();
1304 lru = &__get_cpu_var(bh_lrus); 1304 lru = &__get_cpu_var(bh_lrus);
1305 if (lru->bhs[0] != bh) { 1305 if (lru->bhs[0] != bh) {
1306 struct buffer_head *bhs[BH_LRU_SIZE]; 1306 struct buffer_head *bhs[BH_LRU_SIZE];
1307 int in; 1307 int in;
1308 int out = 0; 1308 int out = 0;
1309 1309
1310 get_bh(bh); 1310 get_bh(bh);
1311 bhs[out++] = bh; 1311 bhs[out++] = bh;
1312 for (in = 0; in < BH_LRU_SIZE; in++) { 1312 for (in = 0; in < BH_LRU_SIZE; in++) {
1313 struct buffer_head *bh2 = lru->bhs[in]; 1313 struct buffer_head *bh2 = lru->bhs[in];
1314 1314
1315 if (bh2 == bh) { 1315 if (bh2 == bh) {
1316 __brelse(bh2); 1316 __brelse(bh2);
1317 } else { 1317 } else {
1318 if (out >= BH_LRU_SIZE) { 1318 if (out >= BH_LRU_SIZE) {
1319 BUG_ON(evictee != NULL); 1319 BUG_ON(evictee != NULL);
1320 evictee = bh2; 1320 evictee = bh2;
1321 } else { 1321 } else {
1322 bhs[out++] = bh2; 1322 bhs[out++] = bh2;
1323 } 1323 }
1324 } 1324 }
1325 } 1325 }
1326 while (out < BH_LRU_SIZE) 1326 while (out < BH_LRU_SIZE)
1327 bhs[out++] = NULL; 1327 bhs[out++] = NULL;
1328 memcpy(lru->bhs, bhs, sizeof(bhs)); 1328 memcpy(lru->bhs, bhs, sizeof(bhs));
1329 } 1329 }
1330 bh_lru_unlock(); 1330 bh_lru_unlock();
1331 1331
1332 if (evictee) 1332 if (evictee)
1333 __brelse(evictee); 1333 __brelse(evictee);
1334 } 1334 }
1335 1335
1336 /* 1336 /*
1337 * Look up the bh in this cpu's LRU. If it's there, move it to the head. 1337 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1338 */ 1338 */
1339 static struct buffer_head * 1339 static struct buffer_head *
1340 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) 1340 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1341 { 1341 {
1342 struct buffer_head *ret = NULL; 1342 struct buffer_head *ret = NULL;
1343 struct bh_lru *lru; 1343 struct bh_lru *lru;
1344 unsigned int i; 1344 unsigned int i;
1345 1345
1346 check_irqs_on(); 1346 check_irqs_on();
1347 bh_lru_lock(); 1347 bh_lru_lock();
1348 lru = &__get_cpu_var(bh_lrus); 1348 lru = &__get_cpu_var(bh_lrus);
1349 for (i = 0; i < BH_LRU_SIZE; i++) { 1349 for (i = 0; i < BH_LRU_SIZE; i++) {
1350 struct buffer_head *bh = lru->bhs[i]; 1350 struct buffer_head *bh = lru->bhs[i];
1351 1351
1352 if (bh && bh->b_bdev == bdev && 1352 if (bh && bh->b_bdev == bdev &&
1353 bh->b_blocknr == block && bh->b_size == size) { 1353 bh->b_blocknr == block && bh->b_size == size) {
1354 if (i) { 1354 if (i) {
1355 while (i) { 1355 while (i) {
1356 lru->bhs[i] = lru->bhs[i - 1]; 1356 lru->bhs[i] = lru->bhs[i - 1];
1357 i--; 1357 i--;
1358 } 1358 }
1359 lru->bhs[0] = bh; 1359 lru->bhs[0] = bh;
1360 } 1360 }
1361 get_bh(bh); 1361 get_bh(bh);
1362 ret = bh; 1362 ret = bh;
1363 break; 1363 break;
1364 } 1364 }
1365 } 1365 }
1366 bh_lru_unlock(); 1366 bh_lru_unlock();
1367 return ret; 1367 return ret;
1368 } 1368 }
1369 1369
1370 /* 1370 /*
1371 * Perform a pagecache lookup for the matching buffer. If it's there, refresh 1371 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1372 * it in the LRU and mark it as accessed. If it is not present then return 1372 * it in the LRU and mark it as accessed. If it is not present then return
1373 * NULL 1373 * NULL
1374 */ 1374 */
1375 struct buffer_head * 1375 struct buffer_head *
1376 __find_get_block(struct block_device *bdev, sector_t block, unsigned size) 1376 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1377 { 1377 {
1378 struct buffer_head *bh = lookup_bh_lru(bdev, block, size); 1378 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1379 1379
1380 if (bh == NULL) { 1380 if (bh == NULL) {
1381 bh = __find_get_block_slow(bdev, block); 1381 bh = __find_get_block_slow(bdev, block);
1382 if (bh) 1382 if (bh)
1383 bh_lru_install(bh); 1383 bh_lru_install(bh);
1384 } 1384 }
1385 if (bh) 1385 if (bh)
1386 touch_buffer(bh); 1386 touch_buffer(bh);
1387 return bh; 1387 return bh;
1388 } 1388 }
1389 EXPORT_SYMBOL(__find_get_block); 1389 EXPORT_SYMBOL(__find_get_block);
1390 1390
1391 /* 1391 /*
1392 * __getblk will locate (and, if necessary, create) the buffer_head 1392 * __getblk will locate (and, if necessary, create) the buffer_head
1393 * which corresponds to the passed block_device, block and size. The 1393 * which corresponds to the passed block_device, block and size. The
1394 * returned buffer has its reference count incremented. 1394 * returned buffer has its reference count incremented.
1395 * 1395 *
1396 * __getblk() cannot fail - it just keeps trying. If you pass it an 1396 * __getblk() cannot fail - it just keeps trying. If you pass it an
1397 * illegal block number, __getblk() will happily return a buffer_head 1397 * illegal block number, __getblk() will happily return a buffer_head
1398 * which represents the non-existent block. Very weird. 1398 * which represents the non-existent block. Very weird.
1399 * 1399 *
1400 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() 1400 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1401 * attempt is failing. FIXME, perhaps? 1401 * attempt is failing. FIXME, perhaps?
1402 */ 1402 */
1403 struct buffer_head * 1403 struct buffer_head *
1404 __getblk(struct block_device *bdev, sector_t block, unsigned size) 1404 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1405 { 1405 {
1406 struct buffer_head *bh = __find_get_block(bdev, block, size); 1406 struct buffer_head *bh = __find_get_block(bdev, block, size);
1407 1407
1408 might_sleep(); 1408 might_sleep();
1409 if (bh == NULL) 1409 if (bh == NULL)
1410 bh = __getblk_slow(bdev, block, size); 1410 bh = __getblk_slow(bdev, block, size);
1411 return bh; 1411 return bh;
1412 } 1412 }
1413 EXPORT_SYMBOL(__getblk); 1413 EXPORT_SYMBOL(__getblk);
1414 1414
1415 /* 1415 /*
1416 * Do async read-ahead on a buffer.. 1416 * Do async read-ahead on a buffer..
1417 */ 1417 */
1418 void __breadahead(struct block_device *bdev, sector_t block, unsigned size) 1418 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1419 { 1419 {
1420 struct buffer_head *bh = __getblk(bdev, block, size); 1420 struct buffer_head *bh = __getblk(bdev, block, size);
1421 if (likely(bh)) { 1421 if (likely(bh)) {
1422 ll_rw_block(READA, 1, &bh); 1422 ll_rw_block(READA, 1, &bh);
1423 brelse(bh); 1423 brelse(bh);
1424 } 1424 }
1425 } 1425 }
1426 EXPORT_SYMBOL(__breadahead); 1426 EXPORT_SYMBOL(__breadahead);
1427 1427
1428 /** 1428 /**
1429 * __bread() - reads a specified block and returns the bh 1429 * __bread() - reads a specified block and returns the bh
1430 * @bdev: the block_device to read from 1430 * @bdev: the block_device to read from
1431 * @block: number of block 1431 * @block: number of block
1432 * @size: size (in bytes) to read 1432 * @size: size (in bytes) to read
1433 * 1433 *
1434 * Reads a specified block, and returns buffer head that contains it. 1434 * Reads a specified block, and returns buffer head that contains it.
1435 * It returns NULL if the block was unreadable. 1435 * It returns NULL if the block was unreadable.
1436 */ 1436 */
1437 struct buffer_head * 1437 struct buffer_head *
1438 __bread(struct block_device *bdev, sector_t block, unsigned size) 1438 __bread(struct block_device *bdev, sector_t block, unsigned size)
1439 { 1439 {
1440 struct buffer_head *bh = __getblk(bdev, block, size); 1440 struct buffer_head *bh = __getblk(bdev, block, size);
1441 1441
1442 if (likely(bh) && !buffer_uptodate(bh)) 1442 if (likely(bh) && !buffer_uptodate(bh))
1443 bh = __bread_slow(bh); 1443 bh = __bread_slow(bh);
1444 return bh; 1444 return bh;
1445 } 1445 }
1446 EXPORT_SYMBOL(__bread); 1446 EXPORT_SYMBOL(__bread);
1447 1447
1448 /* 1448 /*
1449 * invalidate_bh_lrus() is called rarely - but not only at unmount. 1449 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1450 * This doesn't race because it runs in each cpu either in irq 1450 * This doesn't race because it runs in each cpu either in irq
1451 * or with preempt disabled. 1451 * or with preempt disabled.
1452 */ 1452 */
1453 static void invalidate_bh_lru(void *arg) 1453 static void invalidate_bh_lru(void *arg)
1454 { 1454 {
1455 struct bh_lru *b = &get_cpu_var(bh_lrus); 1455 struct bh_lru *b = &get_cpu_var(bh_lrus);
1456 int i; 1456 int i;
1457 1457
1458 for (i = 0; i < BH_LRU_SIZE; i++) { 1458 for (i = 0; i < BH_LRU_SIZE; i++) {
1459 brelse(b->bhs[i]); 1459 brelse(b->bhs[i]);
1460 b->bhs[i] = NULL; 1460 b->bhs[i] = NULL;
1461 } 1461 }
1462 put_cpu_var(bh_lrus); 1462 put_cpu_var(bh_lrus);
1463 } 1463 }
1464 1464
1465 void invalidate_bh_lrus(void) 1465 void invalidate_bh_lrus(void)
1466 { 1466 {
1467 on_each_cpu(invalidate_bh_lru, NULL, 1, 1); 1467 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1468 } 1468 }
1469 EXPORT_SYMBOL_GPL(invalidate_bh_lrus); 1469 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1470 1470
1471 void set_bh_page(struct buffer_head *bh, 1471 void set_bh_page(struct buffer_head *bh,
1472 struct page *page, unsigned long offset) 1472 struct page *page, unsigned long offset)
1473 { 1473 {
1474 bh->b_page = page; 1474 bh->b_page = page;
1475 BUG_ON(offset >= PAGE_SIZE); 1475 BUG_ON(offset >= PAGE_SIZE);
1476 if (PageHighMem(page)) 1476 if (PageHighMem(page))
1477 /* 1477 /*
1478 * This catches illegal uses and preserves the offset: 1478 * This catches illegal uses and preserves the offset:
1479 */ 1479 */
1480 bh->b_data = (char *)(0 + offset); 1480 bh->b_data = (char *)(0 + offset);
1481 else 1481 else
1482 bh->b_data = page_address(page) + offset; 1482 bh->b_data = page_address(page) + offset;
1483 } 1483 }
1484 EXPORT_SYMBOL(set_bh_page); 1484 EXPORT_SYMBOL(set_bh_page);
1485 1485
1486 /* 1486 /*
1487 * Called when truncating a buffer on a page completely. 1487 * Called when truncating a buffer on a page completely.
1488 */ 1488 */
1489 static void discard_buffer(struct buffer_head * bh) 1489 static void discard_buffer(struct buffer_head * bh)
1490 { 1490 {
1491 lock_buffer(bh); 1491 lock_buffer(bh);
1492 clear_buffer_dirty(bh); 1492 clear_buffer_dirty(bh);
1493 bh->b_bdev = NULL; 1493 bh->b_bdev = NULL;
1494 clear_buffer_mapped(bh); 1494 clear_buffer_mapped(bh);
1495 clear_buffer_req(bh); 1495 clear_buffer_req(bh);
1496 clear_buffer_new(bh); 1496 clear_buffer_new(bh);
1497 clear_buffer_delay(bh); 1497 clear_buffer_delay(bh);
1498 clear_buffer_unwritten(bh); 1498 clear_buffer_unwritten(bh);
1499 unlock_buffer(bh); 1499 unlock_buffer(bh);
1500 } 1500 }
1501 1501
1502 /** 1502 /**
1503 * block_invalidatepage - invalidate part of all of a buffer-backed page 1503 * block_invalidatepage - invalidate part of all of a buffer-backed page
1504 * 1504 *
1505 * @page: the page which is affected 1505 * @page: the page which is affected
1506 * @offset: the index of the truncation point 1506 * @offset: the index of the truncation point
1507 * 1507 *
1508 * block_invalidatepage() is called when all or part of the page has become 1508 * block_invalidatepage() is called when all or part of the page has become
1509 * invalidatedby a truncate operation. 1509 * invalidatedby a truncate operation.
1510 * 1510 *
1511 * block_invalidatepage() does not have to release all buffers, but it must 1511 * block_invalidatepage() does not have to release all buffers, but it must
1512 * ensure that no dirty buffer is left outside @offset and that no I/O 1512 * ensure that no dirty buffer is left outside @offset and that no I/O
1513 * is underway against any of the blocks which are outside the truncation 1513 * is underway against any of the blocks which are outside the truncation
1514 * point. Because the caller is about to free (and possibly reuse) those 1514 * point. Because the caller is about to free (and possibly reuse) those
1515 * blocks on-disk. 1515 * blocks on-disk.
1516 */ 1516 */
1517 void block_invalidatepage(struct page *page, unsigned long offset) 1517 void block_invalidatepage(struct page *page, unsigned long offset)
1518 { 1518 {
1519 struct buffer_head *head, *bh, *next; 1519 struct buffer_head *head, *bh, *next;
1520 unsigned int curr_off = 0; 1520 unsigned int curr_off = 0;
1521 1521
1522 BUG_ON(!PageLocked(page)); 1522 BUG_ON(!PageLocked(page));
1523 if (!page_has_buffers(page)) 1523 if (!page_has_buffers(page))
1524 goto out; 1524 goto out;
1525 1525
1526 head = page_buffers(page); 1526 head = page_buffers(page);
1527 bh = head; 1527 bh = head;
1528 do { 1528 do {
1529 unsigned int next_off = curr_off + bh->b_size; 1529 unsigned int next_off = curr_off + bh->b_size;
1530 next = bh->b_this_page; 1530 next = bh->b_this_page;
1531 1531
1532 /* 1532 /*
1533 * is this block fully invalidated? 1533 * is this block fully invalidated?
1534 */ 1534 */
1535 if (offset <= curr_off) 1535 if (offset <= curr_off)
1536 discard_buffer(bh); 1536 discard_buffer(bh);
1537 curr_off = next_off; 1537 curr_off = next_off;
1538 bh = next; 1538 bh = next;
1539 } while (bh != head); 1539 } while (bh != head);
1540 1540
1541 /* 1541 /*
1542 * We release buffers only if the entire page is being invalidated. 1542 * We release buffers only if the entire page is being invalidated.
1543 * The get_block cached value has been unconditionally invalidated, 1543 * The get_block cached value has been unconditionally invalidated,
1544 * so real IO is not possible anymore. 1544 * so real IO is not possible anymore.
1545 */ 1545 */
1546 if (offset == 0) 1546 if (offset == 0)
1547 try_to_release_page(page, 0); 1547 try_to_release_page(page, 0);
1548 out: 1548 out:
1549 return; 1549 return;
1550 } 1550 }
1551 EXPORT_SYMBOL(block_invalidatepage); 1551 EXPORT_SYMBOL(block_invalidatepage);
1552 1552
1553 /* 1553 /*
1554 * We attach and possibly dirty the buffers atomically wrt 1554 * We attach and possibly dirty the buffers atomically wrt
1555 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers 1555 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1556 * is already excluded via the page lock. 1556 * is already excluded via the page lock.
1557 */ 1557 */
1558 void create_empty_buffers(struct page *page, 1558 void create_empty_buffers(struct page *page,
1559 unsigned long blocksize, unsigned long b_state) 1559 unsigned long blocksize, unsigned long b_state)
1560 { 1560 {
1561 struct buffer_head *bh, *head, *tail; 1561 struct buffer_head *bh, *head, *tail;
1562 1562
1563 head = alloc_page_buffers(page, blocksize, 1); 1563 head = alloc_page_buffers(page, blocksize, 1);
1564 bh = head; 1564 bh = head;
1565 do { 1565 do {
1566 bh->b_state |= b_state; 1566 bh->b_state |= b_state;
1567 tail = bh; 1567 tail = bh;
1568 bh = bh->b_this_page; 1568 bh = bh->b_this_page;
1569 } while (bh); 1569 } while (bh);
1570 tail->b_this_page = head; 1570 tail->b_this_page = head;
1571 1571
1572 spin_lock(&page->mapping->private_lock); 1572 spin_lock(&page->mapping->private_lock);
1573 if (PageUptodate(page) || PageDirty(page)) { 1573 if (PageUptodate(page) || PageDirty(page)) {
1574 bh = head; 1574 bh = head;
1575 do { 1575 do {
1576 if (PageDirty(page)) 1576 if (PageDirty(page))
1577 set_buffer_dirty(bh); 1577 set_buffer_dirty(bh);
1578 if (PageUptodate(page)) 1578 if (PageUptodate(page))
1579 set_buffer_uptodate(bh); 1579 set_buffer_uptodate(bh);
1580 bh = bh->b_this_page; 1580 bh = bh->b_this_page;
1581 } while (bh != head); 1581 } while (bh != head);
1582 } 1582 }
1583 attach_page_buffers(page, head); 1583 attach_page_buffers(page, head);
1584 spin_unlock(&page->mapping->private_lock); 1584 spin_unlock(&page->mapping->private_lock);
1585 } 1585 }
1586 EXPORT_SYMBOL(create_empty_buffers); 1586 EXPORT_SYMBOL(create_empty_buffers);
1587 1587
1588 /* 1588 /*
1589 * We are taking a block for data and we don't want any output from any 1589 * We are taking a block for data and we don't want any output from any
1590 * buffer-cache aliases starting from return from that function and 1590 * buffer-cache aliases starting from return from that function and
1591 * until the moment when something will explicitly mark the buffer 1591 * until the moment when something will explicitly mark the buffer
1592 * dirty (hopefully that will not happen until we will free that block ;-) 1592 * dirty (hopefully that will not happen until we will free that block ;-)
1593 * We don't even need to mark it not-uptodate - nobody can expect 1593 * We don't even need to mark it not-uptodate - nobody can expect
1594 * anything from a newly allocated buffer anyway. We used to used 1594 * anything from a newly allocated buffer anyway. We used to used
1595 * unmap_buffer() for such invalidation, but that was wrong. We definitely 1595 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1596 * don't want to mark the alias unmapped, for example - it would confuse 1596 * don't want to mark the alias unmapped, for example - it would confuse
1597 * anyone who might pick it with bread() afterwards... 1597 * anyone who might pick it with bread() afterwards...
1598 * 1598 *
1599 * Also.. Note that bforget() doesn't lock the buffer. So there can 1599 * Also.. Note that bforget() doesn't lock the buffer. So there can
1600 * be writeout I/O going on against recently-freed buffers. We don't 1600 * be writeout I/O going on against recently-freed buffers. We don't
1601 * wait on that I/O in bforget() - it's more efficient to wait on the I/O 1601 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1602 * only if we really need to. That happens here. 1602 * only if we really need to. That happens here.
1603 */ 1603 */
1604 void unmap_underlying_metadata(struct block_device *bdev, sector_t block) 1604 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1605 { 1605 {
1606 struct buffer_head *old_bh; 1606 struct buffer_head *old_bh;
1607 1607
1608 might_sleep(); 1608 might_sleep();
1609 1609
1610 old_bh = __find_get_block_slow(bdev, block); 1610 old_bh = __find_get_block_slow(bdev, block);
1611 if (old_bh) { 1611 if (old_bh) {
1612 clear_buffer_dirty(old_bh); 1612 clear_buffer_dirty(old_bh);
1613 wait_on_buffer(old_bh); 1613 wait_on_buffer(old_bh);
1614 clear_buffer_req(old_bh); 1614 clear_buffer_req(old_bh);
1615 __brelse(old_bh); 1615 __brelse(old_bh);
1616 } 1616 }
1617 } 1617 }
1618 EXPORT_SYMBOL(unmap_underlying_metadata); 1618 EXPORT_SYMBOL(unmap_underlying_metadata);
1619 1619
1620 /* 1620 /*
1621 * NOTE! All mapped/uptodate combinations are valid: 1621 * NOTE! All mapped/uptodate combinations are valid:
1622 * 1622 *
1623 * Mapped Uptodate Meaning 1623 * Mapped Uptodate Meaning
1624 * 1624 *
1625 * No No "unknown" - must do get_block() 1625 * No No "unknown" - must do get_block()
1626 * No Yes "hole" - zero-filled 1626 * No Yes "hole" - zero-filled
1627 * Yes No "allocated" - allocated on disk, not read in 1627 * Yes No "allocated" - allocated on disk, not read in
1628 * Yes Yes "valid" - allocated and up-to-date in memory. 1628 * Yes Yes "valid" - allocated and up-to-date in memory.
1629 * 1629 *
1630 * "Dirty" is valid only with the last case (mapped+uptodate). 1630 * "Dirty" is valid only with the last case (mapped+uptodate).
1631 */ 1631 */
1632 1632
1633 /* 1633 /*
1634 * While block_write_full_page is writing back the dirty buffers under 1634 * While block_write_full_page is writing back the dirty buffers under
1635 * the page lock, whoever dirtied the buffers may decide to clean them 1635 * the page lock, whoever dirtied the buffers may decide to clean them
1636 * again at any time. We handle that by only looking at the buffer 1636 * again at any time. We handle that by only looking at the buffer
1637 * state inside lock_buffer(). 1637 * state inside lock_buffer().
1638 * 1638 *
1639 * If block_write_full_page() is called for regular writeback 1639 * If block_write_full_page() is called for regular writeback
1640 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a 1640 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1641 * locked buffer. This only can happen if someone has written the buffer 1641 * locked buffer. This only can happen if someone has written the buffer
1642 * directly, with submit_bh(). At the address_space level PageWriteback 1642 * directly, with submit_bh(). At the address_space level PageWriteback
1643 * prevents this contention from occurring. 1643 * prevents this contention from occurring.
1644 */ 1644 */
1645 static int __block_write_full_page(struct inode *inode, struct page *page, 1645 static int __block_write_full_page(struct inode *inode, struct page *page,
1646 get_block_t *get_block, struct writeback_control *wbc) 1646 get_block_t *get_block, struct writeback_control *wbc)
1647 { 1647 {
1648 int err; 1648 int err;
1649 sector_t block; 1649 sector_t block;
1650 sector_t last_block; 1650 sector_t last_block;
1651 struct buffer_head *bh, *head; 1651 struct buffer_head *bh, *head;
1652 const unsigned blocksize = 1 << inode->i_blkbits; 1652 const unsigned blocksize = 1 << inode->i_blkbits;
1653 int nr_underway = 0; 1653 int nr_underway = 0;
1654 1654
1655 BUG_ON(!PageLocked(page)); 1655 BUG_ON(!PageLocked(page));
1656 1656
1657 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; 1657 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1658 1658
1659 if (!page_has_buffers(page)) { 1659 if (!page_has_buffers(page)) {
1660 create_empty_buffers(page, blocksize, 1660 create_empty_buffers(page, blocksize,
1661 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1661 (1 << BH_Dirty)|(1 << BH_Uptodate));
1662 } 1662 }
1663 1663
1664 /* 1664 /*
1665 * Be very careful. We have no exclusion from __set_page_dirty_buffers 1665 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1666 * here, and the (potentially unmapped) buffers may become dirty at 1666 * here, and the (potentially unmapped) buffers may become dirty at
1667 * any time. If a buffer becomes dirty here after we've inspected it 1667 * any time. If a buffer becomes dirty here after we've inspected it
1668 * then we just miss that fact, and the page stays dirty. 1668 * then we just miss that fact, and the page stays dirty.
1669 * 1669 *
1670 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; 1670 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1671 * handle that here by just cleaning them. 1671 * handle that here by just cleaning them.
1672 */ 1672 */
1673 1673
1674 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1674 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1675 head = page_buffers(page); 1675 head = page_buffers(page);
1676 bh = head; 1676 bh = head;
1677 1677
1678 /* 1678 /*
1679 * Get all the dirty buffers mapped to disk addresses and 1679 * Get all the dirty buffers mapped to disk addresses and
1680 * handle any aliases from the underlying blockdev's mapping. 1680 * handle any aliases from the underlying blockdev's mapping.
1681 */ 1681 */
1682 do { 1682 do {
1683 if (block > last_block) { 1683 if (block > last_block) {
1684 /* 1684 /*
1685 * mapped buffers outside i_size will occur, because 1685 * mapped buffers outside i_size will occur, because
1686 * this page can be outside i_size when there is a 1686 * this page can be outside i_size when there is a
1687 * truncate in progress. 1687 * truncate in progress.
1688 */ 1688 */
1689 /* 1689 /*
1690 * The buffer was zeroed by block_write_full_page() 1690 * The buffer was zeroed by block_write_full_page()
1691 */ 1691 */
1692 clear_buffer_dirty(bh); 1692 clear_buffer_dirty(bh);
1693 set_buffer_uptodate(bh); 1693 set_buffer_uptodate(bh);
1694 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { 1694 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1695 buffer_dirty(bh)) {
1695 WARN_ON(bh->b_size != blocksize); 1696 WARN_ON(bh->b_size != blocksize);
1696 err = get_block(inode, block, bh, 1); 1697 err = get_block(inode, block, bh, 1);
1697 if (err) 1698 if (err)
1698 goto recover; 1699 goto recover;
1700 clear_buffer_delay(bh);
1699 if (buffer_new(bh)) { 1701 if (buffer_new(bh)) {
1700 /* blockdev mappings never come here */ 1702 /* blockdev mappings never come here */
1701 clear_buffer_new(bh); 1703 clear_buffer_new(bh);
1702 unmap_underlying_metadata(bh->b_bdev, 1704 unmap_underlying_metadata(bh->b_bdev,
1703 bh->b_blocknr); 1705 bh->b_blocknr);
1704 } 1706 }
1705 } 1707 }
1706 bh = bh->b_this_page; 1708 bh = bh->b_this_page;
1707 block++; 1709 block++;
1708 } while (bh != head); 1710 } while (bh != head);
1709 1711
1710 do { 1712 do {
1711 if (!buffer_mapped(bh)) 1713 if (!buffer_mapped(bh))
1712 continue; 1714 continue;
1713 /* 1715 /*
1714 * If it's a fully non-blocking write attempt and we cannot 1716 * If it's a fully non-blocking write attempt and we cannot
1715 * lock the buffer then redirty the page. Note that this can 1717 * lock the buffer then redirty the page. Note that this can
1716 * potentially cause a busy-wait loop from pdflush and kswapd 1718 * potentially cause a busy-wait loop from pdflush and kswapd
1717 * activity, but those code paths have their own higher-level 1719 * activity, but those code paths have their own higher-level
1718 * throttling. 1720 * throttling.
1719 */ 1721 */
1720 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1722 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1721 lock_buffer(bh); 1723 lock_buffer(bh);
1722 } else if (test_set_buffer_locked(bh)) { 1724 } else if (test_set_buffer_locked(bh)) {
1723 redirty_page_for_writepage(wbc, page); 1725 redirty_page_for_writepage(wbc, page);
1724 continue; 1726 continue;
1725 } 1727 }
1726 if (test_clear_buffer_dirty(bh)) { 1728 if (test_clear_buffer_dirty(bh)) {
1727 mark_buffer_async_write(bh); 1729 mark_buffer_async_write(bh);
1728 } else { 1730 } else {
1729 unlock_buffer(bh); 1731 unlock_buffer(bh);
1730 } 1732 }
1731 } while ((bh = bh->b_this_page) != head); 1733 } while ((bh = bh->b_this_page) != head);
1732 1734
1733 /* 1735 /*
1734 * The page and its buffers are protected by PageWriteback(), so we can 1736 * The page and its buffers are protected by PageWriteback(), so we can
1735 * drop the bh refcounts early. 1737 * drop the bh refcounts early.
1736 */ 1738 */
1737 BUG_ON(PageWriteback(page)); 1739 BUG_ON(PageWriteback(page));
1738 set_page_writeback(page); 1740 set_page_writeback(page);
1739 1741
1740 do { 1742 do {
1741 struct buffer_head *next = bh->b_this_page; 1743 struct buffer_head *next = bh->b_this_page;
1742 if (buffer_async_write(bh)) { 1744 if (buffer_async_write(bh)) {
1743 submit_bh(WRITE, bh); 1745 submit_bh(WRITE, bh);
1744 nr_underway++; 1746 nr_underway++;
1745 } 1747 }
1746 bh = next; 1748 bh = next;
1747 } while (bh != head); 1749 } while (bh != head);
1748 unlock_page(page); 1750 unlock_page(page);
1749 1751
1750 err = 0; 1752 err = 0;
1751 done: 1753 done:
1752 if (nr_underway == 0) { 1754 if (nr_underway == 0) {
1753 /* 1755 /*
1754 * The page was marked dirty, but the buffers were 1756 * The page was marked dirty, but the buffers were
1755 * clean. Someone wrote them back by hand with 1757 * clean. Someone wrote them back by hand with
1756 * ll_rw_block/submit_bh. A rare case. 1758 * ll_rw_block/submit_bh. A rare case.
1757 */ 1759 */
1758 end_page_writeback(page); 1760 end_page_writeback(page);
1759 1761
1760 /* 1762 /*
1761 * The page and buffer_heads can be released at any time from 1763 * The page and buffer_heads can be released at any time from
1762 * here on. 1764 * here on.
1763 */ 1765 */
1764 } 1766 }
1765 return err; 1767 return err;
1766 1768
1767 recover: 1769 recover:
1768 /* 1770 /*
1769 * ENOSPC, or some other error. We may already have added some 1771 * ENOSPC, or some other error. We may already have added some
1770 * blocks to the file, so we need to write these out to avoid 1772 * blocks to the file, so we need to write these out to avoid
1771 * exposing stale data. 1773 * exposing stale data.
1772 * The page is currently locked and not marked for writeback 1774 * The page is currently locked and not marked for writeback
1773 */ 1775 */
1774 bh = head; 1776 bh = head;
1775 /* Recovery: lock and submit the mapped buffers */ 1777 /* Recovery: lock and submit the mapped buffers */
1776 do { 1778 do {
1777 if (buffer_mapped(bh) && buffer_dirty(bh)) { 1779 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1780 !buffer_delay(bh)) {
1778 lock_buffer(bh); 1781 lock_buffer(bh);
1779 mark_buffer_async_write(bh); 1782 mark_buffer_async_write(bh);
1780 } else { 1783 } else {
1781 /* 1784 /*
1782 * The buffer may have been set dirty during 1785 * The buffer may have been set dirty during
1783 * attachment to a dirty page. 1786 * attachment to a dirty page.
1784 */ 1787 */
1785 clear_buffer_dirty(bh); 1788 clear_buffer_dirty(bh);
1786 } 1789 }
1787 } while ((bh = bh->b_this_page) != head); 1790 } while ((bh = bh->b_this_page) != head);
1788 SetPageError(page); 1791 SetPageError(page);
1789 BUG_ON(PageWriteback(page)); 1792 BUG_ON(PageWriteback(page));
1790 mapping_set_error(page->mapping, err); 1793 mapping_set_error(page->mapping, err);
1791 set_page_writeback(page); 1794 set_page_writeback(page);
1792 do { 1795 do {
1793 struct buffer_head *next = bh->b_this_page; 1796 struct buffer_head *next = bh->b_this_page;
1794 if (buffer_async_write(bh)) { 1797 if (buffer_async_write(bh)) {
1795 clear_buffer_dirty(bh); 1798 clear_buffer_dirty(bh);
1796 submit_bh(WRITE, bh); 1799 submit_bh(WRITE, bh);
1797 nr_underway++; 1800 nr_underway++;
1798 } 1801 }
1799 bh = next; 1802 bh = next;
1800 } while (bh != head); 1803 } while (bh != head);
1801 unlock_page(page); 1804 unlock_page(page);
1802 goto done; 1805 goto done;
1803 } 1806 }
1804 1807
1805 /* 1808 /*
1806 * If a page has any new buffers, zero them out here, and mark them uptodate 1809 * If a page has any new buffers, zero them out here, and mark them uptodate
1807 * and dirty so they'll be written out (in order to prevent uninitialised 1810 * and dirty so they'll be written out (in order to prevent uninitialised
1808 * block data from leaking). And clear the new bit. 1811 * block data from leaking). And clear the new bit.
1809 */ 1812 */
1810 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) 1813 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1811 { 1814 {
1812 unsigned int block_start, block_end; 1815 unsigned int block_start, block_end;
1813 struct buffer_head *head, *bh; 1816 struct buffer_head *head, *bh;
1814 1817
1815 BUG_ON(!PageLocked(page)); 1818 BUG_ON(!PageLocked(page));
1816 if (!page_has_buffers(page)) 1819 if (!page_has_buffers(page))
1817 return; 1820 return;
1818 1821
1819 bh = head = page_buffers(page); 1822 bh = head = page_buffers(page);
1820 block_start = 0; 1823 block_start = 0;
1821 do { 1824 do {
1822 block_end = block_start + bh->b_size; 1825 block_end = block_start + bh->b_size;
1823 1826
1824 if (buffer_new(bh)) { 1827 if (buffer_new(bh)) {
1825 if (block_end > from && block_start < to) { 1828 if (block_end > from && block_start < to) {
1826 if (!PageUptodate(page)) { 1829 if (!PageUptodate(page)) {
1827 unsigned start, size; 1830 unsigned start, size;
1828 1831
1829 start = max(from, block_start); 1832 start = max(from, block_start);
1830 size = min(to, block_end) - start; 1833 size = min(to, block_end) - start;
1831 1834
1832 zero_user(page, start, size); 1835 zero_user(page, start, size);
1833 set_buffer_uptodate(bh); 1836 set_buffer_uptodate(bh);
1834 } 1837 }
1835 1838
1836 clear_buffer_new(bh); 1839 clear_buffer_new(bh);
1837 mark_buffer_dirty(bh); 1840 mark_buffer_dirty(bh);
1838 } 1841 }
1839 } 1842 }
1840 1843
1841 block_start = block_end; 1844 block_start = block_end;
1842 bh = bh->b_this_page; 1845 bh = bh->b_this_page;
1843 } while (bh != head); 1846 } while (bh != head);
1844 } 1847 }
1845 EXPORT_SYMBOL(page_zero_new_buffers); 1848 EXPORT_SYMBOL(page_zero_new_buffers);
1846 1849
1847 static int __block_prepare_write(struct inode *inode, struct page *page, 1850 static int __block_prepare_write(struct inode *inode, struct page *page,
1848 unsigned from, unsigned to, get_block_t *get_block) 1851 unsigned from, unsigned to, get_block_t *get_block)
1849 { 1852 {
1850 unsigned block_start, block_end; 1853 unsigned block_start, block_end;
1851 sector_t block; 1854 sector_t block;
1852 int err = 0; 1855 int err = 0;
1853 unsigned blocksize, bbits; 1856 unsigned blocksize, bbits;
1854 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; 1857 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1855 1858
1856 BUG_ON(!PageLocked(page)); 1859 BUG_ON(!PageLocked(page));
1857 BUG_ON(from > PAGE_CACHE_SIZE); 1860 BUG_ON(from > PAGE_CACHE_SIZE);
1858 BUG_ON(to > PAGE_CACHE_SIZE); 1861 BUG_ON(to > PAGE_CACHE_SIZE);
1859 BUG_ON(from > to); 1862 BUG_ON(from > to);
1860 1863
1861 blocksize = 1 << inode->i_blkbits; 1864 blocksize = 1 << inode->i_blkbits;
1862 if (!page_has_buffers(page)) 1865 if (!page_has_buffers(page))
1863 create_empty_buffers(page, blocksize, 0); 1866 create_empty_buffers(page, blocksize, 0);
1864 head = page_buffers(page); 1867 head = page_buffers(page);
1865 1868
1866 bbits = inode->i_blkbits; 1869 bbits = inode->i_blkbits;
1867 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1870 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1868 1871
1869 for(bh = head, block_start = 0; bh != head || !block_start; 1872 for(bh = head, block_start = 0; bh != head || !block_start;
1870 block++, block_start=block_end, bh = bh->b_this_page) { 1873 block++, block_start=block_end, bh = bh->b_this_page) {
1871 block_end = block_start + blocksize; 1874 block_end = block_start + blocksize;
1872 if (block_end <= from || block_start >= to) { 1875 if (block_end <= from || block_start >= to) {
1873 if (PageUptodate(page)) { 1876 if (PageUptodate(page)) {
1874 if (!buffer_uptodate(bh)) 1877 if (!buffer_uptodate(bh))
1875 set_buffer_uptodate(bh); 1878 set_buffer_uptodate(bh);
1876 } 1879 }
1877 continue; 1880 continue;
1878 } 1881 }
1879 if (buffer_new(bh)) 1882 if (buffer_new(bh))
1880 clear_buffer_new(bh); 1883 clear_buffer_new(bh);
1881 if (!buffer_mapped(bh)) { 1884 if (!buffer_mapped(bh)) {
1882 WARN_ON(bh->b_size != blocksize); 1885 WARN_ON(bh->b_size != blocksize);
1883 err = get_block(inode, block, bh, 1); 1886 err = get_block(inode, block, bh, 1);
1884 if (err) 1887 if (err)
1885 break; 1888 break;
1886 if (buffer_new(bh)) { 1889 if (buffer_new(bh)) {
1887 unmap_underlying_metadata(bh->b_bdev, 1890 unmap_underlying_metadata(bh->b_bdev,
1888 bh->b_blocknr); 1891 bh->b_blocknr);
1889 if (PageUptodate(page)) { 1892 if (PageUptodate(page)) {
1890 clear_buffer_new(bh); 1893 clear_buffer_new(bh);
1891 set_buffer_uptodate(bh); 1894 set_buffer_uptodate(bh);
1892 mark_buffer_dirty(bh); 1895 mark_buffer_dirty(bh);
1893 continue; 1896 continue;
1894 } 1897 }
1895 if (block_end > to || block_start < from) 1898 if (block_end > to || block_start < from)
1896 zero_user_segments(page, 1899 zero_user_segments(page,
1897 to, block_end, 1900 to, block_end,
1898 block_start, from); 1901 block_start, from);
1899 continue; 1902 continue;
1900 } 1903 }
1901 } 1904 }
1902 if (PageUptodate(page)) { 1905 if (PageUptodate(page)) {
1903 if (!buffer_uptodate(bh)) 1906 if (!buffer_uptodate(bh))
1904 set_buffer_uptodate(bh); 1907 set_buffer_uptodate(bh);
1905 continue; 1908 continue;
1906 } 1909 }
1907 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 1910 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1908 !buffer_unwritten(bh) && 1911 !buffer_unwritten(bh) &&
1909 (block_start < from || block_end > to)) { 1912 (block_start < from || block_end > to)) {
1910 ll_rw_block(READ, 1, &bh); 1913 ll_rw_block(READ, 1, &bh);
1911 *wait_bh++=bh; 1914 *wait_bh++=bh;
1912 } 1915 }
1913 } 1916 }
1914 /* 1917 /*
1915 * If we issued read requests - let them complete. 1918 * If we issued read requests - let them complete.
1916 */ 1919 */
1917 while(wait_bh > wait) { 1920 while(wait_bh > wait) {
1918 wait_on_buffer(*--wait_bh); 1921 wait_on_buffer(*--wait_bh);
1919 if (!buffer_uptodate(*wait_bh)) 1922 if (!buffer_uptodate(*wait_bh))
1920 err = -EIO; 1923 err = -EIO;
1921 } 1924 }
1922 if (unlikely(err)) 1925 if (unlikely(err))
1923 page_zero_new_buffers(page, from, to); 1926 page_zero_new_buffers(page, from, to);
1924 return err; 1927 return err;
1925 } 1928 }
1926 1929
1927 static int __block_commit_write(struct inode *inode, struct page *page, 1930 static int __block_commit_write(struct inode *inode, struct page *page,
1928 unsigned from, unsigned to) 1931 unsigned from, unsigned to)
1929 { 1932 {
1930 unsigned block_start, block_end; 1933 unsigned block_start, block_end;
1931 int partial = 0; 1934 int partial = 0;
1932 unsigned blocksize; 1935 unsigned blocksize;
1933 struct buffer_head *bh, *head; 1936 struct buffer_head *bh, *head;
1934 1937
1935 blocksize = 1 << inode->i_blkbits; 1938 blocksize = 1 << inode->i_blkbits;
1936 1939
1937 for(bh = head = page_buffers(page), block_start = 0; 1940 for(bh = head = page_buffers(page), block_start = 0;
1938 bh != head || !block_start; 1941 bh != head || !block_start;
1939 block_start=block_end, bh = bh->b_this_page) { 1942 block_start=block_end, bh = bh->b_this_page) {
1940 block_end = block_start + blocksize; 1943 block_end = block_start + blocksize;
1941 if (block_end <= from || block_start >= to) { 1944 if (block_end <= from || block_start >= to) {
1942 if (!buffer_uptodate(bh)) 1945 if (!buffer_uptodate(bh))
1943 partial = 1; 1946 partial = 1;
1944 } else { 1947 } else {
1945 set_buffer_uptodate(bh); 1948 set_buffer_uptodate(bh);
1946 mark_buffer_dirty(bh); 1949 mark_buffer_dirty(bh);
1947 } 1950 }
1948 clear_buffer_new(bh); 1951 clear_buffer_new(bh);
1949 } 1952 }
1950 1953
1951 /* 1954 /*
1952 * If this is a partial write which happened to make all buffers 1955 * If this is a partial write which happened to make all buffers
1953 * uptodate then we can optimize away a bogus readpage() for 1956 * uptodate then we can optimize away a bogus readpage() for
1954 * the next read(). Here we 'discover' whether the page went 1957 * the next read(). Here we 'discover' whether the page went
1955 * uptodate as a result of this (potentially partial) write. 1958 * uptodate as a result of this (potentially partial) write.
1956 */ 1959 */
1957 if (!partial) 1960 if (!partial)
1958 SetPageUptodate(page); 1961 SetPageUptodate(page);
1959 return 0; 1962 return 0;
1960 } 1963 }
1961 1964
1962 /* 1965 /*
1963 * block_write_begin takes care of the basic task of block allocation and 1966 * block_write_begin takes care of the basic task of block allocation and
1964 * bringing partial write blocks uptodate first. 1967 * bringing partial write blocks uptodate first.
1965 * 1968 *
1966 * If *pagep is not NULL, then block_write_begin uses the locked page 1969 * If *pagep is not NULL, then block_write_begin uses the locked page
1967 * at *pagep rather than allocating its own. In this case, the page will 1970 * at *pagep rather than allocating its own. In this case, the page will
1968 * not be unlocked or deallocated on failure. 1971 * not be unlocked or deallocated on failure.
1969 */ 1972 */
1970 int block_write_begin(struct file *file, struct address_space *mapping, 1973 int block_write_begin(struct file *file, struct address_space *mapping,
1971 loff_t pos, unsigned len, unsigned flags, 1974 loff_t pos, unsigned len, unsigned flags,
1972 struct page **pagep, void **fsdata, 1975 struct page **pagep, void **fsdata,
1973 get_block_t *get_block) 1976 get_block_t *get_block)
1974 { 1977 {
1975 struct inode *inode = mapping->host; 1978 struct inode *inode = mapping->host;
1976 int status = 0; 1979 int status = 0;
1977 struct page *page; 1980 struct page *page;
1978 pgoff_t index; 1981 pgoff_t index;
1979 unsigned start, end; 1982 unsigned start, end;
1980 int ownpage = 0; 1983 int ownpage = 0;
1981 1984
1982 index = pos >> PAGE_CACHE_SHIFT; 1985 index = pos >> PAGE_CACHE_SHIFT;
1983 start = pos & (PAGE_CACHE_SIZE - 1); 1986 start = pos & (PAGE_CACHE_SIZE - 1);
1984 end = start + len; 1987 end = start + len;
1985 1988
1986 page = *pagep; 1989 page = *pagep;
1987 if (page == NULL) { 1990 if (page == NULL) {
1988 ownpage = 1; 1991 ownpage = 1;
1989 page = __grab_cache_page(mapping, index); 1992 page = __grab_cache_page(mapping, index);
1990 if (!page) { 1993 if (!page) {
1991 status = -ENOMEM; 1994 status = -ENOMEM;
1992 goto out; 1995 goto out;
1993 } 1996 }
1994 *pagep = page; 1997 *pagep = page;
1995 } else 1998 } else
1996 BUG_ON(!PageLocked(page)); 1999 BUG_ON(!PageLocked(page));
1997 2000
1998 status = __block_prepare_write(inode, page, start, end, get_block); 2001 status = __block_prepare_write(inode, page, start, end, get_block);
1999 if (unlikely(status)) { 2002 if (unlikely(status)) {
2000 ClearPageUptodate(page); 2003 ClearPageUptodate(page);
2001 2004
2002 if (ownpage) { 2005 if (ownpage) {
2003 unlock_page(page); 2006 unlock_page(page);
2004 page_cache_release(page); 2007 page_cache_release(page);
2005 *pagep = NULL; 2008 *pagep = NULL;
2006 2009
2007 /* 2010 /*
2008 * prepare_write() may have instantiated a few blocks 2011 * prepare_write() may have instantiated a few blocks
2009 * outside i_size. Trim these off again. Don't need 2012 * outside i_size. Trim these off again. Don't need
2010 * i_size_read because we hold i_mutex. 2013 * i_size_read because we hold i_mutex.
2011 */ 2014 */
2012 if (pos + len > inode->i_size) 2015 if (pos + len > inode->i_size)
2013 vmtruncate(inode, inode->i_size); 2016 vmtruncate(inode, inode->i_size);
2014 } 2017 }
2015 goto out; 2018 goto out;
2016 } 2019 }
2017 2020
2018 out: 2021 out:
2019 return status; 2022 return status;
2020 } 2023 }
2021 EXPORT_SYMBOL(block_write_begin); 2024 EXPORT_SYMBOL(block_write_begin);
2022 2025
2023 int block_write_end(struct file *file, struct address_space *mapping, 2026 int block_write_end(struct file *file, struct address_space *mapping,
2024 loff_t pos, unsigned len, unsigned copied, 2027 loff_t pos, unsigned len, unsigned copied,
2025 struct page *page, void *fsdata) 2028 struct page *page, void *fsdata)
2026 { 2029 {
2027 struct inode *inode = mapping->host; 2030 struct inode *inode = mapping->host;
2028 unsigned start; 2031 unsigned start;
2029 2032
2030 start = pos & (PAGE_CACHE_SIZE - 1); 2033 start = pos & (PAGE_CACHE_SIZE - 1);
2031 2034
2032 if (unlikely(copied < len)) { 2035 if (unlikely(copied < len)) {
2033 /* 2036 /*
2034 * The buffers that were written will now be uptodate, so we 2037 * The buffers that were written will now be uptodate, so we
2035 * don't have to worry about a readpage reading them and 2038 * don't have to worry about a readpage reading them and
2036 * overwriting a partial write. However if we have encountered 2039 * overwriting a partial write. However if we have encountered
2037 * a short write and only partially written into a buffer, it 2040 * a short write and only partially written into a buffer, it
2038 * will not be marked uptodate, so a readpage might come in and 2041 * will not be marked uptodate, so a readpage might come in and
2039 * destroy our partial write. 2042 * destroy our partial write.
2040 * 2043 *
2041 * Do the simplest thing, and just treat any short write to a 2044 * Do the simplest thing, and just treat any short write to a
2042 * non uptodate page as a zero-length write, and force the 2045 * non uptodate page as a zero-length write, and force the
2043 * caller to redo the whole thing. 2046 * caller to redo the whole thing.
2044 */ 2047 */
2045 if (!PageUptodate(page)) 2048 if (!PageUptodate(page))
2046 copied = 0; 2049 copied = 0;
2047 2050
2048 page_zero_new_buffers(page, start+copied, start+len); 2051 page_zero_new_buffers(page, start+copied, start+len);
2049 } 2052 }
2050 flush_dcache_page(page); 2053 flush_dcache_page(page);
2051 2054
2052 /* This could be a short (even 0-length) commit */ 2055 /* This could be a short (even 0-length) commit */
2053 __block_commit_write(inode, page, start, start+copied); 2056 __block_commit_write(inode, page, start, start+copied);
2054 2057
2055 return copied; 2058 return copied;
2056 } 2059 }
2057 EXPORT_SYMBOL(block_write_end); 2060 EXPORT_SYMBOL(block_write_end);
2058 2061
2059 int generic_write_end(struct file *file, struct address_space *mapping, 2062 int generic_write_end(struct file *file, struct address_space *mapping,
2060 loff_t pos, unsigned len, unsigned copied, 2063 loff_t pos, unsigned len, unsigned copied,
2061 struct page *page, void *fsdata) 2064 struct page *page, void *fsdata)
2062 { 2065 {
2063 struct inode *inode = mapping->host; 2066 struct inode *inode = mapping->host;
2064 int i_size_changed = 0; 2067 int i_size_changed = 0;
2065 2068
2066 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 2069 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2067 2070
2068 /* 2071 /*
2069 * No need to use i_size_read() here, the i_size 2072 * No need to use i_size_read() here, the i_size
2070 * cannot change under us because we hold i_mutex. 2073 * cannot change under us because we hold i_mutex.
2071 * 2074 *
2072 * But it's important to update i_size while still holding page lock: 2075 * But it's important to update i_size while still holding page lock:
2073 * page writeout could otherwise come in and zero beyond i_size. 2076 * page writeout could otherwise come in and zero beyond i_size.
2074 */ 2077 */
2075 if (pos+copied > inode->i_size) { 2078 if (pos+copied > inode->i_size) {
2076 i_size_write(inode, pos+copied); 2079 i_size_write(inode, pos+copied);
2077 i_size_changed = 1; 2080 i_size_changed = 1;
2078 } 2081 }
2079 2082
2080 unlock_page(page); 2083 unlock_page(page);
2081 page_cache_release(page); 2084 page_cache_release(page);
2082 2085
2083 /* 2086 /*
2084 * Don't mark the inode dirty under page lock. First, it unnecessarily 2087 * Don't mark the inode dirty under page lock. First, it unnecessarily
2085 * makes the holding time of page lock longer. Second, it forces lock 2088 * makes the holding time of page lock longer. Second, it forces lock
2086 * ordering of page lock and transaction start for journaling 2089 * ordering of page lock and transaction start for journaling
2087 * filesystems. 2090 * filesystems.
2088 */ 2091 */
2089 if (i_size_changed) 2092 if (i_size_changed)
2090 mark_inode_dirty(inode); 2093 mark_inode_dirty(inode);
2091 2094
2092 return copied; 2095 return copied;
2093 } 2096 }
2094 EXPORT_SYMBOL(generic_write_end); 2097 EXPORT_SYMBOL(generic_write_end);
2095 2098
2096 /* 2099 /*
2097 * Generic "read page" function for block devices that have the normal 2100 * Generic "read page" function for block devices that have the normal
2098 * get_block functionality. This is most of the block device filesystems. 2101 * get_block functionality. This is most of the block device filesystems.
2099 * Reads the page asynchronously --- the unlock_buffer() and 2102 * Reads the page asynchronously --- the unlock_buffer() and
2100 * set/clear_buffer_uptodate() functions propagate buffer state into the 2103 * set/clear_buffer_uptodate() functions propagate buffer state into the
2101 * page struct once IO has completed. 2104 * page struct once IO has completed.
2102 */ 2105 */
2103 int block_read_full_page(struct page *page, get_block_t *get_block) 2106 int block_read_full_page(struct page *page, get_block_t *get_block)
2104 { 2107 {
2105 struct inode *inode = page->mapping->host; 2108 struct inode *inode = page->mapping->host;
2106 sector_t iblock, lblock; 2109 sector_t iblock, lblock;
2107 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 2110 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2108 unsigned int blocksize; 2111 unsigned int blocksize;
2109 int nr, i; 2112 int nr, i;
2110 int fully_mapped = 1; 2113 int fully_mapped = 1;
2111 2114
2112 BUG_ON(!PageLocked(page)); 2115 BUG_ON(!PageLocked(page));
2113 blocksize = 1 << inode->i_blkbits; 2116 blocksize = 1 << inode->i_blkbits;
2114 if (!page_has_buffers(page)) 2117 if (!page_has_buffers(page))
2115 create_empty_buffers(page, blocksize, 0); 2118 create_empty_buffers(page, blocksize, 0);
2116 head = page_buffers(page); 2119 head = page_buffers(page);
2117 2120
2118 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2121 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2119 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; 2122 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2120 bh = head; 2123 bh = head;
2121 nr = 0; 2124 nr = 0;
2122 i = 0; 2125 i = 0;
2123 2126
2124 do { 2127 do {
2125 if (buffer_uptodate(bh)) 2128 if (buffer_uptodate(bh))
2126 continue; 2129 continue;
2127 2130
2128 if (!buffer_mapped(bh)) { 2131 if (!buffer_mapped(bh)) {
2129 int err = 0; 2132 int err = 0;
2130 2133
2131 fully_mapped = 0; 2134 fully_mapped = 0;
2132 if (iblock < lblock) { 2135 if (iblock < lblock) {
2133 WARN_ON(bh->b_size != blocksize); 2136 WARN_ON(bh->b_size != blocksize);
2134 err = get_block(inode, iblock, bh, 0); 2137 err = get_block(inode, iblock, bh, 0);
2135 if (err) 2138 if (err)
2136 SetPageError(page); 2139 SetPageError(page);
2137 } 2140 }
2138 if (!buffer_mapped(bh)) { 2141 if (!buffer_mapped(bh)) {
2139 zero_user(page, i * blocksize, blocksize); 2142 zero_user(page, i * blocksize, blocksize);
2140 if (!err) 2143 if (!err)
2141 set_buffer_uptodate(bh); 2144 set_buffer_uptodate(bh);
2142 continue; 2145 continue;
2143 } 2146 }
2144 /* 2147 /*
2145 * get_block() might have updated the buffer 2148 * get_block() might have updated the buffer
2146 * synchronously 2149 * synchronously
2147 */ 2150 */
2148 if (buffer_uptodate(bh)) 2151 if (buffer_uptodate(bh))
2149 continue; 2152 continue;
2150 } 2153 }
2151 arr[nr++] = bh; 2154 arr[nr++] = bh;
2152 } while (i++, iblock++, (bh = bh->b_this_page) != head); 2155 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2153 2156
2154 if (fully_mapped) 2157 if (fully_mapped)
2155 SetPageMappedToDisk(page); 2158 SetPageMappedToDisk(page);
2156 2159
2157 if (!nr) { 2160 if (!nr) {
2158 /* 2161 /*
2159 * All buffers are uptodate - we can set the page uptodate 2162 * All buffers are uptodate - we can set the page uptodate
2160 * as well. But not if get_block() returned an error. 2163 * as well. But not if get_block() returned an error.
2161 */ 2164 */
2162 if (!PageError(page)) 2165 if (!PageError(page))
2163 SetPageUptodate(page); 2166 SetPageUptodate(page);
2164 unlock_page(page); 2167 unlock_page(page);
2165 return 0; 2168 return 0;
2166 } 2169 }
2167 2170
2168 /* Stage two: lock the buffers */ 2171 /* Stage two: lock the buffers */
2169 for (i = 0; i < nr; i++) { 2172 for (i = 0; i < nr; i++) {
2170 bh = arr[i]; 2173 bh = arr[i];
2171 lock_buffer(bh); 2174 lock_buffer(bh);
2172 mark_buffer_async_read(bh); 2175 mark_buffer_async_read(bh);
2173 } 2176 }
2174 2177
2175 /* 2178 /*
2176 * Stage 3: start the IO. Check for uptodateness 2179 * Stage 3: start the IO. Check for uptodateness
2177 * inside the buffer lock in case another process reading 2180 * inside the buffer lock in case another process reading
2178 * the underlying blockdev brought it uptodate (the sct fix). 2181 * the underlying blockdev brought it uptodate (the sct fix).
2179 */ 2182 */
2180 for (i = 0; i < nr; i++) { 2183 for (i = 0; i < nr; i++) {
2181 bh = arr[i]; 2184 bh = arr[i];
2182 if (buffer_uptodate(bh)) 2185 if (buffer_uptodate(bh))
2183 end_buffer_async_read(bh, 1); 2186 end_buffer_async_read(bh, 1);
2184 else 2187 else
2185 submit_bh(READ, bh); 2188 submit_bh(READ, bh);
2186 } 2189 }
2187 return 0; 2190 return 0;
2188 } 2191 }
2189 2192
2190 /* utility function for filesystems that need to do work on expanding 2193 /* utility function for filesystems that need to do work on expanding
2191 * truncates. Uses filesystem pagecache writes to allow the filesystem to 2194 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2192 * deal with the hole. 2195 * deal with the hole.
2193 */ 2196 */
2194 int generic_cont_expand_simple(struct inode *inode, loff_t size) 2197 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2195 { 2198 {
2196 struct address_space *mapping = inode->i_mapping; 2199 struct address_space *mapping = inode->i_mapping;
2197 struct page *page; 2200 struct page *page;
2198 void *fsdata; 2201 void *fsdata;
2199 unsigned long limit; 2202 unsigned long limit;
2200 int err; 2203 int err;
2201 2204
2202 err = -EFBIG; 2205 err = -EFBIG;
2203 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 2206 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2204 if (limit != RLIM_INFINITY && size > (loff_t)limit) { 2207 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2205 send_sig(SIGXFSZ, current, 0); 2208 send_sig(SIGXFSZ, current, 0);
2206 goto out; 2209 goto out;
2207 } 2210 }
2208 if (size > inode->i_sb->s_maxbytes) 2211 if (size > inode->i_sb->s_maxbytes)
2209 goto out; 2212 goto out;
2210 2213
2211 err = pagecache_write_begin(NULL, mapping, size, 0, 2214 err = pagecache_write_begin(NULL, mapping, size, 0,
2212 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, 2215 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2213 &page, &fsdata); 2216 &page, &fsdata);
2214 if (err) 2217 if (err)
2215 goto out; 2218 goto out;
2216 2219
2217 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); 2220 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2218 BUG_ON(err > 0); 2221 BUG_ON(err > 0);
2219 2222
2220 out: 2223 out:
2221 return err; 2224 return err;
2222 } 2225 }
2223 2226
2224 static int cont_expand_zero(struct file *file, struct address_space *mapping, 2227 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2225 loff_t pos, loff_t *bytes) 2228 loff_t pos, loff_t *bytes)
2226 { 2229 {
2227 struct inode *inode = mapping->host; 2230 struct inode *inode = mapping->host;
2228 unsigned blocksize = 1 << inode->i_blkbits; 2231 unsigned blocksize = 1 << inode->i_blkbits;
2229 struct page *page; 2232 struct page *page;
2230 void *fsdata; 2233 void *fsdata;
2231 pgoff_t index, curidx; 2234 pgoff_t index, curidx;
2232 loff_t curpos; 2235 loff_t curpos;
2233 unsigned zerofrom, offset, len; 2236 unsigned zerofrom, offset, len;
2234 int err = 0; 2237 int err = 0;
2235 2238
2236 index = pos >> PAGE_CACHE_SHIFT; 2239 index = pos >> PAGE_CACHE_SHIFT;
2237 offset = pos & ~PAGE_CACHE_MASK; 2240 offset = pos & ~PAGE_CACHE_MASK;
2238 2241
2239 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { 2242 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2240 zerofrom = curpos & ~PAGE_CACHE_MASK; 2243 zerofrom = curpos & ~PAGE_CACHE_MASK;
2241 if (zerofrom & (blocksize-1)) { 2244 if (zerofrom & (blocksize-1)) {
2242 *bytes |= (blocksize-1); 2245 *bytes |= (blocksize-1);
2243 (*bytes)++; 2246 (*bytes)++;
2244 } 2247 }
2245 len = PAGE_CACHE_SIZE - zerofrom; 2248 len = PAGE_CACHE_SIZE - zerofrom;
2246 2249
2247 err = pagecache_write_begin(file, mapping, curpos, len, 2250 err = pagecache_write_begin(file, mapping, curpos, len,
2248 AOP_FLAG_UNINTERRUPTIBLE, 2251 AOP_FLAG_UNINTERRUPTIBLE,
2249 &page, &fsdata); 2252 &page, &fsdata);
2250 if (err) 2253 if (err)
2251 goto out; 2254 goto out;
2252 zero_user(page, zerofrom, len); 2255 zero_user(page, zerofrom, len);
2253 err = pagecache_write_end(file, mapping, curpos, len, len, 2256 err = pagecache_write_end(file, mapping, curpos, len, len,
2254 page, fsdata); 2257 page, fsdata);
2255 if (err < 0) 2258 if (err < 0)
2256 goto out; 2259 goto out;
2257 BUG_ON(err != len); 2260 BUG_ON(err != len);
2258 err = 0; 2261 err = 0;
2259 2262
2260 balance_dirty_pages_ratelimited(mapping); 2263 balance_dirty_pages_ratelimited(mapping);
2261 } 2264 }
2262 2265
2263 /* page covers the boundary, find the boundary offset */ 2266 /* page covers the boundary, find the boundary offset */
2264 if (index == curidx) { 2267 if (index == curidx) {
2265 zerofrom = curpos & ~PAGE_CACHE_MASK; 2268 zerofrom = curpos & ~PAGE_CACHE_MASK;
2266 /* if we will expand the thing last block will be filled */ 2269 /* if we will expand the thing last block will be filled */
2267 if (offset <= zerofrom) { 2270 if (offset <= zerofrom) {
2268 goto out; 2271 goto out;
2269 } 2272 }
2270 if (zerofrom & (blocksize-1)) { 2273 if (zerofrom & (blocksize-1)) {
2271 *bytes |= (blocksize-1); 2274 *bytes |= (blocksize-1);
2272 (*bytes)++; 2275 (*bytes)++;
2273 } 2276 }
2274 len = offset - zerofrom; 2277 len = offset - zerofrom;
2275 2278
2276 err = pagecache_write_begin(file, mapping, curpos, len, 2279 err = pagecache_write_begin(file, mapping, curpos, len,
2277 AOP_FLAG_UNINTERRUPTIBLE, 2280 AOP_FLAG_UNINTERRUPTIBLE,
2278 &page, &fsdata); 2281 &page, &fsdata);
2279 if (err) 2282 if (err)
2280 goto out; 2283 goto out;
2281 zero_user(page, zerofrom, len); 2284 zero_user(page, zerofrom, len);
2282 err = pagecache_write_end(file, mapping, curpos, len, len, 2285 err = pagecache_write_end(file, mapping, curpos, len, len,
2283 page, fsdata); 2286 page, fsdata);
2284 if (err < 0) 2287 if (err < 0)
2285 goto out; 2288 goto out;
2286 BUG_ON(err != len); 2289 BUG_ON(err != len);
2287 err = 0; 2290 err = 0;
2288 } 2291 }
2289 out: 2292 out:
2290 return err; 2293 return err;
2291 } 2294 }
2292 2295
2293 /* 2296 /*
2294 * For moronic filesystems that do not allow holes in file. 2297 * For moronic filesystems that do not allow holes in file.
2295 * We may have to extend the file. 2298 * We may have to extend the file.
2296 */ 2299 */
2297 int cont_write_begin(struct file *file, struct address_space *mapping, 2300 int cont_write_begin(struct file *file, struct address_space *mapping,
2298 loff_t pos, unsigned len, unsigned flags, 2301 loff_t pos, unsigned len, unsigned flags,
2299 struct page **pagep, void **fsdata, 2302 struct page **pagep, void **fsdata,
2300 get_block_t *get_block, loff_t *bytes) 2303 get_block_t *get_block, loff_t *bytes)
2301 { 2304 {
2302 struct inode *inode = mapping->host; 2305 struct inode *inode = mapping->host;
2303 unsigned blocksize = 1 << inode->i_blkbits; 2306 unsigned blocksize = 1 << inode->i_blkbits;
2304 unsigned zerofrom; 2307 unsigned zerofrom;
2305 int err; 2308 int err;
2306 2309
2307 err = cont_expand_zero(file, mapping, pos, bytes); 2310 err = cont_expand_zero(file, mapping, pos, bytes);
2308 if (err) 2311 if (err)
2309 goto out; 2312 goto out;
2310 2313
2311 zerofrom = *bytes & ~PAGE_CACHE_MASK; 2314 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2312 if (pos+len > *bytes && zerofrom & (blocksize-1)) { 2315 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2313 *bytes |= (blocksize-1); 2316 *bytes |= (blocksize-1);
2314 (*bytes)++; 2317 (*bytes)++;
2315 } 2318 }
2316 2319
2317 *pagep = NULL; 2320 *pagep = NULL;
2318 err = block_write_begin(file, mapping, pos, len, 2321 err = block_write_begin(file, mapping, pos, len,
2319 flags, pagep, fsdata, get_block); 2322 flags, pagep, fsdata, get_block);
2320 out: 2323 out:
2321 return err; 2324 return err;
2322 } 2325 }
2323 2326
2324 int block_prepare_write(struct page *page, unsigned from, unsigned to, 2327 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2325 get_block_t *get_block) 2328 get_block_t *get_block)
2326 { 2329 {
2327 struct inode *inode = page->mapping->host; 2330 struct inode *inode = page->mapping->host;
2328 int err = __block_prepare_write(inode, page, from, to, get_block); 2331 int err = __block_prepare_write(inode, page, from, to, get_block);
2329 if (err) 2332 if (err)
2330 ClearPageUptodate(page); 2333 ClearPageUptodate(page);
2331 return err; 2334 return err;
2332 } 2335 }
2333 2336
2334 int block_commit_write(struct page *page, unsigned from, unsigned to) 2337 int block_commit_write(struct page *page, unsigned from, unsigned to)
2335 { 2338 {
2336 struct inode *inode = page->mapping->host; 2339 struct inode *inode = page->mapping->host;
2337 __block_commit_write(inode,page,from,to); 2340 __block_commit_write(inode,page,from,to);
2338 return 0; 2341 return 0;
2339 } 2342 }
2340 2343
2341 /* 2344 /*
2342 * block_page_mkwrite() is not allowed to change the file size as it gets 2345 * block_page_mkwrite() is not allowed to change the file size as it gets
2343 * called from a page fault handler when a page is first dirtied. Hence we must 2346 * called from a page fault handler when a page is first dirtied. Hence we must
2344 * be careful to check for EOF conditions here. We set the page up correctly 2347 * be careful to check for EOF conditions here. We set the page up correctly
2345 * for a written page which means we get ENOSPC checking when writing into 2348 * for a written page which means we get ENOSPC checking when writing into
2346 * holes and correct delalloc and unwritten extent mapping on filesystems that 2349 * holes and correct delalloc and unwritten extent mapping on filesystems that
2347 * support these features. 2350 * support these features.
2348 * 2351 *
2349 * We are not allowed to take the i_mutex here so we have to play games to 2352 * We are not allowed to take the i_mutex here so we have to play games to
2350 * protect against truncate races as the page could now be beyond EOF. Because 2353 * protect against truncate races as the page could now be beyond EOF. Because
2351 * vmtruncate() writes the inode size before removing pages, once we have the 2354 * vmtruncate() writes the inode size before removing pages, once we have the
2352 * page lock we can determine safely if the page is beyond EOF. If it is not 2355 * page lock we can determine safely if the page is beyond EOF. If it is not
2353 * beyond EOF, then the page is guaranteed safe against truncation until we 2356 * beyond EOF, then the page is guaranteed safe against truncation until we
2354 * unlock the page. 2357 * unlock the page.
2355 */ 2358 */
2356 int 2359 int
2357 block_page_mkwrite(struct vm_area_struct *vma, struct page *page, 2360 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2358 get_block_t get_block) 2361 get_block_t get_block)
2359 { 2362 {
2360 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2363 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2361 unsigned long end; 2364 unsigned long end;
2362 loff_t size; 2365 loff_t size;
2363 int ret = -EINVAL; 2366 int ret = -EINVAL;
2364 2367
2365 lock_page(page); 2368 lock_page(page);
2366 size = i_size_read(inode); 2369 size = i_size_read(inode);
2367 if ((page->mapping != inode->i_mapping) || 2370 if ((page->mapping != inode->i_mapping) ||
2368 (page_offset(page) > size)) { 2371 (page_offset(page) > size)) {
2369 /* page got truncated out from underneath us */ 2372 /* page got truncated out from underneath us */
2370 goto out_unlock; 2373 goto out_unlock;
2371 } 2374 }
2372 2375
2373 /* page is wholly or partially inside EOF */ 2376 /* page is wholly or partially inside EOF */
2374 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) 2377 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2375 end = size & ~PAGE_CACHE_MASK; 2378 end = size & ~PAGE_CACHE_MASK;
2376 else 2379 else
2377 end = PAGE_CACHE_SIZE; 2380 end = PAGE_CACHE_SIZE;
2378 2381
2379 ret = block_prepare_write(page, 0, end, get_block); 2382 ret = block_prepare_write(page, 0, end, get_block);
2380 if (!ret) 2383 if (!ret)
2381 ret = block_commit_write(page, 0, end); 2384 ret = block_commit_write(page, 0, end);
2382 2385
2383 out_unlock: 2386 out_unlock:
2384 unlock_page(page); 2387 unlock_page(page);
2385 return ret; 2388 return ret;
2386 } 2389 }
2387 2390
2388 /* 2391 /*
2389 * nobh_write_begin()'s prereads are special: the buffer_heads are freed 2392 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2390 * immediately, while under the page lock. So it needs a special end_io 2393 * immediately, while under the page lock. So it needs a special end_io
2391 * handler which does not touch the bh after unlocking it. 2394 * handler which does not touch the bh after unlocking it.
2392 */ 2395 */
2393 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) 2396 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2394 { 2397 {
2395 __end_buffer_read_notouch(bh, uptodate); 2398 __end_buffer_read_notouch(bh, uptodate);
2396 } 2399 }
2397 2400
2398 /* 2401 /*
2399 * Attach the singly-linked list of buffers created by nobh_write_begin, to 2402 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2400 * the page (converting it to circular linked list and taking care of page 2403 * the page (converting it to circular linked list and taking care of page
2401 * dirty races). 2404 * dirty races).
2402 */ 2405 */
2403 static void attach_nobh_buffers(struct page *page, struct buffer_head *head) 2406 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2404 { 2407 {
2405 struct buffer_head *bh; 2408 struct buffer_head *bh;
2406 2409
2407 BUG_ON(!PageLocked(page)); 2410 BUG_ON(!PageLocked(page));
2408 2411
2409 spin_lock(&page->mapping->private_lock); 2412 spin_lock(&page->mapping->private_lock);
2410 bh = head; 2413 bh = head;
2411 do { 2414 do {
2412 if (PageDirty(page)) 2415 if (PageDirty(page))
2413 set_buffer_dirty(bh); 2416 set_buffer_dirty(bh);
2414 if (!bh->b_this_page) 2417 if (!bh->b_this_page)
2415 bh->b_this_page = head; 2418 bh->b_this_page = head;
2416 bh = bh->b_this_page; 2419 bh = bh->b_this_page;
2417 } while (bh != head); 2420 } while (bh != head);
2418 attach_page_buffers(page, head); 2421 attach_page_buffers(page, head);
2419 spin_unlock(&page->mapping->private_lock); 2422 spin_unlock(&page->mapping->private_lock);
2420 } 2423 }
2421 2424
2422 /* 2425 /*
2423 * On entry, the page is fully not uptodate. 2426 * On entry, the page is fully not uptodate.
2424 * On exit the page is fully uptodate in the areas outside (from,to) 2427 * On exit the page is fully uptodate in the areas outside (from,to)
2425 */ 2428 */
2426 int nobh_write_begin(struct file *file, struct address_space *mapping, 2429 int nobh_write_begin(struct file *file, struct address_space *mapping,
2427 loff_t pos, unsigned len, unsigned flags, 2430 loff_t pos, unsigned len, unsigned flags,
2428 struct page **pagep, void **fsdata, 2431 struct page **pagep, void **fsdata,
2429 get_block_t *get_block) 2432 get_block_t *get_block)
2430 { 2433 {
2431 struct inode *inode = mapping->host; 2434 struct inode *inode = mapping->host;
2432 const unsigned blkbits = inode->i_blkbits; 2435 const unsigned blkbits = inode->i_blkbits;
2433 const unsigned blocksize = 1 << blkbits; 2436 const unsigned blocksize = 1 << blkbits;
2434 struct buffer_head *head, *bh; 2437 struct buffer_head *head, *bh;
2435 struct page *page; 2438 struct page *page;
2436 pgoff_t index; 2439 pgoff_t index;
2437 unsigned from, to; 2440 unsigned from, to;
2438 unsigned block_in_page; 2441 unsigned block_in_page;
2439 unsigned block_start, block_end; 2442 unsigned block_start, block_end;
2440 sector_t block_in_file; 2443 sector_t block_in_file;
2441 int nr_reads = 0; 2444 int nr_reads = 0;
2442 int ret = 0; 2445 int ret = 0;
2443 int is_mapped_to_disk = 1; 2446 int is_mapped_to_disk = 1;
2444 2447
2445 index = pos >> PAGE_CACHE_SHIFT; 2448 index = pos >> PAGE_CACHE_SHIFT;
2446 from = pos & (PAGE_CACHE_SIZE - 1); 2449 from = pos & (PAGE_CACHE_SIZE - 1);
2447 to = from + len; 2450 to = from + len;
2448 2451
2449 page = __grab_cache_page(mapping, index); 2452 page = __grab_cache_page(mapping, index);
2450 if (!page) 2453 if (!page)
2451 return -ENOMEM; 2454 return -ENOMEM;
2452 *pagep = page; 2455 *pagep = page;
2453 *fsdata = NULL; 2456 *fsdata = NULL;
2454 2457
2455 if (page_has_buffers(page)) { 2458 if (page_has_buffers(page)) {
2456 unlock_page(page); 2459 unlock_page(page);
2457 page_cache_release(page); 2460 page_cache_release(page);
2458 *pagep = NULL; 2461 *pagep = NULL;
2459 return block_write_begin(file, mapping, pos, len, flags, pagep, 2462 return block_write_begin(file, mapping, pos, len, flags, pagep,
2460 fsdata, get_block); 2463 fsdata, get_block);
2461 } 2464 }
2462 2465
2463 if (PageMappedToDisk(page)) 2466 if (PageMappedToDisk(page))
2464 return 0; 2467 return 0;
2465 2468
2466 /* 2469 /*
2467 * Allocate buffers so that we can keep track of state, and potentially 2470 * Allocate buffers so that we can keep track of state, and potentially
2468 * attach them to the page if an error occurs. In the common case of 2471 * attach them to the page if an error occurs. In the common case of
2469 * no error, they will just be freed again without ever being attached 2472 * no error, they will just be freed again without ever being attached
2470 * to the page (which is all OK, because we're under the page lock). 2473 * to the page (which is all OK, because we're under the page lock).
2471 * 2474 *
2472 * Be careful: the buffer linked list is a NULL terminated one, rather 2475 * Be careful: the buffer linked list is a NULL terminated one, rather
2473 * than the circular one we're used to. 2476 * than the circular one we're used to.
2474 */ 2477 */
2475 head = alloc_page_buffers(page, blocksize, 0); 2478 head = alloc_page_buffers(page, blocksize, 0);
2476 if (!head) { 2479 if (!head) {
2477 ret = -ENOMEM; 2480 ret = -ENOMEM;
2478 goto out_release; 2481 goto out_release;
2479 } 2482 }
2480 2483
2481 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 2484 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2482 2485
2483 /* 2486 /*
2484 * We loop across all blocks in the page, whether or not they are 2487 * We loop across all blocks in the page, whether or not they are
2485 * part of the affected region. This is so we can discover if the 2488 * part of the affected region. This is so we can discover if the
2486 * page is fully mapped-to-disk. 2489 * page is fully mapped-to-disk.
2487 */ 2490 */
2488 for (block_start = 0, block_in_page = 0, bh = head; 2491 for (block_start = 0, block_in_page = 0, bh = head;
2489 block_start < PAGE_CACHE_SIZE; 2492 block_start < PAGE_CACHE_SIZE;
2490 block_in_page++, block_start += blocksize, bh = bh->b_this_page) { 2493 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2491 int create; 2494 int create;
2492 2495
2493 block_end = block_start + blocksize; 2496 block_end = block_start + blocksize;
2494 bh->b_state = 0; 2497 bh->b_state = 0;
2495 create = 1; 2498 create = 1;
2496 if (block_start >= to) 2499 if (block_start >= to)
2497 create = 0; 2500 create = 0;
2498 ret = get_block(inode, block_in_file + block_in_page, 2501 ret = get_block(inode, block_in_file + block_in_page,
2499 bh, create); 2502 bh, create);
2500 if (ret) 2503 if (ret)
2501 goto failed; 2504 goto failed;
2502 if (!buffer_mapped(bh)) 2505 if (!buffer_mapped(bh))
2503 is_mapped_to_disk = 0; 2506 is_mapped_to_disk = 0;
2504 if (buffer_new(bh)) 2507 if (buffer_new(bh))
2505 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 2508 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2506 if (PageUptodate(page)) { 2509 if (PageUptodate(page)) {
2507 set_buffer_uptodate(bh); 2510 set_buffer_uptodate(bh);
2508 continue; 2511 continue;
2509 } 2512 }
2510 if (buffer_new(bh) || !buffer_mapped(bh)) { 2513 if (buffer_new(bh) || !buffer_mapped(bh)) {
2511 zero_user_segments(page, block_start, from, 2514 zero_user_segments(page, block_start, from,
2512 to, block_end); 2515 to, block_end);
2513 continue; 2516 continue;
2514 } 2517 }
2515 if (buffer_uptodate(bh)) 2518 if (buffer_uptodate(bh))
2516 continue; /* reiserfs does this */ 2519 continue; /* reiserfs does this */
2517 if (block_start < from || block_end > to) { 2520 if (block_start < from || block_end > to) {
2518 lock_buffer(bh); 2521 lock_buffer(bh);
2519 bh->b_end_io = end_buffer_read_nobh; 2522 bh->b_end_io = end_buffer_read_nobh;
2520 submit_bh(READ, bh); 2523 submit_bh(READ, bh);
2521 nr_reads++; 2524 nr_reads++;
2522 } 2525 }
2523 } 2526 }
2524 2527
2525 if (nr_reads) { 2528 if (nr_reads) {
2526 /* 2529 /*
2527 * The page is locked, so these buffers are protected from 2530 * The page is locked, so these buffers are protected from
2528 * any VM or truncate activity. Hence we don't need to care 2531 * any VM or truncate activity. Hence we don't need to care
2529 * for the buffer_head refcounts. 2532 * for the buffer_head refcounts.
2530 */ 2533 */
2531 for (bh = head; bh; bh = bh->b_this_page) { 2534 for (bh = head; bh; bh = bh->b_this_page) {
2532 wait_on_buffer(bh); 2535 wait_on_buffer(bh);
2533 if (!buffer_uptodate(bh)) 2536 if (!buffer_uptodate(bh))
2534 ret = -EIO; 2537 ret = -EIO;
2535 } 2538 }
2536 if (ret) 2539 if (ret)
2537 goto failed; 2540 goto failed;
2538 } 2541 }
2539 2542
2540 if (is_mapped_to_disk) 2543 if (is_mapped_to_disk)
2541 SetPageMappedToDisk(page); 2544 SetPageMappedToDisk(page);
2542 2545
2543 *fsdata = head; /* to be released by nobh_write_end */ 2546 *fsdata = head; /* to be released by nobh_write_end */
2544 2547
2545 return 0; 2548 return 0;
2546 2549
2547 failed: 2550 failed:
2548 BUG_ON(!ret); 2551 BUG_ON(!ret);
2549 /* 2552 /*
2550 * Error recovery is a bit difficult. We need to zero out blocks that 2553 * Error recovery is a bit difficult. We need to zero out blocks that
2551 * were newly allocated, and dirty them to ensure they get written out. 2554 * were newly allocated, and dirty them to ensure they get written out.
2552 * Buffers need to be attached to the page at this point, otherwise 2555 * Buffers need to be attached to the page at this point, otherwise
2553 * the handling of potential IO errors during writeout would be hard 2556 * the handling of potential IO errors during writeout would be hard
2554 * (could try doing synchronous writeout, but what if that fails too?) 2557 * (could try doing synchronous writeout, but what if that fails too?)
2555 */ 2558 */
2556 attach_nobh_buffers(page, head); 2559 attach_nobh_buffers(page, head);
2557 page_zero_new_buffers(page, from, to); 2560 page_zero_new_buffers(page, from, to);
2558 2561
2559 out_release: 2562 out_release:
2560 unlock_page(page); 2563 unlock_page(page);
2561 page_cache_release(page); 2564 page_cache_release(page);
2562 *pagep = NULL; 2565 *pagep = NULL;
2563 2566
2564 if (pos + len > inode->i_size) 2567 if (pos + len > inode->i_size)
2565 vmtruncate(inode, inode->i_size); 2568 vmtruncate(inode, inode->i_size);
2566 2569
2567 return ret; 2570 return ret;
2568 } 2571 }
2569 EXPORT_SYMBOL(nobh_write_begin); 2572 EXPORT_SYMBOL(nobh_write_begin);
2570 2573
2571 int nobh_write_end(struct file *file, struct address_space *mapping, 2574 int nobh_write_end(struct file *file, struct address_space *mapping,
2572 loff_t pos, unsigned len, unsigned copied, 2575 loff_t pos, unsigned len, unsigned copied,
2573 struct page *page, void *fsdata) 2576 struct page *page, void *fsdata)
2574 { 2577 {
2575 struct inode *inode = page->mapping->host; 2578 struct inode *inode = page->mapping->host;
2576 struct buffer_head *head = fsdata; 2579 struct buffer_head *head = fsdata;
2577 struct buffer_head *bh; 2580 struct buffer_head *bh;
2578 BUG_ON(fsdata != NULL && page_has_buffers(page)); 2581 BUG_ON(fsdata != NULL && page_has_buffers(page));
2579 2582
2580 if (unlikely(copied < len) && !page_has_buffers(page)) 2583 if (unlikely(copied < len) && !page_has_buffers(page))
2581 attach_nobh_buffers(page, head); 2584 attach_nobh_buffers(page, head);
2582 if (page_has_buffers(page)) 2585 if (page_has_buffers(page))
2583 return generic_write_end(file, mapping, pos, len, 2586 return generic_write_end(file, mapping, pos, len,
2584 copied, page, fsdata); 2587 copied, page, fsdata);
2585 2588
2586 SetPageUptodate(page); 2589 SetPageUptodate(page);
2587 set_page_dirty(page); 2590 set_page_dirty(page);
2588 if (pos+copied > inode->i_size) { 2591 if (pos+copied > inode->i_size) {
2589 i_size_write(inode, pos+copied); 2592 i_size_write(inode, pos+copied);
2590 mark_inode_dirty(inode); 2593 mark_inode_dirty(inode);
2591 } 2594 }
2592 2595
2593 unlock_page(page); 2596 unlock_page(page);
2594 page_cache_release(page); 2597 page_cache_release(page);
2595 2598
2596 while (head) { 2599 while (head) {
2597 bh = head; 2600 bh = head;
2598 head = head->b_this_page; 2601 head = head->b_this_page;
2599 free_buffer_head(bh); 2602 free_buffer_head(bh);
2600 } 2603 }
2601 2604
2602 return copied; 2605 return copied;
2603 } 2606 }
2604 EXPORT_SYMBOL(nobh_write_end); 2607 EXPORT_SYMBOL(nobh_write_end);
2605 2608
2606 /* 2609 /*
2607 * nobh_writepage() - based on block_full_write_page() except 2610 * nobh_writepage() - based on block_full_write_page() except
2608 * that it tries to operate without attaching bufferheads to 2611 * that it tries to operate without attaching bufferheads to
2609 * the page. 2612 * the page.
2610 */ 2613 */
2611 int nobh_writepage(struct page *page, get_block_t *get_block, 2614 int nobh_writepage(struct page *page, get_block_t *get_block,
2612 struct writeback_control *wbc) 2615 struct writeback_control *wbc)
2613 { 2616 {
2614 struct inode * const inode = page->mapping->host; 2617 struct inode * const inode = page->mapping->host;
2615 loff_t i_size = i_size_read(inode); 2618 loff_t i_size = i_size_read(inode);
2616 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2619 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2617 unsigned offset; 2620 unsigned offset;
2618 int ret; 2621 int ret;
2619 2622
2620 /* Is the page fully inside i_size? */ 2623 /* Is the page fully inside i_size? */
2621 if (page->index < end_index) 2624 if (page->index < end_index)
2622 goto out; 2625 goto out;
2623 2626
2624 /* Is the page fully outside i_size? (truncate in progress) */ 2627 /* Is the page fully outside i_size? (truncate in progress) */
2625 offset = i_size & (PAGE_CACHE_SIZE-1); 2628 offset = i_size & (PAGE_CACHE_SIZE-1);
2626 if (page->index >= end_index+1 || !offset) { 2629 if (page->index >= end_index+1 || !offset) {
2627 /* 2630 /*
2628 * The page may have dirty, unmapped buffers. For example, 2631 * The page may have dirty, unmapped buffers. For example,
2629 * they may have been added in ext3_writepage(). Make them 2632 * they may have been added in ext3_writepage(). Make them
2630 * freeable here, so the page does not leak. 2633 * freeable here, so the page does not leak.
2631 */ 2634 */
2632 #if 0 2635 #if 0
2633 /* Not really sure about this - do we need this ? */ 2636 /* Not really sure about this - do we need this ? */
2634 if (page->mapping->a_ops->invalidatepage) 2637 if (page->mapping->a_ops->invalidatepage)
2635 page->mapping->a_ops->invalidatepage(page, offset); 2638 page->mapping->a_ops->invalidatepage(page, offset);
2636 #endif 2639 #endif
2637 unlock_page(page); 2640 unlock_page(page);
2638 return 0; /* don't care */ 2641 return 0; /* don't care */
2639 } 2642 }
2640 2643
2641 /* 2644 /*
2642 * The page straddles i_size. It must be zeroed out on each and every 2645 * The page straddles i_size. It must be zeroed out on each and every
2643 * writepage invocation because it may be mmapped. "A file is mapped 2646 * writepage invocation because it may be mmapped. "A file is mapped
2644 * in multiples of the page size. For a file that is not a multiple of 2647 * in multiples of the page size. For a file that is not a multiple of
2645 * the page size, the remaining memory is zeroed when mapped, and 2648 * the page size, the remaining memory is zeroed when mapped, and
2646 * writes to that region are not written out to the file." 2649 * writes to that region are not written out to the file."
2647 */ 2650 */
2648 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2651 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2649 out: 2652 out:
2650 ret = mpage_writepage(page, get_block, wbc); 2653 ret = mpage_writepage(page, get_block, wbc);
2651 if (ret == -EAGAIN) 2654 if (ret == -EAGAIN)
2652 ret = __block_write_full_page(inode, page, get_block, wbc); 2655 ret = __block_write_full_page(inode, page, get_block, wbc);
2653 return ret; 2656 return ret;
2654 } 2657 }
2655 EXPORT_SYMBOL(nobh_writepage); 2658 EXPORT_SYMBOL(nobh_writepage);
2656 2659
2657 int nobh_truncate_page(struct address_space *mapping, 2660 int nobh_truncate_page(struct address_space *mapping,
2658 loff_t from, get_block_t *get_block) 2661 loff_t from, get_block_t *get_block)
2659 { 2662 {
2660 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2663 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2661 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2664 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2662 unsigned blocksize; 2665 unsigned blocksize;
2663 sector_t iblock; 2666 sector_t iblock;
2664 unsigned length, pos; 2667 unsigned length, pos;
2665 struct inode *inode = mapping->host; 2668 struct inode *inode = mapping->host;
2666 struct page *page; 2669 struct page *page;
2667 struct buffer_head map_bh; 2670 struct buffer_head map_bh;
2668 int err; 2671 int err;
2669 2672
2670 blocksize = 1 << inode->i_blkbits; 2673 blocksize = 1 << inode->i_blkbits;
2671 length = offset & (blocksize - 1); 2674 length = offset & (blocksize - 1);
2672 2675
2673 /* Block boundary? Nothing to do */ 2676 /* Block boundary? Nothing to do */
2674 if (!length) 2677 if (!length)
2675 return 0; 2678 return 0;
2676 2679
2677 length = blocksize - length; 2680 length = blocksize - length;
2678 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2681 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2679 2682
2680 page = grab_cache_page(mapping, index); 2683 page = grab_cache_page(mapping, index);
2681 err = -ENOMEM; 2684 err = -ENOMEM;
2682 if (!page) 2685 if (!page)
2683 goto out; 2686 goto out;
2684 2687
2685 if (page_has_buffers(page)) { 2688 if (page_has_buffers(page)) {
2686 has_buffers: 2689 has_buffers:
2687 unlock_page(page); 2690 unlock_page(page);
2688 page_cache_release(page); 2691 page_cache_release(page);
2689 return block_truncate_page(mapping, from, get_block); 2692 return block_truncate_page(mapping, from, get_block);
2690 } 2693 }
2691 2694
2692 /* Find the buffer that contains "offset" */ 2695 /* Find the buffer that contains "offset" */
2693 pos = blocksize; 2696 pos = blocksize;
2694 while (offset >= pos) { 2697 while (offset >= pos) {
2695 iblock++; 2698 iblock++;
2696 pos += blocksize; 2699 pos += blocksize;
2697 } 2700 }
2698 2701
2699 err = get_block(inode, iblock, &map_bh, 0); 2702 err = get_block(inode, iblock, &map_bh, 0);
2700 if (err) 2703 if (err)
2701 goto unlock; 2704 goto unlock;
2702 /* unmapped? It's a hole - nothing to do */ 2705 /* unmapped? It's a hole - nothing to do */
2703 if (!buffer_mapped(&map_bh)) 2706 if (!buffer_mapped(&map_bh))
2704 goto unlock; 2707 goto unlock;
2705 2708
2706 /* Ok, it's mapped. Make sure it's up-to-date */ 2709 /* Ok, it's mapped. Make sure it's up-to-date */
2707 if (!PageUptodate(page)) { 2710 if (!PageUptodate(page)) {
2708 err = mapping->a_ops->readpage(NULL, page); 2711 err = mapping->a_ops->readpage(NULL, page);
2709 if (err) { 2712 if (err) {
2710 page_cache_release(page); 2713 page_cache_release(page);
2711 goto out; 2714 goto out;
2712 } 2715 }
2713 lock_page(page); 2716 lock_page(page);
2714 if (!PageUptodate(page)) { 2717 if (!PageUptodate(page)) {
2715 err = -EIO; 2718 err = -EIO;
2716 goto unlock; 2719 goto unlock;
2717 } 2720 }
2718 if (page_has_buffers(page)) 2721 if (page_has_buffers(page))
2719 goto has_buffers; 2722 goto has_buffers;
2720 } 2723 }
2721 zero_user(page, offset, length); 2724 zero_user(page, offset, length);
2722 set_page_dirty(page); 2725 set_page_dirty(page);
2723 err = 0; 2726 err = 0;
2724 2727
2725 unlock: 2728 unlock:
2726 unlock_page(page); 2729 unlock_page(page);
2727 page_cache_release(page); 2730 page_cache_release(page);
2728 out: 2731 out:
2729 return err; 2732 return err;
2730 } 2733 }
2731 EXPORT_SYMBOL(nobh_truncate_page); 2734 EXPORT_SYMBOL(nobh_truncate_page);
2732 2735
2733 int block_truncate_page(struct address_space *mapping, 2736 int block_truncate_page(struct address_space *mapping,
2734 loff_t from, get_block_t *get_block) 2737 loff_t from, get_block_t *get_block)
2735 { 2738 {
2736 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2739 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2737 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2740 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2738 unsigned blocksize; 2741 unsigned blocksize;
2739 sector_t iblock; 2742 sector_t iblock;
2740 unsigned length, pos; 2743 unsigned length, pos;
2741 struct inode *inode = mapping->host; 2744 struct inode *inode = mapping->host;
2742 struct page *page; 2745 struct page *page;
2743 struct buffer_head *bh; 2746 struct buffer_head *bh;
2744 int err; 2747 int err;
2745 2748
2746 blocksize = 1 << inode->i_blkbits; 2749 blocksize = 1 << inode->i_blkbits;
2747 length = offset & (blocksize - 1); 2750 length = offset & (blocksize - 1);
2748 2751
2749 /* Block boundary? Nothing to do */ 2752 /* Block boundary? Nothing to do */
2750 if (!length) 2753 if (!length)
2751 return 0; 2754 return 0;
2752 2755
2753 length = blocksize - length; 2756 length = blocksize - length;
2754 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2757 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2755 2758
2756 page = grab_cache_page(mapping, index); 2759 page = grab_cache_page(mapping, index);
2757 err = -ENOMEM; 2760 err = -ENOMEM;
2758 if (!page) 2761 if (!page)
2759 goto out; 2762 goto out;
2760 2763
2761 if (!page_has_buffers(page)) 2764 if (!page_has_buffers(page))
2762 create_empty_buffers(page, blocksize, 0); 2765 create_empty_buffers(page, blocksize, 0);
2763 2766
2764 /* Find the buffer that contains "offset" */ 2767 /* Find the buffer that contains "offset" */
2765 bh = page_buffers(page); 2768 bh = page_buffers(page);
2766 pos = blocksize; 2769 pos = blocksize;
2767 while (offset >= pos) { 2770 while (offset >= pos) {
2768 bh = bh->b_this_page; 2771 bh = bh->b_this_page;
2769 iblock++; 2772 iblock++;
2770 pos += blocksize; 2773 pos += blocksize;
2771 } 2774 }
2772 2775
2773 err = 0; 2776 err = 0;
2774 if (!buffer_mapped(bh)) { 2777 if (!buffer_mapped(bh)) {
2775 WARN_ON(bh->b_size != blocksize); 2778 WARN_ON(bh->b_size != blocksize);
2776 err = get_block(inode, iblock, bh, 0); 2779 err = get_block(inode, iblock, bh, 0);
2777 if (err) 2780 if (err)
2778 goto unlock; 2781 goto unlock;
2779 /* unmapped? It's a hole - nothing to do */ 2782 /* unmapped? It's a hole - nothing to do */
2780 if (!buffer_mapped(bh)) 2783 if (!buffer_mapped(bh))
2781 goto unlock; 2784 goto unlock;
2782 } 2785 }
2783 2786
2784 /* Ok, it's mapped. Make sure it's up-to-date */ 2787 /* Ok, it's mapped. Make sure it's up-to-date */
2785 if (PageUptodate(page)) 2788 if (PageUptodate(page))
2786 set_buffer_uptodate(bh); 2789 set_buffer_uptodate(bh);
2787 2790
2788 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { 2791 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2789 err = -EIO; 2792 err = -EIO;
2790 ll_rw_block(READ, 1, &bh); 2793 ll_rw_block(READ, 1, &bh);
2791 wait_on_buffer(bh); 2794 wait_on_buffer(bh);
2792 /* Uhhuh. Read error. Complain and punt. */ 2795 /* Uhhuh. Read error. Complain and punt. */
2793 if (!buffer_uptodate(bh)) 2796 if (!buffer_uptodate(bh))
2794 goto unlock; 2797 goto unlock;
2795 } 2798 }
2796 2799
2797 zero_user(page, offset, length); 2800 zero_user(page, offset, length);
2798 mark_buffer_dirty(bh); 2801 mark_buffer_dirty(bh);
2799 err = 0; 2802 err = 0;
2800 2803
2801 unlock: 2804 unlock:
2802 unlock_page(page); 2805 unlock_page(page);
2803 page_cache_release(page); 2806 page_cache_release(page);
2804 out: 2807 out:
2805 return err; 2808 return err;
2806 } 2809 }
2807 2810
2808 /* 2811 /*
2809 * The generic ->writepage function for buffer-backed address_spaces 2812 * The generic ->writepage function for buffer-backed address_spaces
2810 */ 2813 */
2811 int block_write_full_page(struct page *page, get_block_t *get_block, 2814 int block_write_full_page(struct page *page, get_block_t *get_block,
2812 struct writeback_control *wbc) 2815 struct writeback_control *wbc)
2813 { 2816 {
2814 struct inode * const inode = page->mapping->host; 2817 struct inode * const inode = page->mapping->host;
2815 loff_t i_size = i_size_read(inode); 2818 loff_t i_size = i_size_read(inode);
2816 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2819 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2817 unsigned offset; 2820 unsigned offset;
2818 2821
2819 /* Is the page fully inside i_size? */ 2822 /* Is the page fully inside i_size? */
2820 if (page->index < end_index) 2823 if (page->index < end_index)
2821 return __block_write_full_page(inode, page, get_block, wbc); 2824 return __block_write_full_page(inode, page, get_block, wbc);
2822 2825
2823 /* Is the page fully outside i_size? (truncate in progress) */ 2826 /* Is the page fully outside i_size? (truncate in progress) */
2824 offset = i_size & (PAGE_CACHE_SIZE-1); 2827 offset = i_size & (PAGE_CACHE_SIZE-1);
2825 if (page->index >= end_index+1 || !offset) { 2828 if (page->index >= end_index+1 || !offset) {
2826 /* 2829 /*
2827 * The page may have dirty, unmapped buffers. For example, 2830 * The page may have dirty, unmapped buffers. For example,
2828 * they may have been added in ext3_writepage(). Make them 2831 * they may have been added in ext3_writepage(). Make them
2829 * freeable here, so the page does not leak. 2832 * freeable here, so the page does not leak.
2830 */ 2833 */
2831 do_invalidatepage(page, 0); 2834 do_invalidatepage(page, 0);
2832 unlock_page(page); 2835 unlock_page(page);
2833 return 0; /* don't care */ 2836 return 0; /* don't care */
2834 } 2837 }
2835 2838
2836 /* 2839 /*
2837 * The page straddles i_size. It must be zeroed out on each and every 2840 * The page straddles i_size. It must be zeroed out on each and every
2838 * writepage invokation because it may be mmapped. "A file is mapped 2841 * writepage invokation because it may be mmapped. "A file is mapped
2839 * in multiples of the page size. For a file that is not a multiple of 2842 * in multiples of the page size. For a file that is not a multiple of
2840 * the page size, the remaining memory is zeroed when mapped, and 2843 * the page size, the remaining memory is zeroed when mapped, and
2841 * writes to that region are not written out to the file." 2844 * writes to that region are not written out to the file."
2842 */ 2845 */
2843 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2846 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2844 return __block_write_full_page(inode, page, get_block, wbc); 2847 return __block_write_full_page(inode, page, get_block, wbc);
2845 } 2848 }
2846 2849
2847 sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2850 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2848 get_block_t *get_block) 2851 get_block_t *get_block)
2849 { 2852 {
2850 struct buffer_head tmp; 2853 struct buffer_head tmp;
2851 struct inode *inode = mapping->host; 2854 struct inode *inode = mapping->host;
2852 tmp.b_state = 0; 2855 tmp.b_state = 0;
2853 tmp.b_blocknr = 0; 2856 tmp.b_blocknr = 0;
2854 tmp.b_size = 1 << inode->i_blkbits; 2857 tmp.b_size = 1 << inode->i_blkbits;
2855 get_block(inode, block, &tmp, 0); 2858 get_block(inode, block, &tmp, 0);
2856 return tmp.b_blocknr; 2859 return tmp.b_blocknr;
2857 } 2860 }
2858 2861
2859 static void end_bio_bh_io_sync(struct bio *bio, int err) 2862 static void end_bio_bh_io_sync(struct bio *bio, int err)
2860 { 2863 {
2861 struct buffer_head *bh = bio->bi_private; 2864 struct buffer_head *bh = bio->bi_private;
2862 2865
2863 if (err == -EOPNOTSUPP) { 2866 if (err == -EOPNOTSUPP) {
2864 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2867 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2865 set_bit(BH_Eopnotsupp, &bh->b_state); 2868 set_bit(BH_Eopnotsupp, &bh->b_state);
2866 } 2869 }
2867 2870
2868 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); 2871 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2869 bio_put(bio); 2872 bio_put(bio);
2870 } 2873 }
2871 2874
2872 int submit_bh(int rw, struct buffer_head * bh) 2875 int submit_bh(int rw, struct buffer_head * bh)
2873 { 2876 {
2874 struct bio *bio; 2877 struct bio *bio;
2875 int ret = 0; 2878 int ret = 0;
2876 2879
2877 BUG_ON(!buffer_locked(bh)); 2880 BUG_ON(!buffer_locked(bh));
2878 BUG_ON(!buffer_mapped(bh)); 2881 BUG_ON(!buffer_mapped(bh));
2879 BUG_ON(!bh->b_end_io); 2882 BUG_ON(!bh->b_end_io);
2880 2883
2881 if (buffer_ordered(bh) && (rw == WRITE)) 2884 if (buffer_ordered(bh) && (rw == WRITE))
2882 rw = WRITE_BARRIER; 2885 rw = WRITE_BARRIER;
2883 2886
2884 /* 2887 /*
2885 * Only clear out a write error when rewriting, should this 2888 * Only clear out a write error when rewriting, should this
2886 * include WRITE_SYNC as well? 2889 * include WRITE_SYNC as well?
2887 */ 2890 */
2888 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) 2891 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2889 clear_buffer_write_io_error(bh); 2892 clear_buffer_write_io_error(bh);
2890 2893
2891 /* 2894 /*
2892 * from here on down, it's all bio -- do the initial mapping, 2895 * from here on down, it's all bio -- do the initial mapping,
2893 * submit_bio -> generic_make_request may further map this bio around 2896 * submit_bio -> generic_make_request may further map this bio around
2894 */ 2897 */
2895 bio = bio_alloc(GFP_NOIO, 1); 2898 bio = bio_alloc(GFP_NOIO, 1);
2896 2899
2897 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 2900 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2898 bio->bi_bdev = bh->b_bdev; 2901 bio->bi_bdev = bh->b_bdev;
2899 bio->bi_io_vec[0].bv_page = bh->b_page; 2902 bio->bi_io_vec[0].bv_page = bh->b_page;
2900 bio->bi_io_vec[0].bv_len = bh->b_size; 2903 bio->bi_io_vec[0].bv_len = bh->b_size;
2901 bio->bi_io_vec[0].bv_offset = bh_offset(bh); 2904 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2902 2905
2903 bio->bi_vcnt = 1; 2906 bio->bi_vcnt = 1;
2904 bio->bi_idx = 0; 2907 bio->bi_idx = 0;
2905 bio->bi_size = bh->b_size; 2908 bio->bi_size = bh->b_size;
2906 2909
2907 bio->bi_end_io = end_bio_bh_io_sync; 2910 bio->bi_end_io = end_bio_bh_io_sync;
2908 bio->bi_private = bh; 2911 bio->bi_private = bh;
2909 2912
2910 bio_get(bio); 2913 bio_get(bio);
2911 submit_bio(rw, bio); 2914 submit_bio(rw, bio);
2912 2915
2913 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2916 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2914 ret = -EOPNOTSUPP; 2917 ret = -EOPNOTSUPP;
2915 2918
2916 bio_put(bio); 2919 bio_put(bio);
2917 return ret; 2920 return ret;
2918 } 2921 }
2919 2922
2920 /** 2923 /**
2921 * ll_rw_block: low-level access to block devices (DEPRECATED) 2924 * ll_rw_block: low-level access to block devices (DEPRECATED)
2922 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) 2925 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2923 * @nr: number of &struct buffer_heads in the array 2926 * @nr: number of &struct buffer_heads in the array
2924 * @bhs: array of pointers to &struct buffer_head 2927 * @bhs: array of pointers to &struct buffer_head
2925 * 2928 *
2926 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 2929 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2927 * requests an I/O operation on them, either a %READ or a %WRITE. The third 2930 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2928 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers 2931 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2929 * are sent to disk. The fourth %READA option is described in the documentation 2932 * are sent to disk. The fourth %READA option is described in the documentation
2930 * for generic_make_request() which ll_rw_block() calls. 2933 * for generic_make_request() which ll_rw_block() calls.
2931 * 2934 *
2932 * This function drops any buffer that it cannot get a lock on (with the 2935 * This function drops any buffer that it cannot get a lock on (with the
2933 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be 2936 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2934 * clean when doing a write request, and any buffer that appears to be 2937 * clean when doing a write request, and any buffer that appears to be
2935 * up-to-date when doing read request. Further it marks as clean buffers that 2938 * up-to-date when doing read request. Further it marks as clean buffers that
2936 * are processed for writing (the buffer cache won't assume that they are 2939 * are processed for writing (the buffer cache won't assume that they are
2937 * actually clean until the buffer gets unlocked). 2940 * actually clean until the buffer gets unlocked).
2938 * 2941 *
2939 * ll_rw_block sets b_end_io to simple completion handler that marks 2942 * ll_rw_block sets b_end_io to simple completion handler that marks
2940 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 2943 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2941 * any waiters. 2944 * any waiters.
2942 * 2945 *
2943 * All of the buffers must be for the same device, and must also be a 2946 * All of the buffers must be for the same device, and must also be a
2944 * multiple of the current approved size for the device. 2947 * multiple of the current approved size for the device.
2945 */ 2948 */
2946 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) 2949 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2947 { 2950 {
2948 int i; 2951 int i;
2949 2952
2950 for (i = 0; i < nr; i++) { 2953 for (i = 0; i < nr; i++) {
2951 struct buffer_head *bh = bhs[i]; 2954 struct buffer_head *bh = bhs[i];
2952 2955
2953 if (rw == SWRITE || rw == SWRITE_SYNC) 2956 if (rw == SWRITE || rw == SWRITE_SYNC)
2954 lock_buffer(bh); 2957 lock_buffer(bh);
2955 else if (test_set_buffer_locked(bh)) 2958 else if (test_set_buffer_locked(bh))
2956 continue; 2959 continue;
2957 2960
2958 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { 2961 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
2959 if (test_clear_buffer_dirty(bh)) { 2962 if (test_clear_buffer_dirty(bh)) {
2960 bh->b_end_io = end_buffer_write_sync; 2963 bh->b_end_io = end_buffer_write_sync;
2961 get_bh(bh); 2964 get_bh(bh);
2962 if (rw == SWRITE_SYNC) 2965 if (rw == SWRITE_SYNC)
2963 submit_bh(WRITE_SYNC, bh); 2966 submit_bh(WRITE_SYNC, bh);
2964 else 2967 else
2965 submit_bh(WRITE, bh); 2968 submit_bh(WRITE, bh);
2966 continue; 2969 continue;
2967 } 2970 }
2968 } else { 2971 } else {
2969 if (!buffer_uptodate(bh)) { 2972 if (!buffer_uptodate(bh)) {
2970 bh->b_end_io = end_buffer_read_sync; 2973 bh->b_end_io = end_buffer_read_sync;
2971 get_bh(bh); 2974 get_bh(bh);
2972 submit_bh(rw, bh); 2975 submit_bh(rw, bh);
2973 continue; 2976 continue;
2974 } 2977 }
2975 } 2978 }
2976 unlock_buffer(bh); 2979 unlock_buffer(bh);
2977 } 2980 }
2978 } 2981 }
2979 2982
2980 /* 2983 /*
2981 * For a data-integrity writeout, we need to wait upon any in-progress I/O 2984 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2982 * and then start new I/O and then wait upon it. The caller must have a ref on 2985 * and then start new I/O and then wait upon it. The caller must have a ref on
2983 * the buffer_head. 2986 * the buffer_head.
2984 */ 2987 */
2985 int sync_dirty_buffer(struct buffer_head *bh) 2988 int sync_dirty_buffer(struct buffer_head *bh)
2986 { 2989 {
2987 int ret = 0; 2990 int ret = 0;
2988 2991
2989 WARN_ON(atomic_read(&bh->b_count) < 1); 2992 WARN_ON(atomic_read(&bh->b_count) < 1);
2990 lock_buffer(bh); 2993 lock_buffer(bh);
2991 if (test_clear_buffer_dirty(bh)) { 2994 if (test_clear_buffer_dirty(bh)) {
2992 get_bh(bh); 2995 get_bh(bh);
2993 bh->b_end_io = end_buffer_write_sync; 2996 bh->b_end_io = end_buffer_write_sync;
2994 ret = submit_bh(WRITE_SYNC, bh); 2997 ret = submit_bh(WRITE_SYNC, bh);
2995 wait_on_buffer(bh); 2998 wait_on_buffer(bh);
2996 if (buffer_eopnotsupp(bh)) { 2999 if (buffer_eopnotsupp(bh)) {
2997 clear_buffer_eopnotsupp(bh); 3000 clear_buffer_eopnotsupp(bh);
2998 ret = -EOPNOTSUPP; 3001 ret = -EOPNOTSUPP;
2999 } 3002 }
3000 if (!ret && !buffer_uptodate(bh)) 3003 if (!ret && !buffer_uptodate(bh))
3001 ret = -EIO; 3004 ret = -EIO;
3002 } else { 3005 } else {
3003 unlock_buffer(bh); 3006 unlock_buffer(bh);
3004 } 3007 }
3005 return ret; 3008 return ret;
3006 } 3009 }
3007 3010
3008 /* 3011 /*
3009 * try_to_free_buffers() checks if all the buffers on this particular page 3012 * try_to_free_buffers() checks if all the buffers on this particular page
3010 * are unused, and releases them if so. 3013 * are unused, and releases them if so.
3011 * 3014 *
3012 * Exclusion against try_to_free_buffers may be obtained by either 3015 * Exclusion against try_to_free_buffers may be obtained by either
3013 * locking the page or by holding its mapping's private_lock. 3016 * locking the page or by holding its mapping's private_lock.
3014 * 3017 *
3015 * If the page is dirty but all the buffers are clean then we need to 3018 * If the page is dirty but all the buffers are clean then we need to
3016 * be sure to mark the page clean as well. This is because the page 3019 * be sure to mark the page clean as well. This is because the page
3017 * may be against a block device, and a later reattachment of buffers 3020 * may be against a block device, and a later reattachment of buffers
3018 * to a dirty page will set *all* buffers dirty. Which would corrupt 3021 * to a dirty page will set *all* buffers dirty. Which would corrupt
3019 * filesystem data on the same device. 3022 * filesystem data on the same device.
3020 * 3023 *
3021 * The same applies to regular filesystem pages: if all the buffers are 3024 * The same applies to regular filesystem pages: if all the buffers are
3022 * clean then we set the page clean and proceed. To do that, we require 3025 * clean then we set the page clean and proceed. To do that, we require
3023 * total exclusion from __set_page_dirty_buffers(). That is obtained with 3026 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3024 * private_lock. 3027 * private_lock.
3025 * 3028 *
3026 * try_to_free_buffers() is non-blocking. 3029 * try_to_free_buffers() is non-blocking.
3027 */ 3030 */
3028 static inline int buffer_busy(struct buffer_head *bh) 3031 static inline int buffer_busy(struct buffer_head *bh)
3029 { 3032 {
3030 return atomic_read(&bh->b_count) | 3033 return atomic_read(&bh->b_count) |
3031 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); 3034 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3032 } 3035 }
3033 3036
3034 static int 3037 static int
3035 drop_buffers(struct page *page, struct buffer_head **buffers_to_free) 3038 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3036 { 3039 {
3037 struct buffer_head *head = page_buffers(page); 3040 struct buffer_head *head = page_buffers(page);
3038 struct buffer_head *bh; 3041 struct buffer_head *bh;
3039 3042
3040 bh = head; 3043 bh = head;
3041 do { 3044 do {
3042 if (buffer_write_io_error(bh) && page->mapping) 3045 if (buffer_write_io_error(bh) && page->mapping)
3043 set_bit(AS_EIO, &page->mapping->flags); 3046 set_bit(AS_EIO, &page->mapping->flags);
3044 if (buffer_busy(bh)) 3047 if (buffer_busy(bh))
3045 goto failed; 3048 goto failed;
3046 bh = bh->b_this_page; 3049 bh = bh->b_this_page;
3047 } while (bh != head); 3050 } while (bh != head);
3048 3051
3049 do { 3052 do {
3050 struct buffer_head *next = bh->b_this_page; 3053 struct buffer_head *next = bh->b_this_page;
3051 3054
3052 if (bh->b_assoc_map) 3055 if (bh->b_assoc_map)
3053 __remove_assoc_queue(bh); 3056 __remove_assoc_queue(bh);
3054 bh = next; 3057 bh = next;
3055 } while (bh != head); 3058 } while (bh != head);
3056 *buffers_to_free = head; 3059 *buffers_to_free = head;
3057 __clear_page_buffers(page); 3060 __clear_page_buffers(page);
3058 return 1; 3061 return 1;
3059 failed: 3062 failed:
3060 return 0; 3063 return 0;
3061 } 3064 }
3062 3065
3063 int try_to_free_buffers(struct page *page) 3066 int try_to_free_buffers(struct page *page)
3064 { 3067 {
3065 struct address_space * const mapping = page->mapping; 3068 struct address_space * const mapping = page->mapping;
3066 struct buffer_head *buffers_to_free = NULL; 3069 struct buffer_head *buffers_to_free = NULL;
3067 int ret = 0; 3070 int ret = 0;
3068 3071
3069 BUG_ON(!PageLocked(page)); 3072 BUG_ON(!PageLocked(page));
3070 if (PageWriteback(page)) 3073 if (PageWriteback(page))
3071 return 0; 3074 return 0;
3072 3075
3073 if (mapping == NULL) { /* can this still happen? */ 3076 if (mapping == NULL) { /* can this still happen? */
3074 ret = drop_buffers(page, &buffers_to_free); 3077 ret = drop_buffers(page, &buffers_to_free);
3075 goto out; 3078 goto out;
3076 } 3079 }
3077 3080
3078 spin_lock(&mapping->private_lock); 3081 spin_lock(&mapping->private_lock);
3079 ret = drop_buffers(page, &buffers_to_free); 3082 ret = drop_buffers(page, &buffers_to_free);
3080 3083
3081 /* 3084 /*
3082 * If the filesystem writes its buffers by hand (eg ext3) 3085 * If the filesystem writes its buffers by hand (eg ext3)
3083 * then we can have clean buffers against a dirty page. We 3086 * then we can have clean buffers against a dirty page. We
3084 * clean the page here; otherwise the VM will never notice 3087 * clean the page here; otherwise the VM will never notice
3085 * that the filesystem did any IO at all. 3088 * that the filesystem did any IO at all.
3086 * 3089 *
3087 * Also, during truncate, discard_buffer will have marked all 3090 * Also, during truncate, discard_buffer will have marked all
3088 * the page's buffers clean. We discover that here and clean 3091 * the page's buffers clean. We discover that here and clean
3089 * the page also. 3092 * the page also.
3090 * 3093 *
3091 * private_lock must be held over this entire operation in order 3094 * private_lock must be held over this entire operation in order
3092 * to synchronise against __set_page_dirty_buffers and prevent the 3095 * to synchronise against __set_page_dirty_buffers and prevent the
3093 * dirty bit from being lost. 3096 * dirty bit from being lost.
3094 */ 3097 */
3095 if (ret) 3098 if (ret)
3096 cancel_dirty_page(page, PAGE_CACHE_SIZE); 3099 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3097 spin_unlock(&mapping->private_lock); 3100 spin_unlock(&mapping->private_lock);
3098 out: 3101 out:
3099 if (buffers_to_free) { 3102 if (buffers_to_free) {
3100 struct buffer_head *bh = buffers_to_free; 3103 struct buffer_head *bh = buffers_to_free;
3101 3104
3102 do { 3105 do {
3103 struct buffer_head *next = bh->b_this_page; 3106 struct buffer_head *next = bh->b_this_page;
3104 free_buffer_head(bh); 3107 free_buffer_head(bh);
3105 bh = next; 3108 bh = next;
3106 } while (bh != buffers_to_free); 3109 } while (bh != buffers_to_free);
3107 } 3110 }
3108 return ret; 3111 return ret;
3109 } 3112 }
3110 EXPORT_SYMBOL(try_to_free_buffers); 3113 EXPORT_SYMBOL(try_to_free_buffers);
3111 3114
3112 void block_sync_page(struct page *page) 3115 void block_sync_page(struct page *page)
3113 { 3116 {
3114 struct address_space *mapping; 3117 struct address_space *mapping;
3115 3118
3116 smp_mb(); 3119 smp_mb();
3117 mapping = page_mapping(page); 3120 mapping = page_mapping(page);
3118 if (mapping) 3121 if (mapping)
3119 blk_run_backing_dev(mapping->backing_dev_info, page); 3122 blk_run_backing_dev(mapping->backing_dev_info, page);
3120 } 3123 }
3121 3124
3122 /* 3125 /*
3123 * There are no bdflush tunables left. But distributions are 3126 * There are no bdflush tunables left. But distributions are
3124 * still running obsolete flush daemons, so we terminate them here. 3127 * still running obsolete flush daemons, so we terminate them here.
3125 * 3128 *
3126 * Use of bdflush() is deprecated and will be removed in a future kernel. 3129 * Use of bdflush() is deprecated and will be removed in a future kernel.
3127 * The `pdflush' kernel threads fully replace bdflush daemons and this call. 3130 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3128 */ 3131 */
3129 asmlinkage long sys_bdflush(int func, long data) 3132 asmlinkage long sys_bdflush(int func, long data)
3130 { 3133 {
3131 static int msg_count; 3134 static int msg_count;
3132 3135
3133 if (!capable(CAP_SYS_ADMIN)) 3136 if (!capable(CAP_SYS_ADMIN))
3134 return -EPERM; 3137 return -EPERM;
3135 3138
3136 if (msg_count < 5) { 3139 if (msg_count < 5) {
3137 msg_count++; 3140 msg_count++;
3138 printk(KERN_INFO 3141 printk(KERN_INFO
3139 "warning: process `%s' used the obsolete bdflush" 3142 "warning: process `%s' used the obsolete bdflush"
3140 " system call\n", current->comm); 3143 " system call\n", current->comm);
3141 printk(KERN_INFO "Fix your initscripts?\n"); 3144 printk(KERN_INFO "Fix your initscripts?\n");
3142 } 3145 }
3143 3146
3144 if (func == 1) 3147 if (func == 1)
3145 do_exit(0); 3148 do_exit(0);
3146 return 0; 3149 return 0;
3147 } 3150 }
3148 3151
3149 /* 3152 /*
3150 * Buffer-head allocation 3153 * Buffer-head allocation
3151 */ 3154 */
3152 static struct kmem_cache *bh_cachep; 3155 static struct kmem_cache *bh_cachep;
3153 3156
3154 /* 3157 /*
3155 * Once the number of bh's in the machine exceeds this level, we start 3158 * Once the number of bh's in the machine exceeds this level, we start
3156 * stripping them in writeback. 3159 * stripping them in writeback.
3157 */ 3160 */
3158 static int max_buffer_heads; 3161 static int max_buffer_heads;
3159 3162
3160 int buffer_heads_over_limit; 3163 int buffer_heads_over_limit;
3161 3164
3162 struct bh_accounting { 3165 struct bh_accounting {
3163 int nr; /* Number of live bh's */ 3166 int nr; /* Number of live bh's */
3164 int ratelimit; /* Limit cacheline bouncing */ 3167 int ratelimit; /* Limit cacheline bouncing */
3165 }; 3168 };
3166 3169
3167 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; 3170 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3168 3171
3169 static void recalc_bh_state(void) 3172 static void recalc_bh_state(void)
3170 { 3173 {
3171 int i; 3174 int i;
3172 int tot = 0; 3175 int tot = 0;
3173 3176
3174 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) 3177 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3175 return; 3178 return;
3176 __get_cpu_var(bh_accounting).ratelimit = 0; 3179 __get_cpu_var(bh_accounting).ratelimit = 0;
3177 for_each_online_cpu(i) 3180 for_each_online_cpu(i)
3178 tot += per_cpu(bh_accounting, i).nr; 3181 tot += per_cpu(bh_accounting, i).nr;
3179 buffer_heads_over_limit = (tot > max_buffer_heads); 3182 buffer_heads_over_limit = (tot > max_buffer_heads);
3180 } 3183 }
3181 3184
3182 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3185 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3183 { 3186 {
3184 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); 3187 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3185 if (ret) { 3188 if (ret) {
3186 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3189 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3187 get_cpu_var(bh_accounting).nr++; 3190 get_cpu_var(bh_accounting).nr++;
3188 recalc_bh_state(); 3191 recalc_bh_state();
3189 put_cpu_var(bh_accounting); 3192 put_cpu_var(bh_accounting);
3190 } 3193 }
3191 return ret; 3194 return ret;
3192 } 3195 }
3193 EXPORT_SYMBOL(alloc_buffer_head); 3196 EXPORT_SYMBOL(alloc_buffer_head);
3194 3197
3195 void free_buffer_head(struct buffer_head *bh) 3198 void free_buffer_head(struct buffer_head *bh)
3196 { 3199 {
3197 BUG_ON(!list_empty(&bh->b_assoc_buffers)); 3200 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3198 kmem_cache_free(bh_cachep, bh); 3201 kmem_cache_free(bh_cachep, bh);
3199 get_cpu_var(bh_accounting).nr--; 3202 get_cpu_var(bh_accounting).nr--;
3200 recalc_bh_state(); 3203 recalc_bh_state();
3201 put_cpu_var(bh_accounting); 3204 put_cpu_var(bh_accounting);
3202 } 3205 }
3203 EXPORT_SYMBOL(free_buffer_head); 3206 EXPORT_SYMBOL(free_buffer_head);
3204 3207
3205 static void buffer_exit_cpu(int cpu) 3208 static void buffer_exit_cpu(int cpu)
3206 { 3209 {
3207 int i; 3210 int i;
3208 struct bh_lru *b = &per_cpu(bh_lrus, cpu); 3211 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3209 3212
3210 for (i = 0; i < BH_LRU_SIZE; i++) { 3213 for (i = 0; i < BH_LRU_SIZE; i++) {
3211 brelse(b->bhs[i]); 3214 brelse(b->bhs[i]);
3212 b->bhs[i] = NULL; 3215 b->bhs[i] = NULL;
3213 } 3216 }
3214 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; 3217 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3215 per_cpu(bh_accounting, cpu).nr = 0; 3218 per_cpu(bh_accounting, cpu).nr = 0;
3216 put_cpu_var(bh_accounting); 3219 put_cpu_var(bh_accounting);
3217 } 3220 }
3218 3221
3219 static int buffer_cpu_notify(struct notifier_block *self, 3222 static int buffer_cpu_notify(struct notifier_block *self,
3220 unsigned long action, void *hcpu) 3223 unsigned long action, void *hcpu)
3221 { 3224 {
3222 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 3225 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3223 buffer_exit_cpu((unsigned long)hcpu); 3226 buffer_exit_cpu((unsigned long)hcpu);
3224 return NOTIFY_OK; 3227 return NOTIFY_OK;
3225 } 3228 }
3226 3229
3227 /** 3230 /**
3228 * bh_uptodate_or_lock - Test whether the buffer is uptodate 3231 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3229 * @bh: struct buffer_head 3232 * @bh: struct buffer_head
3230 * 3233 *
3231 * Return true if the buffer is up-to-date and false, 3234 * Return true if the buffer is up-to-date and false,
3232 * with the buffer locked, if not. 3235 * with the buffer locked, if not.
3233 */ 3236 */
3234 int bh_uptodate_or_lock(struct buffer_head *bh) 3237 int bh_uptodate_or_lock(struct buffer_head *bh)
3235 { 3238 {
3236 if (!buffer_uptodate(bh)) { 3239 if (!buffer_uptodate(bh)) {
3237 lock_buffer(bh); 3240 lock_buffer(bh);
3238 if (!buffer_uptodate(bh)) 3241 if (!buffer_uptodate(bh))
3239 return 0; 3242 return 0;
3240 unlock_buffer(bh); 3243 unlock_buffer(bh);
3241 } 3244 }
3242 return 1; 3245 return 1;
3243 } 3246 }
3244 EXPORT_SYMBOL(bh_uptodate_or_lock); 3247 EXPORT_SYMBOL(bh_uptodate_or_lock);
3245 3248
3246 /** 3249 /**
3247 * bh_submit_read - Submit a locked buffer for reading 3250 * bh_submit_read - Submit a locked buffer for reading
3248 * @bh: struct buffer_head 3251 * @bh: struct buffer_head
3249 * 3252 *
3250 * Returns zero on success and -EIO on error. 3253 * Returns zero on success and -EIO on error.
3251 */ 3254 */
3252 int bh_submit_read(struct buffer_head *bh) 3255 int bh_submit_read(struct buffer_head *bh)
3253 { 3256 {
3254 BUG_ON(!buffer_locked(bh)); 3257 BUG_ON(!buffer_locked(bh));
3255 3258
3256 if (buffer_uptodate(bh)) { 3259 if (buffer_uptodate(bh)) {
3257 unlock_buffer(bh); 3260 unlock_buffer(bh);
3258 return 0; 3261 return 0;
3259 } 3262 }
3260 3263
3261 get_bh(bh); 3264 get_bh(bh);
3262 bh->b_end_io = end_buffer_read_sync; 3265 bh->b_end_io = end_buffer_read_sync;
3263 submit_bh(READ, bh); 3266 submit_bh(READ, bh);
3264 wait_on_buffer(bh); 3267 wait_on_buffer(bh);
3265 if (buffer_uptodate(bh)) 3268 if (buffer_uptodate(bh))
3266 return 0; 3269 return 0;
3267 return -EIO; 3270 return -EIO;
3268 } 3271 }
3269 EXPORT_SYMBOL(bh_submit_read); 3272 EXPORT_SYMBOL(bh_submit_read);
3270 3273
3271 static void 3274 static void
3272 init_buffer_head(struct kmem_cache *cachep, void *data) 3275 init_buffer_head(struct kmem_cache *cachep, void *data)
3273 { 3276 {
3274 struct buffer_head *bh = data; 3277 struct buffer_head *bh = data;
3275 3278
3276 memset(bh, 0, sizeof(*bh)); 3279 memset(bh, 0, sizeof(*bh));
3277 INIT_LIST_HEAD(&bh->b_assoc_buffers); 3280 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3278 } 3281 }
3279 3282
3280 void __init buffer_init(void) 3283 void __init buffer_init(void)
3281 { 3284 {
3282 int nrpages; 3285 int nrpages;
3283 3286
3284 bh_cachep = kmem_cache_create("buffer_head", 3287 bh_cachep = kmem_cache_create("buffer_head",
3285 sizeof(struct buffer_head), 0, 3288 sizeof(struct buffer_head), 0,
3286 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 3289 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3287 SLAB_MEM_SPREAD), 3290 SLAB_MEM_SPREAD),
3288 init_buffer_head); 3291 init_buffer_head);
3289 3292
3290 /* 3293 /*
3291 * Limit the bh occupancy to 10% of ZONE_NORMAL 3294 * Limit the bh occupancy to 10% of ZONE_NORMAL
3292 */ 3295 */
3293 nrpages = (nr_free_buffer_pages() * 10) / 100; 3296 nrpages = (nr_free_buffer_pages() * 10) / 100;
3294 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); 3297 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3295 hotcpu_notifier(buffer_cpu_notify, 0); 3298 hotcpu_notifier(buffer_cpu_notify, 0);
3296 } 3299 }
3297 3300
3298 EXPORT_SYMBOL(__bforget); 3301 EXPORT_SYMBOL(__bforget);
3299 EXPORT_SYMBOL(__brelse); 3302 EXPORT_SYMBOL(__brelse);
3300 EXPORT_SYMBOL(__wait_on_buffer); 3303 EXPORT_SYMBOL(__wait_on_buffer);
3301 EXPORT_SYMBOL(block_commit_write); 3304 EXPORT_SYMBOL(block_commit_write);
3302 EXPORT_SYMBOL(block_prepare_write); 3305 EXPORT_SYMBOL(block_prepare_write);
3303 EXPORT_SYMBOL(block_page_mkwrite); 3306 EXPORT_SYMBOL(block_page_mkwrite);
3304 EXPORT_SYMBOL(block_read_full_page); 3307 EXPORT_SYMBOL(block_read_full_page);
3305 EXPORT_SYMBOL(block_sync_page); 3308 EXPORT_SYMBOL(block_sync_page);
3306 EXPORT_SYMBOL(block_truncate_page); 3309 EXPORT_SYMBOL(block_truncate_page);
3307 EXPORT_SYMBOL(block_write_full_page); 3310 EXPORT_SYMBOL(block_write_full_page);
3308 EXPORT_SYMBOL(cont_write_begin); 3311 EXPORT_SYMBOL(cont_write_begin);
3309 EXPORT_SYMBOL(end_buffer_read_sync); 3312 EXPORT_SYMBOL(end_buffer_read_sync);
3310 EXPORT_SYMBOL(end_buffer_write_sync); 3313 EXPORT_SYMBOL(end_buffer_write_sync);
3311 EXPORT_SYMBOL(file_fsync); 3314 EXPORT_SYMBOL(file_fsync);
3312 EXPORT_SYMBOL(fsync_bdev); 3315 EXPORT_SYMBOL(fsync_bdev);
3313 EXPORT_SYMBOL(generic_block_bmap); 3316 EXPORT_SYMBOL(generic_block_bmap);
3314 EXPORT_SYMBOL(generic_cont_expand_simple); 3317 EXPORT_SYMBOL(generic_cont_expand_simple);
3315 EXPORT_SYMBOL(init_buffer); 3318 EXPORT_SYMBOL(init_buffer);
3316 EXPORT_SYMBOL(invalidate_bdev); 3319 EXPORT_SYMBOL(invalidate_bdev);
3317 EXPORT_SYMBOL(ll_rw_block); 3320 EXPORT_SYMBOL(ll_rw_block);
3318 EXPORT_SYMBOL(mark_buffer_dirty); 3321 EXPORT_SYMBOL(mark_buffer_dirty);
3319 EXPORT_SYMBOL(submit_bh); 3322 EXPORT_SYMBOL(submit_bh);
3320 EXPORT_SYMBOL(sync_dirty_buffer); 3323 EXPORT_SYMBOL(sync_dirty_buffer);
3321 EXPORT_SYMBOL(unlock_buffer); 3324 EXPORT_SYMBOL(unlock_buffer);
3322 3325
1 /* 1 /*
2 * fs/mpage.c 2 * fs/mpage.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * 5 *
6 * Contains functions related to preparing and submitting BIOs which contain 6 * Contains functions related to preparing and submitting BIOs which contain
7 * multiple pagecache pages. 7 * multiple pagecache pages.
8 * 8 *
9 * 15May2002 akpm@zip.com.au 9 * 15May2002 akpm@zip.com.au
10 * Initial version 10 * Initial version
11 * 27Jun2002 axboe@suse.de 11 * 27Jun2002 axboe@suse.de
12 * use bio_add_page() to build bio's just the right size 12 * use bio_add_page() to build bio's just the right size
13 */ 13 */
14 14
15 #include <linux/kernel.h> 15 #include <linux/kernel.h>
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/mm.h> 17 #include <linux/mm.h>
18 #include <linux/kdev_t.h> 18 #include <linux/kdev_t.h>
19 #include <linux/bio.h> 19 #include <linux/bio.h>
20 #include <linux/fs.h> 20 #include <linux/fs.h>
21 #include <linux/buffer_head.h> 21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h> 22 #include <linux/blkdev.h>
23 #include <linux/highmem.h> 23 #include <linux/highmem.h>
24 #include <linux/prefetch.h> 24 #include <linux/prefetch.h>
25 #include <linux/mpage.h> 25 #include <linux/mpage.h>
26 #include <linux/writeback.h> 26 #include <linux/writeback.h>
27 #include <linux/backing-dev.h> 27 #include <linux/backing-dev.h>
28 #include <linux/pagevec.h> 28 #include <linux/pagevec.h>
29 29
30 /* 30 /*
31 * I/O completion handler for multipage BIOs. 31 * I/O completion handler for multipage BIOs.
32 * 32 *
33 * The mpage code never puts partial pages into a BIO (except for end-of-file). 33 * The mpage code never puts partial pages into a BIO (except for end-of-file).
34 * If a page does not map to a contiguous run of blocks then it simply falls 34 * If a page does not map to a contiguous run of blocks then it simply falls
35 * back to block_read_full_page(). 35 * back to block_read_full_page().
36 * 36 *
37 * Why is this? If a page's completion depends on a number of different BIOs 37 * Why is this? If a page's completion depends on a number of different BIOs
38 * which can complete in any order (or at the same time) then determining the 38 * which can complete in any order (or at the same time) then determining the
39 * status of that page is hard. See end_buffer_async_read() for the details. 39 * status of that page is hard. See end_buffer_async_read() for the details.
40 * There is no point in duplicating all that complexity. 40 * There is no point in duplicating all that complexity.
41 */ 41 */
42 static void mpage_end_io_read(struct bio *bio, int err) 42 static void mpage_end_io_read(struct bio *bio, int err)
43 { 43 {
44 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 44 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
45 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 45 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
46 46
47 do { 47 do {
48 struct page *page = bvec->bv_page; 48 struct page *page = bvec->bv_page;
49 49
50 if (--bvec >= bio->bi_io_vec) 50 if (--bvec >= bio->bi_io_vec)
51 prefetchw(&bvec->bv_page->flags); 51 prefetchw(&bvec->bv_page->flags);
52 52
53 if (uptodate) { 53 if (uptodate) {
54 SetPageUptodate(page); 54 SetPageUptodate(page);
55 } else { 55 } else {
56 ClearPageUptodate(page); 56 ClearPageUptodate(page);
57 SetPageError(page); 57 SetPageError(page);
58 } 58 }
59 unlock_page(page); 59 unlock_page(page);
60 } while (bvec >= bio->bi_io_vec); 60 } while (bvec >= bio->bi_io_vec);
61 bio_put(bio); 61 bio_put(bio);
62 } 62 }
63 63
64 static void mpage_end_io_write(struct bio *bio, int err) 64 static void mpage_end_io_write(struct bio *bio, int err)
65 { 65 {
66 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 66 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
67 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 67 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
68 68
69 do { 69 do {
70 struct page *page = bvec->bv_page; 70 struct page *page = bvec->bv_page;
71 71
72 if (--bvec >= bio->bi_io_vec) 72 if (--bvec >= bio->bi_io_vec)
73 prefetchw(&bvec->bv_page->flags); 73 prefetchw(&bvec->bv_page->flags);
74 74
75 if (!uptodate){ 75 if (!uptodate){
76 SetPageError(page); 76 SetPageError(page);
77 if (page->mapping) 77 if (page->mapping)
78 set_bit(AS_EIO, &page->mapping->flags); 78 set_bit(AS_EIO, &page->mapping->flags);
79 } 79 }
80 end_page_writeback(page); 80 end_page_writeback(page);
81 } while (bvec >= bio->bi_io_vec); 81 } while (bvec >= bio->bi_io_vec);
82 bio_put(bio); 82 bio_put(bio);
83 } 83 }
84 84
85 static struct bio *mpage_bio_submit(int rw, struct bio *bio) 85 struct bio *mpage_bio_submit(int rw, struct bio *bio)
86 { 86 {
87 bio->bi_end_io = mpage_end_io_read; 87 bio->bi_end_io = mpage_end_io_read;
88 if (rw == WRITE) 88 if (rw == WRITE)
89 bio->bi_end_io = mpage_end_io_write; 89 bio->bi_end_io = mpage_end_io_write;
90 submit_bio(rw, bio); 90 submit_bio(rw, bio);
91 return NULL; 91 return NULL;
92 } 92 }
93 EXPORT_SYMBOL(mpage_bio_submit);
93 94
94 static struct bio * 95 static struct bio *
95 mpage_alloc(struct block_device *bdev, 96 mpage_alloc(struct block_device *bdev,
96 sector_t first_sector, int nr_vecs, 97 sector_t first_sector, int nr_vecs,
97 gfp_t gfp_flags) 98 gfp_t gfp_flags)
98 { 99 {
99 struct bio *bio; 100 struct bio *bio;
100 101
101 bio = bio_alloc(gfp_flags, nr_vecs); 102 bio = bio_alloc(gfp_flags, nr_vecs);
102 103
103 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 104 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
104 while (!bio && (nr_vecs /= 2)) 105 while (!bio && (nr_vecs /= 2))
105 bio = bio_alloc(gfp_flags, nr_vecs); 106 bio = bio_alloc(gfp_flags, nr_vecs);
106 } 107 }
107 108
108 if (bio) { 109 if (bio) {
109 bio->bi_bdev = bdev; 110 bio->bi_bdev = bdev;
110 bio->bi_sector = first_sector; 111 bio->bi_sector = first_sector;
111 } 112 }
112 return bio; 113 return bio;
113 } 114 }
114 115
115 /* 116 /*
116 * support function for mpage_readpages. The fs supplied get_block might 117 * support function for mpage_readpages. The fs supplied get_block might
117 * return an up to date buffer. This is used to map that buffer into 118 * return an up to date buffer. This is used to map that buffer into
118 * the page, which allows readpage to avoid triggering a duplicate call 119 * the page, which allows readpage to avoid triggering a duplicate call
119 * to get_block. 120 * to get_block.
120 * 121 *
121 * The idea is to avoid adding buffers to pages that don't already have 122 * The idea is to avoid adding buffers to pages that don't already have
122 * them. So when the buffer is up to date and the page size == block size, 123 * them. So when the buffer is up to date and the page size == block size,
123 * this marks the page up to date instead of adding new buffers. 124 * this marks the page up to date instead of adding new buffers.
124 */ 125 */
125 static void 126 static void
126 map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 127 map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
127 { 128 {
128 struct inode *inode = page->mapping->host; 129 struct inode *inode = page->mapping->host;
129 struct buffer_head *page_bh, *head; 130 struct buffer_head *page_bh, *head;
130 int block = 0; 131 int block = 0;
131 132
132 if (!page_has_buffers(page)) { 133 if (!page_has_buffers(page)) {
133 /* 134 /*
134 * don't make any buffers if there is only one buffer on 135 * don't make any buffers if there is only one buffer on
135 * the page and the page just needs to be set up to date 136 * the page and the page just needs to be set up to date
136 */ 137 */
137 if (inode->i_blkbits == PAGE_CACHE_SHIFT && 138 if (inode->i_blkbits == PAGE_CACHE_SHIFT &&
138 buffer_uptodate(bh)) { 139 buffer_uptodate(bh)) {
139 SetPageUptodate(page); 140 SetPageUptodate(page);
140 return; 141 return;
141 } 142 }
142 create_empty_buffers(page, 1 << inode->i_blkbits, 0); 143 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
143 } 144 }
144 head = page_buffers(page); 145 head = page_buffers(page);
145 page_bh = head; 146 page_bh = head;
146 do { 147 do {
147 if (block == page_block) { 148 if (block == page_block) {
148 page_bh->b_state = bh->b_state; 149 page_bh->b_state = bh->b_state;
149 page_bh->b_bdev = bh->b_bdev; 150 page_bh->b_bdev = bh->b_bdev;
150 page_bh->b_blocknr = bh->b_blocknr; 151 page_bh->b_blocknr = bh->b_blocknr;
151 break; 152 break;
152 } 153 }
153 page_bh = page_bh->b_this_page; 154 page_bh = page_bh->b_this_page;
154 block++; 155 block++;
155 } while (page_bh != head); 156 } while (page_bh != head);
156 } 157 }
157 158
158 /* 159 /*
159 * This is the worker routine which does all the work of mapping the disk 160 * This is the worker routine which does all the work of mapping the disk
160 * blocks and constructs largest possible bios, submits them for IO if the 161 * blocks and constructs largest possible bios, submits them for IO if the
161 * blocks are not contiguous on the disk. 162 * blocks are not contiguous on the disk.
162 * 163 *
163 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 164 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
164 * represent the validity of its disk mapping and to decide when to do the next 165 * represent the validity of its disk mapping and to decide when to do the next
165 * get_block() call. 166 * get_block() call.
166 */ 167 */
167 static struct bio * 168 static struct bio *
168 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 169 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
169 sector_t *last_block_in_bio, struct buffer_head *map_bh, 170 sector_t *last_block_in_bio, struct buffer_head *map_bh,
170 unsigned long *first_logical_block, get_block_t get_block) 171 unsigned long *first_logical_block, get_block_t get_block)
171 { 172 {
172 struct inode *inode = page->mapping->host; 173 struct inode *inode = page->mapping->host;
173 const unsigned blkbits = inode->i_blkbits; 174 const unsigned blkbits = inode->i_blkbits;
174 const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; 175 const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
175 const unsigned blocksize = 1 << blkbits; 176 const unsigned blocksize = 1 << blkbits;
176 sector_t block_in_file; 177 sector_t block_in_file;
177 sector_t last_block; 178 sector_t last_block;
178 sector_t last_block_in_file; 179 sector_t last_block_in_file;
179 sector_t blocks[MAX_BUF_PER_PAGE]; 180 sector_t blocks[MAX_BUF_PER_PAGE];
180 unsigned page_block; 181 unsigned page_block;
181 unsigned first_hole = blocks_per_page; 182 unsigned first_hole = blocks_per_page;
182 struct block_device *bdev = NULL; 183 struct block_device *bdev = NULL;
183 int length; 184 int length;
184 int fully_mapped = 1; 185 int fully_mapped = 1;
185 unsigned nblocks; 186 unsigned nblocks;
186 unsigned relative_block; 187 unsigned relative_block;
187 188
188 if (page_has_buffers(page)) 189 if (page_has_buffers(page))
189 goto confused; 190 goto confused;
190 191
191 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 192 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
192 last_block = block_in_file + nr_pages * blocks_per_page; 193 last_block = block_in_file + nr_pages * blocks_per_page;
193 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 194 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
194 if (last_block > last_block_in_file) 195 if (last_block > last_block_in_file)
195 last_block = last_block_in_file; 196 last_block = last_block_in_file;
196 page_block = 0; 197 page_block = 0;
197 198
198 /* 199 /*
199 * Map blocks using the result from the previous get_blocks call first. 200 * Map blocks using the result from the previous get_blocks call first.
200 */ 201 */
201 nblocks = map_bh->b_size >> blkbits; 202 nblocks = map_bh->b_size >> blkbits;
202 if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && 203 if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
203 block_in_file < (*first_logical_block + nblocks)) { 204 block_in_file < (*first_logical_block + nblocks)) {
204 unsigned map_offset = block_in_file - *first_logical_block; 205 unsigned map_offset = block_in_file - *first_logical_block;
205 unsigned last = nblocks - map_offset; 206 unsigned last = nblocks - map_offset;
206 207
207 for (relative_block = 0; ; relative_block++) { 208 for (relative_block = 0; ; relative_block++) {
208 if (relative_block == last) { 209 if (relative_block == last) {
209 clear_buffer_mapped(map_bh); 210 clear_buffer_mapped(map_bh);
210 break; 211 break;
211 } 212 }
212 if (page_block == blocks_per_page) 213 if (page_block == blocks_per_page)
213 break; 214 break;
214 blocks[page_block] = map_bh->b_blocknr + map_offset + 215 blocks[page_block] = map_bh->b_blocknr + map_offset +
215 relative_block; 216 relative_block;
216 page_block++; 217 page_block++;
217 block_in_file++; 218 block_in_file++;
218 } 219 }
219 bdev = map_bh->b_bdev; 220 bdev = map_bh->b_bdev;
220 } 221 }
221 222
222 /* 223 /*
223 * Then do more get_blocks calls until we are done with this page. 224 * Then do more get_blocks calls until we are done with this page.
224 */ 225 */
225 map_bh->b_page = page; 226 map_bh->b_page = page;
226 while (page_block < blocks_per_page) { 227 while (page_block < blocks_per_page) {
227 map_bh->b_state = 0; 228 map_bh->b_state = 0;
228 map_bh->b_size = 0; 229 map_bh->b_size = 0;
229 230
230 if (block_in_file < last_block) { 231 if (block_in_file < last_block) {
231 map_bh->b_size = (last_block-block_in_file) << blkbits; 232 map_bh->b_size = (last_block-block_in_file) << blkbits;
232 if (get_block(inode, block_in_file, map_bh, 0)) 233 if (get_block(inode, block_in_file, map_bh, 0))
233 goto confused; 234 goto confused;
234 *first_logical_block = block_in_file; 235 *first_logical_block = block_in_file;
235 } 236 }
236 237
237 if (!buffer_mapped(map_bh)) { 238 if (!buffer_mapped(map_bh)) {
238 fully_mapped = 0; 239 fully_mapped = 0;
239 if (first_hole == blocks_per_page) 240 if (first_hole == blocks_per_page)
240 first_hole = page_block; 241 first_hole = page_block;
241 page_block++; 242 page_block++;
242 block_in_file++; 243 block_in_file++;
243 clear_buffer_mapped(map_bh); 244 clear_buffer_mapped(map_bh);
244 continue; 245 continue;
245 } 246 }
246 247
247 /* some filesystems will copy data into the page during 248 /* some filesystems will copy data into the page during
248 * the get_block call, in which case we don't want to 249 * the get_block call, in which case we don't want to
249 * read it again. map_buffer_to_page copies the data 250 * read it again. map_buffer_to_page copies the data
250 * we just collected from get_block into the page's buffers 251 * we just collected from get_block into the page's buffers
251 * so readpage doesn't have to repeat the get_block call 252 * so readpage doesn't have to repeat the get_block call
252 */ 253 */
253 if (buffer_uptodate(map_bh)) { 254 if (buffer_uptodate(map_bh)) {
254 map_buffer_to_page(page, map_bh, page_block); 255 map_buffer_to_page(page, map_bh, page_block);
255 goto confused; 256 goto confused;
256 } 257 }
257 258
258 if (first_hole != blocks_per_page) 259 if (first_hole != blocks_per_page)
259 goto confused; /* hole -> non-hole */ 260 goto confused; /* hole -> non-hole */
260 261
261 /* Contiguous blocks? */ 262 /* Contiguous blocks? */
262 if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) 263 if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
263 goto confused; 264 goto confused;
264 nblocks = map_bh->b_size >> blkbits; 265 nblocks = map_bh->b_size >> blkbits;
265 for (relative_block = 0; ; relative_block++) { 266 for (relative_block = 0; ; relative_block++) {
266 if (relative_block == nblocks) { 267 if (relative_block == nblocks) {
267 clear_buffer_mapped(map_bh); 268 clear_buffer_mapped(map_bh);
268 break; 269 break;
269 } else if (page_block == blocks_per_page) 270 } else if (page_block == blocks_per_page)
270 break; 271 break;
271 blocks[page_block] = map_bh->b_blocknr+relative_block; 272 blocks[page_block] = map_bh->b_blocknr+relative_block;
272 page_block++; 273 page_block++;
273 block_in_file++; 274 block_in_file++;
274 } 275 }
275 bdev = map_bh->b_bdev; 276 bdev = map_bh->b_bdev;
276 } 277 }
277 278
278 if (first_hole != blocks_per_page) { 279 if (first_hole != blocks_per_page) {
279 zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); 280 zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
280 if (first_hole == 0) { 281 if (first_hole == 0) {
281 SetPageUptodate(page); 282 SetPageUptodate(page);
282 unlock_page(page); 283 unlock_page(page);
283 goto out; 284 goto out;
284 } 285 }
285 } else if (fully_mapped) { 286 } else if (fully_mapped) {
286 SetPageMappedToDisk(page); 287 SetPageMappedToDisk(page);
287 } 288 }
288 289
289 /* 290 /*
290 * This page will go to BIO. Do we need to send this BIO off first? 291 * This page will go to BIO. Do we need to send this BIO off first?
291 */ 292 */
292 if (bio && (*last_block_in_bio != blocks[0] - 1)) 293 if (bio && (*last_block_in_bio != blocks[0] - 1))
293 bio = mpage_bio_submit(READ, bio); 294 bio = mpage_bio_submit(READ, bio);
294 295
295 alloc_new: 296 alloc_new:
296 if (bio == NULL) { 297 if (bio == NULL) {
297 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), 298 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
298 min_t(int, nr_pages, bio_get_nr_vecs(bdev)), 299 min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
299 GFP_KERNEL); 300 GFP_KERNEL);
300 if (bio == NULL) 301 if (bio == NULL)
301 goto confused; 302 goto confused;
302 } 303 }
303 304
304 length = first_hole << blkbits; 305 length = first_hole << blkbits;
305 if (bio_add_page(bio, page, length, 0) < length) { 306 if (bio_add_page(bio, page, length, 0) < length) {
306 bio = mpage_bio_submit(READ, bio); 307 bio = mpage_bio_submit(READ, bio);
307 goto alloc_new; 308 goto alloc_new;
308 } 309 }
309 310
310 if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) 311 if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
311 bio = mpage_bio_submit(READ, bio); 312 bio = mpage_bio_submit(READ, bio);
312 else 313 else
313 *last_block_in_bio = blocks[blocks_per_page - 1]; 314 *last_block_in_bio = blocks[blocks_per_page - 1];
314 out: 315 out:
315 return bio; 316 return bio;
316 317
317 confused: 318 confused:
318 if (bio) 319 if (bio)
319 bio = mpage_bio_submit(READ, bio); 320 bio = mpage_bio_submit(READ, bio);
320 if (!PageUptodate(page)) 321 if (!PageUptodate(page))
321 block_read_full_page(page, get_block); 322 block_read_full_page(page, get_block);
322 else 323 else
323 unlock_page(page); 324 unlock_page(page);
324 goto out; 325 goto out;
325 } 326 }
326 327
327 /** 328 /**
328 * mpage_readpages - populate an address space with some pages & start reads against them 329 * mpage_readpages - populate an address space with some pages & start reads against them
329 * @mapping: the address_space 330 * @mapping: the address_space
330 * @pages: The address of a list_head which contains the target pages. These 331 * @pages: The address of a list_head which contains the target pages. These
331 * pages have their ->index populated and are otherwise uninitialised. 332 * pages have their ->index populated and are otherwise uninitialised.
332 * The page at @pages->prev has the lowest file offset, and reads should be 333 * The page at @pages->prev has the lowest file offset, and reads should be
333 * issued in @pages->prev to @pages->next order. 334 * issued in @pages->prev to @pages->next order.
334 * @nr_pages: The number of pages at *@pages 335 * @nr_pages: The number of pages at *@pages
335 * @get_block: The filesystem's block mapper function. 336 * @get_block: The filesystem's block mapper function.
336 * 337 *
337 * This function walks the pages and the blocks within each page, building and 338 * This function walks the pages and the blocks within each page, building and
338 * emitting large BIOs. 339 * emitting large BIOs.
339 * 340 *
340 * If anything unusual happens, such as: 341 * If anything unusual happens, such as:
341 * 342 *
342 * - encountering a page which has buffers 343 * - encountering a page which has buffers
343 * - encountering a page which has a non-hole after a hole 344 * - encountering a page which has a non-hole after a hole
344 * - encountering a page with non-contiguous blocks 345 * - encountering a page with non-contiguous blocks
345 * 346 *
346 * then this code just gives up and calls the buffer_head-based read function. 347 * then this code just gives up and calls the buffer_head-based read function.
347 * It does handle a page which has holes at the end - that is a common case: 348 * It does handle a page which has holes at the end - that is a common case:
348 * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. 349 * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
349 * 350 *
350 * BH_Boundary explanation: 351 * BH_Boundary explanation:
351 * 352 *
352 * There is a problem. The mpage read code assembles several pages, gets all 353 * There is a problem. The mpage read code assembles several pages, gets all
353 * their disk mappings, and then submits them all. That's fine, but obtaining 354 * their disk mappings, and then submits them all. That's fine, but obtaining
354 * the disk mappings may require I/O. Reads of indirect blocks, for example. 355 * the disk mappings may require I/O. Reads of indirect blocks, for example.
355 * 356 *
356 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 357 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
357 * submitted in the following order: 358 * submitted in the following order:
358 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 359 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
359 * 360 *
360 * because the indirect block has to be read to get the mappings of blocks 361 * because the indirect block has to be read to get the mappings of blocks
361 * 13,14,15,16. Obviously, this impacts performance. 362 * 13,14,15,16. Obviously, this impacts performance.
362 * 363 *
363 * So what we do it to allow the filesystem's get_block() function to set 364 * So what we do it to allow the filesystem's get_block() function to set
364 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 365 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block
365 * after this one will require I/O against a block which is probably close to 366 * after this one will require I/O against a block which is probably close to
366 * this one. So you should push what I/O you have currently accumulated. 367 * this one. So you should push what I/O you have currently accumulated.
367 * 368 *
368 * This all causes the disk requests to be issued in the correct order. 369 * This all causes the disk requests to be issued in the correct order.
369 */ 370 */
370 int 371 int
371 mpage_readpages(struct address_space *mapping, struct list_head *pages, 372 mpage_readpages(struct address_space *mapping, struct list_head *pages,
372 unsigned nr_pages, get_block_t get_block) 373 unsigned nr_pages, get_block_t get_block)
373 { 374 {
374 struct bio *bio = NULL; 375 struct bio *bio = NULL;
375 unsigned page_idx; 376 unsigned page_idx;
376 sector_t last_block_in_bio = 0; 377 sector_t last_block_in_bio = 0;
377 struct buffer_head map_bh; 378 struct buffer_head map_bh;
378 unsigned long first_logical_block = 0; 379 unsigned long first_logical_block = 0;
379 380
380 clear_buffer_mapped(&map_bh); 381 clear_buffer_mapped(&map_bh);
381 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 382 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
382 struct page *page = list_entry(pages->prev, struct page, lru); 383 struct page *page = list_entry(pages->prev, struct page, lru);
383 384
384 prefetchw(&page->flags); 385 prefetchw(&page->flags);
385 list_del(&page->lru); 386 list_del(&page->lru);
386 if (!add_to_page_cache_lru(page, mapping, 387 if (!add_to_page_cache_lru(page, mapping,
387 page->index, GFP_KERNEL)) { 388 page->index, GFP_KERNEL)) {
388 bio = do_mpage_readpage(bio, page, 389 bio = do_mpage_readpage(bio, page,
389 nr_pages - page_idx, 390 nr_pages - page_idx,
390 &last_block_in_bio, &map_bh, 391 &last_block_in_bio, &map_bh,
391 &first_logical_block, 392 &first_logical_block,
392 get_block); 393 get_block);
393 } 394 }
394 page_cache_release(page); 395 page_cache_release(page);
395 } 396 }
396 BUG_ON(!list_empty(pages)); 397 BUG_ON(!list_empty(pages));
397 if (bio) 398 if (bio)
398 mpage_bio_submit(READ, bio); 399 mpage_bio_submit(READ, bio);
399 return 0; 400 return 0;
400 } 401 }
401 EXPORT_SYMBOL(mpage_readpages); 402 EXPORT_SYMBOL(mpage_readpages);
402 403
403 /* 404 /*
404 * This isn't called much at all 405 * This isn't called much at all
405 */ 406 */
406 int mpage_readpage(struct page *page, get_block_t get_block) 407 int mpage_readpage(struct page *page, get_block_t get_block)
407 { 408 {
408 struct bio *bio = NULL; 409 struct bio *bio = NULL;
409 sector_t last_block_in_bio = 0; 410 sector_t last_block_in_bio = 0;
410 struct buffer_head map_bh; 411 struct buffer_head map_bh;
411 unsigned long first_logical_block = 0; 412 unsigned long first_logical_block = 0;
412 413
413 clear_buffer_mapped(&map_bh); 414 clear_buffer_mapped(&map_bh);
414 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, 415 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
415 &map_bh, &first_logical_block, get_block); 416 &map_bh, &first_logical_block, get_block);
416 if (bio) 417 if (bio)
417 mpage_bio_submit(READ, bio); 418 mpage_bio_submit(READ, bio);
418 return 0; 419 return 0;
419 } 420 }
420 EXPORT_SYMBOL(mpage_readpage); 421 EXPORT_SYMBOL(mpage_readpage);
421 422
422 /* 423 /*
423 * Writing is not so simple. 424 * Writing is not so simple.
424 * 425 *
425 * If the page has buffers then they will be used for obtaining the disk 426 * If the page has buffers then they will be used for obtaining the disk
426 * mapping. We only support pages which are fully mapped-and-dirty, with a 427 * mapping. We only support pages which are fully mapped-and-dirty, with a
427 * special case for pages which are unmapped at the end: end-of-file. 428 * special case for pages which are unmapped at the end: end-of-file.
428 * 429 *
429 * If the page has no buffers (preferred) then the page is mapped here. 430 * If the page has no buffers (preferred) then the page is mapped here.
430 * 431 *
431 * If all blocks are found to be contiguous then the page can go into the 432 * If all blocks are found to be contiguous then the page can go into the
432 * BIO. Otherwise fall back to the mapping's writepage(). 433 * BIO. Otherwise fall back to the mapping's writepage().
433 * 434 *
434 * FIXME: This code wants an estimate of how many pages are still to be 435 * FIXME: This code wants an estimate of how many pages are still to be
435 * written, so it can intelligently allocate a suitably-sized BIO. For now, 436 * written, so it can intelligently allocate a suitably-sized BIO. For now,
436 * just allocate full-size (16-page) BIOs. 437 * just allocate full-size (16-page) BIOs.
437 */ 438 */
438 struct mpage_data {
439 struct bio *bio;
440 sector_t last_block_in_bio;
441 get_block_t *get_block;
442 unsigned use_writepage;
443 };
444 439
445 static int __mpage_writepage(struct page *page, struct writeback_control *wbc, 440 int __mpage_writepage(struct page *page, struct writeback_control *wbc,
446 void *data) 441 void *data)
447 { 442 {
448 struct mpage_data *mpd = data; 443 struct mpage_data *mpd = data;
449 struct bio *bio = mpd->bio; 444 struct bio *bio = mpd->bio;
450 struct address_space *mapping = page->mapping; 445 struct address_space *mapping = page->mapping;
451 struct inode *inode = page->mapping->host; 446 struct inode *inode = page->mapping->host;
452 const unsigned blkbits = inode->i_blkbits; 447 const unsigned blkbits = inode->i_blkbits;
453 unsigned long end_index; 448 unsigned long end_index;
454 const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; 449 const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
455 sector_t last_block; 450 sector_t last_block;
456 sector_t block_in_file; 451 sector_t block_in_file;
457 sector_t blocks[MAX_BUF_PER_PAGE]; 452 sector_t blocks[MAX_BUF_PER_PAGE];
458 unsigned page_block; 453 unsigned page_block;
459 unsigned first_unmapped = blocks_per_page; 454 unsigned first_unmapped = blocks_per_page;
460 struct block_device *bdev = NULL; 455 struct block_device *bdev = NULL;
461 int boundary = 0; 456 int boundary = 0;
462 sector_t boundary_block = 0; 457 sector_t boundary_block = 0;
463 struct block_device *boundary_bdev = NULL; 458 struct block_device *boundary_bdev = NULL;
464 int length; 459 int length;
465 struct buffer_head map_bh; 460 struct buffer_head map_bh;
466 loff_t i_size = i_size_read(inode); 461 loff_t i_size = i_size_read(inode);
467 int ret = 0; 462 int ret = 0;
468 463
469 if (page_has_buffers(page)) { 464 if (page_has_buffers(page)) {
470 struct buffer_head *head = page_buffers(page); 465 struct buffer_head *head = page_buffers(page);
471 struct buffer_head *bh = head; 466 struct buffer_head *bh = head;
472 467
473 /* If they're all mapped and dirty, do it */ 468 /* If they're all mapped and dirty, do it */
474 page_block = 0; 469 page_block = 0;
475 do { 470 do {
476 BUG_ON(buffer_locked(bh)); 471 BUG_ON(buffer_locked(bh));
477 if (!buffer_mapped(bh)) { 472 if (!buffer_mapped(bh)) {
478 /* 473 /*
479 * unmapped dirty buffers are created by 474 * unmapped dirty buffers are created by
480 * __set_page_dirty_buffers -> mmapped data 475 * __set_page_dirty_buffers -> mmapped data
481 */ 476 */
482 if (buffer_dirty(bh)) 477 if (buffer_dirty(bh))
483 goto confused; 478 goto confused;
484 if (first_unmapped == blocks_per_page) 479 if (first_unmapped == blocks_per_page)
485 first_unmapped = page_block; 480 first_unmapped = page_block;
486 continue; 481 continue;
487 } 482 }
488 483
489 if (first_unmapped != blocks_per_page) 484 if (first_unmapped != blocks_per_page)
490 goto confused; /* hole -> non-hole */ 485 goto confused; /* hole -> non-hole */
491 486
492 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 487 if (!buffer_dirty(bh) || !buffer_uptodate(bh))
493 goto confused; 488 goto confused;
494 if (page_block) { 489 if (page_block) {
495 if (bh->b_blocknr != blocks[page_block-1] + 1) 490 if (bh->b_blocknr != blocks[page_block-1] + 1)
496 goto confused; 491 goto confused;
497 } 492 }
498 blocks[page_block++] = bh->b_blocknr; 493 blocks[page_block++] = bh->b_blocknr;
499 boundary = buffer_boundary(bh); 494 boundary = buffer_boundary(bh);
500 if (boundary) { 495 if (boundary) {
501 boundary_block = bh->b_blocknr; 496 boundary_block = bh->b_blocknr;
502 boundary_bdev = bh->b_bdev; 497 boundary_bdev = bh->b_bdev;
503 } 498 }
504 bdev = bh->b_bdev; 499 bdev = bh->b_bdev;
505 } while ((bh = bh->b_this_page) != head); 500 } while ((bh = bh->b_this_page) != head);
506 501
507 if (first_unmapped) 502 if (first_unmapped)
508 goto page_is_mapped; 503 goto page_is_mapped;
509 504
510 /* 505 /*
511 * Page has buffers, but they are all unmapped. The page was 506 * Page has buffers, but they are all unmapped. The page was
512 * created by pagein or read over a hole which was handled by 507 * created by pagein or read over a hole which was handled by
513 * block_read_full_page(). If this address_space is also 508 * block_read_full_page(). If this address_space is also
514 * using mpage_readpages then this can rarely happen. 509 * using mpage_readpages then this can rarely happen.
515 */ 510 */
516 goto confused; 511 goto confused;
517 } 512 }
518 513
519 /* 514 /*
520 * The page has no buffers: map it to disk 515 * The page has no buffers: map it to disk
521 */ 516 */
522 BUG_ON(!PageUptodate(page)); 517 BUG_ON(!PageUptodate(page));
523 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 518 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
524 last_block = (i_size - 1) >> blkbits; 519 last_block = (i_size - 1) >> blkbits;
525 map_bh.b_page = page; 520 map_bh.b_page = page;
526 for (page_block = 0; page_block < blocks_per_page; ) { 521 for (page_block = 0; page_block < blocks_per_page; ) {
527 522
528 map_bh.b_state = 0; 523 map_bh.b_state = 0;
529 map_bh.b_size = 1 << blkbits; 524 map_bh.b_size = 1 << blkbits;
530 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 525 if (mpd->get_block(inode, block_in_file, &map_bh, 1))
531 goto confused; 526 goto confused;
532 if (buffer_new(&map_bh)) 527 if (buffer_new(&map_bh))
533 unmap_underlying_metadata(map_bh.b_bdev, 528 unmap_underlying_metadata(map_bh.b_bdev,
534 map_bh.b_blocknr); 529 map_bh.b_blocknr);
535 if (buffer_boundary(&map_bh)) { 530 if (buffer_boundary(&map_bh)) {
536 boundary_block = map_bh.b_blocknr; 531 boundary_block = map_bh.b_blocknr;
537 boundary_bdev = map_bh.b_bdev; 532 boundary_bdev = map_bh.b_bdev;
538 } 533 }
539 if (page_block) { 534 if (page_block) {
540 if (map_bh.b_blocknr != blocks[page_block-1] + 1) 535 if (map_bh.b_blocknr != blocks[page_block-1] + 1)
541 goto confused; 536 goto confused;
542 } 537 }
543 blocks[page_block++] = map_bh.b_blocknr; 538 blocks[page_block++] = map_bh.b_blocknr;
544 boundary = buffer_boundary(&map_bh); 539 boundary = buffer_boundary(&map_bh);
545 bdev = map_bh.b_bdev; 540 bdev = map_bh.b_bdev;
546 if (block_in_file == last_block) 541 if (block_in_file == last_block)
547 break; 542 break;
548 block_in_file++; 543 block_in_file++;
549 } 544 }
550 BUG_ON(page_block == 0); 545 BUG_ON(page_block == 0);
551 546
552 first_unmapped = page_block; 547 first_unmapped = page_block;
553 548
554 page_is_mapped: 549 page_is_mapped:
555 end_index = i_size >> PAGE_CACHE_SHIFT; 550 end_index = i_size >> PAGE_CACHE_SHIFT;
556 if (page->index >= end_index) { 551 if (page->index >= end_index) {
557 /* 552 /*
558 * The page straddles i_size. It must be zeroed out on each 553 * The page straddles i_size. It must be zeroed out on each
559 * and every writepage invokation because it may be mmapped. 554 * and every writepage invokation because it may be mmapped.
560 * "A file is mapped in multiples of the page size. For a file 555 * "A file is mapped in multiples of the page size. For a file
561 * that is not a multiple of the page size, the remaining memory 556 * that is not a multiple of the page size, the remaining memory
562 * is zeroed when mapped, and writes to that region are not 557 * is zeroed when mapped, and writes to that region are not
563 * written out to the file." 558 * written out to the file."
564 */ 559 */
565 unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); 560 unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
566 561
567 if (page->index > end_index || !offset) 562 if (page->index > end_index || !offset)
568 goto confused; 563 goto confused;
569 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 564 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
570 } 565 }
571 566
572 /* 567 /*
573 * This page will go to BIO. Do we need to send this BIO off first? 568 * This page will go to BIO. Do we need to send this BIO off first?
574 */ 569 */
575 if (bio && mpd->last_block_in_bio != blocks[0] - 1) 570 if (bio && mpd->last_block_in_bio != blocks[0] - 1)
576 bio = mpage_bio_submit(WRITE, bio); 571 bio = mpage_bio_submit(WRITE, bio);
577 572
578 alloc_new: 573 alloc_new:
579 if (bio == NULL) { 574 if (bio == NULL) {
580 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), 575 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
581 bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); 576 bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
582 if (bio == NULL) 577 if (bio == NULL)
583 goto confused; 578 goto confused;
584 } 579 }
585 580
586 /* 581 /*
587 * Must try to add the page before marking the buffer clean or 582 * Must try to add the page before marking the buffer clean or
588 * the confused fail path above (OOM) will be very confused when 583 * the confused fail path above (OOM) will be very confused when
589 * it finds all bh marked clean (i.e. it will not write anything) 584 * it finds all bh marked clean (i.e. it will not write anything)
590 */ 585 */
591 length = first_unmapped << blkbits; 586 length = first_unmapped << blkbits;
592 if (bio_add_page(bio, page, length, 0) < length) { 587 if (bio_add_page(bio, page, length, 0) < length) {
593 bio = mpage_bio_submit(WRITE, bio); 588 bio = mpage_bio_submit(WRITE, bio);
594 goto alloc_new; 589 goto alloc_new;
595 } 590 }
596 591
597 /* 592 /*
598 * OK, we have our BIO, so we can now mark the buffers clean. Make 593 * OK, we have our BIO, so we can now mark the buffers clean. Make
599 * sure to only clean buffers which we know we'll be writing. 594 * sure to only clean buffers which we know we'll be writing.
600 */ 595 */
601 if (page_has_buffers(page)) { 596 if (page_has_buffers(page)) {
602 struct buffer_head *head = page_buffers(page); 597 struct buffer_head *head = page_buffers(page);
603 struct buffer_head *bh = head; 598 struct buffer_head *bh = head;
604 unsigned buffer_counter = 0; 599 unsigned buffer_counter = 0;
605 600
606 do { 601 do {
607 if (buffer_counter++ == first_unmapped) 602 if (buffer_counter++ == first_unmapped)
608 break; 603 break;
609 clear_buffer_dirty(bh); 604 clear_buffer_dirty(bh);
610 bh = bh->b_this_page; 605 bh = bh->b_this_page;
611 } while (bh != head); 606 } while (bh != head);
612 607
613 /* 608 /*
614 * we cannot drop the bh if the page is not uptodate 609 * we cannot drop the bh if the page is not uptodate
615 * or a concurrent readpage would fail to serialize with the bh 610 * or a concurrent readpage would fail to serialize with the bh
616 * and it would read from disk before we reach the platter. 611 * and it would read from disk before we reach the platter.
617 */ 612 */
618 if (buffer_heads_over_limit && PageUptodate(page)) 613 if (buffer_heads_over_limit && PageUptodate(page))
619 try_to_free_buffers(page); 614 try_to_free_buffers(page);
620 } 615 }
621 616
622 BUG_ON(PageWriteback(page)); 617 BUG_ON(PageWriteback(page));
623 set_page_writeback(page); 618 set_page_writeback(page);
624 unlock_page(page); 619 unlock_page(page);
625 if (boundary || (first_unmapped != blocks_per_page)) { 620 if (boundary || (first_unmapped != blocks_per_page)) {
626 bio = mpage_bio_submit(WRITE, bio); 621 bio = mpage_bio_submit(WRITE, bio);
627 if (boundary_block) { 622 if (boundary_block) {
628 write_boundary_block(boundary_bdev, 623 write_boundary_block(boundary_bdev,
629 boundary_block, 1 << blkbits); 624 boundary_block, 1 << blkbits);
630 } 625 }
631 } else { 626 } else {
632 mpd->last_block_in_bio = blocks[blocks_per_page - 1]; 627 mpd->last_block_in_bio = blocks[blocks_per_page - 1];
633 } 628 }
634 goto out; 629 goto out;
635 630
636 confused: 631 confused:
637 if (bio) 632 if (bio)
638 bio = mpage_bio_submit(WRITE, bio); 633 bio = mpage_bio_submit(WRITE, bio);
639 634
640 if (mpd->use_writepage) { 635 if (mpd->use_writepage) {
641 ret = mapping->a_ops->writepage(page, wbc); 636 ret = mapping->a_ops->writepage(page, wbc);
642 } else { 637 } else {
643 ret = -EAGAIN; 638 ret = -EAGAIN;
644 goto out; 639 goto out;
645 } 640 }
646 /* 641 /*
647 * The caller has a ref on the inode, so *mapping is stable 642 * The caller has a ref on the inode, so *mapping is stable
648 */ 643 */
649 mapping_set_error(mapping, ret); 644 mapping_set_error(mapping, ret);
650 out: 645 out:
651 mpd->bio = bio; 646 mpd->bio = bio;
652 return ret; 647 return ret;
653 } 648 }
649 EXPORT_SYMBOL(__mpage_writepage);
654 650
655 /** 651 /**
656 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 652 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
657 * @mapping: address space structure to write 653 * @mapping: address space structure to write
658 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 654 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
659 * @get_block: the filesystem's block mapper function. 655 * @get_block: the filesystem's block mapper function.
660 * If this is NULL then use a_ops->writepage. Otherwise, go 656 * If this is NULL then use a_ops->writepage. Otherwise, go
661 * direct-to-BIO. 657 * direct-to-BIO.
662 * 658 *
663 * This is a library function, which implements the writepages() 659 * This is a library function, which implements the writepages()
664 * address_space_operation. 660 * address_space_operation.
665 * 661 *
666 * If a page is already under I/O, generic_writepages() skips it, even 662 * If a page is already under I/O, generic_writepages() skips it, even
667 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 663 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
668 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 664 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
669 * and msync() need to guarantee that all the data which was dirty at the time 665 * and msync() need to guarantee that all the data which was dirty at the time
670 * the call was made get new I/O started against them. If wbc->sync_mode is 666 * the call was made get new I/O started against them. If wbc->sync_mode is
671 * WB_SYNC_ALL then we were called for data integrity and we must wait for 667 * WB_SYNC_ALL then we were called for data integrity and we must wait for
672 * existing IO to complete. 668 * existing IO to complete.
673 */ 669 */
674 int 670 int
675 mpage_writepages(struct address_space *mapping, 671 mpage_writepages(struct address_space *mapping,
676 struct writeback_control *wbc, get_block_t get_block) 672 struct writeback_control *wbc, get_block_t get_block)
677 { 673 {
678 int ret; 674 int ret;
679 675
680 if (!get_block) 676 if (!get_block)
681 ret = generic_writepages(mapping, wbc); 677 ret = generic_writepages(mapping, wbc);
682 else { 678 else {
683 struct mpage_data mpd = { 679 struct mpage_data mpd = {
684 .bio = NULL, 680 .bio = NULL,
685 .last_block_in_bio = 0, 681 .last_block_in_bio = 0,
686 .get_block = get_block, 682 .get_block = get_block,
687 .use_writepage = 1, 683 .use_writepage = 1,
688 }; 684 };
689 685
690 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); 686 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
691 if (mpd.bio) 687 if (mpd.bio)
692 mpage_bio_submit(WRITE, mpd.bio); 688 mpage_bio_submit(WRITE, mpd.bio);
693 } 689 }
694 return ret; 690 return ret;
695 } 691 }
696 EXPORT_SYMBOL(mpage_writepages); 692 EXPORT_SYMBOL(mpage_writepages);
697 693
698 int mpage_writepage(struct page *page, get_block_t get_block, 694 int mpage_writepage(struct page *page, get_block_t get_block,
699 struct writeback_control *wbc) 695 struct writeback_control *wbc)
700 { 696 {
701 struct mpage_data mpd = { 697 struct mpage_data mpd = {
702 .bio = NULL, 698 .bio = NULL,
703 .last_block_in_bio = 0, 699 .last_block_in_bio = 0,
704 .get_block = get_block, 700 .get_block = get_block,
705 .use_writepage = 0, 701 .use_writepage = 0,
706 }; 702 };
707 int ret = __mpage_writepage(page, wbc, &mpd); 703 int ret = __mpage_writepage(page, wbc, &mpd);
708 if (mpd.bio) 704 if (mpd.bio)
709 mpage_bio_submit(WRITE, mpd.bio); 705 mpage_bio_submit(WRITE, mpd.bio);
710 return ret; 706 return ret;
711 } 707 }
include/linux/mpage.h
1 /* 1 /*
2 * include/linux/mpage.h 2 * include/linux/mpage.h
3 * 3 *
4 * Contains declarations related to preparing and submitting BIOS which contain 4 * Contains declarations related to preparing and submitting BIOS which contain
5 * multiple pagecache pages. 5 * multiple pagecache pages.
6 */ 6 */
7 7
8 /* 8 /*
9 * (And no, it doesn't do the #ifdef __MPAGE_H thing, and it doesn't do 9 * (And no, it doesn't do the #ifdef __MPAGE_H thing, and it doesn't do
10 * nested includes. Get it right in the .c file). 10 * nested includes. Get it right in the .c file).
11 */ 11 */
12 #ifdef CONFIG_BLOCK 12 #ifdef CONFIG_BLOCK
13 13
14 struct mpage_data {
15 struct bio *bio;
16 sector_t last_block_in_bio;
17 get_block_t *get_block;
18 unsigned use_writepage;
19 };
20
14 struct writeback_control; 21 struct writeback_control;
15 22
23 struct bio *mpage_bio_submit(int rw, struct bio *bio);
16 int mpage_readpages(struct address_space *mapping, struct list_head *pages, 24 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
17 unsigned nr_pages, get_block_t get_block); 25 unsigned nr_pages, get_block_t get_block);
18 int mpage_readpage(struct page *page, get_block_t get_block); 26 int mpage_readpage(struct page *page, get_block_t get_block);
27 int __mpage_writepage(struct page *page, struct writeback_control *wbc,
28 void *data);
19 int mpage_writepages(struct address_space *mapping, 29 int mpage_writepages(struct address_space *mapping,
20 struct writeback_control *wbc, get_block_t get_block); 30 struct writeback_control *wbc, get_block_t get_block);
21 int mpage_writepage(struct page *page, get_block_t *get_block, 31 int mpage_writepage(struct page *page, get_block_t *get_block,
22 struct writeback_control *wbc); 32 struct writeback_control *wbc);
23 33
24 #endif 34 #endif
25 35