Commit f01ef569cddb1a8627b1c6b3a134998ad1cf4b22

Authored by Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits)
  mm: properly reflect task dirty limits in dirty_exceeded logic
  writeback: don't busy retry writeback on new/freeing inodes
  writeback: scale IO chunk size up to half device bandwidth
  writeback: trace global_dirty_state
  writeback: introduce max-pause and pass-good dirty limits
  writeback: introduce smoothed global dirty limit
  writeback: consolidate variable names in balance_dirty_pages()
  writeback: show bdi write bandwidth in debugfs
  writeback: bdi write bandwidth estimation
  writeback: account per-bdi accumulated written pages
  writeback: make writeback_control.nr_to_write straight
  writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()
  writeback: trace event writeback_queue_io
  writeback: trace event writeback_single_inode
  writeback: remove .nonblocking and .encountered_congestion
  writeback: remove writeback_control.more_io
  writeback: skip balance_dirty_pages() for in-memory fs
  writeback: add bdi_dirty_limit() kernel-doc
  writeback: avoid extra sync work at enqueue time
  writeback: elevate queue_io() into wb_writeback()
  ...

Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c

Showing 15 changed files Inline Diff

1 /* 1 /*
2 * linux/fs/block_dev.c 2 * linux/fs/block_dev.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
6 */ 6 */
7 7
8 #include <linux/init.h> 8 #include <linux/init.h>
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/fcntl.h> 10 #include <linux/fcntl.h>
11 #include <linux/slab.h> 11 #include <linux/slab.h>
12 #include <linux/kmod.h> 12 #include <linux/kmod.h>
13 #include <linux/major.h> 13 #include <linux/major.h>
14 #include <linux/device_cgroup.h> 14 #include <linux/device_cgroup.h>
15 #include <linux/highmem.h> 15 #include <linux/highmem.h>
16 #include <linux/blkdev.h> 16 #include <linux/blkdev.h>
17 #include <linux/module.h> 17 #include <linux/module.h>
18 #include <linux/blkpg.h> 18 #include <linux/blkpg.h>
19 #include <linux/buffer_head.h> 19 #include <linux/buffer_head.h>
20 #include <linux/pagevec.h> 20 #include <linux/pagevec.h>
21 #include <linux/writeback.h> 21 #include <linux/writeback.h>
22 #include <linux/mpage.h> 22 #include <linux/mpage.h>
23 #include <linux/mount.h> 23 #include <linux/mount.h>
24 #include <linux/uio.h> 24 #include <linux/uio.h>
25 #include <linux/namei.h> 25 #include <linux/namei.h>
26 #include <linux/log2.h> 26 #include <linux/log2.h>
27 #include <linux/kmemleak.h> 27 #include <linux/kmemleak.h>
28 #include <asm/uaccess.h> 28 #include <asm/uaccess.h>
29 #include "internal.h" 29 #include "internal.h"
30 30
31 struct bdev_inode { 31 struct bdev_inode {
32 struct block_device bdev; 32 struct block_device bdev;
33 struct inode vfs_inode; 33 struct inode vfs_inode;
34 }; 34 };
35 35
36 static const struct address_space_operations def_blk_aops; 36 static const struct address_space_operations def_blk_aops;
37 37
38 static inline struct bdev_inode *BDEV_I(struct inode *inode) 38 static inline struct bdev_inode *BDEV_I(struct inode *inode)
39 { 39 {
40 return container_of(inode, struct bdev_inode, vfs_inode); 40 return container_of(inode, struct bdev_inode, vfs_inode);
41 } 41 }
42 42
43 inline struct block_device *I_BDEV(struct inode *inode) 43 inline struct block_device *I_BDEV(struct inode *inode)
44 { 44 {
45 return &BDEV_I(inode)->bdev; 45 return &BDEV_I(inode)->bdev;
46 } 46 }
47
48 EXPORT_SYMBOL(I_BDEV); 47 EXPORT_SYMBOL(I_BDEV);
49 48
50 /* 49 /*
51 * move the inode from it's current bdi to the a new bdi. if the inode is dirty 50 * Move the inode from its current bdi to a new bdi. If the inode is dirty we
52 * we need to move it onto the dirty list of @dst so that the inode is always 51 * need to move it onto the dirty list of @dst so that the inode is always on
53 * on the right list. 52 * the right list.
54 */ 53 */
55 static void bdev_inode_switch_bdi(struct inode *inode, 54 static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst) 55 struct backing_dev_info *dst)
57 { 56 {
58 spin_lock(&inode_wb_list_lock); 57 struct backing_dev_info *old = inode->i_data.backing_dev_info;
58
59 if (unlikely(dst == old)) /* deadlock avoidance */
60 return;
61 bdi_lock_two(&old->wb, &dst->wb);
59 spin_lock(&inode->i_lock); 62 spin_lock(&inode->i_lock);
60 inode->i_data.backing_dev_info = dst; 63 inode->i_data.backing_dev_info = dst;
61 if (inode->i_state & I_DIRTY) 64 if (inode->i_state & I_DIRTY)
62 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 65 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 spin_unlock(&inode->i_lock); 66 spin_unlock(&inode->i_lock);
64 spin_unlock(&inode_wb_list_lock); 67 spin_unlock(&old->wb.list_lock);
68 spin_unlock(&dst->wb.list_lock);
65 } 69 }
66 70
67 static sector_t max_block(struct block_device *bdev) 71 static sector_t max_block(struct block_device *bdev)
68 { 72 {
69 sector_t retval = ~((sector_t)0); 73 sector_t retval = ~((sector_t)0);
70 loff_t sz = i_size_read(bdev->bd_inode); 74 loff_t sz = i_size_read(bdev->bd_inode);
71 75
72 if (sz) { 76 if (sz) {
73 unsigned int size = block_size(bdev); 77 unsigned int size = block_size(bdev);
74 unsigned int sizebits = blksize_bits(size); 78 unsigned int sizebits = blksize_bits(size);
75 retval = (sz >> sizebits); 79 retval = (sz >> sizebits);
76 } 80 }
77 return retval; 81 return retval;
78 } 82 }
79 83
80 /* Kill _all_ buffers and pagecache , dirty or not.. */ 84 /* Kill _all_ buffers and pagecache , dirty or not.. */
81 static void kill_bdev(struct block_device *bdev) 85 static void kill_bdev(struct block_device *bdev)
82 { 86 {
83 if (bdev->bd_inode->i_mapping->nrpages == 0) 87 if (bdev->bd_inode->i_mapping->nrpages == 0)
84 return; 88 return;
85 invalidate_bh_lrus(); 89 invalidate_bh_lrus();
86 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 90 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
87 } 91 }
88 92
89 int set_blocksize(struct block_device *bdev, int size) 93 int set_blocksize(struct block_device *bdev, int size)
90 { 94 {
91 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 95 /* Size must be a power of two, and between 512 and PAGE_SIZE */
92 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 96 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
93 return -EINVAL; 97 return -EINVAL;
94 98
95 /* Size cannot be smaller than the size supported by the device */ 99 /* Size cannot be smaller than the size supported by the device */
96 if (size < bdev_logical_block_size(bdev)) 100 if (size < bdev_logical_block_size(bdev))
97 return -EINVAL; 101 return -EINVAL;
98 102
99 /* Don't change the size if it is same as current */ 103 /* Don't change the size if it is same as current */
100 if (bdev->bd_block_size != size) { 104 if (bdev->bd_block_size != size) {
101 sync_blockdev(bdev); 105 sync_blockdev(bdev);
102 bdev->bd_block_size = size; 106 bdev->bd_block_size = size;
103 bdev->bd_inode->i_blkbits = blksize_bits(size); 107 bdev->bd_inode->i_blkbits = blksize_bits(size);
104 kill_bdev(bdev); 108 kill_bdev(bdev);
105 } 109 }
106 return 0; 110 return 0;
107 } 111 }
108 112
109 EXPORT_SYMBOL(set_blocksize); 113 EXPORT_SYMBOL(set_blocksize);
110 114
111 int sb_set_blocksize(struct super_block *sb, int size) 115 int sb_set_blocksize(struct super_block *sb, int size)
112 { 116 {
113 if (set_blocksize(sb->s_bdev, size)) 117 if (set_blocksize(sb->s_bdev, size))
114 return 0; 118 return 0;
115 /* If we get here, we know size is power of two 119 /* If we get here, we know size is power of two
116 * and it's value is between 512 and PAGE_SIZE */ 120 * and it's value is between 512 and PAGE_SIZE */
117 sb->s_blocksize = size; 121 sb->s_blocksize = size;
118 sb->s_blocksize_bits = blksize_bits(size); 122 sb->s_blocksize_bits = blksize_bits(size);
119 return sb->s_blocksize; 123 return sb->s_blocksize;
120 } 124 }
121 125
122 EXPORT_SYMBOL(sb_set_blocksize); 126 EXPORT_SYMBOL(sb_set_blocksize);
123 127
124 int sb_min_blocksize(struct super_block *sb, int size) 128 int sb_min_blocksize(struct super_block *sb, int size)
125 { 129 {
126 int minsize = bdev_logical_block_size(sb->s_bdev); 130 int minsize = bdev_logical_block_size(sb->s_bdev);
127 if (size < minsize) 131 if (size < minsize)
128 size = minsize; 132 size = minsize;
129 return sb_set_blocksize(sb, size); 133 return sb_set_blocksize(sb, size);
130 } 134 }
131 135
132 EXPORT_SYMBOL(sb_min_blocksize); 136 EXPORT_SYMBOL(sb_min_blocksize);
133 137
134 static int 138 static int
135 blkdev_get_block(struct inode *inode, sector_t iblock, 139 blkdev_get_block(struct inode *inode, sector_t iblock,
136 struct buffer_head *bh, int create) 140 struct buffer_head *bh, int create)
137 { 141 {
138 if (iblock >= max_block(I_BDEV(inode))) { 142 if (iblock >= max_block(I_BDEV(inode))) {
139 if (create) 143 if (create)
140 return -EIO; 144 return -EIO;
141 145
142 /* 146 /*
143 * for reads, we're just trying to fill a partial page. 147 * for reads, we're just trying to fill a partial page.
144 * return a hole, they will have to call get_block again 148 * return a hole, they will have to call get_block again
145 * before they can fill it, and they will get -EIO at that 149 * before they can fill it, and they will get -EIO at that
146 * time 150 * time
147 */ 151 */
148 return 0; 152 return 0;
149 } 153 }
150 bh->b_bdev = I_BDEV(inode); 154 bh->b_bdev = I_BDEV(inode);
151 bh->b_blocknr = iblock; 155 bh->b_blocknr = iblock;
152 set_buffer_mapped(bh); 156 set_buffer_mapped(bh);
153 return 0; 157 return 0;
154 } 158 }
155 159
156 static int 160 static int
157 blkdev_get_blocks(struct inode *inode, sector_t iblock, 161 blkdev_get_blocks(struct inode *inode, sector_t iblock,
158 struct buffer_head *bh, int create) 162 struct buffer_head *bh, int create)
159 { 163 {
160 sector_t end_block = max_block(I_BDEV(inode)); 164 sector_t end_block = max_block(I_BDEV(inode));
161 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 165 unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
162 166
163 if ((iblock + max_blocks) > end_block) { 167 if ((iblock + max_blocks) > end_block) {
164 max_blocks = end_block - iblock; 168 max_blocks = end_block - iblock;
165 if ((long)max_blocks <= 0) { 169 if ((long)max_blocks <= 0) {
166 if (create) 170 if (create)
167 return -EIO; /* write fully beyond EOF */ 171 return -EIO; /* write fully beyond EOF */
168 /* 172 /*
169 * It is a read which is fully beyond EOF. We return 173 * It is a read which is fully beyond EOF. We return
170 * a !buffer_mapped buffer 174 * a !buffer_mapped buffer
171 */ 175 */
172 max_blocks = 0; 176 max_blocks = 0;
173 } 177 }
174 } 178 }
175 179
176 bh->b_bdev = I_BDEV(inode); 180 bh->b_bdev = I_BDEV(inode);
177 bh->b_blocknr = iblock; 181 bh->b_blocknr = iblock;
178 bh->b_size = max_blocks << inode->i_blkbits; 182 bh->b_size = max_blocks << inode->i_blkbits;
179 if (max_blocks) 183 if (max_blocks)
180 set_buffer_mapped(bh); 184 set_buffer_mapped(bh);
181 return 0; 185 return 0;
182 } 186 }
183 187
184 static ssize_t 188 static ssize_t
185 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 189 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
186 loff_t offset, unsigned long nr_segs) 190 loff_t offset, unsigned long nr_segs)
187 { 191 {
188 struct file *file = iocb->ki_filp; 192 struct file *file = iocb->ki_filp;
189 struct inode *inode = file->f_mapping->host; 193 struct inode *inode = file->f_mapping->host;
190 194
191 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 195 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
192 nr_segs, blkdev_get_blocks, NULL, NULL, 0); 196 nr_segs, blkdev_get_blocks, NULL, NULL, 0);
193 } 197 }
194 198
195 int __sync_blockdev(struct block_device *bdev, int wait) 199 int __sync_blockdev(struct block_device *bdev, int wait)
196 { 200 {
197 if (!bdev) 201 if (!bdev)
198 return 0; 202 return 0;
199 if (!wait) 203 if (!wait)
200 return filemap_flush(bdev->bd_inode->i_mapping); 204 return filemap_flush(bdev->bd_inode->i_mapping);
201 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 205 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
202 } 206 }
203 207
204 /* 208 /*
205 * Write out and wait upon all the dirty data associated with a block 209 * Write out and wait upon all the dirty data associated with a block
206 * device via its mapping. Does not take the superblock lock. 210 * device via its mapping. Does not take the superblock lock.
207 */ 211 */
208 int sync_blockdev(struct block_device *bdev) 212 int sync_blockdev(struct block_device *bdev)
209 { 213 {
210 return __sync_blockdev(bdev, 1); 214 return __sync_blockdev(bdev, 1);
211 } 215 }
212 EXPORT_SYMBOL(sync_blockdev); 216 EXPORT_SYMBOL(sync_blockdev);
213 217
214 /* 218 /*
215 * Write out and wait upon all dirty data associated with this 219 * Write out and wait upon all dirty data associated with this
216 * device. Filesystem data as well as the underlying block 220 * device. Filesystem data as well as the underlying block
217 * device. Takes the superblock lock. 221 * device. Takes the superblock lock.
218 */ 222 */
219 int fsync_bdev(struct block_device *bdev) 223 int fsync_bdev(struct block_device *bdev)
220 { 224 {
221 struct super_block *sb = get_super(bdev); 225 struct super_block *sb = get_super(bdev);
222 if (sb) { 226 if (sb) {
223 int res = sync_filesystem(sb); 227 int res = sync_filesystem(sb);
224 drop_super(sb); 228 drop_super(sb);
225 return res; 229 return res;
226 } 230 }
227 return sync_blockdev(bdev); 231 return sync_blockdev(bdev);
228 } 232 }
229 EXPORT_SYMBOL(fsync_bdev); 233 EXPORT_SYMBOL(fsync_bdev);
230 234
231 /** 235 /**
232 * freeze_bdev -- lock a filesystem and force it into a consistent state 236 * freeze_bdev -- lock a filesystem and force it into a consistent state
233 * @bdev: blockdevice to lock 237 * @bdev: blockdevice to lock
234 * 238 *
235 * If a superblock is found on this device, we take the s_umount semaphore 239 * If a superblock is found on this device, we take the s_umount semaphore
236 * on it to make sure nobody unmounts until the snapshot creation is done. 240 * on it to make sure nobody unmounts until the snapshot creation is done.
237 * The reference counter (bd_fsfreeze_count) guarantees that only the last 241 * The reference counter (bd_fsfreeze_count) guarantees that only the last
238 * unfreeze process can unfreeze the frozen filesystem actually when multiple 242 * unfreeze process can unfreeze the frozen filesystem actually when multiple
239 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and 243 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
240 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze 244 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
241 * actually. 245 * actually.
242 */ 246 */
243 struct super_block *freeze_bdev(struct block_device *bdev) 247 struct super_block *freeze_bdev(struct block_device *bdev)
244 { 248 {
245 struct super_block *sb; 249 struct super_block *sb;
246 int error = 0; 250 int error = 0;
247 251
248 mutex_lock(&bdev->bd_fsfreeze_mutex); 252 mutex_lock(&bdev->bd_fsfreeze_mutex);
249 if (++bdev->bd_fsfreeze_count > 1) { 253 if (++bdev->bd_fsfreeze_count > 1) {
250 /* 254 /*
251 * We don't even need to grab a reference - the first call 255 * We don't even need to grab a reference - the first call
252 * to freeze_bdev grab an active reference and only the last 256 * to freeze_bdev grab an active reference and only the last
253 * thaw_bdev drops it. 257 * thaw_bdev drops it.
254 */ 258 */
255 sb = get_super(bdev); 259 sb = get_super(bdev);
256 drop_super(sb); 260 drop_super(sb);
257 mutex_unlock(&bdev->bd_fsfreeze_mutex); 261 mutex_unlock(&bdev->bd_fsfreeze_mutex);
258 return sb; 262 return sb;
259 } 263 }
260 264
261 sb = get_active_super(bdev); 265 sb = get_active_super(bdev);
262 if (!sb) 266 if (!sb)
263 goto out; 267 goto out;
264 error = freeze_super(sb); 268 error = freeze_super(sb);
265 if (error) { 269 if (error) {
266 deactivate_super(sb); 270 deactivate_super(sb);
267 bdev->bd_fsfreeze_count--; 271 bdev->bd_fsfreeze_count--;
268 mutex_unlock(&bdev->bd_fsfreeze_mutex); 272 mutex_unlock(&bdev->bd_fsfreeze_mutex);
269 return ERR_PTR(error); 273 return ERR_PTR(error);
270 } 274 }
271 deactivate_super(sb); 275 deactivate_super(sb);
272 out: 276 out:
273 sync_blockdev(bdev); 277 sync_blockdev(bdev);
274 mutex_unlock(&bdev->bd_fsfreeze_mutex); 278 mutex_unlock(&bdev->bd_fsfreeze_mutex);
275 return sb; /* thaw_bdev releases s->s_umount */ 279 return sb; /* thaw_bdev releases s->s_umount */
276 } 280 }
277 EXPORT_SYMBOL(freeze_bdev); 281 EXPORT_SYMBOL(freeze_bdev);
278 282
279 /** 283 /**
280 * thaw_bdev -- unlock filesystem 284 * thaw_bdev -- unlock filesystem
281 * @bdev: blockdevice to unlock 285 * @bdev: blockdevice to unlock
282 * @sb: associated superblock 286 * @sb: associated superblock
283 * 287 *
284 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 288 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
285 */ 289 */
286 int thaw_bdev(struct block_device *bdev, struct super_block *sb) 290 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
287 { 291 {
288 int error = -EINVAL; 292 int error = -EINVAL;
289 293
290 mutex_lock(&bdev->bd_fsfreeze_mutex); 294 mutex_lock(&bdev->bd_fsfreeze_mutex);
291 if (!bdev->bd_fsfreeze_count) 295 if (!bdev->bd_fsfreeze_count)
292 goto out; 296 goto out;
293 297
294 error = 0; 298 error = 0;
295 if (--bdev->bd_fsfreeze_count > 0) 299 if (--bdev->bd_fsfreeze_count > 0)
296 goto out; 300 goto out;
297 301
298 if (!sb) 302 if (!sb)
299 goto out; 303 goto out;
300 304
301 error = thaw_super(sb); 305 error = thaw_super(sb);
302 if (error) { 306 if (error) {
303 bdev->bd_fsfreeze_count++; 307 bdev->bd_fsfreeze_count++;
304 mutex_unlock(&bdev->bd_fsfreeze_mutex); 308 mutex_unlock(&bdev->bd_fsfreeze_mutex);
305 return error; 309 return error;
306 } 310 }
307 out: 311 out:
308 mutex_unlock(&bdev->bd_fsfreeze_mutex); 312 mutex_unlock(&bdev->bd_fsfreeze_mutex);
309 return 0; 313 return 0;
310 } 314 }
311 EXPORT_SYMBOL(thaw_bdev); 315 EXPORT_SYMBOL(thaw_bdev);
312 316
313 static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 317 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
314 { 318 {
315 return block_write_full_page(page, blkdev_get_block, wbc); 319 return block_write_full_page(page, blkdev_get_block, wbc);
316 } 320 }
317 321
318 static int blkdev_readpage(struct file * file, struct page * page) 322 static int blkdev_readpage(struct file * file, struct page * page)
319 { 323 {
320 return block_read_full_page(page, blkdev_get_block); 324 return block_read_full_page(page, blkdev_get_block);
321 } 325 }
322 326
323 static int blkdev_write_begin(struct file *file, struct address_space *mapping, 327 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
324 loff_t pos, unsigned len, unsigned flags, 328 loff_t pos, unsigned len, unsigned flags,
325 struct page **pagep, void **fsdata) 329 struct page **pagep, void **fsdata)
326 { 330 {
327 return block_write_begin(mapping, pos, len, flags, pagep, 331 return block_write_begin(mapping, pos, len, flags, pagep,
328 blkdev_get_block); 332 blkdev_get_block);
329 } 333 }
330 334
331 static int blkdev_write_end(struct file *file, struct address_space *mapping, 335 static int blkdev_write_end(struct file *file, struct address_space *mapping,
332 loff_t pos, unsigned len, unsigned copied, 336 loff_t pos, unsigned len, unsigned copied,
333 struct page *page, void *fsdata) 337 struct page *page, void *fsdata)
334 { 338 {
335 int ret; 339 int ret;
336 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 340 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
337 341
338 unlock_page(page); 342 unlock_page(page);
339 page_cache_release(page); 343 page_cache_release(page);
340 344
341 return ret; 345 return ret;
342 } 346 }
343 347
344 /* 348 /*
345 * private llseek: 349 * private llseek:
346 * for a block special file file->f_path.dentry->d_inode->i_size is zero 350 * for a block special file file->f_path.dentry->d_inode->i_size is zero
347 * so we compute the size by hand (just as in block_read/write above) 351 * so we compute the size by hand (just as in block_read/write above)
348 */ 352 */
349 static loff_t block_llseek(struct file *file, loff_t offset, int origin) 353 static loff_t block_llseek(struct file *file, loff_t offset, int origin)
350 { 354 {
351 struct inode *bd_inode = file->f_mapping->host; 355 struct inode *bd_inode = file->f_mapping->host;
352 loff_t size; 356 loff_t size;
353 loff_t retval; 357 loff_t retval;
354 358
355 mutex_lock(&bd_inode->i_mutex); 359 mutex_lock(&bd_inode->i_mutex);
356 size = i_size_read(bd_inode); 360 size = i_size_read(bd_inode);
357 361
358 retval = -EINVAL; 362 retval = -EINVAL;
359 switch (origin) { 363 switch (origin) {
360 case SEEK_END: 364 case SEEK_END:
361 offset += size; 365 offset += size;
362 break; 366 break;
363 case SEEK_CUR: 367 case SEEK_CUR:
364 offset += file->f_pos; 368 offset += file->f_pos;
365 case SEEK_SET: 369 case SEEK_SET:
366 break; 370 break;
367 default: 371 default:
368 goto out; 372 goto out;
369 } 373 }
370 if (offset >= 0 && offset <= size) { 374 if (offset >= 0 && offset <= size) {
371 if (offset != file->f_pos) { 375 if (offset != file->f_pos) {
372 file->f_pos = offset; 376 file->f_pos = offset;
373 } 377 }
374 retval = offset; 378 retval = offset;
375 } 379 }
376 out: 380 out:
377 mutex_unlock(&bd_inode->i_mutex); 381 mutex_unlock(&bd_inode->i_mutex);
378 return retval; 382 return retval;
379 } 383 }
380 384
381 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) 385 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
382 { 386 {
383 struct inode *bd_inode = filp->f_mapping->host; 387 struct inode *bd_inode = filp->f_mapping->host;
384 struct block_device *bdev = I_BDEV(bd_inode); 388 struct block_device *bdev = I_BDEV(bd_inode);
385 int error; 389 int error;
386 390
387 /* 391 /*
388 * There is no need to serialise calls to blkdev_issue_flush with 392 * There is no need to serialise calls to blkdev_issue_flush with
389 * i_mutex and doing so causes performance issues with concurrent 393 * i_mutex and doing so causes performance issues with concurrent
390 * O_SYNC writers to a block device. 394 * O_SYNC writers to a block device.
391 */ 395 */
392 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); 396 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
393 if (error == -EOPNOTSUPP) 397 if (error == -EOPNOTSUPP)
394 error = 0; 398 error = 0;
395 399
396 return error; 400 return error;
397 } 401 }
398 EXPORT_SYMBOL(blkdev_fsync); 402 EXPORT_SYMBOL(blkdev_fsync);
399 403
400 /* 404 /*
401 * pseudo-fs 405 * pseudo-fs
402 */ 406 */
403 407
404 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 408 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
405 static struct kmem_cache * bdev_cachep __read_mostly; 409 static struct kmem_cache * bdev_cachep __read_mostly;
406 410
407 static struct inode *bdev_alloc_inode(struct super_block *sb) 411 static struct inode *bdev_alloc_inode(struct super_block *sb)
408 { 412 {
409 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 413 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
410 if (!ei) 414 if (!ei)
411 return NULL; 415 return NULL;
412 return &ei->vfs_inode; 416 return &ei->vfs_inode;
413 } 417 }
414 418
415 static void bdev_i_callback(struct rcu_head *head) 419 static void bdev_i_callback(struct rcu_head *head)
416 { 420 {
417 struct inode *inode = container_of(head, struct inode, i_rcu); 421 struct inode *inode = container_of(head, struct inode, i_rcu);
418 struct bdev_inode *bdi = BDEV_I(inode); 422 struct bdev_inode *bdi = BDEV_I(inode);
419 423
420 INIT_LIST_HEAD(&inode->i_dentry); 424 INIT_LIST_HEAD(&inode->i_dentry);
421 kmem_cache_free(bdev_cachep, bdi); 425 kmem_cache_free(bdev_cachep, bdi);
422 } 426 }
423 427
424 static void bdev_destroy_inode(struct inode *inode) 428 static void bdev_destroy_inode(struct inode *inode)
425 { 429 {
426 call_rcu(&inode->i_rcu, bdev_i_callback); 430 call_rcu(&inode->i_rcu, bdev_i_callback);
427 } 431 }
428 432
429 static void init_once(void *foo) 433 static void init_once(void *foo)
430 { 434 {
431 struct bdev_inode *ei = (struct bdev_inode *) foo; 435 struct bdev_inode *ei = (struct bdev_inode *) foo;
432 struct block_device *bdev = &ei->bdev; 436 struct block_device *bdev = &ei->bdev;
433 437
434 memset(bdev, 0, sizeof(*bdev)); 438 memset(bdev, 0, sizeof(*bdev));
435 mutex_init(&bdev->bd_mutex); 439 mutex_init(&bdev->bd_mutex);
436 INIT_LIST_HEAD(&bdev->bd_inodes); 440 INIT_LIST_HEAD(&bdev->bd_inodes);
437 INIT_LIST_HEAD(&bdev->bd_list); 441 INIT_LIST_HEAD(&bdev->bd_list);
438 #ifdef CONFIG_SYSFS 442 #ifdef CONFIG_SYSFS
439 INIT_LIST_HEAD(&bdev->bd_holder_disks); 443 INIT_LIST_HEAD(&bdev->bd_holder_disks);
440 #endif 444 #endif
441 inode_init_once(&ei->vfs_inode); 445 inode_init_once(&ei->vfs_inode);
442 /* Initialize mutex for freeze. */ 446 /* Initialize mutex for freeze. */
443 mutex_init(&bdev->bd_fsfreeze_mutex); 447 mutex_init(&bdev->bd_fsfreeze_mutex);
444 } 448 }
445 449
446 static inline void __bd_forget(struct inode *inode) 450 static inline void __bd_forget(struct inode *inode)
447 { 451 {
448 list_del_init(&inode->i_devices); 452 list_del_init(&inode->i_devices);
449 inode->i_bdev = NULL; 453 inode->i_bdev = NULL;
450 inode->i_mapping = &inode->i_data; 454 inode->i_mapping = &inode->i_data;
451 } 455 }
452 456
453 static void bdev_evict_inode(struct inode *inode) 457 static void bdev_evict_inode(struct inode *inode)
454 { 458 {
455 struct block_device *bdev = &BDEV_I(inode)->bdev; 459 struct block_device *bdev = &BDEV_I(inode)->bdev;
456 struct list_head *p; 460 struct list_head *p;
457 truncate_inode_pages(&inode->i_data, 0); 461 truncate_inode_pages(&inode->i_data, 0);
458 invalidate_inode_buffers(inode); /* is it needed here? */ 462 invalidate_inode_buffers(inode); /* is it needed here? */
459 end_writeback(inode); 463 end_writeback(inode);
460 spin_lock(&bdev_lock); 464 spin_lock(&bdev_lock);
461 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 465 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
462 __bd_forget(list_entry(p, struct inode, i_devices)); 466 __bd_forget(list_entry(p, struct inode, i_devices));
463 } 467 }
464 list_del_init(&bdev->bd_list); 468 list_del_init(&bdev->bd_list);
465 spin_unlock(&bdev_lock); 469 spin_unlock(&bdev_lock);
466 } 470 }
467 471
468 static const struct super_operations bdev_sops = { 472 static const struct super_operations bdev_sops = {
469 .statfs = simple_statfs, 473 .statfs = simple_statfs,
470 .alloc_inode = bdev_alloc_inode, 474 .alloc_inode = bdev_alloc_inode,
471 .destroy_inode = bdev_destroy_inode, 475 .destroy_inode = bdev_destroy_inode,
472 .drop_inode = generic_delete_inode, 476 .drop_inode = generic_delete_inode,
473 .evict_inode = bdev_evict_inode, 477 .evict_inode = bdev_evict_inode,
474 }; 478 };
475 479
476 static struct dentry *bd_mount(struct file_system_type *fs_type, 480 static struct dentry *bd_mount(struct file_system_type *fs_type,
477 int flags, const char *dev_name, void *data) 481 int flags, const char *dev_name, void *data)
478 { 482 {
479 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); 483 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
480 } 484 }
481 485
482 static struct file_system_type bd_type = { 486 static struct file_system_type bd_type = {
483 .name = "bdev", 487 .name = "bdev",
484 .mount = bd_mount, 488 .mount = bd_mount,
485 .kill_sb = kill_anon_super, 489 .kill_sb = kill_anon_super,
486 }; 490 };
487 491
488 struct super_block *blockdev_superblock __read_mostly; 492 struct super_block *blockdev_superblock __read_mostly;
489 493
490 void __init bdev_cache_init(void) 494 void __init bdev_cache_init(void)
491 { 495 {
492 int err; 496 int err;
493 struct vfsmount *bd_mnt; 497 struct vfsmount *bd_mnt;
494 498
495 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 499 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
496 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 500 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
497 SLAB_MEM_SPREAD|SLAB_PANIC), 501 SLAB_MEM_SPREAD|SLAB_PANIC),
498 init_once); 502 init_once);
499 err = register_filesystem(&bd_type); 503 err = register_filesystem(&bd_type);
500 if (err) 504 if (err)
501 panic("Cannot register bdev pseudo-fs"); 505 panic("Cannot register bdev pseudo-fs");
502 bd_mnt = kern_mount(&bd_type); 506 bd_mnt = kern_mount(&bd_type);
503 if (IS_ERR(bd_mnt)) 507 if (IS_ERR(bd_mnt))
504 panic("Cannot create bdev pseudo-fs"); 508 panic("Cannot create bdev pseudo-fs");
505 /* 509 /*
506 * This vfsmount structure is only used to obtain the 510 * This vfsmount structure is only used to obtain the
507 * blockdev_superblock, so tell kmemleak not to report it. 511 * blockdev_superblock, so tell kmemleak not to report it.
508 */ 512 */
509 kmemleak_not_leak(bd_mnt); 513 kmemleak_not_leak(bd_mnt);
510 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 514 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
511 } 515 }
512 516
513 /* 517 /*
514 * Most likely _very_ bad one - but then it's hardly critical for small 518 * Most likely _very_ bad one - but then it's hardly critical for small
515 * /dev and can be fixed when somebody will need really large one. 519 * /dev and can be fixed when somebody will need really large one.
516 * Keep in mind that it will be fed through icache hash function too. 520 * Keep in mind that it will be fed through icache hash function too.
517 */ 521 */
518 static inline unsigned long hash(dev_t dev) 522 static inline unsigned long hash(dev_t dev)
519 { 523 {
520 return MAJOR(dev)+MINOR(dev); 524 return MAJOR(dev)+MINOR(dev);
521 } 525 }
522 526
523 static int bdev_test(struct inode *inode, void *data) 527 static int bdev_test(struct inode *inode, void *data)
524 { 528 {
525 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 529 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
526 } 530 }
527 531
528 static int bdev_set(struct inode *inode, void *data) 532 static int bdev_set(struct inode *inode, void *data)
529 { 533 {
530 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 534 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
531 return 0; 535 return 0;
532 } 536 }
533 537
534 static LIST_HEAD(all_bdevs); 538 static LIST_HEAD(all_bdevs);
535 539
536 struct block_device *bdget(dev_t dev) 540 struct block_device *bdget(dev_t dev)
537 { 541 {
538 struct block_device *bdev; 542 struct block_device *bdev;
539 struct inode *inode; 543 struct inode *inode;
540 544
541 inode = iget5_locked(blockdev_superblock, hash(dev), 545 inode = iget5_locked(blockdev_superblock, hash(dev),
542 bdev_test, bdev_set, &dev); 546 bdev_test, bdev_set, &dev);
543 547
544 if (!inode) 548 if (!inode)
545 return NULL; 549 return NULL;
546 550
547 bdev = &BDEV_I(inode)->bdev; 551 bdev = &BDEV_I(inode)->bdev;
548 552
549 if (inode->i_state & I_NEW) { 553 if (inode->i_state & I_NEW) {
550 bdev->bd_contains = NULL; 554 bdev->bd_contains = NULL;
551 bdev->bd_inode = inode; 555 bdev->bd_inode = inode;
552 bdev->bd_block_size = (1 << inode->i_blkbits); 556 bdev->bd_block_size = (1 << inode->i_blkbits);
553 bdev->bd_part_count = 0; 557 bdev->bd_part_count = 0;
554 bdev->bd_invalidated = 0; 558 bdev->bd_invalidated = 0;
555 inode->i_mode = S_IFBLK; 559 inode->i_mode = S_IFBLK;
556 inode->i_rdev = dev; 560 inode->i_rdev = dev;
557 inode->i_bdev = bdev; 561 inode->i_bdev = bdev;
558 inode->i_data.a_ops = &def_blk_aops; 562 inode->i_data.a_ops = &def_blk_aops;
559 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 563 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
560 inode->i_data.backing_dev_info = &default_backing_dev_info; 564 inode->i_data.backing_dev_info = &default_backing_dev_info;
561 spin_lock(&bdev_lock); 565 spin_lock(&bdev_lock);
562 list_add(&bdev->bd_list, &all_bdevs); 566 list_add(&bdev->bd_list, &all_bdevs);
563 spin_unlock(&bdev_lock); 567 spin_unlock(&bdev_lock);
564 unlock_new_inode(inode); 568 unlock_new_inode(inode);
565 } 569 }
566 return bdev; 570 return bdev;
567 } 571 }
568 572
569 EXPORT_SYMBOL(bdget); 573 EXPORT_SYMBOL(bdget);
570 574
571 /** 575 /**
572 * bdgrab -- Grab a reference to an already referenced block device 576 * bdgrab -- Grab a reference to an already referenced block device
573 * @bdev: Block device to grab a reference to. 577 * @bdev: Block device to grab a reference to.
574 */ 578 */
575 struct block_device *bdgrab(struct block_device *bdev) 579 struct block_device *bdgrab(struct block_device *bdev)
576 { 580 {
577 ihold(bdev->bd_inode); 581 ihold(bdev->bd_inode);
578 return bdev; 582 return bdev;
579 } 583 }
580 584
581 long nr_blockdev_pages(void) 585 long nr_blockdev_pages(void)
582 { 586 {
583 struct block_device *bdev; 587 struct block_device *bdev;
584 long ret = 0; 588 long ret = 0;
585 spin_lock(&bdev_lock); 589 spin_lock(&bdev_lock);
586 list_for_each_entry(bdev, &all_bdevs, bd_list) { 590 list_for_each_entry(bdev, &all_bdevs, bd_list) {
587 ret += bdev->bd_inode->i_mapping->nrpages; 591 ret += bdev->bd_inode->i_mapping->nrpages;
588 } 592 }
589 spin_unlock(&bdev_lock); 593 spin_unlock(&bdev_lock);
590 return ret; 594 return ret;
591 } 595 }
592 596
593 void bdput(struct block_device *bdev) 597 void bdput(struct block_device *bdev)
594 { 598 {
595 iput(bdev->bd_inode); 599 iput(bdev->bd_inode);
596 } 600 }
597 601
598 EXPORT_SYMBOL(bdput); 602 EXPORT_SYMBOL(bdput);
599 603
600 static struct block_device *bd_acquire(struct inode *inode) 604 static struct block_device *bd_acquire(struct inode *inode)
601 { 605 {
602 struct block_device *bdev; 606 struct block_device *bdev;
603 607
604 spin_lock(&bdev_lock); 608 spin_lock(&bdev_lock);
605 bdev = inode->i_bdev; 609 bdev = inode->i_bdev;
606 if (bdev) { 610 if (bdev) {
607 ihold(bdev->bd_inode); 611 ihold(bdev->bd_inode);
608 spin_unlock(&bdev_lock); 612 spin_unlock(&bdev_lock);
609 return bdev; 613 return bdev;
610 } 614 }
611 spin_unlock(&bdev_lock); 615 spin_unlock(&bdev_lock);
612 616
613 bdev = bdget(inode->i_rdev); 617 bdev = bdget(inode->i_rdev);
614 if (bdev) { 618 if (bdev) {
615 spin_lock(&bdev_lock); 619 spin_lock(&bdev_lock);
616 if (!inode->i_bdev) { 620 if (!inode->i_bdev) {
617 /* 621 /*
618 * We take an additional reference to bd_inode, 622 * We take an additional reference to bd_inode,
619 * and it's released in clear_inode() of inode. 623 * and it's released in clear_inode() of inode.
620 * So, we can access it via ->i_mapping always 624 * So, we can access it via ->i_mapping always
621 * without igrab(). 625 * without igrab().
622 */ 626 */
623 ihold(bdev->bd_inode); 627 ihold(bdev->bd_inode);
624 inode->i_bdev = bdev; 628 inode->i_bdev = bdev;
625 inode->i_mapping = bdev->bd_inode->i_mapping; 629 inode->i_mapping = bdev->bd_inode->i_mapping;
626 list_add(&inode->i_devices, &bdev->bd_inodes); 630 list_add(&inode->i_devices, &bdev->bd_inodes);
627 } 631 }
628 spin_unlock(&bdev_lock); 632 spin_unlock(&bdev_lock);
629 } 633 }
630 return bdev; 634 return bdev;
631 } 635 }
632 636
633 /* Call when you free inode */ 637 /* Call when you free inode */
634 638
635 void bd_forget(struct inode *inode) 639 void bd_forget(struct inode *inode)
636 { 640 {
637 struct block_device *bdev = NULL; 641 struct block_device *bdev = NULL;
638 642
639 spin_lock(&bdev_lock); 643 spin_lock(&bdev_lock);
640 if (inode->i_bdev) { 644 if (inode->i_bdev) {
641 if (!sb_is_blkdev_sb(inode->i_sb)) 645 if (!sb_is_blkdev_sb(inode->i_sb))
642 bdev = inode->i_bdev; 646 bdev = inode->i_bdev;
643 __bd_forget(inode); 647 __bd_forget(inode);
644 } 648 }
645 spin_unlock(&bdev_lock); 649 spin_unlock(&bdev_lock);
646 650
647 if (bdev) 651 if (bdev)
648 iput(bdev->bd_inode); 652 iput(bdev->bd_inode);
649 } 653 }
650 654
651 /** 655 /**
652 * bd_may_claim - test whether a block device can be claimed 656 * bd_may_claim - test whether a block device can be claimed
653 * @bdev: block device of interest 657 * @bdev: block device of interest
654 * @whole: whole block device containing @bdev, may equal @bdev 658 * @whole: whole block device containing @bdev, may equal @bdev
655 * @holder: holder trying to claim @bdev 659 * @holder: holder trying to claim @bdev
656 * 660 *
657 * Test whether @bdev can be claimed by @holder. 661 * Test whether @bdev can be claimed by @holder.
658 * 662 *
659 * CONTEXT: 663 * CONTEXT:
660 * spin_lock(&bdev_lock). 664 * spin_lock(&bdev_lock).
661 * 665 *
662 * RETURNS: 666 * RETURNS:
663 * %true if @bdev can be claimed, %false otherwise. 667 * %true if @bdev can be claimed, %false otherwise.
664 */ 668 */
665 static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, 669 static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
666 void *holder) 670 void *holder)
667 { 671 {
668 if (bdev->bd_holder == holder) 672 if (bdev->bd_holder == holder)
669 return true; /* already a holder */ 673 return true; /* already a holder */
670 else if (bdev->bd_holder != NULL) 674 else if (bdev->bd_holder != NULL)
671 return false; /* held by someone else */ 675 return false; /* held by someone else */
672 else if (bdev->bd_contains == bdev) 676 else if (bdev->bd_contains == bdev)
673 return true; /* is a whole device which isn't held */ 677 return true; /* is a whole device which isn't held */
674 678
675 else if (whole->bd_holder == bd_may_claim) 679 else if (whole->bd_holder == bd_may_claim)
676 return true; /* is a partition of a device that is being partitioned */ 680 return true; /* is a partition of a device that is being partitioned */
677 else if (whole->bd_holder != NULL) 681 else if (whole->bd_holder != NULL)
678 return false; /* is a partition of a held device */ 682 return false; /* is a partition of a held device */
679 else 683 else
680 return true; /* is a partition of an un-held device */ 684 return true; /* is a partition of an un-held device */
681 } 685 }
682 686
683 /** 687 /**
684 * bd_prepare_to_claim - prepare to claim a block device 688 * bd_prepare_to_claim - prepare to claim a block device
685 * @bdev: block device of interest 689 * @bdev: block device of interest
686 * @whole: the whole device containing @bdev, may equal @bdev 690 * @whole: the whole device containing @bdev, may equal @bdev
687 * @holder: holder trying to claim @bdev 691 * @holder: holder trying to claim @bdev
688 * 692 *
689 * Prepare to claim @bdev. This function fails if @bdev is already 693 * Prepare to claim @bdev. This function fails if @bdev is already
690 * claimed by another holder and waits if another claiming is in 694 * claimed by another holder and waits if another claiming is in
691 * progress. This function doesn't actually claim. On successful 695 * progress. This function doesn't actually claim. On successful
692 * return, the caller has ownership of bd_claiming and bd_holder[s]. 696 * return, the caller has ownership of bd_claiming and bd_holder[s].
693 * 697 *
694 * CONTEXT: 698 * CONTEXT:
695 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab 699 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
696 * it multiple times. 700 * it multiple times.
697 * 701 *
698 * RETURNS: 702 * RETURNS:
699 * 0 if @bdev can be claimed, -EBUSY otherwise. 703 * 0 if @bdev can be claimed, -EBUSY otherwise.
700 */ 704 */
701 static int bd_prepare_to_claim(struct block_device *bdev, 705 static int bd_prepare_to_claim(struct block_device *bdev,
702 struct block_device *whole, void *holder) 706 struct block_device *whole, void *holder)
703 { 707 {
704 retry: 708 retry:
705 /* if someone else claimed, fail */ 709 /* if someone else claimed, fail */
706 if (!bd_may_claim(bdev, whole, holder)) 710 if (!bd_may_claim(bdev, whole, holder))
707 return -EBUSY; 711 return -EBUSY;
708 712
709 /* if claiming is already in progress, wait for it to finish */ 713 /* if claiming is already in progress, wait for it to finish */
710 if (whole->bd_claiming) { 714 if (whole->bd_claiming) {
711 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 715 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
712 DEFINE_WAIT(wait); 716 DEFINE_WAIT(wait);
713 717
714 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 718 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
715 spin_unlock(&bdev_lock); 719 spin_unlock(&bdev_lock);
716 schedule(); 720 schedule();
717 finish_wait(wq, &wait); 721 finish_wait(wq, &wait);
718 spin_lock(&bdev_lock); 722 spin_lock(&bdev_lock);
719 goto retry; 723 goto retry;
720 } 724 }
721 725
722 /* yay, all mine */ 726 /* yay, all mine */
723 return 0; 727 return 0;
724 } 728 }
725 729
726 /** 730 /**
727 * bd_start_claiming - start claiming a block device 731 * bd_start_claiming - start claiming a block device
728 * @bdev: block device of interest 732 * @bdev: block device of interest
729 * @holder: holder trying to claim @bdev 733 * @holder: holder trying to claim @bdev
730 * 734 *
731 * @bdev is about to be opened exclusively. Check @bdev can be opened 735 * @bdev is about to be opened exclusively. Check @bdev can be opened
732 * exclusively and mark that an exclusive open is in progress. Each 736 * exclusively and mark that an exclusive open is in progress. Each
733 * successful call to this function must be matched with a call to 737 * successful call to this function must be matched with a call to
734 * either bd_finish_claiming() or bd_abort_claiming() (which do not 738 * either bd_finish_claiming() or bd_abort_claiming() (which do not
735 * fail). 739 * fail).
736 * 740 *
737 * This function is used to gain exclusive access to the block device 741 * This function is used to gain exclusive access to the block device
738 * without actually causing other exclusive open attempts to fail. It 742 * without actually causing other exclusive open attempts to fail. It
739 * should be used when the open sequence itself requires exclusive 743 * should be used when the open sequence itself requires exclusive
740 * access but may subsequently fail. 744 * access but may subsequently fail.
741 * 745 *
742 * CONTEXT: 746 * CONTEXT:
743 * Might sleep. 747 * Might sleep.
744 * 748 *
745 * RETURNS: 749 * RETURNS:
746 * Pointer to the block device containing @bdev on success, ERR_PTR() 750 * Pointer to the block device containing @bdev on success, ERR_PTR()
747 * value on failure. 751 * value on failure.
748 */ 752 */
749 static struct block_device *bd_start_claiming(struct block_device *bdev, 753 static struct block_device *bd_start_claiming(struct block_device *bdev,
750 void *holder) 754 void *holder)
751 { 755 {
752 struct gendisk *disk; 756 struct gendisk *disk;
753 struct block_device *whole; 757 struct block_device *whole;
754 int partno, err; 758 int partno, err;
755 759
756 might_sleep(); 760 might_sleep();
757 761
758 /* 762 /*
759 * @bdev might not have been initialized properly yet, look up 763 * @bdev might not have been initialized properly yet, look up
760 * and grab the outer block device the hard way. 764 * and grab the outer block device the hard way.
761 */ 765 */
762 disk = get_gendisk(bdev->bd_dev, &partno); 766 disk = get_gendisk(bdev->bd_dev, &partno);
763 if (!disk) 767 if (!disk)
764 return ERR_PTR(-ENXIO); 768 return ERR_PTR(-ENXIO);
765 769
766 /* 770 /*
767 * Normally, @bdev should equal what's returned from bdget_disk() 771 * Normally, @bdev should equal what's returned from bdget_disk()
768 * if partno is 0; however, some drivers (floppy) use multiple 772 * if partno is 0; however, some drivers (floppy) use multiple
769 * bdev's for the same physical device and @bdev may be one of the 773 * bdev's for the same physical device and @bdev may be one of the
770 * aliases. Keep @bdev if partno is 0. This means claimer 774 * aliases. Keep @bdev if partno is 0. This means claimer
771 * tracking is broken for those devices but it has always been that 775 * tracking is broken for those devices but it has always been that
772 * way. 776 * way.
773 */ 777 */
774 if (partno) 778 if (partno)
775 whole = bdget_disk(disk, 0); 779 whole = bdget_disk(disk, 0);
776 else 780 else
777 whole = bdgrab(bdev); 781 whole = bdgrab(bdev);
778 782
779 module_put(disk->fops->owner); 783 module_put(disk->fops->owner);
780 put_disk(disk); 784 put_disk(disk);
781 if (!whole) 785 if (!whole)
782 return ERR_PTR(-ENOMEM); 786 return ERR_PTR(-ENOMEM);
783 787
784 /* prepare to claim, if successful, mark claiming in progress */ 788 /* prepare to claim, if successful, mark claiming in progress */
785 spin_lock(&bdev_lock); 789 spin_lock(&bdev_lock);
786 790
787 err = bd_prepare_to_claim(bdev, whole, holder); 791 err = bd_prepare_to_claim(bdev, whole, holder);
788 if (err == 0) { 792 if (err == 0) {
789 whole->bd_claiming = holder; 793 whole->bd_claiming = holder;
790 spin_unlock(&bdev_lock); 794 spin_unlock(&bdev_lock);
791 return whole; 795 return whole;
792 } else { 796 } else {
793 spin_unlock(&bdev_lock); 797 spin_unlock(&bdev_lock);
794 bdput(whole); 798 bdput(whole);
795 return ERR_PTR(err); 799 return ERR_PTR(err);
796 } 800 }
797 } 801 }
798 802
799 #ifdef CONFIG_SYSFS 803 #ifdef CONFIG_SYSFS
800 struct bd_holder_disk { 804 struct bd_holder_disk {
801 struct list_head list; 805 struct list_head list;
802 struct gendisk *disk; 806 struct gendisk *disk;
803 int refcnt; 807 int refcnt;
804 }; 808 };
805 809
806 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, 810 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
807 struct gendisk *disk) 811 struct gendisk *disk)
808 { 812 {
809 struct bd_holder_disk *holder; 813 struct bd_holder_disk *holder;
810 814
811 list_for_each_entry(holder, &bdev->bd_holder_disks, list) 815 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
812 if (holder->disk == disk) 816 if (holder->disk == disk)
813 return holder; 817 return holder;
814 return NULL; 818 return NULL;
815 } 819 }
816 820
817 static int add_symlink(struct kobject *from, struct kobject *to) 821 static int add_symlink(struct kobject *from, struct kobject *to)
818 { 822 {
819 return sysfs_create_link(from, to, kobject_name(to)); 823 return sysfs_create_link(from, to, kobject_name(to));
820 } 824 }
821 825
822 static void del_symlink(struct kobject *from, struct kobject *to) 826 static void del_symlink(struct kobject *from, struct kobject *to)
823 { 827 {
824 sysfs_remove_link(from, kobject_name(to)); 828 sysfs_remove_link(from, kobject_name(to));
825 } 829 }
826 830
827 /** 831 /**
828 * bd_link_disk_holder - create symlinks between holding disk and slave bdev 832 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
829 * @bdev: the claimed slave bdev 833 * @bdev: the claimed slave bdev
830 * @disk: the holding disk 834 * @disk: the holding disk
831 * 835 *
832 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 836 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
833 * 837 *
834 * This functions creates the following sysfs symlinks. 838 * This functions creates the following sysfs symlinks.
835 * 839 *
836 * - from "slaves" directory of the holder @disk to the claimed @bdev 840 * - from "slaves" directory of the holder @disk to the claimed @bdev
837 * - from "holders" directory of the @bdev to the holder @disk 841 * - from "holders" directory of the @bdev to the holder @disk
838 * 842 *
839 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is 843 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
840 * passed to bd_link_disk_holder(), then: 844 * passed to bd_link_disk_holder(), then:
841 * 845 *
842 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 846 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
843 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 847 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
844 * 848 *
845 * The caller must have claimed @bdev before calling this function and 849 * The caller must have claimed @bdev before calling this function and
846 * ensure that both @bdev and @disk are valid during the creation and 850 * ensure that both @bdev and @disk are valid during the creation and
847 * lifetime of these symlinks. 851 * lifetime of these symlinks.
848 * 852 *
849 * CONTEXT: 853 * CONTEXT:
850 * Might sleep. 854 * Might sleep.
851 * 855 *
852 * RETURNS: 856 * RETURNS:
853 * 0 on success, -errno on failure. 857 * 0 on success, -errno on failure.
854 */ 858 */
855 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) 859 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
856 { 860 {
857 struct bd_holder_disk *holder; 861 struct bd_holder_disk *holder;
858 int ret = 0; 862 int ret = 0;
859 863
860 mutex_lock(&bdev->bd_mutex); 864 mutex_lock(&bdev->bd_mutex);
861 865
862 WARN_ON_ONCE(!bdev->bd_holder); 866 WARN_ON_ONCE(!bdev->bd_holder);
863 867
864 /* FIXME: remove the following once add_disk() handles errors */ 868 /* FIXME: remove the following once add_disk() handles errors */
865 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) 869 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
866 goto out_unlock; 870 goto out_unlock;
867 871
868 holder = bd_find_holder_disk(bdev, disk); 872 holder = bd_find_holder_disk(bdev, disk);
869 if (holder) { 873 if (holder) {
870 holder->refcnt++; 874 holder->refcnt++;
871 goto out_unlock; 875 goto out_unlock;
872 } 876 }
873 877
874 holder = kzalloc(sizeof(*holder), GFP_KERNEL); 878 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
875 if (!holder) { 879 if (!holder) {
876 ret = -ENOMEM; 880 ret = -ENOMEM;
877 goto out_unlock; 881 goto out_unlock;
878 } 882 }
879 883
880 INIT_LIST_HEAD(&holder->list); 884 INIT_LIST_HEAD(&holder->list);
881 holder->disk = disk; 885 holder->disk = disk;
882 holder->refcnt = 1; 886 holder->refcnt = 1;
883 887
884 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 888 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
885 if (ret) 889 if (ret)
886 goto out_free; 890 goto out_free;
887 891
888 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 892 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
889 if (ret) 893 if (ret)
890 goto out_del; 894 goto out_del;
891 /* 895 /*
892 * bdev could be deleted beneath us which would implicitly destroy 896 * bdev could be deleted beneath us which would implicitly destroy
893 * the holder directory. Hold on to it. 897 * the holder directory. Hold on to it.
894 */ 898 */
895 kobject_get(bdev->bd_part->holder_dir); 899 kobject_get(bdev->bd_part->holder_dir);
896 900
897 list_add(&holder->list, &bdev->bd_holder_disks); 901 list_add(&holder->list, &bdev->bd_holder_disks);
898 goto out_unlock; 902 goto out_unlock;
899 903
900 out_del: 904 out_del:
901 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 905 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
902 out_free: 906 out_free:
903 kfree(holder); 907 kfree(holder);
904 out_unlock: 908 out_unlock:
905 mutex_unlock(&bdev->bd_mutex); 909 mutex_unlock(&bdev->bd_mutex);
906 return ret; 910 return ret;
907 } 911 }
908 EXPORT_SYMBOL_GPL(bd_link_disk_holder); 912 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
909 913
910 /** 914 /**
911 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() 915 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
912 * @bdev: the calimed slave bdev 916 * @bdev: the calimed slave bdev
913 * @disk: the holding disk 917 * @disk: the holding disk
914 * 918 *
915 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 919 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
916 * 920 *
917 * CONTEXT: 921 * CONTEXT:
918 * Might sleep. 922 * Might sleep.
919 */ 923 */
920 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) 924 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
921 { 925 {
922 struct bd_holder_disk *holder; 926 struct bd_holder_disk *holder;
923 927
924 mutex_lock(&bdev->bd_mutex); 928 mutex_lock(&bdev->bd_mutex);
925 929
926 holder = bd_find_holder_disk(bdev, disk); 930 holder = bd_find_holder_disk(bdev, disk);
927 931
928 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { 932 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
929 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 933 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
930 del_symlink(bdev->bd_part->holder_dir, 934 del_symlink(bdev->bd_part->holder_dir,
931 &disk_to_dev(disk)->kobj); 935 &disk_to_dev(disk)->kobj);
932 kobject_put(bdev->bd_part->holder_dir); 936 kobject_put(bdev->bd_part->holder_dir);
933 list_del_init(&holder->list); 937 list_del_init(&holder->list);
934 kfree(holder); 938 kfree(holder);
935 } 939 }
936 940
937 mutex_unlock(&bdev->bd_mutex); 941 mutex_unlock(&bdev->bd_mutex);
938 } 942 }
939 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); 943 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
940 #endif 944 #endif
941 945
942 /** 946 /**
943 * flush_disk - invalidates all buffer-cache entries on a disk 947 * flush_disk - invalidates all buffer-cache entries on a disk
944 * 948 *
945 * @bdev: struct block device to be flushed 949 * @bdev: struct block device to be flushed
946 * @kill_dirty: flag to guide handling of dirty inodes 950 * @kill_dirty: flag to guide handling of dirty inodes
947 * 951 *
948 * Invalidates all buffer-cache entries on a disk. It should be called 952 * Invalidates all buffer-cache entries on a disk. It should be called
949 * when a disk has been changed -- either by a media change or online 953 * when a disk has been changed -- either by a media change or online
950 * resize. 954 * resize.
951 */ 955 */
952 static void flush_disk(struct block_device *bdev, bool kill_dirty) 956 static void flush_disk(struct block_device *bdev, bool kill_dirty)
953 { 957 {
954 if (__invalidate_device(bdev, kill_dirty)) { 958 if (__invalidate_device(bdev, kill_dirty)) {
955 char name[BDEVNAME_SIZE] = ""; 959 char name[BDEVNAME_SIZE] = "";
956 960
957 if (bdev->bd_disk) 961 if (bdev->bd_disk)
958 disk_name(bdev->bd_disk, 0, name); 962 disk_name(bdev->bd_disk, 0, name);
959 printk(KERN_WARNING "VFS: busy inodes on changed media or " 963 printk(KERN_WARNING "VFS: busy inodes on changed media or "
960 "resized disk %s\n", name); 964 "resized disk %s\n", name);
961 } 965 }
962 966
963 if (!bdev->bd_disk) 967 if (!bdev->bd_disk)
964 return; 968 return;
965 if (disk_partitionable(bdev->bd_disk)) 969 if (disk_partitionable(bdev->bd_disk))
966 bdev->bd_invalidated = 1; 970 bdev->bd_invalidated = 1;
967 } 971 }
968 972
969 /** 973 /**
970 * check_disk_size_change - checks for disk size change and adjusts bdev size. 974 * check_disk_size_change - checks for disk size change and adjusts bdev size.
971 * @disk: struct gendisk to check 975 * @disk: struct gendisk to check
972 * @bdev: struct bdev to adjust. 976 * @bdev: struct bdev to adjust.
973 * 977 *
974 * This routine checks to see if the bdev size does not match the disk size 978 * This routine checks to see if the bdev size does not match the disk size
975 * and adjusts it if it differs. 979 * and adjusts it if it differs.
976 */ 980 */
977 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 981 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
978 { 982 {
979 loff_t disk_size, bdev_size; 983 loff_t disk_size, bdev_size;
980 984
981 disk_size = (loff_t)get_capacity(disk) << 9; 985 disk_size = (loff_t)get_capacity(disk) << 9;
982 bdev_size = i_size_read(bdev->bd_inode); 986 bdev_size = i_size_read(bdev->bd_inode);
983 if (disk_size != bdev_size) { 987 if (disk_size != bdev_size) {
984 char name[BDEVNAME_SIZE]; 988 char name[BDEVNAME_SIZE];
985 989
986 disk_name(disk, 0, name); 990 disk_name(disk, 0, name);
987 printk(KERN_INFO 991 printk(KERN_INFO
988 "%s: detected capacity change from %lld to %lld\n", 992 "%s: detected capacity change from %lld to %lld\n",
989 name, bdev_size, disk_size); 993 name, bdev_size, disk_size);
990 i_size_write(bdev->bd_inode, disk_size); 994 i_size_write(bdev->bd_inode, disk_size);
991 flush_disk(bdev, false); 995 flush_disk(bdev, false);
992 } 996 }
993 } 997 }
994 EXPORT_SYMBOL(check_disk_size_change); 998 EXPORT_SYMBOL(check_disk_size_change);
995 999
996 /** 1000 /**
997 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 1001 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
998 * @disk: struct gendisk to be revalidated 1002 * @disk: struct gendisk to be revalidated
999 * 1003 *
1000 * This routine is a wrapper for lower-level driver's revalidate_disk 1004 * This routine is a wrapper for lower-level driver's revalidate_disk
1001 * call-backs. It is used to do common pre and post operations needed 1005 * call-backs. It is used to do common pre and post operations needed
1002 * for all revalidate_disk operations. 1006 * for all revalidate_disk operations.
1003 */ 1007 */
1004 int revalidate_disk(struct gendisk *disk) 1008 int revalidate_disk(struct gendisk *disk)
1005 { 1009 {
1006 struct block_device *bdev; 1010 struct block_device *bdev;
1007 int ret = 0; 1011 int ret = 0;
1008 1012
1009 if (disk->fops->revalidate_disk) 1013 if (disk->fops->revalidate_disk)
1010 ret = disk->fops->revalidate_disk(disk); 1014 ret = disk->fops->revalidate_disk(disk);
1011 1015
1012 bdev = bdget_disk(disk, 0); 1016 bdev = bdget_disk(disk, 0);
1013 if (!bdev) 1017 if (!bdev)
1014 return ret; 1018 return ret;
1015 1019
1016 mutex_lock(&bdev->bd_mutex); 1020 mutex_lock(&bdev->bd_mutex);
1017 check_disk_size_change(disk, bdev); 1021 check_disk_size_change(disk, bdev);
1018 mutex_unlock(&bdev->bd_mutex); 1022 mutex_unlock(&bdev->bd_mutex);
1019 bdput(bdev); 1023 bdput(bdev);
1020 return ret; 1024 return ret;
1021 } 1025 }
1022 EXPORT_SYMBOL(revalidate_disk); 1026 EXPORT_SYMBOL(revalidate_disk);
1023 1027
1024 /* 1028 /*
1025 * This routine checks whether a removable media has been changed, 1029 * This routine checks whether a removable media has been changed,
1026 * and invalidates all buffer-cache-entries in that case. This 1030 * and invalidates all buffer-cache-entries in that case. This
1027 * is a relatively slow routine, so we have to try to minimize using 1031 * is a relatively slow routine, so we have to try to minimize using
1028 * it. Thus it is called only upon a 'mount' or 'open'. This 1032 * it. Thus it is called only upon a 'mount' or 'open'. This
1029 * is the best way of combining speed and utility, I think. 1033 * is the best way of combining speed and utility, I think.
1030 * People changing diskettes in the middle of an operation deserve 1034 * People changing diskettes in the middle of an operation deserve
1031 * to lose :-) 1035 * to lose :-)
1032 */ 1036 */
1033 int check_disk_change(struct block_device *bdev) 1037 int check_disk_change(struct block_device *bdev)
1034 { 1038 {
1035 struct gendisk *disk = bdev->bd_disk; 1039 struct gendisk *disk = bdev->bd_disk;
1036 const struct block_device_operations *bdops = disk->fops; 1040 const struct block_device_operations *bdops = disk->fops;
1037 unsigned int events; 1041 unsigned int events;
1038 1042
1039 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | 1043 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1040 DISK_EVENT_EJECT_REQUEST); 1044 DISK_EVENT_EJECT_REQUEST);
1041 if (!(events & DISK_EVENT_MEDIA_CHANGE)) 1045 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1042 return 0; 1046 return 0;
1043 1047
1044 flush_disk(bdev, true); 1048 flush_disk(bdev, true);
1045 if (bdops->revalidate_disk) 1049 if (bdops->revalidate_disk)
1046 bdops->revalidate_disk(bdev->bd_disk); 1050 bdops->revalidate_disk(bdev->bd_disk);
1047 return 1; 1051 return 1;
1048 } 1052 }
1049 1053
1050 EXPORT_SYMBOL(check_disk_change); 1054 EXPORT_SYMBOL(check_disk_change);
1051 1055
1052 void bd_set_size(struct block_device *bdev, loff_t size) 1056 void bd_set_size(struct block_device *bdev, loff_t size)
1053 { 1057 {
1054 unsigned bsize = bdev_logical_block_size(bdev); 1058 unsigned bsize = bdev_logical_block_size(bdev);
1055 1059
1056 bdev->bd_inode->i_size = size; 1060 bdev->bd_inode->i_size = size;
1057 while (bsize < PAGE_CACHE_SIZE) { 1061 while (bsize < PAGE_CACHE_SIZE) {
1058 if (size & bsize) 1062 if (size & bsize)
1059 break; 1063 break;
1060 bsize <<= 1; 1064 bsize <<= 1;
1061 } 1065 }
1062 bdev->bd_block_size = bsize; 1066 bdev->bd_block_size = bsize;
1063 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 1067 bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1064 } 1068 }
1065 EXPORT_SYMBOL(bd_set_size); 1069 EXPORT_SYMBOL(bd_set_size);
1066 1070
1067 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1071 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1068 1072
1069 /* 1073 /*
1070 * bd_mutex locking: 1074 * bd_mutex locking:
1071 * 1075 *
1072 * mutex_lock(part->bd_mutex) 1076 * mutex_lock(part->bd_mutex)
1073 * mutex_lock_nested(whole->bd_mutex, 1) 1077 * mutex_lock_nested(whole->bd_mutex, 1)
1074 */ 1078 */
1075 1079
1076 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1080 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1077 { 1081 {
1078 struct gendisk *disk; 1082 struct gendisk *disk;
1079 int ret; 1083 int ret;
1080 int partno; 1084 int partno;
1081 int perm = 0; 1085 int perm = 0;
1082 1086
1083 if (mode & FMODE_READ) 1087 if (mode & FMODE_READ)
1084 perm |= MAY_READ; 1088 perm |= MAY_READ;
1085 if (mode & FMODE_WRITE) 1089 if (mode & FMODE_WRITE)
1086 perm |= MAY_WRITE; 1090 perm |= MAY_WRITE;
1087 /* 1091 /*
1088 * hooks: /n/, see "layering violations". 1092 * hooks: /n/, see "layering violations".
1089 */ 1093 */
1090 if (!for_part) { 1094 if (!for_part) {
1091 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1095 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1092 if (ret != 0) { 1096 if (ret != 0) {
1093 bdput(bdev); 1097 bdput(bdev);
1094 return ret; 1098 return ret;
1095 } 1099 }
1096 } 1100 }
1097 1101
1098 restart: 1102 restart:
1099 1103
1100 ret = -ENXIO; 1104 ret = -ENXIO;
1101 disk = get_gendisk(bdev->bd_dev, &partno); 1105 disk = get_gendisk(bdev->bd_dev, &partno);
1102 if (!disk) 1106 if (!disk)
1103 goto out; 1107 goto out;
1104 1108
1105 disk_block_events(disk); 1109 disk_block_events(disk);
1106 mutex_lock_nested(&bdev->bd_mutex, for_part); 1110 mutex_lock_nested(&bdev->bd_mutex, for_part);
1107 if (!bdev->bd_openers) { 1111 if (!bdev->bd_openers) {
1108 bdev->bd_disk = disk; 1112 bdev->bd_disk = disk;
1109 bdev->bd_contains = bdev; 1113 bdev->bd_contains = bdev;
1110 if (!partno) { 1114 if (!partno) {
1111 struct backing_dev_info *bdi; 1115 struct backing_dev_info *bdi;
1112 1116
1113 ret = -ENXIO; 1117 ret = -ENXIO;
1114 bdev->bd_part = disk_get_part(disk, partno); 1118 bdev->bd_part = disk_get_part(disk, partno);
1115 if (!bdev->bd_part) 1119 if (!bdev->bd_part)
1116 goto out_clear; 1120 goto out_clear;
1117 1121
1118 ret = 0; 1122 ret = 0;
1119 if (disk->fops->open) { 1123 if (disk->fops->open) {
1120 ret = disk->fops->open(bdev, mode); 1124 ret = disk->fops->open(bdev, mode);
1121 if (ret == -ERESTARTSYS) { 1125 if (ret == -ERESTARTSYS) {
1122 /* Lost a race with 'disk' being 1126 /* Lost a race with 'disk' being
1123 * deleted, try again. 1127 * deleted, try again.
1124 * See md.c 1128 * See md.c
1125 */ 1129 */
1126 disk_put_part(bdev->bd_part); 1130 disk_put_part(bdev->bd_part);
1127 bdev->bd_part = NULL; 1131 bdev->bd_part = NULL;
1128 bdev->bd_disk = NULL; 1132 bdev->bd_disk = NULL;
1129 mutex_unlock(&bdev->bd_mutex); 1133 mutex_unlock(&bdev->bd_mutex);
1130 disk_unblock_events(disk); 1134 disk_unblock_events(disk);
1131 module_put(disk->fops->owner); 1135 module_put(disk->fops->owner);
1132 put_disk(disk); 1136 put_disk(disk);
1133 goto restart; 1137 goto restart;
1134 } 1138 }
1135 } 1139 }
1136 1140
1137 if (!ret && !bdev->bd_openers) { 1141 if (!ret && !bdev->bd_openers) {
1138 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1142 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1139 bdi = blk_get_backing_dev_info(bdev); 1143 bdi = blk_get_backing_dev_info(bdev);
1140 if (bdi == NULL) 1144 if (bdi == NULL)
1141 bdi = &default_backing_dev_info; 1145 bdi = &default_backing_dev_info;
1142 bdev_inode_switch_bdi(bdev->bd_inode, bdi); 1146 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1143 } 1147 }
1144 1148
1145 /* 1149 /*
1146 * If the device is invalidated, rescan partition 1150 * If the device is invalidated, rescan partition
1147 * if open succeeded or failed with -ENOMEDIUM. 1151 * if open succeeded or failed with -ENOMEDIUM.
1148 * The latter is necessary to prevent ghost 1152 * The latter is necessary to prevent ghost
1149 * partitions on a removed medium. 1153 * partitions on a removed medium.
1150 */ 1154 */
1151 if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) 1155 if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
1152 rescan_partitions(disk, bdev); 1156 rescan_partitions(disk, bdev);
1153 if (ret) 1157 if (ret)
1154 goto out_clear; 1158 goto out_clear;
1155 } else { 1159 } else {
1156 struct block_device *whole; 1160 struct block_device *whole;
1157 whole = bdget_disk(disk, 0); 1161 whole = bdget_disk(disk, 0);
1158 ret = -ENOMEM; 1162 ret = -ENOMEM;
1159 if (!whole) 1163 if (!whole)
1160 goto out_clear; 1164 goto out_clear;
1161 BUG_ON(for_part); 1165 BUG_ON(for_part);
1162 ret = __blkdev_get(whole, mode, 1); 1166 ret = __blkdev_get(whole, mode, 1);
1163 if (ret) 1167 if (ret)
1164 goto out_clear; 1168 goto out_clear;
1165 bdev->bd_contains = whole; 1169 bdev->bd_contains = whole;
1166 bdev_inode_switch_bdi(bdev->bd_inode, 1170 bdev_inode_switch_bdi(bdev->bd_inode,
1167 whole->bd_inode->i_data.backing_dev_info); 1171 whole->bd_inode->i_data.backing_dev_info);
1168 bdev->bd_part = disk_get_part(disk, partno); 1172 bdev->bd_part = disk_get_part(disk, partno);
1169 if (!(disk->flags & GENHD_FL_UP) || 1173 if (!(disk->flags & GENHD_FL_UP) ||
1170 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1174 !bdev->bd_part || !bdev->bd_part->nr_sects) {
1171 ret = -ENXIO; 1175 ret = -ENXIO;
1172 goto out_clear; 1176 goto out_clear;
1173 } 1177 }
1174 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1178 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1175 } 1179 }
1176 } else { 1180 } else {
1177 if (bdev->bd_contains == bdev) { 1181 if (bdev->bd_contains == bdev) {
1178 ret = 0; 1182 ret = 0;
1179 if (bdev->bd_disk->fops->open) 1183 if (bdev->bd_disk->fops->open)
1180 ret = bdev->bd_disk->fops->open(bdev, mode); 1184 ret = bdev->bd_disk->fops->open(bdev, mode);
1181 /* the same as first opener case, read comment there */ 1185 /* the same as first opener case, read comment there */
1182 if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) 1186 if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
1183 rescan_partitions(bdev->bd_disk, bdev); 1187 rescan_partitions(bdev->bd_disk, bdev);
1184 if (ret) 1188 if (ret)
1185 goto out_unlock_bdev; 1189 goto out_unlock_bdev;
1186 } 1190 }
1187 /* only one opener holds refs to the module and disk */ 1191 /* only one opener holds refs to the module and disk */
1188 module_put(disk->fops->owner); 1192 module_put(disk->fops->owner);
1189 put_disk(disk); 1193 put_disk(disk);
1190 } 1194 }
1191 bdev->bd_openers++; 1195 bdev->bd_openers++;
1192 if (for_part) 1196 if (for_part)
1193 bdev->bd_part_count++; 1197 bdev->bd_part_count++;
1194 mutex_unlock(&bdev->bd_mutex); 1198 mutex_unlock(&bdev->bd_mutex);
1195 disk_unblock_events(disk); 1199 disk_unblock_events(disk);
1196 return 0; 1200 return 0;
1197 1201
1198 out_clear: 1202 out_clear:
1199 disk_put_part(bdev->bd_part); 1203 disk_put_part(bdev->bd_part);
1200 bdev->bd_disk = NULL; 1204 bdev->bd_disk = NULL;
1201 bdev->bd_part = NULL; 1205 bdev->bd_part = NULL;
1202 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); 1206 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1203 if (bdev != bdev->bd_contains) 1207 if (bdev != bdev->bd_contains)
1204 __blkdev_put(bdev->bd_contains, mode, 1); 1208 __blkdev_put(bdev->bd_contains, mode, 1);
1205 bdev->bd_contains = NULL; 1209 bdev->bd_contains = NULL;
1206 out_unlock_bdev: 1210 out_unlock_bdev:
1207 mutex_unlock(&bdev->bd_mutex); 1211 mutex_unlock(&bdev->bd_mutex);
1208 disk_unblock_events(disk); 1212 disk_unblock_events(disk);
1209 module_put(disk->fops->owner); 1213 module_put(disk->fops->owner);
1210 put_disk(disk); 1214 put_disk(disk);
1211 out: 1215 out:
1212 bdput(bdev); 1216 bdput(bdev);
1213 1217
1214 return ret; 1218 return ret;
1215 } 1219 }
1216 1220
1217 /** 1221 /**
1218 * blkdev_get - open a block device 1222 * blkdev_get - open a block device
1219 * @bdev: block_device to open 1223 * @bdev: block_device to open
1220 * @mode: FMODE_* mask 1224 * @mode: FMODE_* mask
1221 * @holder: exclusive holder identifier 1225 * @holder: exclusive holder identifier
1222 * 1226 *
1223 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is 1227 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1224 * open with exclusive access. Specifying %FMODE_EXCL with %NULL 1228 * open with exclusive access. Specifying %FMODE_EXCL with %NULL
1225 * @holder is invalid. Exclusive opens may nest for the same @holder. 1229 * @holder is invalid. Exclusive opens may nest for the same @holder.
1226 * 1230 *
1227 * On success, the reference count of @bdev is unchanged. On failure, 1231 * On success, the reference count of @bdev is unchanged. On failure,
1228 * @bdev is put. 1232 * @bdev is put.
1229 * 1233 *
1230 * CONTEXT: 1234 * CONTEXT:
1231 * Might sleep. 1235 * Might sleep.
1232 * 1236 *
1233 * RETURNS: 1237 * RETURNS:
1234 * 0 on success, -errno on failure. 1238 * 0 on success, -errno on failure.
1235 */ 1239 */
1236 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) 1240 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1237 { 1241 {
1238 struct block_device *whole = NULL; 1242 struct block_device *whole = NULL;
1239 int res; 1243 int res;
1240 1244
1241 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); 1245 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1242 1246
1243 if ((mode & FMODE_EXCL) && holder) { 1247 if ((mode & FMODE_EXCL) && holder) {
1244 whole = bd_start_claiming(bdev, holder); 1248 whole = bd_start_claiming(bdev, holder);
1245 if (IS_ERR(whole)) { 1249 if (IS_ERR(whole)) {
1246 bdput(bdev); 1250 bdput(bdev);
1247 return PTR_ERR(whole); 1251 return PTR_ERR(whole);
1248 } 1252 }
1249 } 1253 }
1250 1254
1251 res = __blkdev_get(bdev, mode, 0); 1255 res = __blkdev_get(bdev, mode, 0);
1252 1256
1253 if (whole) { 1257 if (whole) {
1254 struct gendisk *disk = whole->bd_disk; 1258 struct gendisk *disk = whole->bd_disk;
1255 1259
1256 /* finish claiming */ 1260 /* finish claiming */
1257 mutex_lock(&bdev->bd_mutex); 1261 mutex_lock(&bdev->bd_mutex);
1258 spin_lock(&bdev_lock); 1262 spin_lock(&bdev_lock);
1259 1263
1260 if (!res) { 1264 if (!res) {
1261 BUG_ON(!bd_may_claim(bdev, whole, holder)); 1265 BUG_ON(!bd_may_claim(bdev, whole, holder));
1262 /* 1266 /*
1263 * Note that for a whole device bd_holders 1267 * Note that for a whole device bd_holders
1264 * will be incremented twice, and bd_holder 1268 * will be incremented twice, and bd_holder
1265 * will be set to bd_may_claim before being 1269 * will be set to bd_may_claim before being
1266 * set to holder 1270 * set to holder
1267 */ 1271 */
1268 whole->bd_holders++; 1272 whole->bd_holders++;
1269 whole->bd_holder = bd_may_claim; 1273 whole->bd_holder = bd_may_claim;
1270 bdev->bd_holders++; 1274 bdev->bd_holders++;
1271 bdev->bd_holder = holder; 1275 bdev->bd_holder = holder;
1272 } 1276 }
1273 1277
1274 /* tell others that we're done */ 1278 /* tell others that we're done */
1275 BUG_ON(whole->bd_claiming != holder); 1279 BUG_ON(whole->bd_claiming != holder);
1276 whole->bd_claiming = NULL; 1280 whole->bd_claiming = NULL;
1277 wake_up_bit(&whole->bd_claiming, 0); 1281 wake_up_bit(&whole->bd_claiming, 0);
1278 1282
1279 spin_unlock(&bdev_lock); 1283 spin_unlock(&bdev_lock);
1280 1284
1281 /* 1285 /*
1282 * Block event polling for write claims if requested. Any 1286 * Block event polling for write claims if requested. Any
1283 * write holder makes the write_holder state stick until 1287 * write holder makes the write_holder state stick until
1284 * all are released. This is good enough and tracking 1288 * all are released. This is good enough and tracking
1285 * individual writeable reference is too fragile given the 1289 * individual writeable reference is too fragile given the
1286 * way @mode is used in blkdev_get/put(). 1290 * way @mode is used in blkdev_get/put().
1287 */ 1291 */
1288 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && 1292 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1289 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { 1293 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1290 bdev->bd_write_holder = true; 1294 bdev->bd_write_holder = true;
1291 disk_block_events(disk); 1295 disk_block_events(disk);
1292 } 1296 }
1293 1297
1294 mutex_unlock(&bdev->bd_mutex); 1298 mutex_unlock(&bdev->bd_mutex);
1295 bdput(whole); 1299 bdput(whole);
1296 } 1300 }
1297 1301
1298 return res; 1302 return res;
1299 } 1303 }
1300 EXPORT_SYMBOL(blkdev_get); 1304 EXPORT_SYMBOL(blkdev_get);
1301 1305
1302 /** 1306 /**
1303 * blkdev_get_by_path - open a block device by name 1307 * blkdev_get_by_path - open a block device by name
1304 * @path: path to the block device to open 1308 * @path: path to the block device to open
1305 * @mode: FMODE_* mask 1309 * @mode: FMODE_* mask
1306 * @holder: exclusive holder identifier 1310 * @holder: exclusive holder identifier
1307 * 1311 *
1308 * Open the blockdevice described by the device file at @path. @mode 1312 * Open the blockdevice described by the device file at @path. @mode
1309 * and @holder are identical to blkdev_get(). 1313 * and @holder are identical to blkdev_get().
1310 * 1314 *
1311 * On success, the returned block_device has reference count of one. 1315 * On success, the returned block_device has reference count of one.
1312 * 1316 *
1313 * CONTEXT: 1317 * CONTEXT:
1314 * Might sleep. 1318 * Might sleep.
1315 * 1319 *
1316 * RETURNS: 1320 * RETURNS:
1317 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1321 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1318 */ 1322 */
1319 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, 1323 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1320 void *holder) 1324 void *holder)
1321 { 1325 {
1322 struct block_device *bdev; 1326 struct block_device *bdev;
1323 int err; 1327 int err;
1324 1328
1325 bdev = lookup_bdev(path); 1329 bdev = lookup_bdev(path);
1326 if (IS_ERR(bdev)) 1330 if (IS_ERR(bdev))
1327 return bdev; 1331 return bdev;
1328 1332
1329 err = blkdev_get(bdev, mode, holder); 1333 err = blkdev_get(bdev, mode, holder);
1330 if (err) 1334 if (err)
1331 return ERR_PTR(err); 1335 return ERR_PTR(err);
1332 1336
1333 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { 1337 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1334 blkdev_put(bdev, mode); 1338 blkdev_put(bdev, mode);
1335 return ERR_PTR(-EACCES); 1339 return ERR_PTR(-EACCES);
1336 } 1340 }
1337 1341
1338 return bdev; 1342 return bdev;
1339 } 1343 }
1340 EXPORT_SYMBOL(blkdev_get_by_path); 1344 EXPORT_SYMBOL(blkdev_get_by_path);
1341 1345
1342 /** 1346 /**
1343 * blkdev_get_by_dev - open a block device by device number 1347 * blkdev_get_by_dev - open a block device by device number
1344 * @dev: device number of block device to open 1348 * @dev: device number of block device to open
1345 * @mode: FMODE_* mask 1349 * @mode: FMODE_* mask
1346 * @holder: exclusive holder identifier 1350 * @holder: exclusive holder identifier
1347 * 1351 *
1348 * Open the blockdevice described by device number @dev. @mode and 1352 * Open the blockdevice described by device number @dev. @mode and
1349 * @holder are identical to blkdev_get(). 1353 * @holder are identical to blkdev_get().
1350 * 1354 *
1351 * Use it ONLY if you really do not have anything better - i.e. when 1355 * Use it ONLY if you really do not have anything better - i.e. when
1352 * you are behind a truly sucky interface and all you are given is a 1356 * you are behind a truly sucky interface and all you are given is a
1353 * device number. _Never_ to be used for internal purposes. If you 1357 * device number. _Never_ to be used for internal purposes. If you
1354 * ever need it - reconsider your API. 1358 * ever need it - reconsider your API.
1355 * 1359 *
1356 * On success, the returned block_device has reference count of one. 1360 * On success, the returned block_device has reference count of one.
1357 * 1361 *
1358 * CONTEXT: 1362 * CONTEXT:
1359 * Might sleep. 1363 * Might sleep.
1360 * 1364 *
1361 * RETURNS: 1365 * RETURNS:
1362 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1366 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1363 */ 1367 */
1364 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) 1368 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1365 { 1369 {
1366 struct block_device *bdev; 1370 struct block_device *bdev;
1367 int err; 1371 int err;
1368 1372
1369 bdev = bdget(dev); 1373 bdev = bdget(dev);
1370 if (!bdev) 1374 if (!bdev)
1371 return ERR_PTR(-ENOMEM); 1375 return ERR_PTR(-ENOMEM);
1372 1376
1373 err = blkdev_get(bdev, mode, holder); 1377 err = blkdev_get(bdev, mode, holder);
1374 if (err) 1378 if (err)
1375 return ERR_PTR(err); 1379 return ERR_PTR(err);
1376 1380
1377 return bdev; 1381 return bdev;
1378 } 1382 }
1379 EXPORT_SYMBOL(blkdev_get_by_dev); 1383 EXPORT_SYMBOL(blkdev_get_by_dev);
1380 1384
1381 static int blkdev_open(struct inode * inode, struct file * filp) 1385 static int blkdev_open(struct inode * inode, struct file * filp)
1382 { 1386 {
1383 struct block_device *bdev; 1387 struct block_device *bdev;
1384 1388
1385 /* 1389 /*
1386 * Preserve backwards compatibility and allow large file access 1390 * Preserve backwards compatibility and allow large file access
1387 * even if userspace doesn't ask for it explicitly. Some mkfs 1391 * even if userspace doesn't ask for it explicitly. Some mkfs
1388 * binary needs it. We might want to drop this workaround 1392 * binary needs it. We might want to drop this workaround
1389 * during an unstable branch. 1393 * during an unstable branch.
1390 */ 1394 */
1391 filp->f_flags |= O_LARGEFILE; 1395 filp->f_flags |= O_LARGEFILE;
1392 1396
1393 if (filp->f_flags & O_NDELAY) 1397 if (filp->f_flags & O_NDELAY)
1394 filp->f_mode |= FMODE_NDELAY; 1398 filp->f_mode |= FMODE_NDELAY;
1395 if (filp->f_flags & O_EXCL) 1399 if (filp->f_flags & O_EXCL)
1396 filp->f_mode |= FMODE_EXCL; 1400 filp->f_mode |= FMODE_EXCL;
1397 if ((filp->f_flags & O_ACCMODE) == 3) 1401 if ((filp->f_flags & O_ACCMODE) == 3)
1398 filp->f_mode |= FMODE_WRITE_IOCTL; 1402 filp->f_mode |= FMODE_WRITE_IOCTL;
1399 1403
1400 bdev = bd_acquire(inode); 1404 bdev = bd_acquire(inode);
1401 if (bdev == NULL) 1405 if (bdev == NULL)
1402 return -ENOMEM; 1406 return -ENOMEM;
1403 1407
1404 filp->f_mapping = bdev->bd_inode->i_mapping; 1408 filp->f_mapping = bdev->bd_inode->i_mapping;
1405 1409
1406 return blkdev_get(bdev, filp->f_mode, filp); 1410 return blkdev_get(bdev, filp->f_mode, filp);
1407 } 1411 }
1408 1412
1409 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1413 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1410 { 1414 {
1411 int ret = 0; 1415 int ret = 0;
1412 struct gendisk *disk = bdev->bd_disk; 1416 struct gendisk *disk = bdev->bd_disk;
1413 struct block_device *victim = NULL; 1417 struct block_device *victim = NULL;
1414 1418
1415 mutex_lock_nested(&bdev->bd_mutex, for_part); 1419 mutex_lock_nested(&bdev->bd_mutex, for_part);
1416 if (for_part) 1420 if (for_part)
1417 bdev->bd_part_count--; 1421 bdev->bd_part_count--;
1418 1422
1419 if (!--bdev->bd_openers) { 1423 if (!--bdev->bd_openers) {
1420 WARN_ON_ONCE(bdev->bd_holders); 1424 WARN_ON_ONCE(bdev->bd_holders);
1421 sync_blockdev(bdev); 1425 sync_blockdev(bdev);
1422 kill_bdev(bdev); 1426 kill_bdev(bdev);
1423 } 1427 }
1424 if (bdev->bd_contains == bdev) { 1428 if (bdev->bd_contains == bdev) {
1425 if (disk->fops->release) 1429 if (disk->fops->release)
1426 ret = disk->fops->release(disk, mode); 1430 ret = disk->fops->release(disk, mode);
1427 } 1431 }
1428 if (!bdev->bd_openers) { 1432 if (!bdev->bd_openers) {
1429 struct module *owner = disk->fops->owner; 1433 struct module *owner = disk->fops->owner;
1430 1434
1431 put_disk(disk); 1435 put_disk(disk);
1432 module_put(owner); 1436 module_put(owner);
1433 disk_put_part(bdev->bd_part); 1437 disk_put_part(bdev->bd_part);
1434 bdev->bd_part = NULL; 1438 bdev->bd_part = NULL;
1435 bdev->bd_disk = NULL; 1439 bdev->bd_disk = NULL;
1436 bdev_inode_switch_bdi(bdev->bd_inode, 1440 bdev_inode_switch_bdi(bdev->bd_inode,
1437 &default_backing_dev_info); 1441 &default_backing_dev_info);
1438 if (bdev != bdev->bd_contains) 1442 if (bdev != bdev->bd_contains)
1439 victim = bdev->bd_contains; 1443 victim = bdev->bd_contains;
1440 bdev->bd_contains = NULL; 1444 bdev->bd_contains = NULL;
1441 } 1445 }
1442 mutex_unlock(&bdev->bd_mutex); 1446 mutex_unlock(&bdev->bd_mutex);
1443 bdput(bdev); 1447 bdput(bdev);
1444 if (victim) 1448 if (victim)
1445 __blkdev_put(victim, mode, 1); 1449 __blkdev_put(victim, mode, 1);
1446 return ret; 1450 return ret;
1447 } 1451 }
1448 1452
1449 int blkdev_put(struct block_device *bdev, fmode_t mode) 1453 int blkdev_put(struct block_device *bdev, fmode_t mode)
1450 { 1454 {
1451 mutex_lock(&bdev->bd_mutex); 1455 mutex_lock(&bdev->bd_mutex);
1452 1456
1453 if (mode & FMODE_EXCL) { 1457 if (mode & FMODE_EXCL) {
1454 bool bdev_free; 1458 bool bdev_free;
1455 1459
1456 /* 1460 /*
1457 * Release a claim on the device. The holder fields 1461 * Release a claim on the device. The holder fields
1458 * are protected with bdev_lock. bd_mutex is to 1462 * are protected with bdev_lock. bd_mutex is to
1459 * synchronize disk_holder unlinking. 1463 * synchronize disk_holder unlinking.
1460 */ 1464 */
1461 spin_lock(&bdev_lock); 1465 spin_lock(&bdev_lock);
1462 1466
1463 WARN_ON_ONCE(--bdev->bd_holders < 0); 1467 WARN_ON_ONCE(--bdev->bd_holders < 0);
1464 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); 1468 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1465 1469
1466 /* bd_contains might point to self, check in a separate step */ 1470 /* bd_contains might point to self, check in a separate step */
1467 if ((bdev_free = !bdev->bd_holders)) 1471 if ((bdev_free = !bdev->bd_holders))
1468 bdev->bd_holder = NULL; 1472 bdev->bd_holder = NULL;
1469 if (!bdev->bd_contains->bd_holders) 1473 if (!bdev->bd_contains->bd_holders)
1470 bdev->bd_contains->bd_holder = NULL; 1474 bdev->bd_contains->bd_holder = NULL;
1471 1475
1472 spin_unlock(&bdev_lock); 1476 spin_unlock(&bdev_lock);
1473 1477
1474 /* 1478 /*
1475 * If this was the last claim, remove holder link and 1479 * If this was the last claim, remove holder link and
1476 * unblock evpoll if it was a write holder. 1480 * unblock evpoll if it was a write holder.
1477 */ 1481 */
1478 if (bdev_free && bdev->bd_write_holder) { 1482 if (bdev_free && bdev->bd_write_holder) {
1479 disk_unblock_events(bdev->bd_disk); 1483 disk_unblock_events(bdev->bd_disk);
1480 bdev->bd_write_holder = false; 1484 bdev->bd_write_holder = false;
1481 } 1485 }
1482 } 1486 }
1483 1487
1484 /* 1488 /*
1485 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1489 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
1486 * event. This is to ensure detection of media removal commanded 1490 * event. This is to ensure detection of media removal commanded
1487 * from userland - e.g. eject(1). 1491 * from userland - e.g. eject(1).
1488 */ 1492 */
1489 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); 1493 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
1490 1494
1491 mutex_unlock(&bdev->bd_mutex); 1495 mutex_unlock(&bdev->bd_mutex);
1492 1496
1493 return __blkdev_put(bdev, mode, 0); 1497 return __blkdev_put(bdev, mode, 0);
1494 } 1498 }
1495 EXPORT_SYMBOL(blkdev_put); 1499 EXPORT_SYMBOL(blkdev_put);
1496 1500
1497 static int blkdev_close(struct inode * inode, struct file * filp) 1501 static int blkdev_close(struct inode * inode, struct file * filp)
1498 { 1502 {
1499 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1503 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1500 1504
1501 return blkdev_put(bdev, filp->f_mode); 1505 return blkdev_put(bdev, filp->f_mode);
1502 } 1506 }
1503 1507
1504 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1508 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1505 { 1509 {
1506 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1510 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1507 fmode_t mode = file->f_mode; 1511 fmode_t mode = file->f_mode;
1508 1512
1509 /* 1513 /*
1510 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 1514 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1511 * to updated it before every ioctl. 1515 * to updated it before every ioctl.
1512 */ 1516 */
1513 if (file->f_flags & O_NDELAY) 1517 if (file->f_flags & O_NDELAY)
1514 mode |= FMODE_NDELAY; 1518 mode |= FMODE_NDELAY;
1515 else 1519 else
1516 mode &= ~FMODE_NDELAY; 1520 mode &= ~FMODE_NDELAY;
1517 1521
1518 return blkdev_ioctl(bdev, mode, cmd, arg); 1522 return blkdev_ioctl(bdev, mode, cmd, arg);
1519 } 1523 }
1520 1524
1521 /* 1525 /*
1522 * Write data to the block device. Only intended for the block device itself 1526 * Write data to the block device. Only intended for the block device itself
1523 * and the raw driver which basically is a fake block device. 1527 * and the raw driver which basically is a fake block device.
1524 * 1528 *
1525 * Does not take i_mutex for the write and thus is not for general purpose 1529 * Does not take i_mutex for the write and thus is not for general purpose
1526 * use. 1530 * use.
1527 */ 1531 */
1528 ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, 1532 ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1529 unsigned long nr_segs, loff_t pos) 1533 unsigned long nr_segs, loff_t pos)
1530 { 1534 {
1531 struct file *file = iocb->ki_filp; 1535 struct file *file = iocb->ki_filp;
1532 ssize_t ret; 1536 ssize_t ret;
1533 1537
1534 BUG_ON(iocb->ki_pos != pos); 1538 BUG_ON(iocb->ki_pos != pos);
1535 1539
1536 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1540 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1537 if (ret > 0 || ret == -EIOCBQUEUED) { 1541 if (ret > 0 || ret == -EIOCBQUEUED) {
1538 ssize_t err; 1542 ssize_t err;
1539 1543
1540 err = generic_write_sync(file, pos, ret); 1544 err = generic_write_sync(file, pos, ret);
1541 if (err < 0 && ret > 0) 1545 if (err < 0 && ret > 0)
1542 ret = err; 1546 ret = err;
1543 } 1547 }
1544 return ret; 1548 return ret;
1545 } 1549 }
1546 EXPORT_SYMBOL_GPL(blkdev_aio_write); 1550 EXPORT_SYMBOL_GPL(blkdev_aio_write);
1547 1551
1548 /* 1552 /*
1549 * Try to release a page associated with block device when the system 1553 * Try to release a page associated with block device when the system
1550 * is under memory pressure. 1554 * is under memory pressure.
1551 */ 1555 */
1552 static int blkdev_releasepage(struct page *page, gfp_t wait) 1556 static int blkdev_releasepage(struct page *page, gfp_t wait)
1553 { 1557 {
1554 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1558 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1555 1559
1556 if (super && super->s_op->bdev_try_to_free_page) 1560 if (super && super->s_op->bdev_try_to_free_page)
1557 return super->s_op->bdev_try_to_free_page(super, page, wait); 1561 return super->s_op->bdev_try_to_free_page(super, page, wait);
1558 1562
1559 return try_to_free_buffers(page); 1563 return try_to_free_buffers(page);
1560 } 1564 }
1561 1565
1562 static const struct address_space_operations def_blk_aops = { 1566 static const struct address_space_operations def_blk_aops = {
1563 .readpage = blkdev_readpage, 1567 .readpage = blkdev_readpage,
1564 .writepage = blkdev_writepage, 1568 .writepage = blkdev_writepage,
1565 .write_begin = blkdev_write_begin, 1569 .write_begin = blkdev_write_begin,
1566 .write_end = blkdev_write_end, 1570 .write_end = blkdev_write_end,
1567 .writepages = generic_writepages, 1571 .writepages = generic_writepages,
1568 .releasepage = blkdev_releasepage, 1572 .releasepage = blkdev_releasepage,
1569 .direct_IO = blkdev_direct_IO, 1573 .direct_IO = blkdev_direct_IO,
1570 }; 1574 };
1571 1575
1572 const struct file_operations def_blk_fops = { 1576 const struct file_operations def_blk_fops = {
1573 .open = blkdev_open, 1577 .open = blkdev_open,
1574 .release = blkdev_close, 1578 .release = blkdev_close,
1575 .llseek = block_llseek, 1579 .llseek = block_llseek,
1576 .read = do_sync_read, 1580 .read = do_sync_read,
1577 .write = do_sync_write, 1581 .write = do_sync_write,
1578 .aio_read = generic_file_aio_read, 1582 .aio_read = generic_file_aio_read,
1579 .aio_write = blkdev_aio_write, 1583 .aio_write = blkdev_aio_write,
1580 .mmap = generic_file_mmap, 1584 .mmap = generic_file_mmap,
1581 .fsync = blkdev_fsync, 1585 .fsync = blkdev_fsync,
1582 .unlocked_ioctl = block_ioctl, 1586 .unlocked_ioctl = block_ioctl,
1583 #ifdef CONFIG_COMPAT 1587 #ifdef CONFIG_COMPAT
1584 .compat_ioctl = compat_blkdev_ioctl, 1588 .compat_ioctl = compat_blkdev_ioctl,
1585 #endif 1589 #endif
1586 .splice_read = generic_file_splice_read, 1590 .splice_read = generic_file_splice_read,
1587 .splice_write = generic_file_splice_write, 1591 .splice_write = generic_file_splice_write,
1588 }; 1592 };
1589 1593
1590 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1594 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1591 { 1595 {
1592 int res; 1596 int res;
1593 mm_segment_t old_fs = get_fs(); 1597 mm_segment_t old_fs = get_fs();
1594 set_fs(KERNEL_DS); 1598 set_fs(KERNEL_DS);
1595 res = blkdev_ioctl(bdev, 0, cmd, arg); 1599 res = blkdev_ioctl(bdev, 0, cmd, arg);
1596 set_fs(old_fs); 1600 set_fs(old_fs);
1597 return res; 1601 return res;
1598 } 1602 }
1599 1603
1600 EXPORT_SYMBOL(ioctl_by_bdev); 1604 EXPORT_SYMBOL(ioctl_by_bdev);
1601 1605
1602 /** 1606 /**
1603 * lookup_bdev - lookup a struct block_device by name 1607 * lookup_bdev - lookup a struct block_device by name
1604 * @pathname: special file representing the block device 1608 * @pathname: special file representing the block device
1605 * 1609 *
1606 * Get a reference to the blockdevice at @pathname in the current 1610 * Get a reference to the blockdevice at @pathname in the current
1607 * namespace if possible and return it. Return ERR_PTR(error) 1611 * namespace if possible and return it. Return ERR_PTR(error)
1608 * otherwise. 1612 * otherwise.
1609 */ 1613 */
1610 struct block_device *lookup_bdev(const char *pathname) 1614 struct block_device *lookup_bdev(const char *pathname)
1611 { 1615 {
1612 struct block_device *bdev; 1616 struct block_device *bdev;
1613 struct inode *inode; 1617 struct inode *inode;
1614 struct path path; 1618 struct path path;
1615 int error; 1619 int error;
1616 1620
1617 if (!pathname || !*pathname) 1621 if (!pathname || !*pathname)
1618 return ERR_PTR(-EINVAL); 1622 return ERR_PTR(-EINVAL);
1619 1623
1620 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1624 error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1621 if (error) 1625 if (error)
1622 return ERR_PTR(error); 1626 return ERR_PTR(error);
1623 1627
1624 inode = path.dentry->d_inode; 1628 inode = path.dentry->d_inode;
1625 error = -ENOTBLK; 1629 error = -ENOTBLK;
1626 if (!S_ISBLK(inode->i_mode)) 1630 if (!S_ISBLK(inode->i_mode))
1627 goto fail; 1631 goto fail;
1628 error = -EACCES; 1632 error = -EACCES;
1629 if (path.mnt->mnt_flags & MNT_NODEV) 1633 if (path.mnt->mnt_flags & MNT_NODEV)
1630 goto fail; 1634 goto fail;
1631 error = -ENOMEM; 1635 error = -ENOMEM;
1632 bdev = bd_acquire(inode); 1636 bdev = bd_acquire(inode);
1633 if (!bdev) 1637 if (!bdev)
1634 goto fail; 1638 goto fail;
1635 out: 1639 out:
1636 path_put(&path); 1640 path_put(&path);
1637 return bdev; 1641 return bdev;
1638 fail: 1642 fail:
1639 bdev = ERR_PTR(error); 1643 bdev = ERR_PTR(error);
1640 goto out; 1644 goto out;
1641 } 1645 }
1642 EXPORT_SYMBOL(lookup_bdev); 1646 EXPORT_SYMBOL(lookup_bdev);
1643 1647
1644 int __invalidate_device(struct block_device *bdev, bool kill_dirty) 1648 int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1645 { 1649 {
1646 struct super_block *sb = get_super(bdev); 1650 struct super_block *sb = get_super(bdev);
1647 int res = 0; 1651 int res = 0;
1648 1652
1649 if (sb) { 1653 if (sb) {
1650 /* 1654 /*
1651 * no need to lock the super, get_super holds the 1655 * no need to lock the super, get_super holds the
1652 * read mutex so the filesystem cannot go away 1656 * read mutex so the filesystem cannot go away
1653 * under us (->put_super runs with the write lock 1657 * under us (->put_super runs with the write lock
1654 * hold). 1658 * hold).
1655 */ 1659 */
1656 shrink_dcache_sb(sb); 1660 shrink_dcache_sb(sb);
1657 res = invalidate_inodes(sb, kill_dirty); 1661 res = invalidate_inodes(sb, kill_dirty);
1658 drop_super(sb); 1662 drop_super(sb);
1659 } 1663 }
1660 invalidate_bdev(bdev); 1664 invalidate_bdev(bdev);
1661 return res; 1665 return res;
1662 } 1666 }
1663 EXPORT_SYMBOL(__invalidate_device); 1667 EXPORT_SYMBOL(__invalidate_device);
fs/btrfs/extent_io.c
1 #include <linux/bitops.h> 1 #include <linux/bitops.h>
2 #include <linux/slab.h> 2 #include <linux/slab.h>
3 #include <linux/bio.h> 3 #include <linux/bio.h>
4 #include <linux/mm.h> 4 #include <linux/mm.h>
5 #include <linux/pagemap.h> 5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h> 6 #include <linux/page-flags.h>
7 #include <linux/module.h> 7 #include <linux/module.h>
8 #include <linux/spinlock.h> 8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h> 9 #include <linux/blkdev.h>
10 #include <linux/swap.h> 10 #include <linux/swap.h>
11 #include <linux/writeback.h> 11 #include <linux/writeback.h>
12 #include <linux/pagevec.h> 12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h> 13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h> 14 #include <linux/cleancache.h>
15 #include "extent_io.h" 15 #include "extent_io.h"
16 #include "extent_map.h" 16 #include "extent_map.h"
17 #include "compat.h" 17 #include "compat.h"
18 #include "ctree.h" 18 #include "ctree.h"
19 #include "btrfs_inode.h" 19 #include "btrfs_inode.h"
20 20
21 static struct kmem_cache *extent_state_cache; 21 static struct kmem_cache *extent_state_cache;
22 static struct kmem_cache *extent_buffer_cache; 22 static struct kmem_cache *extent_buffer_cache;
23 23
24 static LIST_HEAD(buffers); 24 static LIST_HEAD(buffers);
25 static LIST_HEAD(states); 25 static LIST_HEAD(states);
26 26
27 #define LEAK_DEBUG 0 27 #define LEAK_DEBUG 0
28 #if LEAK_DEBUG 28 #if LEAK_DEBUG
29 static DEFINE_SPINLOCK(leak_lock); 29 static DEFINE_SPINLOCK(leak_lock);
30 #endif 30 #endif
31 31
32 #define BUFFER_LRU_MAX 64 32 #define BUFFER_LRU_MAX 64
33 33
34 struct tree_entry { 34 struct tree_entry {
35 u64 start; 35 u64 start;
36 u64 end; 36 u64 end;
37 struct rb_node rb_node; 37 struct rb_node rb_node;
38 }; 38 };
39 39
40 struct extent_page_data { 40 struct extent_page_data {
41 struct bio *bio; 41 struct bio *bio;
42 struct extent_io_tree *tree; 42 struct extent_io_tree *tree;
43 get_extent_t *get_extent; 43 get_extent_t *get_extent;
44 44
45 /* tells writepage not to lock the state bits for this range 45 /* tells writepage not to lock the state bits for this range
46 * it still does the unlocking 46 * it still does the unlocking
47 */ 47 */
48 unsigned int extent_locked:1; 48 unsigned int extent_locked:1;
49 49
50 /* tells the submit_bio code to use a WRITE_SYNC */ 50 /* tells the submit_bio code to use a WRITE_SYNC */
51 unsigned int sync_io:1; 51 unsigned int sync_io:1;
52 }; 52 };
53 53
54 int __init extent_io_init(void) 54 int __init extent_io_init(void)
55 { 55 {
56 extent_state_cache = kmem_cache_create("extent_state", 56 extent_state_cache = kmem_cache_create("extent_state",
57 sizeof(struct extent_state), 0, 57 sizeof(struct extent_state), 0,
58 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 58 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
59 if (!extent_state_cache) 59 if (!extent_state_cache)
60 return -ENOMEM; 60 return -ENOMEM;
61 61
62 extent_buffer_cache = kmem_cache_create("extent_buffers", 62 extent_buffer_cache = kmem_cache_create("extent_buffers",
63 sizeof(struct extent_buffer), 0, 63 sizeof(struct extent_buffer), 0,
64 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 64 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
65 if (!extent_buffer_cache) 65 if (!extent_buffer_cache)
66 goto free_state_cache; 66 goto free_state_cache;
67 return 0; 67 return 0;
68 68
69 free_state_cache: 69 free_state_cache:
70 kmem_cache_destroy(extent_state_cache); 70 kmem_cache_destroy(extent_state_cache);
71 return -ENOMEM; 71 return -ENOMEM;
72 } 72 }
73 73
74 void extent_io_exit(void) 74 void extent_io_exit(void)
75 { 75 {
76 struct extent_state *state; 76 struct extent_state *state;
77 struct extent_buffer *eb; 77 struct extent_buffer *eb;
78 78
79 while (!list_empty(&states)) { 79 while (!list_empty(&states)) {
80 state = list_entry(states.next, struct extent_state, leak_list); 80 state = list_entry(states.next, struct extent_state, leak_list);
81 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 81 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
82 "state %lu in tree %p refs %d\n", 82 "state %lu in tree %p refs %d\n",
83 (unsigned long long)state->start, 83 (unsigned long long)state->start,
84 (unsigned long long)state->end, 84 (unsigned long long)state->end,
85 state->state, state->tree, atomic_read(&state->refs)); 85 state->state, state->tree, atomic_read(&state->refs));
86 list_del(&state->leak_list); 86 list_del(&state->leak_list);
87 kmem_cache_free(extent_state_cache, state); 87 kmem_cache_free(extent_state_cache, state);
88 88
89 } 89 }
90 90
91 while (!list_empty(&buffers)) { 91 while (!list_empty(&buffers)) {
92 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 92 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
93 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 93 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
94 "refs %d\n", (unsigned long long)eb->start, 94 "refs %d\n", (unsigned long long)eb->start,
95 eb->len, atomic_read(&eb->refs)); 95 eb->len, atomic_read(&eb->refs));
96 list_del(&eb->leak_list); 96 list_del(&eb->leak_list);
97 kmem_cache_free(extent_buffer_cache, eb); 97 kmem_cache_free(extent_buffer_cache, eb);
98 } 98 }
99 if (extent_state_cache) 99 if (extent_state_cache)
100 kmem_cache_destroy(extent_state_cache); 100 kmem_cache_destroy(extent_state_cache);
101 if (extent_buffer_cache) 101 if (extent_buffer_cache)
102 kmem_cache_destroy(extent_buffer_cache); 102 kmem_cache_destroy(extent_buffer_cache);
103 } 103 }
104 104
105 void extent_io_tree_init(struct extent_io_tree *tree, 105 void extent_io_tree_init(struct extent_io_tree *tree,
106 struct address_space *mapping) 106 struct address_space *mapping)
107 { 107 {
108 tree->state = RB_ROOT; 108 tree->state = RB_ROOT;
109 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 109 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
110 tree->ops = NULL; 110 tree->ops = NULL;
111 tree->dirty_bytes = 0; 111 tree->dirty_bytes = 0;
112 spin_lock_init(&tree->lock); 112 spin_lock_init(&tree->lock);
113 spin_lock_init(&tree->buffer_lock); 113 spin_lock_init(&tree->buffer_lock);
114 tree->mapping = mapping; 114 tree->mapping = mapping;
115 } 115 }
116 116
117 static struct extent_state *alloc_extent_state(gfp_t mask) 117 static struct extent_state *alloc_extent_state(gfp_t mask)
118 { 118 {
119 struct extent_state *state; 119 struct extent_state *state;
120 #if LEAK_DEBUG 120 #if LEAK_DEBUG
121 unsigned long flags; 121 unsigned long flags;
122 #endif 122 #endif
123 123
124 state = kmem_cache_alloc(extent_state_cache, mask); 124 state = kmem_cache_alloc(extent_state_cache, mask);
125 if (!state) 125 if (!state)
126 return state; 126 return state;
127 state->state = 0; 127 state->state = 0;
128 state->private = 0; 128 state->private = 0;
129 state->tree = NULL; 129 state->tree = NULL;
130 #if LEAK_DEBUG 130 #if LEAK_DEBUG
131 spin_lock_irqsave(&leak_lock, flags); 131 spin_lock_irqsave(&leak_lock, flags);
132 list_add(&state->leak_list, &states); 132 list_add(&state->leak_list, &states);
133 spin_unlock_irqrestore(&leak_lock, flags); 133 spin_unlock_irqrestore(&leak_lock, flags);
134 #endif 134 #endif
135 atomic_set(&state->refs, 1); 135 atomic_set(&state->refs, 1);
136 init_waitqueue_head(&state->wq); 136 init_waitqueue_head(&state->wq);
137 return state; 137 return state;
138 } 138 }
139 139
140 void free_extent_state(struct extent_state *state) 140 void free_extent_state(struct extent_state *state)
141 { 141 {
142 if (!state) 142 if (!state)
143 return; 143 return;
144 if (atomic_dec_and_test(&state->refs)) { 144 if (atomic_dec_and_test(&state->refs)) {
145 #if LEAK_DEBUG 145 #if LEAK_DEBUG
146 unsigned long flags; 146 unsigned long flags;
147 #endif 147 #endif
148 WARN_ON(state->tree); 148 WARN_ON(state->tree);
149 #if LEAK_DEBUG 149 #if LEAK_DEBUG
150 spin_lock_irqsave(&leak_lock, flags); 150 spin_lock_irqsave(&leak_lock, flags);
151 list_del(&state->leak_list); 151 list_del(&state->leak_list);
152 spin_unlock_irqrestore(&leak_lock, flags); 152 spin_unlock_irqrestore(&leak_lock, flags);
153 #endif 153 #endif
154 kmem_cache_free(extent_state_cache, state); 154 kmem_cache_free(extent_state_cache, state);
155 } 155 }
156 } 156 }
157 157
158 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 158 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
159 struct rb_node *node) 159 struct rb_node *node)
160 { 160 {
161 struct rb_node **p = &root->rb_node; 161 struct rb_node **p = &root->rb_node;
162 struct rb_node *parent = NULL; 162 struct rb_node *parent = NULL;
163 struct tree_entry *entry; 163 struct tree_entry *entry;
164 164
165 while (*p) { 165 while (*p) {
166 parent = *p; 166 parent = *p;
167 entry = rb_entry(parent, struct tree_entry, rb_node); 167 entry = rb_entry(parent, struct tree_entry, rb_node);
168 168
169 if (offset < entry->start) 169 if (offset < entry->start)
170 p = &(*p)->rb_left; 170 p = &(*p)->rb_left;
171 else if (offset > entry->end) 171 else if (offset > entry->end)
172 p = &(*p)->rb_right; 172 p = &(*p)->rb_right;
173 else 173 else
174 return parent; 174 return parent;
175 } 175 }
176 176
177 entry = rb_entry(node, struct tree_entry, rb_node); 177 entry = rb_entry(node, struct tree_entry, rb_node);
178 rb_link_node(node, parent, p); 178 rb_link_node(node, parent, p);
179 rb_insert_color(node, root); 179 rb_insert_color(node, root);
180 return NULL; 180 return NULL;
181 } 181 }
182 182
183 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 183 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
184 struct rb_node **prev_ret, 184 struct rb_node **prev_ret,
185 struct rb_node **next_ret) 185 struct rb_node **next_ret)
186 { 186 {
187 struct rb_root *root = &tree->state; 187 struct rb_root *root = &tree->state;
188 struct rb_node *n = root->rb_node; 188 struct rb_node *n = root->rb_node;
189 struct rb_node *prev = NULL; 189 struct rb_node *prev = NULL;
190 struct rb_node *orig_prev = NULL; 190 struct rb_node *orig_prev = NULL;
191 struct tree_entry *entry; 191 struct tree_entry *entry;
192 struct tree_entry *prev_entry = NULL; 192 struct tree_entry *prev_entry = NULL;
193 193
194 while (n) { 194 while (n) {
195 entry = rb_entry(n, struct tree_entry, rb_node); 195 entry = rb_entry(n, struct tree_entry, rb_node);
196 prev = n; 196 prev = n;
197 prev_entry = entry; 197 prev_entry = entry;
198 198
199 if (offset < entry->start) 199 if (offset < entry->start)
200 n = n->rb_left; 200 n = n->rb_left;
201 else if (offset > entry->end) 201 else if (offset > entry->end)
202 n = n->rb_right; 202 n = n->rb_right;
203 else 203 else
204 return n; 204 return n;
205 } 205 }
206 206
207 if (prev_ret) { 207 if (prev_ret) {
208 orig_prev = prev; 208 orig_prev = prev;
209 while (prev && offset > prev_entry->end) { 209 while (prev && offset > prev_entry->end) {
210 prev = rb_next(prev); 210 prev = rb_next(prev);
211 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 211 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
212 } 212 }
213 *prev_ret = prev; 213 *prev_ret = prev;
214 prev = orig_prev; 214 prev = orig_prev;
215 } 215 }
216 216
217 if (next_ret) { 217 if (next_ret) {
218 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 218 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
219 while (prev && offset < prev_entry->start) { 219 while (prev && offset < prev_entry->start) {
220 prev = rb_prev(prev); 220 prev = rb_prev(prev);
221 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 221 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222 } 222 }
223 *next_ret = prev; 223 *next_ret = prev;
224 } 224 }
225 return NULL; 225 return NULL;
226 } 226 }
227 227
228 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 228 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
229 u64 offset) 229 u64 offset)
230 { 230 {
231 struct rb_node *prev = NULL; 231 struct rb_node *prev = NULL;
232 struct rb_node *ret; 232 struct rb_node *ret;
233 233
234 ret = __etree_search(tree, offset, &prev, NULL); 234 ret = __etree_search(tree, offset, &prev, NULL);
235 if (!ret) 235 if (!ret)
236 return prev; 236 return prev;
237 return ret; 237 return ret;
238 } 238 }
239 239
240 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 240 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
241 struct extent_state *other) 241 struct extent_state *other)
242 { 242 {
243 if (tree->ops && tree->ops->merge_extent_hook) 243 if (tree->ops && tree->ops->merge_extent_hook)
244 tree->ops->merge_extent_hook(tree->mapping->host, new, 244 tree->ops->merge_extent_hook(tree->mapping->host, new,
245 other); 245 other);
246 } 246 }
247 247
248 /* 248 /*
249 * utility function to look for merge candidates inside a given range. 249 * utility function to look for merge candidates inside a given range.
250 * Any extents with matching state are merged together into a single 250 * Any extents with matching state are merged together into a single
251 * extent in the tree. Extents with EXTENT_IO in their state field 251 * extent in the tree. Extents with EXTENT_IO in their state field
252 * are not merged because the end_io handlers need to be able to do 252 * are not merged because the end_io handlers need to be able to do
253 * operations on them without sleeping (or doing allocations/splits). 253 * operations on them without sleeping (or doing allocations/splits).
254 * 254 *
255 * This should be called with the tree lock held. 255 * This should be called with the tree lock held.
256 */ 256 */
257 static int merge_state(struct extent_io_tree *tree, 257 static int merge_state(struct extent_io_tree *tree,
258 struct extent_state *state) 258 struct extent_state *state)
259 { 259 {
260 struct extent_state *other; 260 struct extent_state *other;
261 struct rb_node *other_node; 261 struct rb_node *other_node;
262 262
263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
264 return 0; 264 return 0;
265 265
266 other_node = rb_prev(&state->rb_node); 266 other_node = rb_prev(&state->rb_node);
267 if (other_node) { 267 if (other_node) {
268 other = rb_entry(other_node, struct extent_state, rb_node); 268 other = rb_entry(other_node, struct extent_state, rb_node);
269 if (other->end == state->start - 1 && 269 if (other->end == state->start - 1 &&
270 other->state == state->state) { 270 other->state == state->state) {
271 merge_cb(tree, state, other); 271 merge_cb(tree, state, other);
272 state->start = other->start; 272 state->start = other->start;
273 other->tree = NULL; 273 other->tree = NULL;
274 rb_erase(&other->rb_node, &tree->state); 274 rb_erase(&other->rb_node, &tree->state);
275 free_extent_state(other); 275 free_extent_state(other);
276 } 276 }
277 } 277 }
278 other_node = rb_next(&state->rb_node); 278 other_node = rb_next(&state->rb_node);
279 if (other_node) { 279 if (other_node) {
280 other = rb_entry(other_node, struct extent_state, rb_node); 280 other = rb_entry(other_node, struct extent_state, rb_node);
281 if (other->start == state->end + 1 && 281 if (other->start == state->end + 1 &&
282 other->state == state->state) { 282 other->state == state->state) {
283 merge_cb(tree, state, other); 283 merge_cb(tree, state, other);
284 other->start = state->start; 284 other->start = state->start;
285 state->tree = NULL; 285 state->tree = NULL;
286 rb_erase(&state->rb_node, &tree->state); 286 rb_erase(&state->rb_node, &tree->state);
287 free_extent_state(state); 287 free_extent_state(state);
288 state = NULL; 288 state = NULL;
289 } 289 }
290 } 290 }
291 291
292 return 0; 292 return 0;
293 } 293 }
294 294
295 static int set_state_cb(struct extent_io_tree *tree, 295 static int set_state_cb(struct extent_io_tree *tree,
296 struct extent_state *state, int *bits) 296 struct extent_state *state, int *bits)
297 { 297 {
298 if (tree->ops && tree->ops->set_bit_hook) { 298 if (tree->ops && tree->ops->set_bit_hook) {
299 return tree->ops->set_bit_hook(tree->mapping->host, 299 return tree->ops->set_bit_hook(tree->mapping->host,
300 state, bits); 300 state, bits);
301 } 301 }
302 302
303 return 0; 303 return 0;
304 } 304 }
305 305
306 static void clear_state_cb(struct extent_io_tree *tree, 306 static void clear_state_cb(struct extent_io_tree *tree,
307 struct extent_state *state, int *bits) 307 struct extent_state *state, int *bits)
308 { 308 {
309 if (tree->ops && tree->ops->clear_bit_hook) 309 if (tree->ops && tree->ops->clear_bit_hook)
310 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 310 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
311 } 311 }
312 312
313 /* 313 /*
314 * insert an extent_state struct into the tree. 'bits' are set on the 314 * insert an extent_state struct into the tree. 'bits' are set on the
315 * struct before it is inserted. 315 * struct before it is inserted.
316 * 316 *
317 * This may return -EEXIST if the extent is already there, in which case the 317 * This may return -EEXIST if the extent is already there, in which case the
318 * state struct is freed. 318 * state struct is freed.
319 * 319 *
320 * The tree lock is not taken internally. This is a utility function and 320 * The tree lock is not taken internally. This is a utility function and
321 * probably isn't what you want to call (see set/clear_extent_bit). 321 * probably isn't what you want to call (see set/clear_extent_bit).
322 */ 322 */
323 static int insert_state(struct extent_io_tree *tree, 323 static int insert_state(struct extent_io_tree *tree,
324 struct extent_state *state, u64 start, u64 end, 324 struct extent_state *state, u64 start, u64 end,
325 int *bits) 325 int *bits)
326 { 326 {
327 struct rb_node *node; 327 struct rb_node *node;
328 int bits_to_set = *bits & ~EXTENT_CTLBITS; 328 int bits_to_set = *bits & ~EXTENT_CTLBITS;
329 int ret; 329 int ret;
330 330
331 if (end < start) { 331 if (end < start) {
332 printk(KERN_ERR "btrfs end < start %llu %llu\n", 332 printk(KERN_ERR "btrfs end < start %llu %llu\n",
333 (unsigned long long)end, 333 (unsigned long long)end,
334 (unsigned long long)start); 334 (unsigned long long)start);
335 WARN_ON(1); 335 WARN_ON(1);
336 } 336 }
337 state->start = start; 337 state->start = start;
338 state->end = end; 338 state->end = end;
339 ret = set_state_cb(tree, state, bits); 339 ret = set_state_cb(tree, state, bits);
340 if (ret) 340 if (ret)
341 return ret; 341 return ret;
342 342
343 if (bits_to_set & EXTENT_DIRTY) 343 if (bits_to_set & EXTENT_DIRTY)
344 tree->dirty_bytes += end - start + 1; 344 tree->dirty_bytes += end - start + 1;
345 state->state |= bits_to_set; 345 state->state |= bits_to_set;
346 node = tree_insert(&tree->state, end, &state->rb_node); 346 node = tree_insert(&tree->state, end, &state->rb_node);
347 if (node) { 347 if (node) {
348 struct extent_state *found; 348 struct extent_state *found;
349 found = rb_entry(node, struct extent_state, rb_node); 349 found = rb_entry(node, struct extent_state, rb_node);
350 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 350 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
351 "%llu %llu\n", (unsigned long long)found->start, 351 "%llu %llu\n", (unsigned long long)found->start,
352 (unsigned long long)found->end, 352 (unsigned long long)found->end,
353 (unsigned long long)start, (unsigned long long)end); 353 (unsigned long long)start, (unsigned long long)end);
354 free_extent_state(state); 354 free_extent_state(state);
355 return -EEXIST; 355 return -EEXIST;
356 } 356 }
357 state->tree = tree; 357 state->tree = tree;
358 merge_state(tree, state); 358 merge_state(tree, state);
359 return 0; 359 return 0;
360 } 360 }
361 361
362 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 362 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
363 u64 split) 363 u64 split)
364 { 364 {
365 if (tree->ops && tree->ops->split_extent_hook) 365 if (tree->ops && tree->ops->split_extent_hook)
366 return tree->ops->split_extent_hook(tree->mapping->host, 366 return tree->ops->split_extent_hook(tree->mapping->host,
367 orig, split); 367 orig, split);
368 return 0; 368 return 0;
369 } 369 }
370 370
371 /* 371 /*
372 * split a given extent state struct in two, inserting the preallocated 372 * split a given extent state struct in two, inserting the preallocated
373 * struct 'prealloc' as the newly created second half. 'split' indicates an 373 * struct 'prealloc' as the newly created second half. 'split' indicates an
374 * offset inside 'orig' where it should be split. 374 * offset inside 'orig' where it should be split.
375 * 375 *
376 * Before calling, 376 * Before calling,
377 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 377 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
378 * are two extent state structs in the tree: 378 * are two extent state structs in the tree:
379 * prealloc: [orig->start, split - 1] 379 * prealloc: [orig->start, split - 1]
380 * orig: [ split, orig->end ] 380 * orig: [ split, orig->end ]
381 * 381 *
382 * The tree locks are not taken by this function. They need to be held 382 * The tree locks are not taken by this function. They need to be held
383 * by the caller. 383 * by the caller.
384 */ 384 */
385 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 385 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
386 struct extent_state *prealloc, u64 split) 386 struct extent_state *prealloc, u64 split)
387 { 387 {
388 struct rb_node *node; 388 struct rb_node *node;
389 389
390 split_cb(tree, orig, split); 390 split_cb(tree, orig, split);
391 391
392 prealloc->start = orig->start; 392 prealloc->start = orig->start;
393 prealloc->end = split - 1; 393 prealloc->end = split - 1;
394 prealloc->state = orig->state; 394 prealloc->state = orig->state;
395 orig->start = split; 395 orig->start = split;
396 396
397 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 397 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
398 if (node) { 398 if (node) {
399 free_extent_state(prealloc); 399 free_extent_state(prealloc);
400 return -EEXIST; 400 return -EEXIST;
401 } 401 }
402 prealloc->tree = tree; 402 prealloc->tree = tree;
403 return 0; 403 return 0;
404 } 404 }
405 405
406 /* 406 /*
407 * utility function to clear some bits in an extent state struct. 407 * utility function to clear some bits in an extent state struct.
408 * it will optionally wake up any one waiting on this state (wake == 1), or 408 * it will optionally wake up any one waiting on this state (wake == 1), or
409 * forcibly remove the state from the tree (delete == 1). 409 * forcibly remove the state from the tree (delete == 1).
410 * 410 *
411 * If no bits are set on the state struct after clearing things, the 411 * If no bits are set on the state struct after clearing things, the
412 * struct is freed and removed from the tree 412 * struct is freed and removed from the tree
413 */ 413 */
414 static int clear_state_bit(struct extent_io_tree *tree, 414 static int clear_state_bit(struct extent_io_tree *tree,
415 struct extent_state *state, 415 struct extent_state *state,
416 int *bits, int wake) 416 int *bits, int wake)
417 { 417 {
418 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 418 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
419 int ret = state->state & bits_to_clear; 419 int ret = state->state & bits_to_clear;
420 420
421 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 421 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
422 u64 range = state->end - state->start + 1; 422 u64 range = state->end - state->start + 1;
423 WARN_ON(range > tree->dirty_bytes); 423 WARN_ON(range > tree->dirty_bytes);
424 tree->dirty_bytes -= range; 424 tree->dirty_bytes -= range;
425 } 425 }
426 clear_state_cb(tree, state, bits); 426 clear_state_cb(tree, state, bits);
427 state->state &= ~bits_to_clear; 427 state->state &= ~bits_to_clear;
428 if (wake) 428 if (wake)
429 wake_up(&state->wq); 429 wake_up(&state->wq);
430 if (state->state == 0) { 430 if (state->state == 0) {
431 if (state->tree) { 431 if (state->tree) {
432 rb_erase(&state->rb_node, &tree->state); 432 rb_erase(&state->rb_node, &tree->state);
433 state->tree = NULL; 433 state->tree = NULL;
434 free_extent_state(state); 434 free_extent_state(state);
435 } else { 435 } else {
436 WARN_ON(1); 436 WARN_ON(1);
437 } 437 }
438 } else { 438 } else {
439 merge_state(tree, state); 439 merge_state(tree, state);
440 } 440 }
441 return ret; 441 return ret;
442 } 442 }
443 443
444 static struct extent_state * 444 static struct extent_state *
445 alloc_extent_state_atomic(struct extent_state *prealloc) 445 alloc_extent_state_atomic(struct extent_state *prealloc)
446 { 446 {
447 if (!prealloc) 447 if (!prealloc)
448 prealloc = alloc_extent_state(GFP_ATOMIC); 448 prealloc = alloc_extent_state(GFP_ATOMIC);
449 449
450 return prealloc; 450 return prealloc;
451 } 451 }
452 452
453 /* 453 /*
454 * clear some bits on a range in the tree. This may require splitting 454 * clear some bits on a range in the tree. This may require splitting
455 * or inserting elements in the tree, so the gfp mask is used to 455 * or inserting elements in the tree, so the gfp mask is used to
456 * indicate which allocations or sleeping are allowed. 456 * indicate which allocations or sleeping are allowed.
457 * 457 *
458 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 458 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
459 * the given range from the tree regardless of state (ie for truncate). 459 * the given range from the tree regardless of state (ie for truncate).
460 * 460 *
461 * the range [start, end] is inclusive. 461 * the range [start, end] is inclusive.
462 * 462 *
463 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 463 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
464 * bits were already set, or zero if none of the bits were already set. 464 * bits were already set, or zero if none of the bits were already set.
465 */ 465 */
466 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 466 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
467 int bits, int wake, int delete, 467 int bits, int wake, int delete,
468 struct extent_state **cached_state, 468 struct extent_state **cached_state,
469 gfp_t mask) 469 gfp_t mask)
470 { 470 {
471 struct extent_state *state; 471 struct extent_state *state;
472 struct extent_state *cached; 472 struct extent_state *cached;
473 struct extent_state *prealloc = NULL; 473 struct extent_state *prealloc = NULL;
474 struct rb_node *next_node; 474 struct rb_node *next_node;
475 struct rb_node *node; 475 struct rb_node *node;
476 u64 last_end; 476 u64 last_end;
477 int err; 477 int err;
478 int set = 0; 478 int set = 0;
479 int clear = 0; 479 int clear = 0;
480 480
481 if (delete) 481 if (delete)
482 bits |= ~EXTENT_CTLBITS; 482 bits |= ~EXTENT_CTLBITS;
483 bits |= EXTENT_FIRST_DELALLOC; 483 bits |= EXTENT_FIRST_DELALLOC;
484 484
485 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 485 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
486 clear = 1; 486 clear = 1;
487 again: 487 again:
488 if (!prealloc && (mask & __GFP_WAIT)) { 488 if (!prealloc && (mask & __GFP_WAIT)) {
489 prealloc = alloc_extent_state(mask); 489 prealloc = alloc_extent_state(mask);
490 if (!prealloc) 490 if (!prealloc)
491 return -ENOMEM; 491 return -ENOMEM;
492 } 492 }
493 493
494 spin_lock(&tree->lock); 494 spin_lock(&tree->lock);
495 if (cached_state) { 495 if (cached_state) {
496 cached = *cached_state; 496 cached = *cached_state;
497 497
498 if (clear) { 498 if (clear) {
499 *cached_state = NULL; 499 *cached_state = NULL;
500 cached_state = NULL; 500 cached_state = NULL;
501 } 501 }
502 502
503 if (cached && cached->tree && cached->start == start) { 503 if (cached && cached->tree && cached->start == start) {
504 if (clear) 504 if (clear)
505 atomic_dec(&cached->refs); 505 atomic_dec(&cached->refs);
506 state = cached; 506 state = cached;
507 goto hit_next; 507 goto hit_next;
508 } 508 }
509 if (clear) 509 if (clear)
510 free_extent_state(cached); 510 free_extent_state(cached);
511 } 511 }
512 /* 512 /*
513 * this search will find the extents that end after 513 * this search will find the extents that end after
514 * our range starts 514 * our range starts
515 */ 515 */
516 node = tree_search(tree, start); 516 node = tree_search(tree, start);
517 if (!node) 517 if (!node)
518 goto out; 518 goto out;
519 state = rb_entry(node, struct extent_state, rb_node); 519 state = rb_entry(node, struct extent_state, rb_node);
520 hit_next: 520 hit_next:
521 if (state->start > end) 521 if (state->start > end)
522 goto out; 522 goto out;
523 WARN_ON(state->end < start); 523 WARN_ON(state->end < start);
524 last_end = state->end; 524 last_end = state->end;
525 525
526 /* 526 /*
527 * | ---- desired range ---- | 527 * | ---- desired range ---- |
528 * | state | or 528 * | state | or
529 * | ------------- state -------------- | 529 * | ------------- state -------------- |
530 * 530 *
531 * We need to split the extent we found, and may flip 531 * We need to split the extent we found, and may flip
532 * bits on second half. 532 * bits on second half.
533 * 533 *
534 * If the extent we found extends past our range, we 534 * If the extent we found extends past our range, we
535 * just split and search again. It'll get split again 535 * just split and search again. It'll get split again
536 * the next time though. 536 * the next time though.
537 * 537 *
538 * If the extent we found is inside our range, we clear 538 * If the extent we found is inside our range, we clear
539 * the desired bit on it. 539 * the desired bit on it.
540 */ 540 */
541 541
542 if (state->start < start) { 542 if (state->start < start) {
543 prealloc = alloc_extent_state_atomic(prealloc); 543 prealloc = alloc_extent_state_atomic(prealloc);
544 BUG_ON(!prealloc); 544 BUG_ON(!prealloc);
545 err = split_state(tree, state, prealloc, start); 545 err = split_state(tree, state, prealloc, start);
546 BUG_ON(err == -EEXIST); 546 BUG_ON(err == -EEXIST);
547 prealloc = NULL; 547 prealloc = NULL;
548 if (err) 548 if (err)
549 goto out; 549 goto out;
550 if (state->end <= end) { 550 if (state->end <= end) {
551 set |= clear_state_bit(tree, state, &bits, wake); 551 set |= clear_state_bit(tree, state, &bits, wake);
552 if (last_end == (u64)-1) 552 if (last_end == (u64)-1)
553 goto out; 553 goto out;
554 start = last_end + 1; 554 start = last_end + 1;
555 } 555 }
556 goto search_again; 556 goto search_again;
557 } 557 }
558 /* 558 /*
559 * | ---- desired range ---- | 559 * | ---- desired range ---- |
560 * | state | 560 * | state |
561 * We need to split the extent, and clear the bit 561 * We need to split the extent, and clear the bit
562 * on the first half 562 * on the first half
563 */ 563 */
564 if (state->start <= end && state->end > end) { 564 if (state->start <= end && state->end > end) {
565 prealloc = alloc_extent_state_atomic(prealloc); 565 prealloc = alloc_extent_state_atomic(prealloc);
566 BUG_ON(!prealloc); 566 BUG_ON(!prealloc);
567 err = split_state(tree, state, prealloc, end + 1); 567 err = split_state(tree, state, prealloc, end + 1);
568 BUG_ON(err == -EEXIST); 568 BUG_ON(err == -EEXIST);
569 if (wake) 569 if (wake)
570 wake_up(&state->wq); 570 wake_up(&state->wq);
571 571
572 set |= clear_state_bit(tree, prealloc, &bits, wake); 572 set |= clear_state_bit(tree, prealloc, &bits, wake);
573 573
574 prealloc = NULL; 574 prealloc = NULL;
575 goto out; 575 goto out;
576 } 576 }
577 577
578 if (state->end < end && prealloc && !need_resched()) 578 if (state->end < end && prealloc && !need_resched())
579 next_node = rb_next(&state->rb_node); 579 next_node = rb_next(&state->rb_node);
580 else 580 else
581 next_node = NULL; 581 next_node = NULL;
582 582
583 set |= clear_state_bit(tree, state, &bits, wake); 583 set |= clear_state_bit(tree, state, &bits, wake);
584 if (last_end == (u64)-1) 584 if (last_end == (u64)-1)
585 goto out; 585 goto out;
586 start = last_end + 1; 586 start = last_end + 1;
587 if (start <= end && next_node) { 587 if (start <= end && next_node) {
588 state = rb_entry(next_node, struct extent_state, 588 state = rb_entry(next_node, struct extent_state,
589 rb_node); 589 rb_node);
590 if (state->start == start) 590 if (state->start == start)
591 goto hit_next; 591 goto hit_next;
592 } 592 }
593 goto search_again; 593 goto search_again;
594 594
595 out: 595 out:
596 spin_unlock(&tree->lock); 596 spin_unlock(&tree->lock);
597 if (prealloc) 597 if (prealloc)
598 free_extent_state(prealloc); 598 free_extent_state(prealloc);
599 599
600 return set; 600 return set;
601 601
602 search_again: 602 search_again:
603 if (start > end) 603 if (start > end)
604 goto out; 604 goto out;
605 spin_unlock(&tree->lock); 605 spin_unlock(&tree->lock);
606 if (mask & __GFP_WAIT) 606 if (mask & __GFP_WAIT)
607 cond_resched(); 607 cond_resched();
608 goto again; 608 goto again;
609 } 609 }
610 610
611 static int wait_on_state(struct extent_io_tree *tree, 611 static int wait_on_state(struct extent_io_tree *tree,
612 struct extent_state *state) 612 struct extent_state *state)
613 __releases(tree->lock) 613 __releases(tree->lock)
614 __acquires(tree->lock) 614 __acquires(tree->lock)
615 { 615 {
616 DEFINE_WAIT(wait); 616 DEFINE_WAIT(wait);
617 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 617 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
618 spin_unlock(&tree->lock); 618 spin_unlock(&tree->lock);
619 schedule(); 619 schedule();
620 spin_lock(&tree->lock); 620 spin_lock(&tree->lock);
621 finish_wait(&state->wq, &wait); 621 finish_wait(&state->wq, &wait);
622 return 0; 622 return 0;
623 } 623 }
624 624
625 /* 625 /*
626 * waits for one or more bits to clear on a range in the state tree. 626 * waits for one or more bits to clear on a range in the state tree.
627 * The range [start, end] is inclusive. 627 * The range [start, end] is inclusive.
628 * The tree lock is taken by this function 628 * The tree lock is taken by this function
629 */ 629 */
630 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 630 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
631 { 631 {
632 struct extent_state *state; 632 struct extent_state *state;
633 struct rb_node *node; 633 struct rb_node *node;
634 634
635 spin_lock(&tree->lock); 635 spin_lock(&tree->lock);
636 again: 636 again:
637 while (1) { 637 while (1) {
638 /* 638 /*
639 * this search will find all the extents that end after 639 * this search will find all the extents that end after
640 * our range starts 640 * our range starts
641 */ 641 */
642 node = tree_search(tree, start); 642 node = tree_search(tree, start);
643 if (!node) 643 if (!node)
644 break; 644 break;
645 645
646 state = rb_entry(node, struct extent_state, rb_node); 646 state = rb_entry(node, struct extent_state, rb_node);
647 647
648 if (state->start > end) 648 if (state->start > end)
649 goto out; 649 goto out;
650 650
651 if (state->state & bits) { 651 if (state->state & bits) {
652 start = state->start; 652 start = state->start;
653 atomic_inc(&state->refs); 653 atomic_inc(&state->refs);
654 wait_on_state(tree, state); 654 wait_on_state(tree, state);
655 free_extent_state(state); 655 free_extent_state(state);
656 goto again; 656 goto again;
657 } 657 }
658 start = state->end + 1; 658 start = state->end + 1;
659 659
660 if (start > end) 660 if (start > end)
661 break; 661 break;
662 662
663 if (need_resched()) { 663 if (need_resched()) {
664 spin_unlock(&tree->lock); 664 spin_unlock(&tree->lock);
665 cond_resched(); 665 cond_resched();
666 spin_lock(&tree->lock); 666 spin_lock(&tree->lock);
667 } 667 }
668 } 668 }
669 out: 669 out:
670 spin_unlock(&tree->lock); 670 spin_unlock(&tree->lock);
671 return 0; 671 return 0;
672 } 672 }
673 673
674 static int set_state_bits(struct extent_io_tree *tree, 674 static int set_state_bits(struct extent_io_tree *tree,
675 struct extent_state *state, 675 struct extent_state *state,
676 int *bits) 676 int *bits)
677 { 677 {
678 int ret; 678 int ret;
679 int bits_to_set = *bits & ~EXTENT_CTLBITS; 679 int bits_to_set = *bits & ~EXTENT_CTLBITS;
680 680
681 ret = set_state_cb(tree, state, bits); 681 ret = set_state_cb(tree, state, bits);
682 if (ret) 682 if (ret)
683 return ret; 683 return ret;
684 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 684 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
685 u64 range = state->end - state->start + 1; 685 u64 range = state->end - state->start + 1;
686 tree->dirty_bytes += range; 686 tree->dirty_bytes += range;
687 } 687 }
688 state->state |= bits_to_set; 688 state->state |= bits_to_set;
689 689
690 return 0; 690 return 0;
691 } 691 }
692 692
693 static void cache_state(struct extent_state *state, 693 static void cache_state(struct extent_state *state,
694 struct extent_state **cached_ptr) 694 struct extent_state **cached_ptr)
695 { 695 {
696 if (cached_ptr && !(*cached_ptr)) { 696 if (cached_ptr && !(*cached_ptr)) {
697 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 697 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
698 *cached_ptr = state; 698 *cached_ptr = state;
699 atomic_inc(&state->refs); 699 atomic_inc(&state->refs);
700 } 700 }
701 } 701 }
702 } 702 }
703 703
704 static void uncache_state(struct extent_state **cached_ptr) 704 static void uncache_state(struct extent_state **cached_ptr)
705 { 705 {
706 if (cached_ptr && (*cached_ptr)) { 706 if (cached_ptr && (*cached_ptr)) {
707 struct extent_state *state = *cached_ptr; 707 struct extent_state *state = *cached_ptr;
708 *cached_ptr = NULL; 708 *cached_ptr = NULL;
709 free_extent_state(state); 709 free_extent_state(state);
710 } 710 }
711 } 711 }
712 712
713 /* 713 /*
714 * set some bits on a range in the tree. This may require allocations or 714 * set some bits on a range in the tree. This may require allocations or
715 * sleeping, so the gfp mask is used to indicate what is allowed. 715 * sleeping, so the gfp mask is used to indicate what is allowed.
716 * 716 *
717 * If any of the exclusive bits are set, this will fail with -EEXIST if some 717 * If any of the exclusive bits are set, this will fail with -EEXIST if some
718 * part of the range already has the desired bits set. The start of the 718 * part of the range already has the desired bits set. The start of the
719 * existing range is returned in failed_start in this case. 719 * existing range is returned in failed_start in this case.
720 * 720 *
721 * [start, end] is inclusive This takes the tree lock. 721 * [start, end] is inclusive This takes the tree lock.
722 */ 722 */
723 723
724 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 724 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
725 int bits, int exclusive_bits, u64 *failed_start, 725 int bits, int exclusive_bits, u64 *failed_start,
726 struct extent_state **cached_state, gfp_t mask) 726 struct extent_state **cached_state, gfp_t mask)
727 { 727 {
728 struct extent_state *state; 728 struct extent_state *state;
729 struct extent_state *prealloc = NULL; 729 struct extent_state *prealloc = NULL;
730 struct rb_node *node; 730 struct rb_node *node;
731 int err = 0; 731 int err = 0;
732 u64 last_start; 732 u64 last_start;
733 u64 last_end; 733 u64 last_end;
734 734
735 bits |= EXTENT_FIRST_DELALLOC; 735 bits |= EXTENT_FIRST_DELALLOC;
736 again: 736 again:
737 if (!prealloc && (mask & __GFP_WAIT)) { 737 if (!prealloc && (mask & __GFP_WAIT)) {
738 prealloc = alloc_extent_state(mask); 738 prealloc = alloc_extent_state(mask);
739 BUG_ON(!prealloc); 739 BUG_ON(!prealloc);
740 } 740 }
741 741
742 spin_lock(&tree->lock); 742 spin_lock(&tree->lock);
743 if (cached_state && *cached_state) { 743 if (cached_state && *cached_state) {
744 state = *cached_state; 744 state = *cached_state;
745 if (state->start == start && state->tree) { 745 if (state->start == start && state->tree) {
746 node = &state->rb_node; 746 node = &state->rb_node;
747 goto hit_next; 747 goto hit_next;
748 } 748 }
749 } 749 }
750 /* 750 /*
751 * this search will find all the extents that end after 751 * this search will find all the extents that end after
752 * our range starts. 752 * our range starts.
753 */ 753 */
754 node = tree_search(tree, start); 754 node = tree_search(tree, start);
755 if (!node) { 755 if (!node) {
756 prealloc = alloc_extent_state_atomic(prealloc); 756 prealloc = alloc_extent_state_atomic(prealloc);
757 BUG_ON(!prealloc); 757 BUG_ON(!prealloc);
758 err = insert_state(tree, prealloc, start, end, &bits); 758 err = insert_state(tree, prealloc, start, end, &bits);
759 prealloc = NULL; 759 prealloc = NULL;
760 BUG_ON(err == -EEXIST); 760 BUG_ON(err == -EEXIST);
761 goto out; 761 goto out;
762 } 762 }
763 state = rb_entry(node, struct extent_state, rb_node); 763 state = rb_entry(node, struct extent_state, rb_node);
764 hit_next: 764 hit_next:
765 last_start = state->start; 765 last_start = state->start;
766 last_end = state->end; 766 last_end = state->end;
767 767
768 /* 768 /*
769 * | ---- desired range ---- | 769 * | ---- desired range ---- |
770 * | state | 770 * | state |
771 * 771 *
772 * Just lock what we found and keep going 772 * Just lock what we found and keep going
773 */ 773 */
774 if (state->start == start && state->end <= end) { 774 if (state->start == start && state->end <= end) {
775 struct rb_node *next_node; 775 struct rb_node *next_node;
776 if (state->state & exclusive_bits) { 776 if (state->state & exclusive_bits) {
777 *failed_start = state->start; 777 *failed_start = state->start;
778 err = -EEXIST; 778 err = -EEXIST;
779 goto out; 779 goto out;
780 } 780 }
781 781
782 err = set_state_bits(tree, state, &bits); 782 err = set_state_bits(tree, state, &bits);
783 if (err) 783 if (err)
784 goto out; 784 goto out;
785 785
786 next_node = rb_next(node); 786 next_node = rb_next(node);
787 cache_state(state, cached_state); 787 cache_state(state, cached_state);
788 merge_state(tree, state); 788 merge_state(tree, state);
789 if (last_end == (u64)-1) 789 if (last_end == (u64)-1)
790 goto out; 790 goto out;
791 791
792 start = last_end + 1; 792 start = last_end + 1;
793 if (next_node && start < end && prealloc && !need_resched()) { 793 if (next_node && start < end && prealloc && !need_resched()) {
794 state = rb_entry(next_node, struct extent_state, 794 state = rb_entry(next_node, struct extent_state,
795 rb_node); 795 rb_node);
796 if (state->start == start) 796 if (state->start == start)
797 goto hit_next; 797 goto hit_next;
798 } 798 }
799 goto search_again; 799 goto search_again;
800 } 800 }
801 801
802 /* 802 /*
803 * | ---- desired range ---- | 803 * | ---- desired range ---- |
804 * | state | 804 * | state |
805 * or 805 * or
806 * | ------------- state -------------- | 806 * | ------------- state -------------- |
807 * 807 *
808 * We need to split the extent we found, and may flip bits on 808 * We need to split the extent we found, and may flip bits on
809 * second half. 809 * second half.
810 * 810 *
811 * If the extent we found extends past our 811 * If the extent we found extends past our
812 * range, we just split and search again. It'll get split 812 * range, we just split and search again. It'll get split
813 * again the next time though. 813 * again the next time though.
814 * 814 *
815 * If the extent we found is inside our range, we set the 815 * If the extent we found is inside our range, we set the
816 * desired bit on it. 816 * desired bit on it.
817 */ 817 */
818 if (state->start < start) { 818 if (state->start < start) {
819 if (state->state & exclusive_bits) { 819 if (state->state & exclusive_bits) {
820 *failed_start = start; 820 *failed_start = start;
821 err = -EEXIST; 821 err = -EEXIST;
822 goto out; 822 goto out;
823 } 823 }
824 824
825 prealloc = alloc_extent_state_atomic(prealloc); 825 prealloc = alloc_extent_state_atomic(prealloc);
826 BUG_ON(!prealloc); 826 BUG_ON(!prealloc);
827 err = split_state(tree, state, prealloc, start); 827 err = split_state(tree, state, prealloc, start);
828 BUG_ON(err == -EEXIST); 828 BUG_ON(err == -EEXIST);
829 prealloc = NULL; 829 prealloc = NULL;
830 if (err) 830 if (err)
831 goto out; 831 goto out;
832 if (state->end <= end) { 832 if (state->end <= end) {
833 err = set_state_bits(tree, state, &bits); 833 err = set_state_bits(tree, state, &bits);
834 if (err) 834 if (err)
835 goto out; 835 goto out;
836 cache_state(state, cached_state); 836 cache_state(state, cached_state);
837 merge_state(tree, state); 837 merge_state(tree, state);
838 if (last_end == (u64)-1) 838 if (last_end == (u64)-1)
839 goto out; 839 goto out;
840 start = last_end + 1; 840 start = last_end + 1;
841 } 841 }
842 goto search_again; 842 goto search_again;
843 } 843 }
844 /* 844 /*
845 * | ---- desired range ---- | 845 * | ---- desired range ---- |
846 * | state | or | state | 846 * | state | or | state |
847 * 847 *
848 * There's a hole, we need to insert something in it and 848 * There's a hole, we need to insert something in it and
849 * ignore the extent we found. 849 * ignore the extent we found.
850 */ 850 */
851 if (state->start > start) { 851 if (state->start > start) {
852 u64 this_end; 852 u64 this_end;
853 if (end < last_start) 853 if (end < last_start)
854 this_end = end; 854 this_end = end;
855 else 855 else
856 this_end = last_start - 1; 856 this_end = last_start - 1;
857 857
858 prealloc = alloc_extent_state_atomic(prealloc); 858 prealloc = alloc_extent_state_atomic(prealloc);
859 BUG_ON(!prealloc); 859 BUG_ON(!prealloc);
860 860
861 /* 861 /*
862 * Avoid to free 'prealloc' if it can be merged with 862 * Avoid to free 'prealloc' if it can be merged with
863 * the later extent. 863 * the later extent.
864 */ 864 */
865 atomic_inc(&prealloc->refs); 865 atomic_inc(&prealloc->refs);
866 err = insert_state(tree, prealloc, start, this_end, 866 err = insert_state(tree, prealloc, start, this_end,
867 &bits); 867 &bits);
868 BUG_ON(err == -EEXIST); 868 BUG_ON(err == -EEXIST);
869 if (err) { 869 if (err) {
870 free_extent_state(prealloc); 870 free_extent_state(prealloc);
871 prealloc = NULL; 871 prealloc = NULL;
872 goto out; 872 goto out;
873 } 873 }
874 cache_state(prealloc, cached_state); 874 cache_state(prealloc, cached_state);
875 free_extent_state(prealloc); 875 free_extent_state(prealloc);
876 prealloc = NULL; 876 prealloc = NULL;
877 start = this_end + 1; 877 start = this_end + 1;
878 goto search_again; 878 goto search_again;
879 } 879 }
880 /* 880 /*
881 * | ---- desired range ---- | 881 * | ---- desired range ---- |
882 * | state | 882 * | state |
883 * We need to split the extent, and set the bit 883 * We need to split the extent, and set the bit
884 * on the first half 884 * on the first half
885 */ 885 */
886 if (state->start <= end && state->end > end) { 886 if (state->start <= end && state->end > end) {
887 if (state->state & exclusive_bits) { 887 if (state->state & exclusive_bits) {
888 *failed_start = start; 888 *failed_start = start;
889 err = -EEXIST; 889 err = -EEXIST;
890 goto out; 890 goto out;
891 } 891 }
892 892
893 prealloc = alloc_extent_state_atomic(prealloc); 893 prealloc = alloc_extent_state_atomic(prealloc);
894 BUG_ON(!prealloc); 894 BUG_ON(!prealloc);
895 err = split_state(tree, state, prealloc, end + 1); 895 err = split_state(tree, state, prealloc, end + 1);
896 BUG_ON(err == -EEXIST); 896 BUG_ON(err == -EEXIST);
897 897
898 err = set_state_bits(tree, prealloc, &bits); 898 err = set_state_bits(tree, prealloc, &bits);
899 if (err) { 899 if (err) {
900 prealloc = NULL; 900 prealloc = NULL;
901 goto out; 901 goto out;
902 } 902 }
903 cache_state(prealloc, cached_state); 903 cache_state(prealloc, cached_state);
904 merge_state(tree, prealloc); 904 merge_state(tree, prealloc);
905 prealloc = NULL; 905 prealloc = NULL;
906 goto out; 906 goto out;
907 } 907 }
908 908
909 goto search_again; 909 goto search_again;
910 910
911 out: 911 out:
912 spin_unlock(&tree->lock); 912 spin_unlock(&tree->lock);
913 if (prealloc) 913 if (prealloc)
914 free_extent_state(prealloc); 914 free_extent_state(prealloc);
915 915
916 return err; 916 return err;
917 917
918 search_again: 918 search_again:
919 if (start > end) 919 if (start > end)
920 goto out; 920 goto out;
921 spin_unlock(&tree->lock); 921 spin_unlock(&tree->lock);
922 if (mask & __GFP_WAIT) 922 if (mask & __GFP_WAIT)
923 cond_resched(); 923 cond_resched();
924 goto again; 924 goto again;
925 } 925 }
926 926
927 /* wrappers around set/clear extent bit */ 927 /* wrappers around set/clear extent bit */
928 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 928 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
929 gfp_t mask) 929 gfp_t mask)
930 { 930 {
931 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 931 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
932 NULL, mask); 932 NULL, mask);
933 } 933 }
934 934
935 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 935 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
936 int bits, gfp_t mask) 936 int bits, gfp_t mask)
937 { 937 {
938 return set_extent_bit(tree, start, end, bits, 0, NULL, 938 return set_extent_bit(tree, start, end, bits, 0, NULL,
939 NULL, mask); 939 NULL, mask);
940 } 940 }
941 941
942 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 942 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
943 int bits, gfp_t mask) 943 int bits, gfp_t mask)
944 { 944 {
945 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 945 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
946 } 946 }
947 947
948 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 948 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 struct extent_state **cached_state, gfp_t mask) 949 struct extent_state **cached_state, gfp_t mask)
950 { 950 {
951 return set_extent_bit(tree, start, end, 951 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, cached_state, mask); 953 0, NULL, cached_state, mask);
954 } 954 }
955 955
956 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 956 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
957 gfp_t mask) 957 gfp_t mask)
958 { 958 {
959 return clear_extent_bit(tree, start, end, 959 return clear_extent_bit(tree, start, end,
960 EXTENT_DIRTY | EXTENT_DELALLOC | 960 EXTENT_DIRTY | EXTENT_DELALLOC |
961 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 961 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
962 } 962 }
963 963
964 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 964 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
965 gfp_t mask) 965 gfp_t mask)
966 { 966 {
967 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 967 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
968 NULL, mask); 968 NULL, mask);
969 } 969 }
970 970
971 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 971 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
972 struct extent_state **cached_state, gfp_t mask) 972 struct extent_state **cached_state, gfp_t mask)
973 { 973 {
974 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 974 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
975 NULL, cached_state, mask); 975 NULL, cached_state, mask);
976 } 976 }
977 977
978 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 978 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
979 u64 end, struct extent_state **cached_state, 979 u64 end, struct extent_state **cached_state,
980 gfp_t mask) 980 gfp_t mask)
981 { 981 {
982 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 982 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
983 cached_state, mask); 983 cached_state, mask);
984 } 984 }
985 985
986 /* 986 /*
987 * either insert or lock state struct between start and end use mask to tell 987 * either insert or lock state struct between start and end use mask to tell
988 * us if waiting is desired. 988 * us if waiting is desired.
989 */ 989 */
990 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 990 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
991 int bits, struct extent_state **cached_state, gfp_t mask) 991 int bits, struct extent_state **cached_state, gfp_t mask)
992 { 992 {
993 int err; 993 int err;
994 u64 failed_start; 994 u64 failed_start;
995 while (1) { 995 while (1) {
996 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 996 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
997 EXTENT_LOCKED, &failed_start, 997 EXTENT_LOCKED, &failed_start,
998 cached_state, mask); 998 cached_state, mask);
999 if (err == -EEXIST && (mask & __GFP_WAIT)) { 999 if (err == -EEXIST && (mask & __GFP_WAIT)) {
1000 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1000 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1001 start = failed_start; 1001 start = failed_start;
1002 } else { 1002 } else {
1003 break; 1003 break;
1004 } 1004 }
1005 WARN_ON(start > end); 1005 WARN_ON(start > end);
1006 } 1006 }
1007 return err; 1007 return err;
1008 } 1008 }
1009 1009
1010 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1010 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1011 { 1011 {
1012 return lock_extent_bits(tree, start, end, 0, NULL, mask); 1012 return lock_extent_bits(tree, start, end, 0, NULL, mask);
1013 } 1013 }
1014 1014
1015 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1015 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1016 gfp_t mask) 1016 gfp_t mask)
1017 { 1017 {
1018 int err; 1018 int err;
1019 u64 failed_start; 1019 u64 failed_start;
1020 1020
1021 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1021 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1022 &failed_start, NULL, mask); 1022 &failed_start, NULL, mask);
1023 if (err == -EEXIST) { 1023 if (err == -EEXIST) {
1024 if (failed_start > start) 1024 if (failed_start > start)
1025 clear_extent_bit(tree, start, failed_start - 1, 1025 clear_extent_bit(tree, start, failed_start - 1,
1026 EXTENT_LOCKED, 1, 0, NULL, mask); 1026 EXTENT_LOCKED, 1, 0, NULL, mask);
1027 return 0; 1027 return 0;
1028 } 1028 }
1029 return 1; 1029 return 1;
1030 } 1030 }
1031 1031
1032 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1032 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1033 struct extent_state **cached, gfp_t mask) 1033 struct extent_state **cached, gfp_t mask)
1034 { 1034 {
1035 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1035 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1036 mask); 1036 mask);
1037 } 1037 }
1038 1038
1039 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1039 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1040 { 1040 {
1041 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1041 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1042 mask); 1042 mask);
1043 } 1043 }
1044 1044
1045 /* 1045 /*
1046 * helper function to set both pages and extents in the tree writeback 1046 * helper function to set both pages and extents in the tree writeback
1047 */ 1047 */
1048 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1048 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1049 { 1049 {
1050 unsigned long index = start >> PAGE_CACHE_SHIFT; 1050 unsigned long index = start >> PAGE_CACHE_SHIFT;
1051 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1051 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1052 struct page *page; 1052 struct page *page;
1053 1053
1054 while (index <= end_index) { 1054 while (index <= end_index) {
1055 page = find_get_page(tree->mapping, index); 1055 page = find_get_page(tree->mapping, index);
1056 BUG_ON(!page); 1056 BUG_ON(!page);
1057 set_page_writeback(page); 1057 set_page_writeback(page);
1058 page_cache_release(page); 1058 page_cache_release(page);
1059 index++; 1059 index++;
1060 } 1060 }
1061 return 0; 1061 return 0;
1062 } 1062 }
1063 1063
1064 /* 1064 /*
1065 * find the first offset in the io tree with 'bits' set. zero is 1065 * find the first offset in the io tree with 'bits' set. zero is
1066 * returned if we find something, and *start_ret and *end_ret are 1066 * returned if we find something, and *start_ret and *end_ret are
1067 * set to reflect the state struct that was found. 1067 * set to reflect the state struct that was found.
1068 * 1068 *
1069 * If nothing was found, 1 is returned, < 0 on error 1069 * If nothing was found, 1 is returned, < 0 on error
1070 */ 1070 */
1071 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1071 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1072 u64 *start_ret, u64 *end_ret, int bits) 1072 u64 *start_ret, u64 *end_ret, int bits)
1073 { 1073 {
1074 struct rb_node *node; 1074 struct rb_node *node;
1075 struct extent_state *state; 1075 struct extent_state *state;
1076 int ret = 1; 1076 int ret = 1;
1077 1077
1078 spin_lock(&tree->lock); 1078 spin_lock(&tree->lock);
1079 /* 1079 /*
1080 * this search will find all the extents that end after 1080 * this search will find all the extents that end after
1081 * our range starts. 1081 * our range starts.
1082 */ 1082 */
1083 node = tree_search(tree, start); 1083 node = tree_search(tree, start);
1084 if (!node) 1084 if (!node)
1085 goto out; 1085 goto out;
1086 1086
1087 while (1) { 1087 while (1) {
1088 state = rb_entry(node, struct extent_state, rb_node); 1088 state = rb_entry(node, struct extent_state, rb_node);
1089 if (state->end >= start && (state->state & bits)) { 1089 if (state->end >= start && (state->state & bits)) {
1090 *start_ret = state->start; 1090 *start_ret = state->start;
1091 *end_ret = state->end; 1091 *end_ret = state->end;
1092 ret = 0; 1092 ret = 0;
1093 break; 1093 break;
1094 } 1094 }
1095 node = rb_next(node); 1095 node = rb_next(node);
1096 if (!node) 1096 if (!node)
1097 break; 1097 break;
1098 } 1098 }
1099 out: 1099 out:
1100 spin_unlock(&tree->lock); 1100 spin_unlock(&tree->lock);
1101 return ret; 1101 return ret;
1102 } 1102 }
1103 1103
1104 /* find the first state struct with 'bits' set after 'start', and 1104 /* find the first state struct with 'bits' set after 'start', and
1105 * return it. tree->lock must be held. NULL will returned if 1105 * return it. tree->lock must be held. NULL will returned if
1106 * nothing was found after 'start' 1106 * nothing was found after 'start'
1107 */ 1107 */
1108 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1108 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1109 u64 start, int bits) 1109 u64 start, int bits)
1110 { 1110 {
1111 struct rb_node *node; 1111 struct rb_node *node;
1112 struct extent_state *state; 1112 struct extent_state *state;
1113 1113
1114 /* 1114 /*
1115 * this search will find all the extents that end after 1115 * this search will find all the extents that end after
1116 * our range starts. 1116 * our range starts.
1117 */ 1117 */
1118 node = tree_search(tree, start); 1118 node = tree_search(tree, start);
1119 if (!node) 1119 if (!node)
1120 goto out; 1120 goto out;
1121 1121
1122 while (1) { 1122 while (1) {
1123 state = rb_entry(node, struct extent_state, rb_node); 1123 state = rb_entry(node, struct extent_state, rb_node);
1124 if (state->end >= start && (state->state & bits)) 1124 if (state->end >= start && (state->state & bits))
1125 return state; 1125 return state;
1126 1126
1127 node = rb_next(node); 1127 node = rb_next(node);
1128 if (!node) 1128 if (!node)
1129 break; 1129 break;
1130 } 1130 }
1131 out: 1131 out:
1132 return NULL; 1132 return NULL;
1133 } 1133 }
1134 1134
1135 /* 1135 /*
1136 * find a contiguous range of bytes in the file marked as delalloc, not 1136 * find a contiguous range of bytes in the file marked as delalloc, not
1137 * more than 'max_bytes'. start and end are used to return the range, 1137 * more than 'max_bytes'. start and end are used to return the range,
1138 * 1138 *
1139 * 1 is returned if we find something, 0 if nothing was in the tree 1139 * 1 is returned if we find something, 0 if nothing was in the tree
1140 */ 1140 */
1141 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1141 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1142 u64 *start, u64 *end, u64 max_bytes, 1142 u64 *start, u64 *end, u64 max_bytes,
1143 struct extent_state **cached_state) 1143 struct extent_state **cached_state)
1144 { 1144 {
1145 struct rb_node *node; 1145 struct rb_node *node;
1146 struct extent_state *state; 1146 struct extent_state *state;
1147 u64 cur_start = *start; 1147 u64 cur_start = *start;
1148 u64 found = 0; 1148 u64 found = 0;
1149 u64 total_bytes = 0; 1149 u64 total_bytes = 0;
1150 1150
1151 spin_lock(&tree->lock); 1151 spin_lock(&tree->lock);
1152 1152
1153 /* 1153 /*
1154 * this search will find all the extents that end after 1154 * this search will find all the extents that end after
1155 * our range starts. 1155 * our range starts.
1156 */ 1156 */
1157 node = tree_search(tree, cur_start); 1157 node = tree_search(tree, cur_start);
1158 if (!node) { 1158 if (!node) {
1159 if (!found) 1159 if (!found)
1160 *end = (u64)-1; 1160 *end = (u64)-1;
1161 goto out; 1161 goto out;
1162 } 1162 }
1163 1163
1164 while (1) { 1164 while (1) {
1165 state = rb_entry(node, struct extent_state, rb_node); 1165 state = rb_entry(node, struct extent_state, rb_node);
1166 if (found && (state->start != cur_start || 1166 if (found && (state->start != cur_start ||
1167 (state->state & EXTENT_BOUNDARY))) { 1167 (state->state & EXTENT_BOUNDARY))) {
1168 goto out; 1168 goto out;
1169 } 1169 }
1170 if (!(state->state & EXTENT_DELALLOC)) { 1170 if (!(state->state & EXTENT_DELALLOC)) {
1171 if (!found) 1171 if (!found)
1172 *end = state->end; 1172 *end = state->end;
1173 goto out; 1173 goto out;
1174 } 1174 }
1175 if (!found) { 1175 if (!found) {
1176 *start = state->start; 1176 *start = state->start;
1177 *cached_state = state; 1177 *cached_state = state;
1178 atomic_inc(&state->refs); 1178 atomic_inc(&state->refs);
1179 } 1179 }
1180 found++; 1180 found++;
1181 *end = state->end; 1181 *end = state->end;
1182 cur_start = state->end + 1; 1182 cur_start = state->end + 1;
1183 node = rb_next(node); 1183 node = rb_next(node);
1184 if (!node) 1184 if (!node)
1185 break; 1185 break;
1186 total_bytes += state->end - state->start + 1; 1186 total_bytes += state->end - state->start + 1;
1187 if (total_bytes >= max_bytes) 1187 if (total_bytes >= max_bytes)
1188 break; 1188 break;
1189 } 1189 }
1190 out: 1190 out:
1191 spin_unlock(&tree->lock); 1191 spin_unlock(&tree->lock);
1192 return found; 1192 return found;
1193 } 1193 }
1194 1194
1195 static noinline int __unlock_for_delalloc(struct inode *inode, 1195 static noinline int __unlock_for_delalloc(struct inode *inode,
1196 struct page *locked_page, 1196 struct page *locked_page,
1197 u64 start, u64 end) 1197 u64 start, u64 end)
1198 { 1198 {
1199 int ret; 1199 int ret;
1200 struct page *pages[16]; 1200 struct page *pages[16];
1201 unsigned long index = start >> PAGE_CACHE_SHIFT; 1201 unsigned long index = start >> PAGE_CACHE_SHIFT;
1202 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1202 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1203 unsigned long nr_pages = end_index - index + 1; 1203 unsigned long nr_pages = end_index - index + 1;
1204 int i; 1204 int i;
1205 1205
1206 if (index == locked_page->index && end_index == index) 1206 if (index == locked_page->index && end_index == index)
1207 return 0; 1207 return 0;
1208 1208
1209 while (nr_pages > 0) { 1209 while (nr_pages > 0) {
1210 ret = find_get_pages_contig(inode->i_mapping, index, 1210 ret = find_get_pages_contig(inode->i_mapping, index,
1211 min_t(unsigned long, nr_pages, 1211 min_t(unsigned long, nr_pages,
1212 ARRAY_SIZE(pages)), pages); 1212 ARRAY_SIZE(pages)), pages);
1213 for (i = 0; i < ret; i++) { 1213 for (i = 0; i < ret; i++) {
1214 if (pages[i] != locked_page) 1214 if (pages[i] != locked_page)
1215 unlock_page(pages[i]); 1215 unlock_page(pages[i]);
1216 page_cache_release(pages[i]); 1216 page_cache_release(pages[i]);
1217 } 1217 }
1218 nr_pages -= ret; 1218 nr_pages -= ret;
1219 index += ret; 1219 index += ret;
1220 cond_resched(); 1220 cond_resched();
1221 } 1221 }
1222 return 0; 1222 return 0;
1223 } 1223 }
1224 1224
1225 static noinline int lock_delalloc_pages(struct inode *inode, 1225 static noinline int lock_delalloc_pages(struct inode *inode,
1226 struct page *locked_page, 1226 struct page *locked_page,
1227 u64 delalloc_start, 1227 u64 delalloc_start,
1228 u64 delalloc_end) 1228 u64 delalloc_end)
1229 { 1229 {
1230 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1230 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1231 unsigned long start_index = index; 1231 unsigned long start_index = index;
1232 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1232 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1233 unsigned long pages_locked = 0; 1233 unsigned long pages_locked = 0;
1234 struct page *pages[16]; 1234 struct page *pages[16];
1235 unsigned long nrpages; 1235 unsigned long nrpages;
1236 int ret; 1236 int ret;
1237 int i; 1237 int i;
1238 1238
1239 /* the caller is responsible for locking the start index */ 1239 /* the caller is responsible for locking the start index */
1240 if (index == locked_page->index && index == end_index) 1240 if (index == locked_page->index && index == end_index)
1241 return 0; 1241 return 0;
1242 1242
1243 /* skip the page at the start index */ 1243 /* skip the page at the start index */
1244 nrpages = end_index - index + 1; 1244 nrpages = end_index - index + 1;
1245 while (nrpages > 0) { 1245 while (nrpages > 0) {
1246 ret = find_get_pages_contig(inode->i_mapping, index, 1246 ret = find_get_pages_contig(inode->i_mapping, index,
1247 min_t(unsigned long, 1247 min_t(unsigned long,
1248 nrpages, ARRAY_SIZE(pages)), pages); 1248 nrpages, ARRAY_SIZE(pages)), pages);
1249 if (ret == 0) { 1249 if (ret == 0) {
1250 ret = -EAGAIN; 1250 ret = -EAGAIN;
1251 goto done; 1251 goto done;
1252 } 1252 }
1253 /* now we have an array of pages, lock them all */ 1253 /* now we have an array of pages, lock them all */
1254 for (i = 0; i < ret; i++) { 1254 for (i = 0; i < ret; i++) {
1255 /* 1255 /*
1256 * the caller is taking responsibility for 1256 * the caller is taking responsibility for
1257 * locked_page 1257 * locked_page
1258 */ 1258 */
1259 if (pages[i] != locked_page) { 1259 if (pages[i] != locked_page) {
1260 lock_page(pages[i]); 1260 lock_page(pages[i]);
1261 if (!PageDirty(pages[i]) || 1261 if (!PageDirty(pages[i]) ||
1262 pages[i]->mapping != inode->i_mapping) { 1262 pages[i]->mapping != inode->i_mapping) {
1263 ret = -EAGAIN; 1263 ret = -EAGAIN;
1264 unlock_page(pages[i]); 1264 unlock_page(pages[i]);
1265 page_cache_release(pages[i]); 1265 page_cache_release(pages[i]);
1266 goto done; 1266 goto done;
1267 } 1267 }
1268 } 1268 }
1269 page_cache_release(pages[i]); 1269 page_cache_release(pages[i]);
1270 pages_locked++; 1270 pages_locked++;
1271 } 1271 }
1272 nrpages -= ret; 1272 nrpages -= ret;
1273 index += ret; 1273 index += ret;
1274 cond_resched(); 1274 cond_resched();
1275 } 1275 }
1276 ret = 0; 1276 ret = 0;
1277 done: 1277 done:
1278 if (ret && pages_locked) { 1278 if (ret && pages_locked) {
1279 __unlock_for_delalloc(inode, locked_page, 1279 __unlock_for_delalloc(inode, locked_page,
1280 delalloc_start, 1280 delalloc_start,
1281 ((u64)(start_index + pages_locked - 1)) << 1281 ((u64)(start_index + pages_locked - 1)) <<
1282 PAGE_CACHE_SHIFT); 1282 PAGE_CACHE_SHIFT);
1283 } 1283 }
1284 return ret; 1284 return ret;
1285 } 1285 }
1286 1286
1287 /* 1287 /*
1288 * find a contiguous range of bytes in the file marked as delalloc, not 1288 * find a contiguous range of bytes in the file marked as delalloc, not
1289 * more than 'max_bytes'. start and end are used to return the range, 1289 * more than 'max_bytes'. start and end are used to return the range,
1290 * 1290 *
1291 * 1 is returned if we find something, 0 if nothing was in the tree 1291 * 1 is returned if we find something, 0 if nothing was in the tree
1292 */ 1292 */
1293 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1293 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1294 struct extent_io_tree *tree, 1294 struct extent_io_tree *tree,
1295 struct page *locked_page, 1295 struct page *locked_page,
1296 u64 *start, u64 *end, 1296 u64 *start, u64 *end,
1297 u64 max_bytes) 1297 u64 max_bytes)
1298 { 1298 {
1299 u64 delalloc_start; 1299 u64 delalloc_start;
1300 u64 delalloc_end; 1300 u64 delalloc_end;
1301 u64 found; 1301 u64 found;
1302 struct extent_state *cached_state = NULL; 1302 struct extent_state *cached_state = NULL;
1303 int ret; 1303 int ret;
1304 int loops = 0; 1304 int loops = 0;
1305 1305
1306 again: 1306 again:
1307 /* step one, find a bunch of delalloc bytes starting at start */ 1307 /* step one, find a bunch of delalloc bytes starting at start */
1308 delalloc_start = *start; 1308 delalloc_start = *start;
1309 delalloc_end = 0; 1309 delalloc_end = 0;
1310 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1310 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1311 max_bytes, &cached_state); 1311 max_bytes, &cached_state);
1312 if (!found || delalloc_end <= *start) { 1312 if (!found || delalloc_end <= *start) {
1313 *start = delalloc_start; 1313 *start = delalloc_start;
1314 *end = delalloc_end; 1314 *end = delalloc_end;
1315 free_extent_state(cached_state); 1315 free_extent_state(cached_state);
1316 return found; 1316 return found;
1317 } 1317 }
1318 1318
1319 /* 1319 /*
1320 * start comes from the offset of locked_page. We have to lock 1320 * start comes from the offset of locked_page. We have to lock
1321 * pages in order, so we can't process delalloc bytes before 1321 * pages in order, so we can't process delalloc bytes before
1322 * locked_page 1322 * locked_page
1323 */ 1323 */
1324 if (delalloc_start < *start) 1324 if (delalloc_start < *start)
1325 delalloc_start = *start; 1325 delalloc_start = *start;
1326 1326
1327 /* 1327 /*
1328 * make sure to limit the number of pages we try to lock down 1328 * make sure to limit the number of pages we try to lock down
1329 * if we're looping. 1329 * if we're looping.
1330 */ 1330 */
1331 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1331 if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1332 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1332 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1333 1333
1334 /* step two, lock all the pages after the page that has start */ 1334 /* step two, lock all the pages after the page that has start */
1335 ret = lock_delalloc_pages(inode, locked_page, 1335 ret = lock_delalloc_pages(inode, locked_page,
1336 delalloc_start, delalloc_end); 1336 delalloc_start, delalloc_end);
1337 if (ret == -EAGAIN) { 1337 if (ret == -EAGAIN) {
1338 /* some of the pages are gone, lets avoid looping by 1338 /* some of the pages are gone, lets avoid looping by
1339 * shortening the size of the delalloc range we're searching 1339 * shortening the size of the delalloc range we're searching
1340 */ 1340 */
1341 free_extent_state(cached_state); 1341 free_extent_state(cached_state);
1342 if (!loops) { 1342 if (!loops) {
1343 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1343 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1344 max_bytes = PAGE_CACHE_SIZE - offset; 1344 max_bytes = PAGE_CACHE_SIZE - offset;
1345 loops = 1; 1345 loops = 1;
1346 goto again; 1346 goto again;
1347 } else { 1347 } else {
1348 found = 0; 1348 found = 0;
1349 goto out_failed; 1349 goto out_failed;
1350 } 1350 }
1351 } 1351 }
1352 BUG_ON(ret); 1352 BUG_ON(ret);
1353 1353
1354 /* step three, lock the state bits for the whole range */ 1354 /* step three, lock the state bits for the whole range */
1355 lock_extent_bits(tree, delalloc_start, delalloc_end, 1355 lock_extent_bits(tree, delalloc_start, delalloc_end,
1356 0, &cached_state, GFP_NOFS); 1356 0, &cached_state, GFP_NOFS);
1357 1357
1358 /* then test to make sure it is all still delalloc */ 1358 /* then test to make sure it is all still delalloc */
1359 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1359 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1360 EXTENT_DELALLOC, 1, cached_state); 1360 EXTENT_DELALLOC, 1, cached_state);
1361 if (!ret) { 1361 if (!ret) {
1362 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1362 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1363 &cached_state, GFP_NOFS); 1363 &cached_state, GFP_NOFS);
1364 __unlock_for_delalloc(inode, locked_page, 1364 __unlock_for_delalloc(inode, locked_page,
1365 delalloc_start, delalloc_end); 1365 delalloc_start, delalloc_end);
1366 cond_resched(); 1366 cond_resched();
1367 goto again; 1367 goto again;
1368 } 1368 }
1369 free_extent_state(cached_state); 1369 free_extent_state(cached_state);
1370 *start = delalloc_start; 1370 *start = delalloc_start;
1371 *end = delalloc_end; 1371 *end = delalloc_end;
1372 out_failed: 1372 out_failed:
1373 return found; 1373 return found;
1374 } 1374 }
1375 1375
1376 int extent_clear_unlock_delalloc(struct inode *inode, 1376 int extent_clear_unlock_delalloc(struct inode *inode,
1377 struct extent_io_tree *tree, 1377 struct extent_io_tree *tree,
1378 u64 start, u64 end, struct page *locked_page, 1378 u64 start, u64 end, struct page *locked_page,
1379 unsigned long op) 1379 unsigned long op)
1380 { 1380 {
1381 int ret; 1381 int ret;
1382 struct page *pages[16]; 1382 struct page *pages[16];
1383 unsigned long index = start >> PAGE_CACHE_SHIFT; 1383 unsigned long index = start >> PAGE_CACHE_SHIFT;
1384 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1384 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1385 unsigned long nr_pages = end_index - index + 1; 1385 unsigned long nr_pages = end_index - index + 1;
1386 int i; 1386 int i;
1387 int clear_bits = 0; 1387 int clear_bits = 0;
1388 1388
1389 if (op & EXTENT_CLEAR_UNLOCK) 1389 if (op & EXTENT_CLEAR_UNLOCK)
1390 clear_bits |= EXTENT_LOCKED; 1390 clear_bits |= EXTENT_LOCKED;
1391 if (op & EXTENT_CLEAR_DIRTY) 1391 if (op & EXTENT_CLEAR_DIRTY)
1392 clear_bits |= EXTENT_DIRTY; 1392 clear_bits |= EXTENT_DIRTY;
1393 1393
1394 if (op & EXTENT_CLEAR_DELALLOC) 1394 if (op & EXTENT_CLEAR_DELALLOC)
1395 clear_bits |= EXTENT_DELALLOC; 1395 clear_bits |= EXTENT_DELALLOC;
1396 1396
1397 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1397 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1398 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1398 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1399 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1399 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1400 EXTENT_SET_PRIVATE2))) 1400 EXTENT_SET_PRIVATE2)))
1401 return 0; 1401 return 0;
1402 1402
1403 while (nr_pages > 0) { 1403 while (nr_pages > 0) {
1404 ret = find_get_pages_contig(inode->i_mapping, index, 1404 ret = find_get_pages_contig(inode->i_mapping, index,
1405 min_t(unsigned long, 1405 min_t(unsigned long,
1406 nr_pages, ARRAY_SIZE(pages)), pages); 1406 nr_pages, ARRAY_SIZE(pages)), pages);
1407 for (i = 0; i < ret; i++) { 1407 for (i = 0; i < ret; i++) {
1408 1408
1409 if (op & EXTENT_SET_PRIVATE2) 1409 if (op & EXTENT_SET_PRIVATE2)
1410 SetPagePrivate2(pages[i]); 1410 SetPagePrivate2(pages[i]);
1411 1411
1412 if (pages[i] == locked_page) { 1412 if (pages[i] == locked_page) {
1413 page_cache_release(pages[i]); 1413 page_cache_release(pages[i]);
1414 continue; 1414 continue;
1415 } 1415 }
1416 if (op & EXTENT_CLEAR_DIRTY) 1416 if (op & EXTENT_CLEAR_DIRTY)
1417 clear_page_dirty_for_io(pages[i]); 1417 clear_page_dirty_for_io(pages[i]);
1418 if (op & EXTENT_SET_WRITEBACK) 1418 if (op & EXTENT_SET_WRITEBACK)
1419 set_page_writeback(pages[i]); 1419 set_page_writeback(pages[i]);
1420 if (op & EXTENT_END_WRITEBACK) 1420 if (op & EXTENT_END_WRITEBACK)
1421 end_page_writeback(pages[i]); 1421 end_page_writeback(pages[i]);
1422 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1422 if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1423 unlock_page(pages[i]); 1423 unlock_page(pages[i]);
1424 page_cache_release(pages[i]); 1424 page_cache_release(pages[i]);
1425 } 1425 }
1426 nr_pages -= ret; 1426 nr_pages -= ret;
1427 index += ret; 1427 index += ret;
1428 cond_resched(); 1428 cond_resched();
1429 } 1429 }
1430 return 0; 1430 return 0;
1431 } 1431 }
1432 1432
1433 /* 1433 /*
1434 * count the number of bytes in the tree that have a given bit(s) 1434 * count the number of bytes in the tree that have a given bit(s)
1435 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1435 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1436 * cached. The total number found is returned. 1436 * cached. The total number found is returned.
1437 */ 1437 */
1438 u64 count_range_bits(struct extent_io_tree *tree, 1438 u64 count_range_bits(struct extent_io_tree *tree,
1439 u64 *start, u64 search_end, u64 max_bytes, 1439 u64 *start, u64 search_end, u64 max_bytes,
1440 unsigned long bits, int contig) 1440 unsigned long bits, int contig)
1441 { 1441 {
1442 struct rb_node *node; 1442 struct rb_node *node;
1443 struct extent_state *state; 1443 struct extent_state *state;
1444 u64 cur_start = *start; 1444 u64 cur_start = *start;
1445 u64 total_bytes = 0; 1445 u64 total_bytes = 0;
1446 u64 last = 0; 1446 u64 last = 0;
1447 int found = 0; 1447 int found = 0;
1448 1448
1449 if (search_end <= cur_start) { 1449 if (search_end <= cur_start) {
1450 WARN_ON(1); 1450 WARN_ON(1);
1451 return 0; 1451 return 0;
1452 } 1452 }
1453 1453
1454 spin_lock(&tree->lock); 1454 spin_lock(&tree->lock);
1455 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1455 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1456 total_bytes = tree->dirty_bytes; 1456 total_bytes = tree->dirty_bytes;
1457 goto out; 1457 goto out;
1458 } 1458 }
1459 /* 1459 /*
1460 * this search will find all the extents that end after 1460 * this search will find all the extents that end after
1461 * our range starts. 1461 * our range starts.
1462 */ 1462 */
1463 node = tree_search(tree, cur_start); 1463 node = tree_search(tree, cur_start);
1464 if (!node) 1464 if (!node)
1465 goto out; 1465 goto out;
1466 1466
1467 while (1) { 1467 while (1) {
1468 state = rb_entry(node, struct extent_state, rb_node); 1468 state = rb_entry(node, struct extent_state, rb_node);
1469 if (state->start > search_end) 1469 if (state->start > search_end)
1470 break; 1470 break;
1471 if (contig && found && state->start > last + 1) 1471 if (contig && found && state->start > last + 1)
1472 break; 1472 break;
1473 if (state->end >= cur_start && (state->state & bits) == bits) { 1473 if (state->end >= cur_start && (state->state & bits) == bits) {
1474 total_bytes += min(search_end, state->end) + 1 - 1474 total_bytes += min(search_end, state->end) + 1 -
1475 max(cur_start, state->start); 1475 max(cur_start, state->start);
1476 if (total_bytes >= max_bytes) 1476 if (total_bytes >= max_bytes)
1477 break; 1477 break;
1478 if (!found) { 1478 if (!found) {
1479 *start = max(cur_start, state->start); 1479 *start = max(cur_start, state->start);
1480 found = 1; 1480 found = 1;
1481 } 1481 }
1482 last = state->end; 1482 last = state->end;
1483 } else if (contig && found) { 1483 } else if (contig && found) {
1484 break; 1484 break;
1485 } 1485 }
1486 node = rb_next(node); 1486 node = rb_next(node);
1487 if (!node) 1487 if (!node)
1488 break; 1488 break;
1489 } 1489 }
1490 out: 1490 out:
1491 spin_unlock(&tree->lock); 1491 spin_unlock(&tree->lock);
1492 return total_bytes; 1492 return total_bytes;
1493 } 1493 }
1494 1494
1495 /* 1495 /*
1496 * set the private field for a given byte offset in the tree. If there isn't 1496 * set the private field for a given byte offset in the tree. If there isn't
1497 * an extent_state there already, this does nothing. 1497 * an extent_state there already, this does nothing.
1498 */ 1498 */
1499 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1499 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1500 { 1500 {
1501 struct rb_node *node; 1501 struct rb_node *node;
1502 struct extent_state *state; 1502 struct extent_state *state;
1503 int ret = 0; 1503 int ret = 0;
1504 1504
1505 spin_lock(&tree->lock); 1505 spin_lock(&tree->lock);
1506 /* 1506 /*
1507 * this search will find all the extents that end after 1507 * this search will find all the extents that end after
1508 * our range starts. 1508 * our range starts.
1509 */ 1509 */
1510 node = tree_search(tree, start); 1510 node = tree_search(tree, start);
1511 if (!node) { 1511 if (!node) {
1512 ret = -ENOENT; 1512 ret = -ENOENT;
1513 goto out; 1513 goto out;
1514 } 1514 }
1515 state = rb_entry(node, struct extent_state, rb_node); 1515 state = rb_entry(node, struct extent_state, rb_node);
1516 if (state->start != start) { 1516 if (state->start != start) {
1517 ret = -ENOENT; 1517 ret = -ENOENT;
1518 goto out; 1518 goto out;
1519 } 1519 }
1520 state->private = private; 1520 state->private = private;
1521 out: 1521 out:
1522 spin_unlock(&tree->lock); 1522 spin_unlock(&tree->lock);
1523 return ret; 1523 return ret;
1524 } 1524 }
1525 1525
1526 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1526 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1527 { 1527 {
1528 struct rb_node *node; 1528 struct rb_node *node;
1529 struct extent_state *state; 1529 struct extent_state *state;
1530 int ret = 0; 1530 int ret = 0;
1531 1531
1532 spin_lock(&tree->lock); 1532 spin_lock(&tree->lock);
1533 /* 1533 /*
1534 * this search will find all the extents that end after 1534 * this search will find all the extents that end after
1535 * our range starts. 1535 * our range starts.
1536 */ 1536 */
1537 node = tree_search(tree, start); 1537 node = tree_search(tree, start);
1538 if (!node) { 1538 if (!node) {
1539 ret = -ENOENT; 1539 ret = -ENOENT;
1540 goto out; 1540 goto out;
1541 } 1541 }
1542 state = rb_entry(node, struct extent_state, rb_node); 1542 state = rb_entry(node, struct extent_state, rb_node);
1543 if (state->start != start) { 1543 if (state->start != start) {
1544 ret = -ENOENT; 1544 ret = -ENOENT;
1545 goto out; 1545 goto out;
1546 } 1546 }
1547 *private = state->private; 1547 *private = state->private;
1548 out: 1548 out:
1549 spin_unlock(&tree->lock); 1549 spin_unlock(&tree->lock);
1550 return ret; 1550 return ret;
1551 } 1551 }
1552 1552
1553 /* 1553 /*
1554 * searches a range in the state tree for a given mask. 1554 * searches a range in the state tree for a given mask.
1555 * If 'filled' == 1, this returns 1 only if every extent in the tree 1555 * If 'filled' == 1, this returns 1 only if every extent in the tree
1556 * has the bits set. Otherwise, 1 is returned if any bit in the 1556 * has the bits set. Otherwise, 1 is returned if any bit in the
1557 * range is found set. 1557 * range is found set.
1558 */ 1558 */
1559 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1559 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1560 int bits, int filled, struct extent_state *cached) 1560 int bits, int filled, struct extent_state *cached)
1561 { 1561 {
1562 struct extent_state *state = NULL; 1562 struct extent_state *state = NULL;
1563 struct rb_node *node; 1563 struct rb_node *node;
1564 int bitset = 0; 1564 int bitset = 0;
1565 1565
1566 spin_lock(&tree->lock); 1566 spin_lock(&tree->lock);
1567 if (cached && cached->tree && cached->start == start) 1567 if (cached && cached->tree && cached->start == start)
1568 node = &cached->rb_node; 1568 node = &cached->rb_node;
1569 else 1569 else
1570 node = tree_search(tree, start); 1570 node = tree_search(tree, start);
1571 while (node && start <= end) { 1571 while (node && start <= end) {
1572 state = rb_entry(node, struct extent_state, rb_node); 1572 state = rb_entry(node, struct extent_state, rb_node);
1573 1573
1574 if (filled && state->start > start) { 1574 if (filled && state->start > start) {
1575 bitset = 0; 1575 bitset = 0;
1576 break; 1576 break;
1577 } 1577 }
1578 1578
1579 if (state->start > end) 1579 if (state->start > end)
1580 break; 1580 break;
1581 1581
1582 if (state->state & bits) { 1582 if (state->state & bits) {
1583 bitset = 1; 1583 bitset = 1;
1584 if (!filled) 1584 if (!filled)
1585 break; 1585 break;
1586 } else if (filled) { 1586 } else if (filled) {
1587 bitset = 0; 1587 bitset = 0;
1588 break; 1588 break;
1589 } 1589 }
1590 1590
1591 if (state->end == (u64)-1) 1591 if (state->end == (u64)-1)
1592 break; 1592 break;
1593 1593
1594 start = state->end + 1; 1594 start = state->end + 1;
1595 if (start > end) 1595 if (start > end)
1596 break; 1596 break;
1597 node = rb_next(node); 1597 node = rb_next(node);
1598 if (!node) { 1598 if (!node) {
1599 if (filled) 1599 if (filled)
1600 bitset = 0; 1600 bitset = 0;
1601 break; 1601 break;
1602 } 1602 }
1603 } 1603 }
1604 spin_unlock(&tree->lock); 1604 spin_unlock(&tree->lock);
1605 return bitset; 1605 return bitset;
1606 } 1606 }
1607 1607
1608 /* 1608 /*
1609 * helper function to set a given page up to date if all the 1609 * helper function to set a given page up to date if all the
1610 * extents in the tree for that page are up to date 1610 * extents in the tree for that page are up to date
1611 */ 1611 */
1612 static int check_page_uptodate(struct extent_io_tree *tree, 1612 static int check_page_uptodate(struct extent_io_tree *tree,
1613 struct page *page) 1613 struct page *page)
1614 { 1614 {
1615 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1615 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1616 u64 end = start + PAGE_CACHE_SIZE - 1; 1616 u64 end = start + PAGE_CACHE_SIZE - 1;
1617 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1617 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1618 SetPageUptodate(page); 1618 SetPageUptodate(page);
1619 return 0; 1619 return 0;
1620 } 1620 }
1621 1621
1622 /* 1622 /*
1623 * helper function to unlock a page if all the extents in the tree 1623 * helper function to unlock a page if all the extents in the tree
1624 * for that page are unlocked 1624 * for that page are unlocked
1625 */ 1625 */
1626 static int check_page_locked(struct extent_io_tree *tree, 1626 static int check_page_locked(struct extent_io_tree *tree,
1627 struct page *page) 1627 struct page *page)
1628 { 1628 {
1629 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1629 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1630 u64 end = start + PAGE_CACHE_SIZE - 1; 1630 u64 end = start + PAGE_CACHE_SIZE - 1;
1631 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1631 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1632 unlock_page(page); 1632 unlock_page(page);
1633 return 0; 1633 return 0;
1634 } 1634 }
1635 1635
1636 /* 1636 /*
1637 * helper function to end page writeback if all the extents 1637 * helper function to end page writeback if all the extents
1638 * in the tree for that page are done with writeback 1638 * in the tree for that page are done with writeback
1639 */ 1639 */
1640 static int check_page_writeback(struct extent_io_tree *tree, 1640 static int check_page_writeback(struct extent_io_tree *tree,
1641 struct page *page) 1641 struct page *page)
1642 { 1642 {
1643 end_page_writeback(page); 1643 end_page_writeback(page);
1644 return 0; 1644 return 0;
1645 } 1645 }
1646 1646
1647 /* lots and lots of room for performance fixes in the end_bio funcs */ 1647 /* lots and lots of room for performance fixes in the end_bio funcs */
1648 1648
1649 /* 1649 /*
1650 * after a writepage IO is done, we need to: 1650 * after a writepage IO is done, we need to:
1651 * clear the uptodate bits on error 1651 * clear the uptodate bits on error
1652 * clear the writeback bits in the extent tree for this IO 1652 * clear the writeback bits in the extent tree for this IO
1653 * end_page_writeback if the page has no more pending IO 1653 * end_page_writeback if the page has no more pending IO
1654 * 1654 *
1655 * Scheduling is not allowed, so the extent state tree is expected 1655 * Scheduling is not allowed, so the extent state tree is expected
1656 * to have one and only one object corresponding to this IO. 1656 * to have one and only one object corresponding to this IO.
1657 */ 1657 */
1658 static void end_bio_extent_writepage(struct bio *bio, int err) 1658 static void end_bio_extent_writepage(struct bio *bio, int err)
1659 { 1659 {
1660 int uptodate = err == 0; 1660 int uptodate = err == 0;
1661 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1661 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1662 struct extent_io_tree *tree; 1662 struct extent_io_tree *tree;
1663 u64 start; 1663 u64 start;
1664 u64 end; 1664 u64 end;
1665 int whole_page; 1665 int whole_page;
1666 int ret; 1666 int ret;
1667 1667
1668 do { 1668 do {
1669 struct page *page = bvec->bv_page; 1669 struct page *page = bvec->bv_page;
1670 tree = &BTRFS_I(page->mapping->host)->io_tree; 1670 tree = &BTRFS_I(page->mapping->host)->io_tree;
1671 1671
1672 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1672 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1673 bvec->bv_offset; 1673 bvec->bv_offset;
1674 end = start + bvec->bv_len - 1; 1674 end = start + bvec->bv_len - 1;
1675 1675
1676 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1676 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1677 whole_page = 1; 1677 whole_page = 1;
1678 else 1678 else
1679 whole_page = 0; 1679 whole_page = 0;
1680 1680
1681 if (--bvec >= bio->bi_io_vec) 1681 if (--bvec >= bio->bi_io_vec)
1682 prefetchw(&bvec->bv_page->flags); 1682 prefetchw(&bvec->bv_page->flags);
1683 if (tree->ops && tree->ops->writepage_end_io_hook) { 1683 if (tree->ops && tree->ops->writepage_end_io_hook) {
1684 ret = tree->ops->writepage_end_io_hook(page, start, 1684 ret = tree->ops->writepage_end_io_hook(page, start,
1685 end, NULL, uptodate); 1685 end, NULL, uptodate);
1686 if (ret) 1686 if (ret)
1687 uptodate = 0; 1687 uptodate = 0;
1688 } 1688 }
1689 1689
1690 if (!uptodate && tree->ops && 1690 if (!uptodate && tree->ops &&
1691 tree->ops->writepage_io_failed_hook) { 1691 tree->ops->writepage_io_failed_hook) {
1692 ret = tree->ops->writepage_io_failed_hook(bio, page, 1692 ret = tree->ops->writepage_io_failed_hook(bio, page,
1693 start, end, NULL); 1693 start, end, NULL);
1694 if (ret == 0) { 1694 if (ret == 0) {
1695 uptodate = (err == 0); 1695 uptodate = (err == 0);
1696 continue; 1696 continue;
1697 } 1697 }
1698 } 1698 }
1699 1699
1700 if (!uptodate) { 1700 if (!uptodate) {
1701 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 1701 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1702 ClearPageUptodate(page); 1702 ClearPageUptodate(page);
1703 SetPageError(page); 1703 SetPageError(page);
1704 } 1704 }
1705 1705
1706 if (whole_page) 1706 if (whole_page)
1707 end_page_writeback(page); 1707 end_page_writeback(page);
1708 else 1708 else
1709 check_page_writeback(tree, page); 1709 check_page_writeback(tree, page);
1710 } while (bvec >= bio->bi_io_vec); 1710 } while (bvec >= bio->bi_io_vec);
1711 1711
1712 bio_put(bio); 1712 bio_put(bio);
1713 } 1713 }
1714 1714
1715 /* 1715 /*
1716 * after a readpage IO is done, we need to: 1716 * after a readpage IO is done, we need to:
1717 * clear the uptodate bits on error 1717 * clear the uptodate bits on error
1718 * set the uptodate bits if things worked 1718 * set the uptodate bits if things worked
1719 * set the page up to date if all extents in the tree are uptodate 1719 * set the page up to date if all extents in the tree are uptodate
1720 * clear the lock bit in the extent tree 1720 * clear the lock bit in the extent tree
1721 * unlock the page if there are no other extents locked for it 1721 * unlock the page if there are no other extents locked for it
1722 * 1722 *
1723 * Scheduling is not allowed, so the extent state tree is expected 1723 * Scheduling is not allowed, so the extent state tree is expected
1724 * to have one and only one object corresponding to this IO. 1724 * to have one and only one object corresponding to this IO.
1725 */ 1725 */
1726 static void end_bio_extent_readpage(struct bio *bio, int err) 1726 static void end_bio_extent_readpage(struct bio *bio, int err)
1727 { 1727 {
1728 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1728 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1729 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 1729 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1730 struct bio_vec *bvec = bio->bi_io_vec; 1730 struct bio_vec *bvec = bio->bi_io_vec;
1731 struct extent_io_tree *tree; 1731 struct extent_io_tree *tree;
1732 u64 start; 1732 u64 start;
1733 u64 end; 1733 u64 end;
1734 int whole_page; 1734 int whole_page;
1735 int ret; 1735 int ret;
1736 1736
1737 if (err) 1737 if (err)
1738 uptodate = 0; 1738 uptodate = 0;
1739 1739
1740 do { 1740 do {
1741 struct page *page = bvec->bv_page; 1741 struct page *page = bvec->bv_page;
1742 struct extent_state *cached = NULL; 1742 struct extent_state *cached = NULL;
1743 struct extent_state *state; 1743 struct extent_state *state;
1744 1744
1745 tree = &BTRFS_I(page->mapping->host)->io_tree; 1745 tree = &BTRFS_I(page->mapping->host)->io_tree;
1746 1746
1747 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1747 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1748 bvec->bv_offset; 1748 bvec->bv_offset;
1749 end = start + bvec->bv_len - 1; 1749 end = start + bvec->bv_len - 1;
1750 1750
1751 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1751 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1752 whole_page = 1; 1752 whole_page = 1;
1753 else 1753 else
1754 whole_page = 0; 1754 whole_page = 0;
1755 1755
1756 if (++bvec <= bvec_end) 1756 if (++bvec <= bvec_end)
1757 prefetchw(&bvec->bv_page->flags); 1757 prefetchw(&bvec->bv_page->flags);
1758 1758
1759 spin_lock(&tree->lock); 1759 spin_lock(&tree->lock);
1760 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 1760 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1761 if (state && state->start == start) { 1761 if (state && state->start == start) {
1762 /* 1762 /*
1763 * take a reference on the state, unlock will drop 1763 * take a reference on the state, unlock will drop
1764 * the ref 1764 * the ref
1765 */ 1765 */
1766 cache_state(state, &cached); 1766 cache_state(state, &cached);
1767 } 1767 }
1768 spin_unlock(&tree->lock); 1768 spin_unlock(&tree->lock);
1769 1769
1770 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1770 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1771 ret = tree->ops->readpage_end_io_hook(page, start, end, 1771 ret = tree->ops->readpage_end_io_hook(page, start, end,
1772 state); 1772 state);
1773 if (ret) 1773 if (ret)
1774 uptodate = 0; 1774 uptodate = 0;
1775 } 1775 }
1776 if (!uptodate && tree->ops && 1776 if (!uptodate && tree->ops &&
1777 tree->ops->readpage_io_failed_hook) { 1777 tree->ops->readpage_io_failed_hook) {
1778 ret = tree->ops->readpage_io_failed_hook(bio, page, 1778 ret = tree->ops->readpage_io_failed_hook(bio, page,
1779 start, end, NULL); 1779 start, end, NULL);
1780 if (ret == 0) { 1780 if (ret == 0) {
1781 uptodate = 1781 uptodate =
1782 test_bit(BIO_UPTODATE, &bio->bi_flags); 1782 test_bit(BIO_UPTODATE, &bio->bi_flags);
1783 if (err) 1783 if (err)
1784 uptodate = 0; 1784 uptodate = 0;
1785 uncache_state(&cached); 1785 uncache_state(&cached);
1786 continue; 1786 continue;
1787 } 1787 }
1788 } 1788 }
1789 1789
1790 if (uptodate) { 1790 if (uptodate) {
1791 set_extent_uptodate(tree, start, end, &cached, 1791 set_extent_uptodate(tree, start, end, &cached,
1792 GFP_ATOMIC); 1792 GFP_ATOMIC);
1793 } 1793 }
1794 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 1794 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1795 1795
1796 if (whole_page) { 1796 if (whole_page) {
1797 if (uptodate) { 1797 if (uptodate) {
1798 SetPageUptodate(page); 1798 SetPageUptodate(page);
1799 } else { 1799 } else {
1800 ClearPageUptodate(page); 1800 ClearPageUptodate(page);
1801 SetPageError(page); 1801 SetPageError(page);
1802 } 1802 }
1803 unlock_page(page); 1803 unlock_page(page);
1804 } else { 1804 } else {
1805 if (uptodate) { 1805 if (uptodate) {
1806 check_page_uptodate(tree, page); 1806 check_page_uptodate(tree, page);
1807 } else { 1807 } else {
1808 ClearPageUptodate(page); 1808 ClearPageUptodate(page);
1809 SetPageError(page); 1809 SetPageError(page);
1810 } 1810 }
1811 check_page_locked(tree, page); 1811 check_page_locked(tree, page);
1812 } 1812 }
1813 } while (bvec <= bvec_end); 1813 } while (bvec <= bvec_end);
1814 1814
1815 bio_put(bio); 1815 bio_put(bio);
1816 } 1816 }
1817 1817
1818 struct bio * 1818 struct bio *
1819 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1819 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1820 gfp_t gfp_flags) 1820 gfp_t gfp_flags)
1821 { 1821 {
1822 struct bio *bio; 1822 struct bio *bio;
1823 1823
1824 bio = bio_alloc(gfp_flags, nr_vecs); 1824 bio = bio_alloc(gfp_flags, nr_vecs);
1825 1825
1826 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1826 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1827 while (!bio && (nr_vecs /= 2)) 1827 while (!bio && (nr_vecs /= 2))
1828 bio = bio_alloc(gfp_flags, nr_vecs); 1828 bio = bio_alloc(gfp_flags, nr_vecs);
1829 } 1829 }
1830 1830
1831 if (bio) { 1831 if (bio) {
1832 bio->bi_size = 0; 1832 bio->bi_size = 0;
1833 bio->bi_bdev = bdev; 1833 bio->bi_bdev = bdev;
1834 bio->bi_sector = first_sector; 1834 bio->bi_sector = first_sector;
1835 } 1835 }
1836 return bio; 1836 return bio;
1837 } 1837 }
1838 1838
1839 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 1839 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1840 unsigned long bio_flags) 1840 unsigned long bio_flags)
1841 { 1841 {
1842 int ret = 0; 1842 int ret = 0;
1843 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1843 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1844 struct page *page = bvec->bv_page; 1844 struct page *page = bvec->bv_page;
1845 struct extent_io_tree *tree = bio->bi_private; 1845 struct extent_io_tree *tree = bio->bi_private;
1846 u64 start; 1846 u64 start;
1847 1847
1848 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1848 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1849 1849
1850 bio->bi_private = NULL; 1850 bio->bi_private = NULL;
1851 1851
1852 bio_get(bio); 1852 bio_get(bio);
1853 1853
1854 if (tree->ops && tree->ops->submit_bio_hook) 1854 if (tree->ops && tree->ops->submit_bio_hook)
1855 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1855 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1856 mirror_num, bio_flags, start); 1856 mirror_num, bio_flags, start);
1857 else 1857 else
1858 submit_bio(rw, bio); 1858 submit_bio(rw, bio);
1859 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1859 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860 ret = -EOPNOTSUPP; 1860 ret = -EOPNOTSUPP;
1861 bio_put(bio); 1861 bio_put(bio);
1862 return ret; 1862 return ret;
1863 } 1863 }
1864 1864
1865 static int submit_extent_page(int rw, struct extent_io_tree *tree, 1865 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1866 struct page *page, sector_t sector, 1866 struct page *page, sector_t sector,
1867 size_t size, unsigned long offset, 1867 size_t size, unsigned long offset,
1868 struct block_device *bdev, 1868 struct block_device *bdev,
1869 struct bio **bio_ret, 1869 struct bio **bio_ret,
1870 unsigned long max_pages, 1870 unsigned long max_pages,
1871 bio_end_io_t end_io_func, 1871 bio_end_io_t end_io_func,
1872 int mirror_num, 1872 int mirror_num,
1873 unsigned long prev_bio_flags, 1873 unsigned long prev_bio_flags,
1874 unsigned long bio_flags) 1874 unsigned long bio_flags)
1875 { 1875 {
1876 int ret = 0; 1876 int ret = 0;
1877 struct bio *bio; 1877 struct bio *bio;
1878 int nr; 1878 int nr;
1879 int contig = 0; 1879 int contig = 0;
1880 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 1880 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1881 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 1881 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1882 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 1882 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1883 1883
1884 if (bio_ret && *bio_ret) { 1884 if (bio_ret && *bio_ret) {
1885 bio = *bio_ret; 1885 bio = *bio_ret;
1886 if (old_compressed) 1886 if (old_compressed)
1887 contig = bio->bi_sector == sector; 1887 contig = bio->bi_sector == sector;
1888 else 1888 else
1889 contig = bio->bi_sector + (bio->bi_size >> 9) == 1889 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1890 sector; 1890 sector;
1891 1891
1892 if (prev_bio_flags != bio_flags || !contig || 1892 if (prev_bio_flags != bio_flags || !contig ||
1893 (tree->ops && tree->ops->merge_bio_hook && 1893 (tree->ops && tree->ops->merge_bio_hook &&
1894 tree->ops->merge_bio_hook(page, offset, page_size, bio, 1894 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1895 bio_flags)) || 1895 bio_flags)) ||
1896 bio_add_page(bio, page, page_size, offset) < page_size) { 1896 bio_add_page(bio, page, page_size, offset) < page_size) {
1897 ret = submit_one_bio(rw, bio, mirror_num, 1897 ret = submit_one_bio(rw, bio, mirror_num,
1898 prev_bio_flags); 1898 prev_bio_flags);
1899 bio = NULL; 1899 bio = NULL;
1900 } else { 1900 } else {
1901 return 0; 1901 return 0;
1902 } 1902 }
1903 } 1903 }
1904 if (this_compressed) 1904 if (this_compressed)
1905 nr = BIO_MAX_PAGES; 1905 nr = BIO_MAX_PAGES;
1906 else 1906 else
1907 nr = bio_get_nr_vecs(bdev); 1907 nr = bio_get_nr_vecs(bdev);
1908 1908
1909 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1909 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910 if (!bio) 1910 if (!bio)
1911 return -ENOMEM; 1911 return -ENOMEM;
1912 1912
1913 bio_add_page(bio, page, page_size, offset); 1913 bio_add_page(bio, page, page_size, offset);
1914 bio->bi_end_io = end_io_func; 1914 bio->bi_end_io = end_io_func;
1915 bio->bi_private = tree; 1915 bio->bi_private = tree;
1916 1916
1917 if (bio_ret) 1917 if (bio_ret)
1918 *bio_ret = bio; 1918 *bio_ret = bio;
1919 else 1919 else
1920 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 1920 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1921 1921
1922 return ret; 1922 return ret;
1923 } 1923 }
1924 1924
1925 void set_page_extent_mapped(struct page *page) 1925 void set_page_extent_mapped(struct page *page)
1926 { 1926 {
1927 if (!PagePrivate(page)) { 1927 if (!PagePrivate(page)) {
1928 SetPagePrivate(page); 1928 SetPagePrivate(page);
1929 page_cache_get(page); 1929 page_cache_get(page);
1930 set_page_private(page, EXTENT_PAGE_PRIVATE); 1930 set_page_private(page, EXTENT_PAGE_PRIVATE);
1931 } 1931 }
1932 } 1932 }
1933 1933
1934 static void set_page_extent_head(struct page *page, unsigned long len) 1934 static void set_page_extent_head(struct page *page, unsigned long len)
1935 { 1935 {
1936 WARN_ON(!PagePrivate(page)); 1936 WARN_ON(!PagePrivate(page));
1937 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1937 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1938 } 1938 }
1939 1939
1940 /* 1940 /*
1941 * basic readpage implementation. Locked extent state structs are inserted 1941 * basic readpage implementation. Locked extent state structs are inserted
1942 * into the tree that are removed when the IO is done (by the end_io 1942 * into the tree that are removed when the IO is done (by the end_io
1943 * handlers) 1943 * handlers)
1944 */ 1944 */
1945 static int __extent_read_full_page(struct extent_io_tree *tree, 1945 static int __extent_read_full_page(struct extent_io_tree *tree,
1946 struct page *page, 1946 struct page *page,
1947 get_extent_t *get_extent, 1947 get_extent_t *get_extent,
1948 struct bio **bio, int mirror_num, 1948 struct bio **bio, int mirror_num,
1949 unsigned long *bio_flags) 1949 unsigned long *bio_flags)
1950 { 1950 {
1951 struct inode *inode = page->mapping->host; 1951 struct inode *inode = page->mapping->host;
1952 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1952 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1953 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1953 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1954 u64 end; 1954 u64 end;
1955 u64 cur = start; 1955 u64 cur = start;
1956 u64 extent_offset; 1956 u64 extent_offset;
1957 u64 last_byte = i_size_read(inode); 1957 u64 last_byte = i_size_read(inode);
1958 u64 block_start; 1958 u64 block_start;
1959 u64 cur_end; 1959 u64 cur_end;
1960 sector_t sector; 1960 sector_t sector;
1961 struct extent_map *em; 1961 struct extent_map *em;
1962 struct block_device *bdev; 1962 struct block_device *bdev;
1963 struct btrfs_ordered_extent *ordered; 1963 struct btrfs_ordered_extent *ordered;
1964 int ret; 1964 int ret;
1965 int nr = 0; 1965 int nr = 0;
1966 size_t pg_offset = 0; 1966 size_t pg_offset = 0;
1967 size_t iosize; 1967 size_t iosize;
1968 size_t disk_io_size; 1968 size_t disk_io_size;
1969 size_t blocksize = inode->i_sb->s_blocksize; 1969 size_t blocksize = inode->i_sb->s_blocksize;
1970 unsigned long this_bio_flag = 0; 1970 unsigned long this_bio_flag = 0;
1971 1971
1972 set_page_extent_mapped(page); 1972 set_page_extent_mapped(page);
1973 1973
1974 if (!PageUptodate(page)) { 1974 if (!PageUptodate(page)) {
1975 if (cleancache_get_page(page) == 0) { 1975 if (cleancache_get_page(page) == 0) {
1976 BUG_ON(blocksize != PAGE_SIZE); 1976 BUG_ON(blocksize != PAGE_SIZE);
1977 goto out; 1977 goto out;
1978 } 1978 }
1979 } 1979 }
1980 1980
1981 end = page_end; 1981 end = page_end;
1982 while (1) { 1982 while (1) {
1983 lock_extent(tree, start, end, GFP_NOFS); 1983 lock_extent(tree, start, end, GFP_NOFS);
1984 ordered = btrfs_lookup_ordered_extent(inode, start); 1984 ordered = btrfs_lookup_ordered_extent(inode, start);
1985 if (!ordered) 1985 if (!ordered)
1986 break; 1986 break;
1987 unlock_extent(tree, start, end, GFP_NOFS); 1987 unlock_extent(tree, start, end, GFP_NOFS);
1988 btrfs_start_ordered_extent(inode, ordered, 1); 1988 btrfs_start_ordered_extent(inode, ordered, 1);
1989 btrfs_put_ordered_extent(ordered); 1989 btrfs_put_ordered_extent(ordered);
1990 } 1990 }
1991 1991
1992 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 1992 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1993 char *userpage; 1993 char *userpage;
1994 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 1994 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1995 1995
1996 if (zero_offset) { 1996 if (zero_offset) {
1997 iosize = PAGE_CACHE_SIZE - zero_offset; 1997 iosize = PAGE_CACHE_SIZE - zero_offset;
1998 userpage = kmap_atomic(page, KM_USER0); 1998 userpage = kmap_atomic(page, KM_USER0);
1999 memset(userpage + zero_offset, 0, iosize); 1999 memset(userpage + zero_offset, 0, iosize);
2000 flush_dcache_page(page); 2000 flush_dcache_page(page);
2001 kunmap_atomic(userpage, KM_USER0); 2001 kunmap_atomic(userpage, KM_USER0);
2002 } 2002 }
2003 } 2003 }
2004 while (cur <= end) { 2004 while (cur <= end) {
2005 if (cur >= last_byte) { 2005 if (cur >= last_byte) {
2006 char *userpage; 2006 char *userpage;
2007 struct extent_state *cached = NULL; 2007 struct extent_state *cached = NULL;
2008 2008
2009 iosize = PAGE_CACHE_SIZE - pg_offset; 2009 iosize = PAGE_CACHE_SIZE - pg_offset;
2010 userpage = kmap_atomic(page, KM_USER0); 2010 userpage = kmap_atomic(page, KM_USER0);
2011 memset(userpage + pg_offset, 0, iosize); 2011 memset(userpage + pg_offset, 0, iosize);
2012 flush_dcache_page(page); 2012 flush_dcache_page(page);
2013 kunmap_atomic(userpage, KM_USER0); 2013 kunmap_atomic(userpage, KM_USER0);
2014 set_extent_uptodate(tree, cur, cur + iosize - 1, 2014 set_extent_uptodate(tree, cur, cur + iosize - 1,
2015 &cached, GFP_NOFS); 2015 &cached, GFP_NOFS);
2016 unlock_extent_cached(tree, cur, cur + iosize - 1, 2016 unlock_extent_cached(tree, cur, cur + iosize - 1,
2017 &cached, GFP_NOFS); 2017 &cached, GFP_NOFS);
2018 break; 2018 break;
2019 } 2019 }
2020 em = get_extent(inode, page, pg_offset, cur, 2020 em = get_extent(inode, page, pg_offset, cur,
2021 end - cur + 1, 0); 2021 end - cur + 1, 0);
2022 if (IS_ERR_OR_NULL(em)) { 2022 if (IS_ERR_OR_NULL(em)) {
2023 SetPageError(page); 2023 SetPageError(page);
2024 unlock_extent(tree, cur, end, GFP_NOFS); 2024 unlock_extent(tree, cur, end, GFP_NOFS);
2025 break; 2025 break;
2026 } 2026 }
2027 extent_offset = cur - em->start; 2027 extent_offset = cur - em->start;
2028 BUG_ON(extent_map_end(em) <= cur); 2028 BUG_ON(extent_map_end(em) <= cur);
2029 BUG_ON(end < cur); 2029 BUG_ON(end < cur);
2030 2030
2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2032 this_bio_flag = EXTENT_BIO_COMPRESSED;
2033 extent_set_compress_type(&this_bio_flag, 2033 extent_set_compress_type(&this_bio_flag,
2034 em->compress_type); 2034 em->compress_type);
2035 } 2035 }
2036 2036
2037 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2037 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2038 cur_end = min(extent_map_end(em) - 1, end); 2038 cur_end = min(extent_map_end(em) - 1, end);
2039 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2039 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2040 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2040 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2041 disk_io_size = em->block_len; 2041 disk_io_size = em->block_len;
2042 sector = em->block_start >> 9; 2042 sector = em->block_start >> 9;
2043 } else { 2043 } else {
2044 sector = (em->block_start + extent_offset) >> 9; 2044 sector = (em->block_start + extent_offset) >> 9;
2045 disk_io_size = iosize; 2045 disk_io_size = iosize;
2046 } 2046 }
2047 bdev = em->bdev; 2047 bdev = em->bdev;
2048 block_start = em->block_start; 2048 block_start = em->block_start;
2049 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2049 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2050 block_start = EXTENT_MAP_HOLE; 2050 block_start = EXTENT_MAP_HOLE;
2051 free_extent_map(em); 2051 free_extent_map(em);
2052 em = NULL; 2052 em = NULL;
2053 2053
2054 /* we've found a hole, just zero and go on */ 2054 /* we've found a hole, just zero and go on */
2055 if (block_start == EXTENT_MAP_HOLE) { 2055 if (block_start == EXTENT_MAP_HOLE) {
2056 char *userpage; 2056 char *userpage;
2057 struct extent_state *cached = NULL; 2057 struct extent_state *cached = NULL;
2058 2058
2059 userpage = kmap_atomic(page, KM_USER0); 2059 userpage = kmap_atomic(page, KM_USER0);
2060 memset(userpage + pg_offset, 0, iosize); 2060 memset(userpage + pg_offset, 0, iosize);
2061 flush_dcache_page(page); 2061 flush_dcache_page(page);
2062 kunmap_atomic(userpage, KM_USER0); 2062 kunmap_atomic(userpage, KM_USER0);
2063 2063
2064 set_extent_uptodate(tree, cur, cur + iosize - 1, 2064 set_extent_uptodate(tree, cur, cur + iosize - 1,
2065 &cached, GFP_NOFS); 2065 &cached, GFP_NOFS);
2066 unlock_extent_cached(tree, cur, cur + iosize - 1, 2066 unlock_extent_cached(tree, cur, cur + iosize - 1,
2067 &cached, GFP_NOFS); 2067 &cached, GFP_NOFS);
2068 cur = cur + iosize; 2068 cur = cur + iosize;
2069 pg_offset += iosize; 2069 pg_offset += iosize;
2070 continue; 2070 continue;
2071 } 2071 }
2072 /* the get_extent function already copied into the page */ 2072 /* the get_extent function already copied into the page */
2073 if (test_range_bit(tree, cur, cur_end, 2073 if (test_range_bit(tree, cur, cur_end,
2074 EXTENT_UPTODATE, 1, NULL)) { 2074 EXTENT_UPTODATE, 1, NULL)) {
2075 check_page_uptodate(tree, page); 2075 check_page_uptodate(tree, page);
2076 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2076 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2077 cur = cur + iosize; 2077 cur = cur + iosize;
2078 pg_offset += iosize; 2078 pg_offset += iosize;
2079 continue; 2079 continue;
2080 } 2080 }
2081 /* we have an inline extent but it didn't get marked up 2081 /* we have an inline extent but it didn't get marked up
2082 * to date. Error out 2082 * to date. Error out
2083 */ 2083 */
2084 if (block_start == EXTENT_MAP_INLINE) { 2084 if (block_start == EXTENT_MAP_INLINE) {
2085 SetPageError(page); 2085 SetPageError(page);
2086 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2086 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2087 cur = cur + iosize; 2087 cur = cur + iosize;
2088 pg_offset += iosize; 2088 pg_offset += iosize;
2089 continue; 2089 continue;
2090 } 2090 }
2091 2091
2092 ret = 0; 2092 ret = 0;
2093 if (tree->ops && tree->ops->readpage_io_hook) { 2093 if (tree->ops && tree->ops->readpage_io_hook) {
2094 ret = tree->ops->readpage_io_hook(page, cur, 2094 ret = tree->ops->readpage_io_hook(page, cur,
2095 cur + iosize - 1); 2095 cur + iosize - 1);
2096 } 2096 }
2097 if (!ret) { 2097 if (!ret) {
2098 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2098 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2099 pnr -= page->index; 2099 pnr -= page->index;
2100 ret = submit_extent_page(READ, tree, page, 2100 ret = submit_extent_page(READ, tree, page,
2101 sector, disk_io_size, pg_offset, 2101 sector, disk_io_size, pg_offset,
2102 bdev, bio, pnr, 2102 bdev, bio, pnr,
2103 end_bio_extent_readpage, mirror_num, 2103 end_bio_extent_readpage, mirror_num,
2104 *bio_flags, 2104 *bio_flags,
2105 this_bio_flag); 2105 this_bio_flag);
2106 nr++; 2106 nr++;
2107 *bio_flags = this_bio_flag; 2107 *bio_flags = this_bio_flag;
2108 } 2108 }
2109 if (ret) 2109 if (ret)
2110 SetPageError(page); 2110 SetPageError(page);
2111 cur = cur + iosize; 2111 cur = cur + iosize;
2112 pg_offset += iosize; 2112 pg_offset += iosize;
2113 } 2113 }
2114 out: 2114 out:
2115 if (!nr) { 2115 if (!nr) {
2116 if (!PageError(page)) 2116 if (!PageError(page))
2117 SetPageUptodate(page); 2117 SetPageUptodate(page);
2118 unlock_page(page); 2118 unlock_page(page);
2119 } 2119 }
2120 return 0; 2120 return 0;
2121 } 2121 }
2122 2122
2123 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2123 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2124 get_extent_t *get_extent) 2124 get_extent_t *get_extent)
2125 { 2125 {
2126 struct bio *bio = NULL; 2126 struct bio *bio = NULL;
2127 unsigned long bio_flags = 0; 2127 unsigned long bio_flags = 0;
2128 int ret; 2128 int ret;
2129 2129
2130 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2130 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2131 &bio_flags); 2131 &bio_flags);
2132 if (bio) 2132 if (bio)
2133 ret = submit_one_bio(READ, bio, 0, bio_flags); 2133 ret = submit_one_bio(READ, bio, 0, bio_flags);
2134 return ret; 2134 return ret;
2135 } 2135 }
2136 2136
2137 static noinline void update_nr_written(struct page *page, 2137 static noinline void update_nr_written(struct page *page,
2138 struct writeback_control *wbc, 2138 struct writeback_control *wbc,
2139 unsigned long nr_written) 2139 unsigned long nr_written)
2140 { 2140 {
2141 wbc->nr_to_write -= nr_written; 2141 wbc->nr_to_write -= nr_written;
2142 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2142 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2143 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2143 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2144 page->mapping->writeback_index = page->index + nr_written; 2144 page->mapping->writeback_index = page->index + nr_written;
2145 } 2145 }
2146 2146
2147 /* 2147 /*
2148 * the writepage semantics are similar to regular writepage. extent 2148 * the writepage semantics are similar to regular writepage. extent
2149 * records are inserted to lock ranges in the tree, and as dirty areas 2149 * records are inserted to lock ranges in the tree, and as dirty areas
2150 * are found, they are marked writeback. Then the lock bits are removed 2150 * are found, they are marked writeback. Then the lock bits are removed
2151 * and the end_io handler clears the writeback ranges 2151 * and the end_io handler clears the writeback ranges
2152 */ 2152 */
2153 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2153 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2154 void *data) 2154 void *data)
2155 { 2155 {
2156 struct inode *inode = page->mapping->host; 2156 struct inode *inode = page->mapping->host;
2157 struct extent_page_data *epd = data; 2157 struct extent_page_data *epd = data;
2158 struct extent_io_tree *tree = epd->tree; 2158 struct extent_io_tree *tree = epd->tree;
2159 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2159 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2160 u64 delalloc_start; 2160 u64 delalloc_start;
2161 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2161 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2162 u64 end; 2162 u64 end;
2163 u64 cur = start; 2163 u64 cur = start;
2164 u64 extent_offset; 2164 u64 extent_offset;
2165 u64 last_byte = i_size_read(inode); 2165 u64 last_byte = i_size_read(inode);
2166 u64 block_start; 2166 u64 block_start;
2167 u64 iosize; 2167 u64 iosize;
2168 sector_t sector; 2168 sector_t sector;
2169 struct extent_state *cached_state = NULL; 2169 struct extent_state *cached_state = NULL;
2170 struct extent_map *em; 2170 struct extent_map *em;
2171 struct block_device *bdev; 2171 struct block_device *bdev;
2172 int ret; 2172 int ret;
2173 int nr = 0; 2173 int nr = 0;
2174 size_t pg_offset = 0; 2174 size_t pg_offset = 0;
2175 size_t blocksize; 2175 size_t blocksize;
2176 loff_t i_size = i_size_read(inode); 2176 loff_t i_size = i_size_read(inode);
2177 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2177 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2178 u64 nr_delalloc; 2178 u64 nr_delalloc;
2179 u64 delalloc_end; 2179 u64 delalloc_end;
2180 int page_started; 2180 int page_started;
2181 int compressed; 2181 int compressed;
2182 int write_flags; 2182 int write_flags;
2183 unsigned long nr_written = 0; 2183 unsigned long nr_written = 0;
2184 2184
2185 if (wbc->sync_mode == WB_SYNC_ALL) 2185 if (wbc->sync_mode == WB_SYNC_ALL)
2186 write_flags = WRITE_SYNC; 2186 write_flags = WRITE_SYNC;
2187 else 2187 else
2188 write_flags = WRITE; 2188 write_flags = WRITE;
2189 2189
2190 trace___extent_writepage(page, inode, wbc); 2190 trace___extent_writepage(page, inode, wbc);
2191 2191
2192 WARN_ON(!PageLocked(page)); 2192 WARN_ON(!PageLocked(page));
2193 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2193 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2194 if (page->index > end_index || 2194 if (page->index > end_index ||
2195 (page->index == end_index && !pg_offset)) { 2195 (page->index == end_index && !pg_offset)) {
2196 page->mapping->a_ops->invalidatepage(page, 0); 2196 page->mapping->a_ops->invalidatepage(page, 0);
2197 unlock_page(page); 2197 unlock_page(page);
2198 return 0; 2198 return 0;
2199 } 2199 }
2200 2200
2201 if (page->index == end_index) { 2201 if (page->index == end_index) {
2202 char *userpage; 2202 char *userpage;
2203 2203
2204 userpage = kmap_atomic(page, KM_USER0); 2204 userpage = kmap_atomic(page, KM_USER0);
2205 memset(userpage + pg_offset, 0, 2205 memset(userpage + pg_offset, 0,
2206 PAGE_CACHE_SIZE - pg_offset); 2206 PAGE_CACHE_SIZE - pg_offset);
2207 kunmap_atomic(userpage, KM_USER0); 2207 kunmap_atomic(userpage, KM_USER0);
2208 flush_dcache_page(page); 2208 flush_dcache_page(page);
2209 } 2209 }
2210 pg_offset = 0; 2210 pg_offset = 0;
2211 2211
2212 set_page_extent_mapped(page); 2212 set_page_extent_mapped(page);
2213 2213
2214 delalloc_start = start; 2214 delalloc_start = start;
2215 delalloc_end = 0; 2215 delalloc_end = 0;
2216 page_started = 0; 2216 page_started = 0;
2217 if (!epd->extent_locked) { 2217 if (!epd->extent_locked) {
2218 u64 delalloc_to_write = 0; 2218 u64 delalloc_to_write = 0;
2219 /* 2219 /*
2220 * make sure the wbc mapping index is at least updated 2220 * make sure the wbc mapping index is at least updated
2221 * to this page. 2221 * to this page.
2222 */ 2222 */
2223 update_nr_written(page, wbc, 0); 2223 update_nr_written(page, wbc, 0);
2224 2224
2225 while (delalloc_end < page_end) { 2225 while (delalloc_end < page_end) {
2226 nr_delalloc = find_lock_delalloc_range(inode, tree, 2226 nr_delalloc = find_lock_delalloc_range(inode, tree,
2227 page, 2227 page,
2228 &delalloc_start, 2228 &delalloc_start,
2229 &delalloc_end, 2229 &delalloc_end,
2230 128 * 1024 * 1024); 2230 128 * 1024 * 1024);
2231 if (nr_delalloc == 0) { 2231 if (nr_delalloc == 0) {
2232 delalloc_start = delalloc_end + 1; 2232 delalloc_start = delalloc_end + 1;
2233 continue; 2233 continue;
2234 } 2234 }
2235 tree->ops->fill_delalloc(inode, page, delalloc_start, 2235 tree->ops->fill_delalloc(inode, page, delalloc_start,
2236 delalloc_end, &page_started, 2236 delalloc_end, &page_started,
2237 &nr_written); 2237 &nr_written);
2238 /* 2238 /*
2239 * delalloc_end is already one less than the total 2239 * delalloc_end is already one less than the total
2240 * length, so we don't subtract one from 2240 * length, so we don't subtract one from
2241 * PAGE_CACHE_SIZE 2241 * PAGE_CACHE_SIZE
2242 */ 2242 */
2243 delalloc_to_write += (delalloc_end - delalloc_start + 2243 delalloc_to_write += (delalloc_end - delalloc_start +
2244 PAGE_CACHE_SIZE) >> 2244 PAGE_CACHE_SIZE) >>
2245 PAGE_CACHE_SHIFT; 2245 PAGE_CACHE_SHIFT;
2246 delalloc_start = delalloc_end + 1; 2246 delalloc_start = delalloc_end + 1;
2247 } 2247 }
2248 if (wbc->nr_to_write < delalloc_to_write) { 2248 if (wbc->nr_to_write < delalloc_to_write) {
2249 int thresh = 8192; 2249 int thresh = 8192;
2250 2250
2251 if (delalloc_to_write < thresh * 2) 2251 if (delalloc_to_write < thresh * 2)
2252 thresh = delalloc_to_write; 2252 thresh = delalloc_to_write;
2253 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2253 wbc->nr_to_write = min_t(u64, delalloc_to_write,
2254 thresh); 2254 thresh);
2255 } 2255 }
2256 2256
2257 /* did the fill delalloc function already unlock and start 2257 /* did the fill delalloc function already unlock and start
2258 * the IO? 2258 * the IO?
2259 */ 2259 */
2260 if (page_started) { 2260 if (page_started) {
2261 ret = 0; 2261 ret = 0;
2262 /* 2262 /*
2263 * we've unlocked the page, so we can't update 2263 * we've unlocked the page, so we can't update
2264 * the mapping's writeback index, just update 2264 * the mapping's writeback index, just update
2265 * nr_to_write. 2265 * nr_to_write.
2266 */ 2266 */
2267 wbc->nr_to_write -= nr_written; 2267 wbc->nr_to_write -= nr_written;
2268 goto done_unlocked; 2268 goto done_unlocked;
2269 } 2269 }
2270 } 2270 }
2271 if (tree->ops && tree->ops->writepage_start_hook) { 2271 if (tree->ops && tree->ops->writepage_start_hook) {
2272 ret = tree->ops->writepage_start_hook(page, start, 2272 ret = tree->ops->writepage_start_hook(page, start,
2273 page_end); 2273 page_end);
2274 if (ret == -EAGAIN) { 2274 if (ret == -EAGAIN) {
2275 redirty_page_for_writepage(wbc, page); 2275 redirty_page_for_writepage(wbc, page);
2276 update_nr_written(page, wbc, nr_written); 2276 update_nr_written(page, wbc, nr_written);
2277 unlock_page(page); 2277 unlock_page(page);
2278 ret = 0; 2278 ret = 0;
2279 goto done_unlocked; 2279 goto done_unlocked;
2280 } 2280 }
2281 } 2281 }
2282 2282
2283 /* 2283 /*
2284 * we don't want to touch the inode after unlocking the page, 2284 * we don't want to touch the inode after unlocking the page,
2285 * so we update the mapping writeback index now 2285 * so we update the mapping writeback index now
2286 */ 2286 */
2287 update_nr_written(page, wbc, nr_written + 1); 2287 update_nr_written(page, wbc, nr_written + 1);
2288 2288
2289 end = page_end; 2289 end = page_end;
2290 if (last_byte <= start) { 2290 if (last_byte <= start) {
2291 if (tree->ops && tree->ops->writepage_end_io_hook) 2291 if (tree->ops && tree->ops->writepage_end_io_hook)
2292 tree->ops->writepage_end_io_hook(page, start, 2292 tree->ops->writepage_end_io_hook(page, start,
2293 page_end, NULL, 1); 2293 page_end, NULL, 1);
2294 goto done; 2294 goto done;
2295 } 2295 }
2296 2296
2297 blocksize = inode->i_sb->s_blocksize; 2297 blocksize = inode->i_sb->s_blocksize;
2298 2298
2299 while (cur <= end) { 2299 while (cur <= end) {
2300 if (cur >= last_byte) { 2300 if (cur >= last_byte) {
2301 if (tree->ops && tree->ops->writepage_end_io_hook) 2301 if (tree->ops && tree->ops->writepage_end_io_hook)
2302 tree->ops->writepage_end_io_hook(page, cur, 2302 tree->ops->writepage_end_io_hook(page, cur,
2303 page_end, NULL, 1); 2303 page_end, NULL, 1);
2304 break; 2304 break;
2305 } 2305 }
2306 em = epd->get_extent(inode, page, pg_offset, cur, 2306 em = epd->get_extent(inode, page, pg_offset, cur,
2307 end - cur + 1, 1); 2307 end - cur + 1, 1);
2308 if (IS_ERR_OR_NULL(em)) { 2308 if (IS_ERR_OR_NULL(em)) {
2309 SetPageError(page); 2309 SetPageError(page);
2310 break; 2310 break;
2311 } 2311 }
2312 2312
2313 extent_offset = cur - em->start; 2313 extent_offset = cur - em->start;
2314 BUG_ON(extent_map_end(em) <= cur); 2314 BUG_ON(extent_map_end(em) <= cur);
2315 BUG_ON(end < cur); 2315 BUG_ON(end < cur);
2316 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2316 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2317 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2317 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2318 sector = (em->block_start + extent_offset) >> 9; 2318 sector = (em->block_start + extent_offset) >> 9;
2319 bdev = em->bdev; 2319 bdev = em->bdev;
2320 block_start = em->block_start; 2320 block_start = em->block_start;
2321 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2321 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2322 free_extent_map(em); 2322 free_extent_map(em);
2323 em = NULL; 2323 em = NULL;
2324 2324
2325 /* 2325 /*
2326 * compressed and inline extents are written through other 2326 * compressed and inline extents are written through other
2327 * paths in the FS 2327 * paths in the FS
2328 */ 2328 */
2329 if (compressed || block_start == EXTENT_MAP_HOLE || 2329 if (compressed || block_start == EXTENT_MAP_HOLE ||
2330 block_start == EXTENT_MAP_INLINE) { 2330 block_start == EXTENT_MAP_INLINE) {
2331 /* 2331 /*
2332 * end_io notification does not happen here for 2332 * end_io notification does not happen here for
2333 * compressed extents 2333 * compressed extents
2334 */ 2334 */
2335 if (!compressed && tree->ops && 2335 if (!compressed && tree->ops &&
2336 tree->ops->writepage_end_io_hook) 2336 tree->ops->writepage_end_io_hook)
2337 tree->ops->writepage_end_io_hook(page, cur, 2337 tree->ops->writepage_end_io_hook(page, cur,
2338 cur + iosize - 1, 2338 cur + iosize - 1,
2339 NULL, 1); 2339 NULL, 1);
2340 else if (compressed) { 2340 else if (compressed) {
2341 /* we don't want to end_page_writeback on 2341 /* we don't want to end_page_writeback on
2342 * a compressed extent. this happens 2342 * a compressed extent. this happens
2343 * elsewhere 2343 * elsewhere
2344 */ 2344 */
2345 nr++; 2345 nr++;
2346 } 2346 }
2347 2347
2348 cur += iosize; 2348 cur += iosize;
2349 pg_offset += iosize; 2349 pg_offset += iosize;
2350 continue; 2350 continue;
2351 } 2351 }
2352 /* leave this out until we have a page_mkwrite call */ 2352 /* leave this out until we have a page_mkwrite call */
2353 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2353 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2354 EXTENT_DIRTY, 0, NULL)) { 2354 EXTENT_DIRTY, 0, NULL)) {
2355 cur = cur + iosize; 2355 cur = cur + iosize;
2356 pg_offset += iosize; 2356 pg_offset += iosize;
2357 continue; 2357 continue;
2358 } 2358 }
2359 2359
2360 if (tree->ops && tree->ops->writepage_io_hook) { 2360 if (tree->ops && tree->ops->writepage_io_hook) {
2361 ret = tree->ops->writepage_io_hook(page, cur, 2361 ret = tree->ops->writepage_io_hook(page, cur,
2362 cur + iosize - 1); 2362 cur + iosize - 1);
2363 } else { 2363 } else {
2364 ret = 0; 2364 ret = 0;
2365 } 2365 }
2366 if (ret) { 2366 if (ret) {
2367 SetPageError(page); 2367 SetPageError(page);
2368 } else { 2368 } else {
2369 unsigned long max_nr = end_index + 1; 2369 unsigned long max_nr = end_index + 1;
2370 2370
2371 set_range_writeback(tree, cur, cur + iosize - 1); 2371 set_range_writeback(tree, cur, cur + iosize - 1);
2372 if (!PageWriteback(page)) { 2372 if (!PageWriteback(page)) {
2373 printk(KERN_ERR "btrfs warning page %lu not " 2373 printk(KERN_ERR "btrfs warning page %lu not "
2374 "writeback, cur %llu end %llu\n", 2374 "writeback, cur %llu end %llu\n",
2375 page->index, (unsigned long long)cur, 2375 page->index, (unsigned long long)cur,
2376 (unsigned long long)end); 2376 (unsigned long long)end);
2377 } 2377 }
2378 2378
2379 ret = submit_extent_page(write_flags, tree, page, 2379 ret = submit_extent_page(write_flags, tree, page,
2380 sector, iosize, pg_offset, 2380 sector, iosize, pg_offset,
2381 bdev, &epd->bio, max_nr, 2381 bdev, &epd->bio, max_nr,
2382 end_bio_extent_writepage, 2382 end_bio_extent_writepage,
2383 0, 0, 0); 2383 0, 0, 0);
2384 if (ret) 2384 if (ret)
2385 SetPageError(page); 2385 SetPageError(page);
2386 } 2386 }
2387 cur = cur + iosize; 2387 cur = cur + iosize;
2388 pg_offset += iosize; 2388 pg_offset += iosize;
2389 nr++; 2389 nr++;
2390 } 2390 }
2391 done: 2391 done:
2392 if (nr == 0) { 2392 if (nr == 0) {
2393 /* make sure the mapping tag for page dirty gets cleared */ 2393 /* make sure the mapping tag for page dirty gets cleared */
2394 set_page_writeback(page); 2394 set_page_writeback(page);
2395 end_page_writeback(page); 2395 end_page_writeback(page);
2396 } 2396 }
2397 unlock_page(page); 2397 unlock_page(page);
2398 2398
2399 done_unlocked: 2399 done_unlocked:
2400 2400
2401 /* drop our reference on any cached states */ 2401 /* drop our reference on any cached states */
2402 free_extent_state(cached_state); 2402 free_extent_state(cached_state);
2403 return 0; 2403 return 0;
2404 } 2404 }
2405 2405
2406 /** 2406 /**
2407 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2407 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2408 * @mapping: address space structure to write 2408 * @mapping: address space structure to write
2409 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2409 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2410 * @writepage: function called for each page 2410 * @writepage: function called for each page
2411 * @data: data passed to writepage function 2411 * @data: data passed to writepage function
2412 * 2412 *
2413 * If a page is already under I/O, write_cache_pages() skips it, even 2413 * If a page is already under I/O, write_cache_pages() skips it, even
2414 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2414 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2415 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2415 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2416 * and msync() need to guarantee that all the data which was dirty at the time 2416 * and msync() need to guarantee that all the data which was dirty at the time
2417 * the call was made get new I/O started against them. If wbc->sync_mode is 2417 * the call was made get new I/O started against them. If wbc->sync_mode is
2418 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2418 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2419 * existing IO to complete. 2419 * existing IO to complete.
2420 */ 2420 */
2421 static int extent_write_cache_pages(struct extent_io_tree *tree, 2421 static int extent_write_cache_pages(struct extent_io_tree *tree,
2422 struct address_space *mapping, 2422 struct address_space *mapping,
2423 struct writeback_control *wbc, 2423 struct writeback_control *wbc,
2424 writepage_t writepage, void *data, 2424 writepage_t writepage, void *data,
2425 void (*flush_fn)(void *)) 2425 void (*flush_fn)(void *))
2426 { 2426 {
2427 int ret = 0; 2427 int ret = 0;
2428 int done = 0; 2428 int done = 0;
2429 int nr_to_write_done = 0; 2429 int nr_to_write_done = 0;
2430 struct pagevec pvec; 2430 struct pagevec pvec;
2431 int nr_pages; 2431 int nr_pages;
2432 pgoff_t index; 2432 pgoff_t index;
2433 pgoff_t end; /* Inclusive */ 2433 pgoff_t end; /* Inclusive */
2434 int scanned = 0; 2434 int scanned = 0;
2435 2435
2436 pagevec_init(&pvec, 0); 2436 pagevec_init(&pvec, 0);
2437 if (wbc->range_cyclic) { 2437 if (wbc->range_cyclic) {
2438 index = mapping->writeback_index; /* Start from prev offset */ 2438 index = mapping->writeback_index; /* Start from prev offset */
2439 end = -1; 2439 end = -1;
2440 } else { 2440 } else {
2441 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2441 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2442 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2442 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2443 scanned = 1; 2443 scanned = 1;
2444 } 2444 }
2445 retry: 2445 retry:
2446 while (!done && !nr_to_write_done && (index <= end) && 2446 while (!done && !nr_to_write_done && (index <= end) &&
2447 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2447 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2448 PAGECACHE_TAG_DIRTY, min(end - index, 2448 PAGECACHE_TAG_DIRTY, min(end - index,
2449 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2449 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2450 unsigned i; 2450 unsigned i;
2451 2451
2452 scanned = 1; 2452 scanned = 1;
2453 for (i = 0; i < nr_pages; i++) { 2453 for (i = 0; i < nr_pages; i++) {
2454 struct page *page = pvec.pages[i]; 2454 struct page *page = pvec.pages[i];
2455 2455
2456 /* 2456 /*
2457 * At this point we hold neither mapping->tree_lock nor 2457 * At this point we hold neither mapping->tree_lock nor
2458 * lock on the page itself: the page may be truncated or 2458 * lock on the page itself: the page may be truncated or
2459 * invalidated (changing page->mapping to NULL), or even 2459 * invalidated (changing page->mapping to NULL), or even
2460 * swizzled back from swapper_space to tmpfs file 2460 * swizzled back from swapper_space to tmpfs file
2461 * mapping 2461 * mapping
2462 */ 2462 */
2463 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2463 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2464 tree->ops->write_cache_pages_lock_hook(page); 2464 tree->ops->write_cache_pages_lock_hook(page);
2465 else 2465 else
2466 lock_page(page); 2466 lock_page(page);
2467 2467
2468 if (unlikely(page->mapping != mapping)) { 2468 if (unlikely(page->mapping != mapping)) {
2469 unlock_page(page); 2469 unlock_page(page);
2470 continue; 2470 continue;
2471 } 2471 }
2472 2472
2473 if (!wbc->range_cyclic && page->index > end) { 2473 if (!wbc->range_cyclic && page->index > end) {
2474 done = 1; 2474 done = 1;
2475 unlock_page(page); 2475 unlock_page(page);
2476 continue; 2476 continue;
2477 } 2477 }
2478 2478
2479 if (wbc->sync_mode != WB_SYNC_NONE) { 2479 if (wbc->sync_mode != WB_SYNC_NONE) {
2480 if (PageWriteback(page)) 2480 if (PageWriteback(page))
2481 flush_fn(data); 2481 flush_fn(data);
2482 wait_on_page_writeback(page); 2482 wait_on_page_writeback(page);
2483 } 2483 }
2484 2484
2485 if (PageWriteback(page) || 2485 if (PageWriteback(page) ||
2486 !clear_page_dirty_for_io(page)) { 2486 !clear_page_dirty_for_io(page)) {
2487 unlock_page(page); 2487 unlock_page(page);
2488 continue; 2488 continue;
2489 } 2489 }
2490 2490
2491 ret = (*writepage)(page, wbc, data); 2491 ret = (*writepage)(page, wbc, data);
2492 2492
2493 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2493 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2494 unlock_page(page); 2494 unlock_page(page);
2495 ret = 0; 2495 ret = 0;
2496 } 2496 }
2497 if (ret) 2497 if (ret)
2498 done = 1; 2498 done = 1;
2499 2499
2500 /* 2500 /*
2501 * the filesystem may choose to bump up nr_to_write. 2501 * the filesystem may choose to bump up nr_to_write.
2502 * We have to make sure to honor the new nr_to_write 2502 * We have to make sure to honor the new nr_to_write
2503 * at any time 2503 * at any time
2504 */ 2504 */
2505 nr_to_write_done = wbc->nr_to_write <= 0; 2505 nr_to_write_done = wbc->nr_to_write <= 0;
2506 } 2506 }
2507 pagevec_release(&pvec); 2507 pagevec_release(&pvec);
2508 cond_resched(); 2508 cond_resched();
2509 } 2509 }
2510 if (!scanned && !done) { 2510 if (!scanned && !done) {
2511 /* 2511 /*
2512 * We hit the last page and there is more work to be done: wrap 2512 * We hit the last page and there is more work to be done: wrap
2513 * back to the start of the file 2513 * back to the start of the file
2514 */ 2514 */
2515 scanned = 1; 2515 scanned = 1;
2516 index = 0; 2516 index = 0;
2517 goto retry; 2517 goto retry;
2518 } 2518 }
2519 return ret; 2519 return ret;
2520 } 2520 }
2521 2521
2522 static void flush_epd_write_bio(struct extent_page_data *epd) 2522 static void flush_epd_write_bio(struct extent_page_data *epd)
2523 { 2523 {
2524 if (epd->bio) { 2524 if (epd->bio) {
2525 if (epd->sync_io) 2525 if (epd->sync_io)
2526 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2526 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2527 else 2527 else
2528 submit_one_bio(WRITE, epd->bio, 0, 0); 2528 submit_one_bio(WRITE, epd->bio, 0, 0);
2529 epd->bio = NULL; 2529 epd->bio = NULL;
2530 } 2530 }
2531 } 2531 }
2532 2532
2533 static noinline void flush_write_bio(void *data) 2533 static noinline void flush_write_bio(void *data)
2534 { 2534 {
2535 struct extent_page_data *epd = data; 2535 struct extent_page_data *epd = data;
2536 flush_epd_write_bio(epd); 2536 flush_epd_write_bio(epd);
2537 } 2537 }
2538 2538
2539 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2539 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2540 get_extent_t *get_extent, 2540 get_extent_t *get_extent,
2541 struct writeback_control *wbc) 2541 struct writeback_control *wbc)
2542 { 2542 {
2543 int ret; 2543 int ret;
2544 struct address_space *mapping = page->mapping; 2544 struct address_space *mapping = page->mapping;
2545 struct extent_page_data epd = { 2545 struct extent_page_data epd = {
2546 .bio = NULL, 2546 .bio = NULL,
2547 .tree = tree, 2547 .tree = tree,
2548 .get_extent = get_extent, 2548 .get_extent = get_extent,
2549 .extent_locked = 0, 2549 .extent_locked = 0,
2550 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2550 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2551 }; 2551 };
2552 struct writeback_control wbc_writepages = { 2552 struct writeback_control wbc_writepages = {
2553 .sync_mode = wbc->sync_mode, 2553 .sync_mode = wbc->sync_mode,
2554 .older_than_this = NULL,
2555 .nr_to_write = 64, 2554 .nr_to_write = 64,
2556 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2555 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2557 .range_end = (loff_t)-1, 2556 .range_end = (loff_t)-1,
2558 }; 2557 };
2559 2558
2560 ret = __extent_writepage(page, wbc, &epd); 2559 ret = __extent_writepage(page, wbc, &epd);
2561 2560
2562 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2561 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2563 __extent_writepage, &epd, flush_write_bio); 2562 __extent_writepage, &epd, flush_write_bio);
2564 flush_epd_write_bio(&epd); 2563 flush_epd_write_bio(&epd);
2565 return ret; 2564 return ret;
2566 } 2565 }
2567 2566
2568 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 2567 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2569 u64 start, u64 end, get_extent_t *get_extent, 2568 u64 start, u64 end, get_extent_t *get_extent,
2570 int mode) 2569 int mode)
2571 { 2570 {
2572 int ret = 0; 2571 int ret = 0;
2573 struct address_space *mapping = inode->i_mapping; 2572 struct address_space *mapping = inode->i_mapping;
2574 struct page *page; 2573 struct page *page;
2575 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 2574 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2576 PAGE_CACHE_SHIFT; 2575 PAGE_CACHE_SHIFT;
2577 2576
2578 struct extent_page_data epd = { 2577 struct extent_page_data epd = {
2579 .bio = NULL, 2578 .bio = NULL,
2580 .tree = tree, 2579 .tree = tree,
2581 .get_extent = get_extent, 2580 .get_extent = get_extent,
2582 .extent_locked = 1, 2581 .extent_locked = 1,
2583 .sync_io = mode == WB_SYNC_ALL, 2582 .sync_io = mode == WB_SYNC_ALL,
2584 }; 2583 };
2585 struct writeback_control wbc_writepages = { 2584 struct writeback_control wbc_writepages = {
2586 .sync_mode = mode, 2585 .sync_mode = mode,
2587 .older_than_this = NULL,
2588 .nr_to_write = nr_pages * 2, 2586 .nr_to_write = nr_pages * 2,
2589 .range_start = start, 2587 .range_start = start,
2590 .range_end = end + 1, 2588 .range_end = end + 1,
2591 }; 2589 };
2592 2590
2593 while (start <= end) { 2591 while (start <= end) {
2594 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 2592 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2595 if (clear_page_dirty_for_io(page)) 2593 if (clear_page_dirty_for_io(page))
2596 ret = __extent_writepage(page, &wbc_writepages, &epd); 2594 ret = __extent_writepage(page, &wbc_writepages, &epd);
2597 else { 2595 else {
2598 if (tree->ops && tree->ops->writepage_end_io_hook) 2596 if (tree->ops && tree->ops->writepage_end_io_hook)
2599 tree->ops->writepage_end_io_hook(page, start, 2597 tree->ops->writepage_end_io_hook(page, start,
2600 start + PAGE_CACHE_SIZE - 1, 2598 start + PAGE_CACHE_SIZE - 1,
2601 NULL, 1); 2599 NULL, 1);
2602 unlock_page(page); 2600 unlock_page(page);
2603 } 2601 }
2604 page_cache_release(page); 2602 page_cache_release(page);
2605 start += PAGE_CACHE_SIZE; 2603 start += PAGE_CACHE_SIZE;
2606 } 2604 }
2607 2605
2608 flush_epd_write_bio(&epd); 2606 flush_epd_write_bio(&epd);
2609 return ret; 2607 return ret;
2610 } 2608 }
2611 2609
2612 int extent_writepages(struct extent_io_tree *tree, 2610 int extent_writepages(struct extent_io_tree *tree,
2613 struct address_space *mapping, 2611 struct address_space *mapping,
2614 get_extent_t *get_extent, 2612 get_extent_t *get_extent,
2615 struct writeback_control *wbc) 2613 struct writeback_control *wbc)
2616 { 2614 {
2617 int ret = 0; 2615 int ret = 0;
2618 struct extent_page_data epd = { 2616 struct extent_page_data epd = {
2619 .bio = NULL, 2617 .bio = NULL,
2620 .tree = tree, 2618 .tree = tree,
2621 .get_extent = get_extent, 2619 .get_extent = get_extent,
2622 .extent_locked = 0, 2620 .extent_locked = 0,
2623 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2621 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2624 }; 2622 };
2625 2623
2626 ret = extent_write_cache_pages(tree, mapping, wbc, 2624 ret = extent_write_cache_pages(tree, mapping, wbc,
2627 __extent_writepage, &epd, 2625 __extent_writepage, &epd,
2628 flush_write_bio); 2626 flush_write_bio);
2629 flush_epd_write_bio(&epd); 2627 flush_epd_write_bio(&epd);
2630 return ret; 2628 return ret;
2631 } 2629 }
2632 2630
2633 int extent_readpages(struct extent_io_tree *tree, 2631 int extent_readpages(struct extent_io_tree *tree,
2634 struct address_space *mapping, 2632 struct address_space *mapping,
2635 struct list_head *pages, unsigned nr_pages, 2633 struct list_head *pages, unsigned nr_pages,
2636 get_extent_t get_extent) 2634 get_extent_t get_extent)
2637 { 2635 {
2638 struct bio *bio = NULL; 2636 struct bio *bio = NULL;
2639 unsigned page_idx; 2637 unsigned page_idx;
2640 unsigned long bio_flags = 0; 2638 unsigned long bio_flags = 0;
2641 2639
2642 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2640 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2643 struct page *page = list_entry(pages->prev, struct page, lru); 2641 struct page *page = list_entry(pages->prev, struct page, lru);
2644 2642
2645 prefetchw(&page->flags); 2643 prefetchw(&page->flags);
2646 list_del(&page->lru); 2644 list_del(&page->lru);
2647 if (!add_to_page_cache_lru(page, mapping, 2645 if (!add_to_page_cache_lru(page, mapping,
2648 page->index, GFP_NOFS)) { 2646 page->index, GFP_NOFS)) {
2649 __extent_read_full_page(tree, page, get_extent, 2647 __extent_read_full_page(tree, page, get_extent,
2650 &bio, 0, &bio_flags); 2648 &bio, 0, &bio_flags);
2651 } 2649 }
2652 page_cache_release(page); 2650 page_cache_release(page);
2653 } 2651 }
2654 BUG_ON(!list_empty(pages)); 2652 BUG_ON(!list_empty(pages));
2655 if (bio) 2653 if (bio)
2656 submit_one_bio(READ, bio, 0, bio_flags); 2654 submit_one_bio(READ, bio, 0, bio_flags);
2657 return 0; 2655 return 0;
2658 } 2656 }
2659 2657
2660 /* 2658 /*
2661 * basic invalidatepage code, this waits on any locked or writeback 2659 * basic invalidatepage code, this waits on any locked or writeback
2662 * ranges corresponding to the page, and then deletes any extent state 2660 * ranges corresponding to the page, and then deletes any extent state
2663 * records from the tree 2661 * records from the tree
2664 */ 2662 */
2665 int extent_invalidatepage(struct extent_io_tree *tree, 2663 int extent_invalidatepage(struct extent_io_tree *tree,
2666 struct page *page, unsigned long offset) 2664 struct page *page, unsigned long offset)
2667 { 2665 {
2668 struct extent_state *cached_state = NULL; 2666 struct extent_state *cached_state = NULL;
2669 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2667 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2670 u64 end = start + PAGE_CACHE_SIZE - 1; 2668 u64 end = start + PAGE_CACHE_SIZE - 1;
2671 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2669 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2672 2670
2673 start += (offset + blocksize - 1) & ~(blocksize - 1); 2671 start += (offset + blocksize - 1) & ~(blocksize - 1);
2674 if (start > end) 2672 if (start > end)
2675 return 0; 2673 return 0;
2676 2674
2677 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 2675 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2678 wait_on_page_writeback(page); 2676 wait_on_page_writeback(page);
2679 clear_extent_bit(tree, start, end, 2677 clear_extent_bit(tree, start, end,
2680 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2678 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2681 EXTENT_DO_ACCOUNTING, 2679 EXTENT_DO_ACCOUNTING,
2682 1, 1, &cached_state, GFP_NOFS); 2680 1, 1, &cached_state, GFP_NOFS);
2683 return 0; 2681 return 0;
2684 } 2682 }
2685 2683
2686 /* 2684 /*
2687 * a helper for releasepage, this tests for areas of the page that 2685 * a helper for releasepage, this tests for areas of the page that
2688 * are locked or under IO and drops the related state bits if it is safe 2686 * are locked or under IO and drops the related state bits if it is safe
2689 * to drop the page. 2687 * to drop the page.
2690 */ 2688 */
2691 int try_release_extent_state(struct extent_map_tree *map, 2689 int try_release_extent_state(struct extent_map_tree *map,
2692 struct extent_io_tree *tree, struct page *page, 2690 struct extent_io_tree *tree, struct page *page,
2693 gfp_t mask) 2691 gfp_t mask)
2694 { 2692 {
2695 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2693 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2696 u64 end = start + PAGE_CACHE_SIZE - 1; 2694 u64 end = start + PAGE_CACHE_SIZE - 1;
2697 int ret = 1; 2695 int ret = 1;
2698 2696
2699 if (test_range_bit(tree, start, end, 2697 if (test_range_bit(tree, start, end,
2700 EXTENT_IOBITS, 0, NULL)) 2698 EXTENT_IOBITS, 0, NULL))
2701 ret = 0; 2699 ret = 0;
2702 else { 2700 else {
2703 if ((mask & GFP_NOFS) == GFP_NOFS) 2701 if ((mask & GFP_NOFS) == GFP_NOFS)
2704 mask = GFP_NOFS; 2702 mask = GFP_NOFS;
2705 /* 2703 /*
2706 * at this point we can safely clear everything except the 2704 * at this point we can safely clear everything except the
2707 * locked bit and the nodatasum bit 2705 * locked bit and the nodatasum bit
2708 */ 2706 */
2709 ret = clear_extent_bit(tree, start, end, 2707 ret = clear_extent_bit(tree, start, end,
2710 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2708 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2711 0, 0, NULL, mask); 2709 0, 0, NULL, mask);
2712 2710
2713 /* if clear_extent_bit failed for enomem reasons, 2711 /* if clear_extent_bit failed for enomem reasons,
2714 * we can't allow the release to continue. 2712 * we can't allow the release to continue.
2715 */ 2713 */
2716 if (ret < 0) 2714 if (ret < 0)
2717 ret = 0; 2715 ret = 0;
2718 else 2716 else
2719 ret = 1; 2717 ret = 1;
2720 } 2718 }
2721 return ret; 2719 return ret;
2722 } 2720 }
2723 2721
2724 /* 2722 /*
2725 * a helper for releasepage. As long as there are no locked extents 2723 * a helper for releasepage. As long as there are no locked extents
2726 * in the range corresponding to the page, both state records and extent 2724 * in the range corresponding to the page, both state records and extent
2727 * map records are removed 2725 * map records are removed
2728 */ 2726 */
2729 int try_release_extent_mapping(struct extent_map_tree *map, 2727 int try_release_extent_mapping(struct extent_map_tree *map,
2730 struct extent_io_tree *tree, struct page *page, 2728 struct extent_io_tree *tree, struct page *page,
2731 gfp_t mask) 2729 gfp_t mask)
2732 { 2730 {
2733 struct extent_map *em; 2731 struct extent_map *em;
2734 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2732 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2735 u64 end = start + PAGE_CACHE_SIZE - 1; 2733 u64 end = start + PAGE_CACHE_SIZE - 1;
2736 2734
2737 if ((mask & __GFP_WAIT) && 2735 if ((mask & __GFP_WAIT) &&
2738 page->mapping->host->i_size > 16 * 1024 * 1024) { 2736 page->mapping->host->i_size > 16 * 1024 * 1024) {
2739 u64 len; 2737 u64 len;
2740 while (start <= end) { 2738 while (start <= end) {
2741 len = end - start + 1; 2739 len = end - start + 1;
2742 write_lock(&map->lock); 2740 write_lock(&map->lock);
2743 em = lookup_extent_mapping(map, start, len); 2741 em = lookup_extent_mapping(map, start, len);
2744 if (IS_ERR_OR_NULL(em)) { 2742 if (IS_ERR_OR_NULL(em)) {
2745 write_unlock(&map->lock); 2743 write_unlock(&map->lock);
2746 break; 2744 break;
2747 } 2745 }
2748 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2746 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2749 em->start != start) { 2747 em->start != start) {
2750 write_unlock(&map->lock); 2748 write_unlock(&map->lock);
2751 free_extent_map(em); 2749 free_extent_map(em);
2752 break; 2750 break;
2753 } 2751 }
2754 if (!test_range_bit(tree, em->start, 2752 if (!test_range_bit(tree, em->start,
2755 extent_map_end(em) - 1, 2753 extent_map_end(em) - 1,
2756 EXTENT_LOCKED | EXTENT_WRITEBACK, 2754 EXTENT_LOCKED | EXTENT_WRITEBACK,
2757 0, NULL)) { 2755 0, NULL)) {
2758 remove_extent_mapping(map, em); 2756 remove_extent_mapping(map, em);
2759 /* once for the rb tree */ 2757 /* once for the rb tree */
2760 free_extent_map(em); 2758 free_extent_map(em);
2761 } 2759 }
2762 start = extent_map_end(em); 2760 start = extent_map_end(em);
2763 write_unlock(&map->lock); 2761 write_unlock(&map->lock);
2764 2762
2765 /* once for us */ 2763 /* once for us */
2766 free_extent_map(em); 2764 free_extent_map(em);
2767 } 2765 }
2768 } 2766 }
2769 return try_release_extent_state(map, tree, page, mask); 2767 return try_release_extent_state(map, tree, page, mask);
2770 } 2768 }
2771 2769
2772 /* 2770 /*
2773 * helper function for fiemap, which doesn't want to see any holes. 2771 * helper function for fiemap, which doesn't want to see any holes.
2774 * This maps until we find something past 'last' 2772 * This maps until we find something past 'last'
2775 */ 2773 */
2776 static struct extent_map *get_extent_skip_holes(struct inode *inode, 2774 static struct extent_map *get_extent_skip_holes(struct inode *inode,
2777 u64 offset, 2775 u64 offset,
2778 u64 last, 2776 u64 last,
2779 get_extent_t *get_extent) 2777 get_extent_t *get_extent)
2780 { 2778 {
2781 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 2779 u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2782 struct extent_map *em; 2780 struct extent_map *em;
2783 u64 len; 2781 u64 len;
2784 2782
2785 if (offset >= last) 2783 if (offset >= last)
2786 return NULL; 2784 return NULL;
2787 2785
2788 while(1) { 2786 while(1) {
2789 len = last - offset; 2787 len = last - offset;
2790 if (len == 0) 2788 if (len == 0)
2791 break; 2789 break;
2792 len = (len + sectorsize - 1) & ~(sectorsize - 1); 2790 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2793 em = get_extent(inode, NULL, 0, offset, len, 0); 2791 em = get_extent(inode, NULL, 0, offset, len, 0);
2794 if (IS_ERR_OR_NULL(em)) 2792 if (IS_ERR_OR_NULL(em))
2795 return em; 2793 return em;
2796 2794
2797 /* if this isn't a hole return it */ 2795 /* if this isn't a hole return it */
2798 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 2796 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2799 em->block_start != EXTENT_MAP_HOLE) { 2797 em->block_start != EXTENT_MAP_HOLE) {
2800 return em; 2798 return em;
2801 } 2799 }
2802 2800
2803 /* this is a hole, advance to the next extent */ 2801 /* this is a hole, advance to the next extent */
2804 offset = extent_map_end(em); 2802 offset = extent_map_end(em);
2805 free_extent_map(em); 2803 free_extent_map(em);
2806 if (offset >= last) 2804 if (offset >= last)
2807 break; 2805 break;
2808 } 2806 }
2809 return NULL; 2807 return NULL;
2810 } 2808 }
2811 2809
2812 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2810 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2813 __u64 start, __u64 len, get_extent_t *get_extent) 2811 __u64 start, __u64 len, get_extent_t *get_extent)
2814 { 2812 {
2815 int ret = 0; 2813 int ret = 0;
2816 u64 off = start; 2814 u64 off = start;
2817 u64 max = start + len; 2815 u64 max = start + len;
2818 u32 flags = 0; 2816 u32 flags = 0;
2819 u32 found_type; 2817 u32 found_type;
2820 u64 last; 2818 u64 last;
2821 u64 last_for_get_extent = 0; 2819 u64 last_for_get_extent = 0;
2822 u64 disko = 0; 2820 u64 disko = 0;
2823 u64 isize = i_size_read(inode); 2821 u64 isize = i_size_read(inode);
2824 struct btrfs_key found_key; 2822 struct btrfs_key found_key;
2825 struct extent_map *em = NULL; 2823 struct extent_map *em = NULL;
2826 struct extent_state *cached_state = NULL; 2824 struct extent_state *cached_state = NULL;
2827 struct btrfs_path *path; 2825 struct btrfs_path *path;
2828 struct btrfs_file_extent_item *item; 2826 struct btrfs_file_extent_item *item;
2829 int end = 0; 2827 int end = 0;
2830 u64 em_start = 0; 2828 u64 em_start = 0;
2831 u64 em_len = 0; 2829 u64 em_len = 0;
2832 u64 em_end = 0; 2830 u64 em_end = 0;
2833 unsigned long emflags; 2831 unsigned long emflags;
2834 2832
2835 if (len == 0) 2833 if (len == 0)
2836 return -EINVAL; 2834 return -EINVAL;
2837 2835
2838 path = btrfs_alloc_path(); 2836 path = btrfs_alloc_path();
2839 if (!path) 2837 if (!path)
2840 return -ENOMEM; 2838 return -ENOMEM;
2841 path->leave_spinning = 1; 2839 path->leave_spinning = 1;
2842 2840
2843 /* 2841 /*
2844 * lookup the last file extent. We're not using i_size here 2842 * lookup the last file extent. We're not using i_size here
2845 * because there might be preallocation past i_size 2843 * because there might be preallocation past i_size
2846 */ 2844 */
2847 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 2845 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2848 path, btrfs_ino(inode), -1, 0); 2846 path, btrfs_ino(inode), -1, 0);
2849 if (ret < 0) { 2847 if (ret < 0) {
2850 btrfs_free_path(path); 2848 btrfs_free_path(path);
2851 return ret; 2849 return ret;
2852 } 2850 }
2853 WARN_ON(!ret); 2851 WARN_ON(!ret);
2854 path->slots[0]--; 2852 path->slots[0]--;
2855 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2853 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2856 struct btrfs_file_extent_item); 2854 struct btrfs_file_extent_item);
2857 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 2855 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2858 found_type = btrfs_key_type(&found_key); 2856 found_type = btrfs_key_type(&found_key);
2859 2857
2860 /* No extents, but there might be delalloc bits */ 2858 /* No extents, but there might be delalloc bits */
2861 if (found_key.objectid != btrfs_ino(inode) || 2859 if (found_key.objectid != btrfs_ino(inode) ||
2862 found_type != BTRFS_EXTENT_DATA_KEY) { 2860 found_type != BTRFS_EXTENT_DATA_KEY) {
2863 /* have to trust i_size as the end */ 2861 /* have to trust i_size as the end */
2864 last = (u64)-1; 2862 last = (u64)-1;
2865 last_for_get_extent = isize; 2863 last_for_get_extent = isize;
2866 } else { 2864 } else {
2867 /* 2865 /*
2868 * remember the start of the last extent. There are a 2866 * remember the start of the last extent. There are a
2869 * bunch of different factors that go into the length of the 2867 * bunch of different factors that go into the length of the
2870 * extent, so its much less complex to remember where it started 2868 * extent, so its much less complex to remember where it started
2871 */ 2869 */
2872 last = found_key.offset; 2870 last = found_key.offset;
2873 last_for_get_extent = last + 1; 2871 last_for_get_extent = last + 1;
2874 } 2872 }
2875 btrfs_free_path(path); 2873 btrfs_free_path(path);
2876 2874
2877 /* 2875 /*
2878 * we might have some extents allocated but more delalloc past those 2876 * we might have some extents allocated but more delalloc past those
2879 * extents. so, we trust isize unless the start of the last extent is 2877 * extents. so, we trust isize unless the start of the last extent is
2880 * beyond isize 2878 * beyond isize
2881 */ 2879 */
2882 if (last < isize) { 2880 if (last < isize) {
2883 last = (u64)-1; 2881 last = (u64)-1;
2884 last_for_get_extent = isize; 2882 last_for_get_extent = isize;
2885 } 2883 }
2886 2884
2887 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2885 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2888 &cached_state, GFP_NOFS); 2886 &cached_state, GFP_NOFS);
2889 2887
2890 em = get_extent_skip_holes(inode, off, last_for_get_extent, 2888 em = get_extent_skip_holes(inode, off, last_for_get_extent,
2891 get_extent); 2889 get_extent);
2892 if (!em) 2890 if (!em)
2893 goto out; 2891 goto out;
2894 if (IS_ERR(em)) { 2892 if (IS_ERR(em)) {
2895 ret = PTR_ERR(em); 2893 ret = PTR_ERR(em);
2896 goto out; 2894 goto out;
2897 } 2895 }
2898 2896
2899 while (!end) { 2897 while (!end) {
2900 u64 offset_in_extent; 2898 u64 offset_in_extent;
2901 2899
2902 /* break if the extent we found is outside the range */ 2900 /* break if the extent we found is outside the range */
2903 if (em->start >= max || extent_map_end(em) < off) 2901 if (em->start >= max || extent_map_end(em) < off)
2904 break; 2902 break;
2905 2903
2906 /* 2904 /*
2907 * get_extent may return an extent that starts before our 2905 * get_extent may return an extent that starts before our
2908 * requested range. We have to make sure the ranges 2906 * requested range. We have to make sure the ranges
2909 * we return to fiemap always move forward and don't 2907 * we return to fiemap always move forward and don't
2910 * overlap, so adjust the offsets here 2908 * overlap, so adjust the offsets here
2911 */ 2909 */
2912 em_start = max(em->start, off); 2910 em_start = max(em->start, off);
2913 2911
2914 /* 2912 /*
2915 * record the offset from the start of the extent 2913 * record the offset from the start of the extent
2916 * for adjusting the disk offset below 2914 * for adjusting the disk offset below
2917 */ 2915 */
2918 offset_in_extent = em_start - em->start; 2916 offset_in_extent = em_start - em->start;
2919 em_end = extent_map_end(em); 2917 em_end = extent_map_end(em);
2920 em_len = em_end - em_start; 2918 em_len = em_end - em_start;
2921 emflags = em->flags; 2919 emflags = em->flags;
2922 disko = 0; 2920 disko = 0;
2923 flags = 0; 2921 flags = 0;
2924 2922
2925 /* 2923 /*
2926 * bump off for our next call to get_extent 2924 * bump off for our next call to get_extent
2927 */ 2925 */
2928 off = extent_map_end(em); 2926 off = extent_map_end(em);
2929 if (off >= max) 2927 if (off >= max)
2930 end = 1; 2928 end = 1;
2931 2929
2932 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2930 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2933 end = 1; 2931 end = 1;
2934 flags |= FIEMAP_EXTENT_LAST; 2932 flags |= FIEMAP_EXTENT_LAST;
2935 } else if (em->block_start == EXTENT_MAP_INLINE) { 2933 } else if (em->block_start == EXTENT_MAP_INLINE) {
2936 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2934 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2937 FIEMAP_EXTENT_NOT_ALIGNED); 2935 FIEMAP_EXTENT_NOT_ALIGNED);
2938 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 2936 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2939 flags |= (FIEMAP_EXTENT_DELALLOC | 2937 flags |= (FIEMAP_EXTENT_DELALLOC |
2940 FIEMAP_EXTENT_UNKNOWN); 2938 FIEMAP_EXTENT_UNKNOWN);
2941 } else { 2939 } else {
2942 disko = em->block_start + offset_in_extent; 2940 disko = em->block_start + offset_in_extent;
2943 } 2941 }
2944 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2942 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2945 flags |= FIEMAP_EXTENT_ENCODED; 2943 flags |= FIEMAP_EXTENT_ENCODED;
2946 2944
2947 free_extent_map(em); 2945 free_extent_map(em);
2948 em = NULL; 2946 em = NULL;
2949 if ((em_start >= last) || em_len == (u64)-1 || 2947 if ((em_start >= last) || em_len == (u64)-1 ||
2950 (last == (u64)-1 && isize <= em_end)) { 2948 (last == (u64)-1 && isize <= em_end)) {
2951 flags |= FIEMAP_EXTENT_LAST; 2949 flags |= FIEMAP_EXTENT_LAST;
2952 end = 1; 2950 end = 1;
2953 } 2951 }
2954 2952
2955 /* now scan forward to see if this is really the last extent. */ 2953 /* now scan forward to see if this is really the last extent. */
2956 em = get_extent_skip_holes(inode, off, last_for_get_extent, 2954 em = get_extent_skip_holes(inode, off, last_for_get_extent,
2957 get_extent); 2955 get_extent);
2958 if (IS_ERR(em)) { 2956 if (IS_ERR(em)) {
2959 ret = PTR_ERR(em); 2957 ret = PTR_ERR(em);
2960 goto out; 2958 goto out;
2961 } 2959 }
2962 if (!em) { 2960 if (!em) {
2963 flags |= FIEMAP_EXTENT_LAST; 2961 flags |= FIEMAP_EXTENT_LAST;
2964 end = 1; 2962 end = 1;
2965 } 2963 }
2966 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 2964 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2967 em_len, flags); 2965 em_len, flags);
2968 if (ret) 2966 if (ret)
2969 goto out_free; 2967 goto out_free;
2970 } 2968 }
2971 out_free: 2969 out_free:
2972 free_extent_map(em); 2970 free_extent_map(em);
2973 out: 2971 out:
2974 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 2972 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
2975 &cached_state, GFP_NOFS); 2973 &cached_state, GFP_NOFS);
2976 return ret; 2974 return ret;
2977 } 2975 }
2978 2976
2979 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2977 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2980 unsigned long i) 2978 unsigned long i)
2981 { 2979 {
2982 struct page *p; 2980 struct page *p;
2983 struct address_space *mapping; 2981 struct address_space *mapping;
2984 2982
2985 if (i == 0) 2983 if (i == 0)
2986 return eb->first_page; 2984 return eb->first_page;
2987 i += eb->start >> PAGE_CACHE_SHIFT; 2985 i += eb->start >> PAGE_CACHE_SHIFT;
2988 mapping = eb->first_page->mapping; 2986 mapping = eb->first_page->mapping;
2989 if (!mapping) 2987 if (!mapping)
2990 return NULL; 2988 return NULL;
2991 2989
2992 /* 2990 /*
2993 * extent_buffer_page is only called after pinning the page 2991 * extent_buffer_page is only called after pinning the page
2994 * by increasing the reference count. So we know the page must 2992 * by increasing the reference count. So we know the page must
2995 * be in the radix tree. 2993 * be in the radix tree.
2996 */ 2994 */
2997 rcu_read_lock(); 2995 rcu_read_lock();
2998 p = radix_tree_lookup(&mapping->page_tree, i); 2996 p = radix_tree_lookup(&mapping->page_tree, i);
2999 rcu_read_unlock(); 2997 rcu_read_unlock();
3000 2998
3001 return p; 2999 return p;
3002 } 3000 }
3003 3001
3004 static inline unsigned long num_extent_pages(u64 start, u64 len) 3002 static inline unsigned long num_extent_pages(u64 start, u64 len)
3005 { 3003 {
3006 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3004 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3007 (start >> PAGE_CACHE_SHIFT); 3005 (start >> PAGE_CACHE_SHIFT);
3008 } 3006 }
3009 3007
3010 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3008 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3011 u64 start, 3009 u64 start,
3012 unsigned long len, 3010 unsigned long len,
3013 gfp_t mask) 3011 gfp_t mask)
3014 { 3012 {
3015 struct extent_buffer *eb = NULL; 3013 struct extent_buffer *eb = NULL;
3016 #if LEAK_DEBUG 3014 #if LEAK_DEBUG
3017 unsigned long flags; 3015 unsigned long flags;
3018 #endif 3016 #endif
3019 3017
3020 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3018 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3021 if (eb == NULL) 3019 if (eb == NULL)
3022 return NULL; 3020 return NULL;
3023 eb->start = start; 3021 eb->start = start;
3024 eb->len = len; 3022 eb->len = len;
3025 spin_lock_init(&eb->lock); 3023 spin_lock_init(&eb->lock);
3026 init_waitqueue_head(&eb->lock_wq); 3024 init_waitqueue_head(&eb->lock_wq);
3027 3025
3028 #if LEAK_DEBUG 3026 #if LEAK_DEBUG
3029 spin_lock_irqsave(&leak_lock, flags); 3027 spin_lock_irqsave(&leak_lock, flags);
3030 list_add(&eb->leak_list, &buffers); 3028 list_add(&eb->leak_list, &buffers);
3031 spin_unlock_irqrestore(&leak_lock, flags); 3029 spin_unlock_irqrestore(&leak_lock, flags);
3032 #endif 3030 #endif
3033 atomic_set(&eb->refs, 1); 3031 atomic_set(&eb->refs, 1);
3034 3032
3035 return eb; 3033 return eb;
3036 } 3034 }
3037 3035
3038 static void __free_extent_buffer(struct extent_buffer *eb) 3036 static void __free_extent_buffer(struct extent_buffer *eb)
3039 { 3037 {
3040 #if LEAK_DEBUG 3038 #if LEAK_DEBUG
3041 unsigned long flags; 3039 unsigned long flags;
3042 spin_lock_irqsave(&leak_lock, flags); 3040 spin_lock_irqsave(&leak_lock, flags);
3043 list_del(&eb->leak_list); 3041 list_del(&eb->leak_list);
3044 spin_unlock_irqrestore(&leak_lock, flags); 3042 spin_unlock_irqrestore(&leak_lock, flags);
3045 #endif 3043 #endif
3046 kmem_cache_free(extent_buffer_cache, eb); 3044 kmem_cache_free(extent_buffer_cache, eb);
3047 } 3045 }
3048 3046
3049 /* 3047 /*
3050 * Helper for releasing extent buffer page. 3048 * Helper for releasing extent buffer page.
3051 */ 3049 */
3052 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3050 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3053 unsigned long start_idx) 3051 unsigned long start_idx)
3054 { 3052 {
3055 unsigned long index; 3053 unsigned long index;
3056 struct page *page; 3054 struct page *page;
3057 3055
3058 if (!eb->first_page) 3056 if (!eb->first_page)
3059 return; 3057 return;
3060 3058
3061 index = num_extent_pages(eb->start, eb->len); 3059 index = num_extent_pages(eb->start, eb->len);
3062 if (start_idx >= index) 3060 if (start_idx >= index)
3063 return; 3061 return;
3064 3062
3065 do { 3063 do {
3066 index--; 3064 index--;
3067 page = extent_buffer_page(eb, index); 3065 page = extent_buffer_page(eb, index);
3068 if (page) 3066 if (page)
3069 page_cache_release(page); 3067 page_cache_release(page);
3070 } while (index != start_idx); 3068 } while (index != start_idx);
3071 } 3069 }
3072 3070
3073 /* 3071 /*
3074 * Helper for releasing the extent buffer. 3072 * Helper for releasing the extent buffer.
3075 */ 3073 */
3076 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3074 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3077 { 3075 {
3078 btrfs_release_extent_buffer_page(eb, 0); 3076 btrfs_release_extent_buffer_page(eb, 0);
3079 __free_extent_buffer(eb); 3077 __free_extent_buffer(eb);
3080 } 3078 }
3081 3079
3082 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3080 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3083 u64 start, unsigned long len, 3081 u64 start, unsigned long len,
3084 struct page *page0) 3082 struct page *page0)
3085 { 3083 {
3086 unsigned long num_pages = num_extent_pages(start, len); 3084 unsigned long num_pages = num_extent_pages(start, len);
3087 unsigned long i; 3085 unsigned long i;
3088 unsigned long index = start >> PAGE_CACHE_SHIFT; 3086 unsigned long index = start >> PAGE_CACHE_SHIFT;
3089 struct extent_buffer *eb; 3087 struct extent_buffer *eb;
3090 struct extent_buffer *exists = NULL; 3088 struct extent_buffer *exists = NULL;
3091 struct page *p; 3089 struct page *p;
3092 struct address_space *mapping = tree->mapping; 3090 struct address_space *mapping = tree->mapping;
3093 int uptodate = 1; 3091 int uptodate = 1;
3094 int ret; 3092 int ret;
3095 3093
3096 rcu_read_lock(); 3094 rcu_read_lock();
3097 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3095 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3098 if (eb && atomic_inc_not_zero(&eb->refs)) { 3096 if (eb && atomic_inc_not_zero(&eb->refs)) {
3099 rcu_read_unlock(); 3097 rcu_read_unlock();
3100 mark_page_accessed(eb->first_page); 3098 mark_page_accessed(eb->first_page);
3101 return eb; 3099 return eb;
3102 } 3100 }
3103 rcu_read_unlock(); 3101 rcu_read_unlock();
3104 3102
3105 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 3103 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3106 if (!eb) 3104 if (!eb)
3107 return NULL; 3105 return NULL;
3108 3106
3109 if (page0) { 3107 if (page0) {
3110 eb->first_page = page0; 3108 eb->first_page = page0;
3111 i = 1; 3109 i = 1;
3112 index++; 3110 index++;
3113 page_cache_get(page0); 3111 page_cache_get(page0);
3114 mark_page_accessed(page0); 3112 mark_page_accessed(page0);
3115 set_page_extent_mapped(page0); 3113 set_page_extent_mapped(page0);
3116 set_page_extent_head(page0, len); 3114 set_page_extent_head(page0, len);
3117 uptodate = PageUptodate(page0); 3115 uptodate = PageUptodate(page0);
3118 } else { 3116 } else {
3119 i = 0; 3117 i = 0;
3120 } 3118 }
3121 for (; i < num_pages; i++, index++) { 3119 for (; i < num_pages; i++, index++) {
3122 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); 3120 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
3123 if (!p) { 3121 if (!p) {
3124 WARN_ON(1); 3122 WARN_ON(1);
3125 goto free_eb; 3123 goto free_eb;
3126 } 3124 }
3127 set_page_extent_mapped(p); 3125 set_page_extent_mapped(p);
3128 mark_page_accessed(p); 3126 mark_page_accessed(p);
3129 if (i == 0) { 3127 if (i == 0) {
3130 eb->first_page = p; 3128 eb->first_page = p;
3131 set_page_extent_head(p, len); 3129 set_page_extent_head(p, len);
3132 } else { 3130 } else {
3133 set_page_private(p, EXTENT_PAGE_PRIVATE); 3131 set_page_private(p, EXTENT_PAGE_PRIVATE);
3134 } 3132 }
3135 if (!PageUptodate(p)) 3133 if (!PageUptodate(p))
3136 uptodate = 0; 3134 uptodate = 0;
3137 3135
3138 /* 3136 /*
3139 * see below about how we avoid a nasty race with release page 3137 * see below about how we avoid a nasty race with release page
3140 * and why we unlock later 3138 * and why we unlock later
3141 */ 3139 */
3142 if (i != 0) 3140 if (i != 0)
3143 unlock_page(p); 3141 unlock_page(p);
3144 } 3142 }
3145 if (uptodate) 3143 if (uptodate)
3146 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3144 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3147 3145
3148 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3146 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3149 if (ret) 3147 if (ret)
3150 goto free_eb; 3148 goto free_eb;
3151 3149
3152 spin_lock(&tree->buffer_lock); 3150 spin_lock(&tree->buffer_lock);
3153 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3151 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3154 if (ret == -EEXIST) { 3152 if (ret == -EEXIST) {
3155 exists = radix_tree_lookup(&tree->buffer, 3153 exists = radix_tree_lookup(&tree->buffer,
3156 start >> PAGE_CACHE_SHIFT); 3154 start >> PAGE_CACHE_SHIFT);
3157 /* add one reference for the caller */ 3155 /* add one reference for the caller */
3158 atomic_inc(&exists->refs); 3156 atomic_inc(&exists->refs);
3159 spin_unlock(&tree->buffer_lock); 3157 spin_unlock(&tree->buffer_lock);
3160 radix_tree_preload_end(); 3158 radix_tree_preload_end();
3161 goto free_eb; 3159 goto free_eb;
3162 } 3160 }
3163 /* add one reference for the tree */ 3161 /* add one reference for the tree */
3164 atomic_inc(&eb->refs); 3162 atomic_inc(&eb->refs);
3165 spin_unlock(&tree->buffer_lock); 3163 spin_unlock(&tree->buffer_lock);
3166 radix_tree_preload_end(); 3164 radix_tree_preload_end();
3167 3165
3168 /* 3166 /*
3169 * there is a race where release page may have 3167 * there is a race where release page may have
3170 * tried to find this extent buffer in the radix 3168 * tried to find this extent buffer in the radix
3171 * but failed. It will tell the VM it is safe to 3169 * but failed. It will tell the VM it is safe to
3172 * reclaim the, and it will clear the page private bit. 3170 * reclaim the, and it will clear the page private bit.
3173 * We must make sure to set the page private bit properly 3171 * We must make sure to set the page private bit properly
3174 * after the extent buffer is in the radix tree so 3172 * after the extent buffer is in the radix tree so
3175 * it doesn't get lost 3173 * it doesn't get lost
3176 */ 3174 */
3177 set_page_extent_mapped(eb->first_page); 3175 set_page_extent_mapped(eb->first_page);
3178 set_page_extent_head(eb->first_page, eb->len); 3176 set_page_extent_head(eb->first_page, eb->len);
3179 if (!page0) 3177 if (!page0)
3180 unlock_page(eb->first_page); 3178 unlock_page(eb->first_page);
3181 return eb; 3179 return eb;
3182 3180
3183 free_eb: 3181 free_eb:
3184 if (eb->first_page && !page0) 3182 if (eb->first_page && !page0)
3185 unlock_page(eb->first_page); 3183 unlock_page(eb->first_page);
3186 3184
3187 if (!atomic_dec_and_test(&eb->refs)) 3185 if (!atomic_dec_and_test(&eb->refs))
3188 return exists; 3186 return exists;
3189 btrfs_release_extent_buffer(eb); 3187 btrfs_release_extent_buffer(eb);
3190 return exists; 3188 return exists;
3191 } 3189 }
3192 3190
3193 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3191 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3194 u64 start, unsigned long len) 3192 u64 start, unsigned long len)
3195 { 3193 {
3196 struct extent_buffer *eb; 3194 struct extent_buffer *eb;
3197 3195
3198 rcu_read_lock(); 3196 rcu_read_lock();
3199 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3197 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3200 if (eb && atomic_inc_not_zero(&eb->refs)) { 3198 if (eb && atomic_inc_not_zero(&eb->refs)) {
3201 rcu_read_unlock(); 3199 rcu_read_unlock();
3202 mark_page_accessed(eb->first_page); 3200 mark_page_accessed(eb->first_page);
3203 return eb; 3201 return eb;
3204 } 3202 }
3205 rcu_read_unlock(); 3203 rcu_read_unlock();
3206 3204
3207 return NULL; 3205 return NULL;
3208 } 3206 }
3209 3207
3210 void free_extent_buffer(struct extent_buffer *eb) 3208 void free_extent_buffer(struct extent_buffer *eb)
3211 { 3209 {
3212 if (!eb) 3210 if (!eb)
3213 return; 3211 return;
3214 3212
3215 if (!atomic_dec_and_test(&eb->refs)) 3213 if (!atomic_dec_and_test(&eb->refs))
3216 return; 3214 return;
3217 3215
3218 WARN_ON(1); 3216 WARN_ON(1);
3219 } 3217 }
3220 3218
3221 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3219 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3222 struct extent_buffer *eb) 3220 struct extent_buffer *eb)
3223 { 3221 {
3224 unsigned long i; 3222 unsigned long i;
3225 unsigned long num_pages; 3223 unsigned long num_pages;
3226 struct page *page; 3224 struct page *page;
3227 3225
3228 num_pages = num_extent_pages(eb->start, eb->len); 3226 num_pages = num_extent_pages(eb->start, eb->len);
3229 3227
3230 for (i = 0; i < num_pages; i++) { 3228 for (i = 0; i < num_pages; i++) {
3231 page = extent_buffer_page(eb, i); 3229 page = extent_buffer_page(eb, i);
3232 if (!PageDirty(page)) 3230 if (!PageDirty(page))
3233 continue; 3231 continue;
3234 3232
3235 lock_page(page); 3233 lock_page(page);
3236 WARN_ON(!PagePrivate(page)); 3234 WARN_ON(!PagePrivate(page));
3237 3235
3238 set_page_extent_mapped(page); 3236 set_page_extent_mapped(page);
3239 if (i == 0) 3237 if (i == 0)
3240 set_page_extent_head(page, eb->len); 3238 set_page_extent_head(page, eb->len);
3241 3239
3242 clear_page_dirty_for_io(page); 3240 clear_page_dirty_for_io(page);
3243 spin_lock_irq(&page->mapping->tree_lock); 3241 spin_lock_irq(&page->mapping->tree_lock);
3244 if (!PageDirty(page)) { 3242 if (!PageDirty(page)) {
3245 radix_tree_tag_clear(&page->mapping->page_tree, 3243 radix_tree_tag_clear(&page->mapping->page_tree,
3246 page_index(page), 3244 page_index(page),
3247 PAGECACHE_TAG_DIRTY); 3245 PAGECACHE_TAG_DIRTY);
3248 } 3246 }
3249 spin_unlock_irq(&page->mapping->tree_lock); 3247 spin_unlock_irq(&page->mapping->tree_lock);
3250 unlock_page(page); 3248 unlock_page(page);
3251 } 3249 }
3252 return 0; 3250 return 0;
3253 } 3251 }
3254 3252
3255 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3253 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3256 struct extent_buffer *eb) 3254 struct extent_buffer *eb)
3257 { 3255 {
3258 unsigned long i; 3256 unsigned long i;
3259 unsigned long num_pages; 3257 unsigned long num_pages;
3260 int was_dirty = 0; 3258 int was_dirty = 0;
3261 3259
3262 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3260 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3263 num_pages = num_extent_pages(eb->start, eb->len); 3261 num_pages = num_extent_pages(eb->start, eb->len);
3264 for (i = 0; i < num_pages; i++) 3262 for (i = 0; i < num_pages; i++)
3265 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3263 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3266 return was_dirty; 3264 return was_dirty;
3267 } 3265 }
3268 3266
3269 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3267 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3270 struct extent_buffer *eb, 3268 struct extent_buffer *eb,
3271 struct extent_state **cached_state) 3269 struct extent_state **cached_state)
3272 { 3270 {
3273 unsigned long i; 3271 unsigned long i;
3274 struct page *page; 3272 struct page *page;
3275 unsigned long num_pages; 3273 unsigned long num_pages;
3276 3274
3277 num_pages = num_extent_pages(eb->start, eb->len); 3275 num_pages = num_extent_pages(eb->start, eb->len);
3278 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3276 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3279 3277
3280 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3278 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3281 cached_state, GFP_NOFS); 3279 cached_state, GFP_NOFS);
3282 for (i = 0; i < num_pages; i++) { 3280 for (i = 0; i < num_pages; i++) {
3283 page = extent_buffer_page(eb, i); 3281 page = extent_buffer_page(eb, i);
3284 if (page) 3282 if (page)
3285 ClearPageUptodate(page); 3283 ClearPageUptodate(page);
3286 } 3284 }
3287 return 0; 3285 return 0;
3288 } 3286 }
3289 3287
3290 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3288 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3291 struct extent_buffer *eb) 3289 struct extent_buffer *eb)
3292 { 3290 {
3293 unsigned long i; 3291 unsigned long i;
3294 struct page *page; 3292 struct page *page;
3295 unsigned long num_pages; 3293 unsigned long num_pages;
3296 3294
3297 num_pages = num_extent_pages(eb->start, eb->len); 3295 num_pages = num_extent_pages(eb->start, eb->len);
3298 3296
3299 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3297 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3300 NULL, GFP_NOFS); 3298 NULL, GFP_NOFS);
3301 for (i = 0; i < num_pages; i++) { 3299 for (i = 0; i < num_pages; i++) {
3302 page = extent_buffer_page(eb, i); 3300 page = extent_buffer_page(eb, i);
3303 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3301 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3304 ((i == num_pages - 1) && 3302 ((i == num_pages - 1) &&
3305 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3303 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3306 check_page_uptodate(tree, page); 3304 check_page_uptodate(tree, page);
3307 continue; 3305 continue;
3308 } 3306 }
3309 SetPageUptodate(page); 3307 SetPageUptodate(page);
3310 } 3308 }
3311 return 0; 3309 return 0;
3312 } 3310 }
3313 3311
3314 int extent_range_uptodate(struct extent_io_tree *tree, 3312 int extent_range_uptodate(struct extent_io_tree *tree,
3315 u64 start, u64 end) 3313 u64 start, u64 end)
3316 { 3314 {
3317 struct page *page; 3315 struct page *page;
3318 int ret; 3316 int ret;
3319 int pg_uptodate = 1; 3317 int pg_uptodate = 1;
3320 int uptodate; 3318 int uptodate;
3321 unsigned long index; 3319 unsigned long index;
3322 3320
3323 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3321 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3324 if (ret) 3322 if (ret)
3325 return 1; 3323 return 1;
3326 while (start <= end) { 3324 while (start <= end) {
3327 index = start >> PAGE_CACHE_SHIFT; 3325 index = start >> PAGE_CACHE_SHIFT;
3328 page = find_get_page(tree->mapping, index); 3326 page = find_get_page(tree->mapping, index);
3329 uptodate = PageUptodate(page); 3327 uptodate = PageUptodate(page);
3330 page_cache_release(page); 3328 page_cache_release(page);
3331 if (!uptodate) { 3329 if (!uptodate) {
3332 pg_uptodate = 0; 3330 pg_uptodate = 0;
3333 break; 3331 break;
3334 } 3332 }
3335 start += PAGE_CACHE_SIZE; 3333 start += PAGE_CACHE_SIZE;
3336 } 3334 }
3337 return pg_uptodate; 3335 return pg_uptodate;
3338 } 3336 }
3339 3337
3340 int extent_buffer_uptodate(struct extent_io_tree *tree, 3338 int extent_buffer_uptodate(struct extent_io_tree *tree,
3341 struct extent_buffer *eb, 3339 struct extent_buffer *eb,
3342 struct extent_state *cached_state) 3340 struct extent_state *cached_state)
3343 { 3341 {
3344 int ret = 0; 3342 int ret = 0;
3345 unsigned long num_pages; 3343 unsigned long num_pages;
3346 unsigned long i; 3344 unsigned long i;
3347 struct page *page; 3345 struct page *page;
3348 int pg_uptodate = 1; 3346 int pg_uptodate = 1;
3349 3347
3350 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3348 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3351 return 1; 3349 return 1;
3352 3350
3353 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3351 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3354 EXTENT_UPTODATE, 1, cached_state); 3352 EXTENT_UPTODATE, 1, cached_state);
3355 if (ret) 3353 if (ret)
3356 return ret; 3354 return ret;
3357 3355
3358 num_pages = num_extent_pages(eb->start, eb->len); 3356 num_pages = num_extent_pages(eb->start, eb->len);
3359 for (i = 0; i < num_pages; i++) { 3357 for (i = 0; i < num_pages; i++) {
3360 page = extent_buffer_page(eb, i); 3358 page = extent_buffer_page(eb, i);
3361 if (!PageUptodate(page)) { 3359 if (!PageUptodate(page)) {
3362 pg_uptodate = 0; 3360 pg_uptodate = 0;
3363 break; 3361 break;
3364 } 3362 }
3365 } 3363 }
3366 return pg_uptodate; 3364 return pg_uptodate;
3367 } 3365 }
3368 3366
3369 int read_extent_buffer_pages(struct extent_io_tree *tree, 3367 int read_extent_buffer_pages(struct extent_io_tree *tree,
3370 struct extent_buffer *eb, 3368 struct extent_buffer *eb,
3371 u64 start, int wait, 3369 u64 start, int wait,
3372 get_extent_t *get_extent, int mirror_num) 3370 get_extent_t *get_extent, int mirror_num)
3373 { 3371 {
3374 unsigned long i; 3372 unsigned long i;
3375 unsigned long start_i; 3373 unsigned long start_i;
3376 struct page *page; 3374 struct page *page;
3377 int err; 3375 int err;
3378 int ret = 0; 3376 int ret = 0;
3379 int locked_pages = 0; 3377 int locked_pages = 0;
3380 int all_uptodate = 1; 3378 int all_uptodate = 1;
3381 int inc_all_pages = 0; 3379 int inc_all_pages = 0;
3382 unsigned long num_pages; 3380 unsigned long num_pages;
3383 struct bio *bio = NULL; 3381 struct bio *bio = NULL;
3384 unsigned long bio_flags = 0; 3382 unsigned long bio_flags = 0;
3385 3383
3386 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3384 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3387 return 0; 3385 return 0;
3388 3386
3389 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3387 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3390 EXTENT_UPTODATE, 1, NULL)) { 3388 EXTENT_UPTODATE, 1, NULL)) {
3391 return 0; 3389 return 0;
3392 } 3390 }
3393 3391
3394 if (start) { 3392 if (start) {
3395 WARN_ON(start < eb->start); 3393 WARN_ON(start < eb->start);
3396 start_i = (start >> PAGE_CACHE_SHIFT) - 3394 start_i = (start >> PAGE_CACHE_SHIFT) -
3397 (eb->start >> PAGE_CACHE_SHIFT); 3395 (eb->start >> PAGE_CACHE_SHIFT);
3398 } else { 3396 } else {
3399 start_i = 0; 3397 start_i = 0;
3400 } 3398 }
3401 3399
3402 num_pages = num_extent_pages(eb->start, eb->len); 3400 num_pages = num_extent_pages(eb->start, eb->len);
3403 for (i = start_i; i < num_pages; i++) { 3401 for (i = start_i; i < num_pages; i++) {
3404 page = extent_buffer_page(eb, i); 3402 page = extent_buffer_page(eb, i);
3405 if (!wait) { 3403 if (!wait) {
3406 if (!trylock_page(page)) 3404 if (!trylock_page(page))
3407 goto unlock_exit; 3405 goto unlock_exit;
3408 } else { 3406 } else {
3409 lock_page(page); 3407 lock_page(page);
3410 } 3408 }
3411 locked_pages++; 3409 locked_pages++;
3412 if (!PageUptodate(page)) 3410 if (!PageUptodate(page))
3413 all_uptodate = 0; 3411 all_uptodate = 0;
3414 } 3412 }
3415 if (all_uptodate) { 3413 if (all_uptodate) {
3416 if (start_i == 0) 3414 if (start_i == 0)
3417 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3415 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3418 goto unlock_exit; 3416 goto unlock_exit;
3419 } 3417 }
3420 3418
3421 for (i = start_i; i < num_pages; i++) { 3419 for (i = start_i; i < num_pages; i++) {
3422 page = extent_buffer_page(eb, i); 3420 page = extent_buffer_page(eb, i);
3423 3421
3424 WARN_ON(!PagePrivate(page)); 3422 WARN_ON(!PagePrivate(page));
3425 3423
3426 set_page_extent_mapped(page); 3424 set_page_extent_mapped(page);
3427 if (i == 0) 3425 if (i == 0)
3428 set_page_extent_head(page, eb->len); 3426 set_page_extent_head(page, eb->len);
3429 3427
3430 if (inc_all_pages) 3428 if (inc_all_pages)
3431 page_cache_get(page); 3429 page_cache_get(page);
3432 if (!PageUptodate(page)) { 3430 if (!PageUptodate(page)) {
3433 if (start_i == 0) 3431 if (start_i == 0)
3434 inc_all_pages = 1; 3432 inc_all_pages = 1;
3435 ClearPageError(page); 3433 ClearPageError(page);
3436 err = __extent_read_full_page(tree, page, 3434 err = __extent_read_full_page(tree, page,
3437 get_extent, &bio, 3435 get_extent, &bio,
3438 mirror_num, &bio_flags); 3436 mirror_num, &bio_flags);
3439 if (err) 3437 if (err)
3440 ret = err; 3438 ret = err;
3441 } else { 3439 } else {
3442 unlock_page(page); 3440 unlock_page(page);
3443 } 3441 }
3444 } 3442 }
3445 3443
3446 if (bio) 3444 if (bio)
3447 submit_one_bio(READ, bio, mirror_num, bio_flags); 3445 submit_one_bio(READ, bio, mirror_num, bio_flags);
3448 3446
3449 if (ret || !wait) 3447 if (ret || !wait)
3450 return ret; 3448 return ret;
3451 3449
3452 for (i = start_i; i < num_pages; i++) { 3450 for (i = start_i; i < num_pages; i++) {
3453 page = extent_buffer_page(eb, i); 3451 page = extent_buffer_page(eb, i);
3454 wait_on_page_locked(page); 3452 wait_on_page_locked(page);
3455 if (!PageUptodate(page)) 3453 if (!PageUptodate(page))
3456 ret = -EIO; 3454 ret = -EIO;
3457 } 3455 }
3458 3456
3459 if (!ret) 3457 if (!ret)
3460 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3458 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3461 return ret; 3459 return ret;
3462 3460
3463 unlock_exit: 3461 unlock_exit:
3464 i = start_i; 3462 i = start_i;
3465 while (locked_pages > 0) { 3463 while (locked_pages > 0) {
3466 page = extent_buffer_page(eb, i); 3464 page = extent_buffer_page(eb, i);
3467 i++; 3465 i++;
3468 unlock_page(page); 3466 unlock_page(page);
3469 locked_pages--; 3467 locked_pages--;
3470 } 3468 }
3471 return ret; 3469 return ret;
3472 } 3470 }
3473 3471
3474 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 3472 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3475 unsigned long start, 3473 unsigned long start,
3476 unsigned long len) 3474 unsigned long len)
3477 { 3475 {
3478 size_t cur; 3476 size_t cur;
3479 size_t offset; 3477 size_t offset;
3480 struct page *page; 3478 struct page *page;
3481 char *kaddr; 3479 char *kaddr;
3482 char *dst = (char *)dstv; 3480 char *dst = (char *)dstv;
3483 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3481 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3484 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3482 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3485 3483
3486 WARN_ON(start > eb->len); 3484 WARN_ON(start > eb->len);
3487 WARN_ON(start + len > eb->start + eb->len); 3485 WARN_ON(start + len > eb->start + eb->len);
3488 3486
3489 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3487 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3490 3488
3491 while (len > 0) { 3489 while (len > 0) {
3492 page = extent_buffer_page(eb, i); 3490 page = extent_buffer_page(eb, i);
3493 3491
3494 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3492 cur = min(len, (PAGE_CACHE_SIZE - offset));
3495 kaddr = kmap_atomic(page, KM_USER1); 3493 kaddr = kmap_atomic(page, KM_USER1);
3496 memcpy(dst, kaddr + offset, cur); 3494 memcpy(dst, kaddr + offset, cur);
3497 kunmap_atomic(kaddr, KM_USER1); 3495 kunmap_atomic(kaddr, KM_USER1);
3498 3496
3499 dst += cur; 3497 dst += cur;
3500 len -= cur; 3498 len -= cur;
3501 offset = 0; 3499 offset = 0;
3502 i++; 3500 i++;
3503 } 3501 }
3504 } 3502 }
3505 3503
3506 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3504 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3507 unsigned long min_len, char **token, char **map, 3505 unsigned long min_len, char **token, char **map,
3508 unsigned long *map_start, 3506 unsigned long *map_start,
3509 unsigned long *map_len, int km) 3507 unsigned long *map_len, int km)
3510 { 3508 {
3511 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3509 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3512 char *kaddr; 3510 char *kaddr;
3513 struct page *p; 3511 struct page *p;
3514 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3512 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3515 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3513 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3516 unsigned long end_i = (start_offset + start + min_len - 1) >> 3514 unsigned long end_i = (start_offset + start + min_len - 1) >>
3517 PAGE_CACHE_SHIFT; 3515 PAGE_CACHE_SHIFT;
3518 3516
3519 if (i != end_i) 3517 if (i != end_i)
3520 return -EINVAL; 3518 return -EINVAL;
3521 3519
3522 if (i == 0) { 3520 if (i == 0) {
3523 offset = start_offset; 3521 offset = start_offset;
3524 *map_start = 0; 3522 *map_start = 0;
3525 } else { 3523 } else {
3526 offset = 0; 3524 offset = 0;
3527 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 3525 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3528 } 3526 }
3529 3527
3530 if (start + min_len > eb->len) { 3528 if (start + min_len > eb->len) {
3531 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 3529 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3532 "wanted %lu %lu\n", (unsigned long long)eb->start, 3530 "wanted %lu %lu\n", (unsigned long long)eb->start,
3533 eb->len, start, min_len); 3531 eb->len, start, min_len);
3534 WARN_ON(1); 3532 WARN_ON(1);
3535 return -EINVAL; 3533 return -EINVAL;
3536 } 3534 }
3537 3535
3538 p = extent_buffer_page(eb, i); 3536 p = extent_buffer_page(eb, i);
3539 kaddr = kmap_atomic(p, km); 3537 kaddr = kmap_atomic(p, km);
3540 *token = kaddr; 3538 *token = kaddr;
3541 *map = kaddr + offset; 3539 *map = kaddr + offset;
3542 *map_len = PAGE_CACHE_SIZE - offset; 3540 *map_len = PAGE_CACHE_SIZE - offset;
3543 return 0; 3541 return 0;
3544 } 3542 }
3545 3543
3546 int map_extent_buffer(struct extent_buffer *eb, unsigned long start, 3544 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3547 unsigned long min_len, 3545 unsigned long min_len,
3548 char **token, char **map, 3546 char **token, char **map,
3549 unsigned long *map_start, 3547 unsigned long *map_start,
3550 unsigned long *map_len, int km) 3548 unsigned long *map_len, int km)
3551 { 3549 {
3552 int err; 3550 int err;
3553 int save = 0; 3551 int save = 0;
3554 if (eb->map_token) { 3552 if (eb->map_token) {
3555 unmap_extent_buffer(eb, eb->map_token, km); 3553 unmap_extent_buffer(eb, eb->map_token, km);
3556 eb->map_token = NULL; 3554 eb->map_token = NULL;
3557 save = 1; 3555 save = 1;
3558 } 3556 }
3559 err = map_private_extent_buffer(eb, start, min_len, token, map, 3557 err = map_private_extent_buffer(eb, start, min_len, token, map,
3560 map_start, map_len, km); 3558 map_start, map_len, km);
3561 if (!err && save) { 3559 if (!err && save) {
3562 eb->map_token = *token; 3560 eb->map_token = *token;
3563 eb->kaddr = *map; 3561 eb->kaddr = *map;
3564 eb->map_start = *map_start; 3562 eb->map_start = *map_start;
3565 eb->map_len = *map_len; 3563 eb->map_len = *map_len;
3566 } 3564 }
3567 return err; 3565 return err;
3568 } 3566 }
3569 3567
3570 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) 3568 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3571 { 3569 {
3572 kunmap_atomic(token, km); 3570 kunmap_atomic(token, km);
3573 } 3571 }
3574 3572
3575 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3573 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3576 unsigned long start, 3574 unsigned long start,
3577 unsigned long len) 3575 unsigned long len)
3578 { 3576 {
3579 size_t cur; 3577 size_t cur;
3580 size_t offset; 3578 size_t offset;
3581 struct page *page; 3579 struct page *page;
3582 char *kaddr; 3580 char *kaddr;
3583 char *ptr = (char *)ptrv; 3581 char *ptr = (char *)ptrv;
3584 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3582 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3585 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3583 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3586 int ret = 0; 3584 int ret = 0;
3587 3585
3588 WARN_ON(start > eb->len); 3586 WARN_ON(start > eb->len);
3589 WARN_ON(start + len > eb->start + eb->len); 3587 WARN_ON(start + len > eb->start + eb->len);
3590 3588
3591 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3589 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3592 3590
3593 while (len > 0) { 3591 while (len > 0) {
3594 page = extent_buffer_page(eb, i); 3592 page = extent_buffer_page(eb, i);
3595 3593
3596 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3594 cur = min(len, (PAGE_CACHE_SIZE - offset));
3597 3595
3598 kaddr = kmap_atomic(page, KM_USER0); 3596 kaddr = kmap_atomic(page, KM_USER0);
3599 ret = memcmp(ptr, kaddr + offset, cur); 3597 ret = memcmp(ptr, kaddr + offset, cur);
3600 kunmap_atomic(kaddr, KM_USER0); 3598 kunmap_atomic(kaddr, KM_USER0);
3601 if (ret) 3599 if (ret)
3602 break; 3600 break;
3603 3601
3604 ptr += cur; 3602 ptr += cur;
3605 len -= cur; 3603 len -= cur;
3606 offset = 0; 3604 offset = 0;
3607 i++; 3605 i++;
3608 } 3606 }
3609 return ret; 3607 return ret;
3610 } 3608 }
3611 3609
3612 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 3610 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3613 unsigned long start, unsigned long len) 3611 unsigned long start, unsigned long len)
3614 { 3612 {
3615 size_t cur; 3613 size_t cur;
3616 size_t offset; 3614 size_t offset;
3617 struct page *page; 3615 struct page *page;
3618 char *kaddr; 3616 char *kaddr;
3619 char *src = (char *)srcv; 3617 char *src = (char *)srcv;
3620 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3618 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3621 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3619 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3622 3620
3623 WARN_ON(start > eb->len); 3621 WARN_ON(start > eb->len);
3624 WARN_ON(start + len > eb->start + eb->len); 3622 WARN_ON(start + len > eb->start + eb->len);
3625 3623
3626 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3624 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3627 3625
3628 while (len > 0) { 3626 while (len > 0) {
3629 page = extent_buffer_page(eb, i); 3627 page = extent_buffer_page(eb, i);
3630 WARN_ON(!PageUptodate(page)); 3628 WARN_ON(!PageUptodate(page));
3631 3629
3632 cur = min(len, PAGE_CACHE_SIZE - offset); 3630 cur = min(len, PAGE_CACHE_SIZE - offset);
3633 kaddr = kmap_atomic(page, KM_USER1); 3631 kaddr = kmap_atomic(page, KM_USER1);
3634 memcpy(kaddr + offset, src, cur); 3632 memcpy(kaddr + offset, src, cur);
3635 kunmap_atomic(kaddr, KM_USER1); 3633 kunmap_atomic(kaddr, KM_USER1);
3636 3634
3637 src += cur; 3635 src += cur;
3638 len -= cur; 3636 len -= cur;
3639 offset = 0; 3637 offset = 0;
3640 i++; 3638 i++;
3641 } 3639 }
3642 } 3640 }
3643 3641
3644 void memset_extent_buffer(struct extent_buffer *eb, char c, 3642 void memset_extent_buffer(struct extent_buffer *eb, char c,
3645 unsigned long start, unsigned long len) 3643 unsigned long start, unsigned long len)
3646 { 3644 {
3647 size_t cur; 3645 size_t cur;
3648 size_t offset; 3646 size_t offset;
3649 struct page *page; 3647 struct page *page;
3650 char *kaddr; 3648 char *kaddr;
3651 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3649 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3652 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3650 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3653 3651
3654 WARN_ON(start > eb->len); 3652 WARN_ON(start > eb->len);
3655 WARN_ON(start + len > eb->start + eb->len); 3653 WARN_ON(start + len > eb->start + eb->len);
3656 3654
3657 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3655 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3658 3656
3659 while (len > 0) { 3657 while (len > 0) {
3660 page = extent_buffer_page(eb, i); 3658 page = extent_buffer_page(eb, i);
3661 WARN_ON(!PageUptodate(page)); 3659 WARN_ON(!PageUptodate(page));
3662 3660
3663 cur = min(len, PAGE_CACHE_SIZE - offset); 3661 cur = min(len, PAGE_CACHE_SIZE - offset);
3664 kaddr = kmap_atomic(page, KM_USER0); 3662 kaddr = kmap_atomic(page, KM_USER0);
3665 memset(kaddr + offset, c, cur); 3663 memset(kaddr + offset, c, cur);
3666 kunmap_atomic(kaddr, KM_USER0); 3664 kunmap_atomic(kaddr, KM_USER0);
3667 3665
3668 len -= cur; 3666 len -= cur;
3669 offset = 0; 3667 offset = 0;
3670 i++; 3668 i++;
3671 } 3669 }
3672 } 3670 }
3673 3671
3674 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3672 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3675 unsigned long dst_offset, unsigned long src_offset, 3673 unsigned long dst_offset, unsigned long src_offset,
3676 unsigned long len) 3674 unsigned long len)
3677 { 3675 {
3678 u64 dst_len = dst->len; 3676 u64 dst_len = dst->len;
3679 size_t cur; 3677 size_t cur;
3680 size_t offset; 3678 size_t offset;
3681 struct page *page; 3679 struct page *page;
3682 char *kaddr; 3680 char *kaddr;
3683 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3681 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3684 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3682 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3685 3683
3686 WARN_ON(src->len != dst_len); 3684 WARN_ON(src->len != dst_len);
3687 3685
3688 offset = (start_offset + dst_offset) & 3686 offset = (start_offset + dst_offset) &
3689 ((unsigned long)PAGE_CACHE_SIZE - 1); 3687 ((unsigned long)PAGE_CACHE_SIZE - 1);
3690 3688
3691 while (len > 0) { 3689 while (len > 0) {
3692 page = extent_buffer_page(dst, i); 3690 page = extent_buffer_page(dst, i);
3693 WARN_ON(!PageUptodate(page)); 3691 WARN_ON(!PageUptodate(page));
3694 3692
3695 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3693 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3696 3694
3697 kaddr = kmap_atomic(page, KM_USER0); 3695 kaddr = kmap_atomic(page, KM_USER0);
3698 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3696 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3699 kunmap_atomic(kaddr, KM_USER0); 3697 kunmap_atomic(kaddr, KM_USER0);
3700 3698
3701 src_offset += cur; 3699 src_offset += cur;
3702 len -= cur; 3700 len -= cur;
3703 offset = 0; 3701 offset = 0;
3704 i++; 3702 i++;
3705 } 3703 }
3706 } 3704 }
3707 3705
3708 static void move_pages(struct page *dst_page, struct page *src_page, 3706 static void move_pages(struct page *dst_page, struct page *src_page,
3709 unsigned long dst_off, unsigned long src_off, 3707 unsigned long dst_off, unsigned long src_off,
3710 unsigned long len) 3708 unsigned long len)
3711 { 3709 {
3712 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3710 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3713 if (dst_page == src_page) { 3711 if (dst_page == src_page) {
3714 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3712 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3715 } else { 3713 } else {
3716 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3714 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3717 char *p = dst_kaddr + dst_off + len; 3715 char *p = dst_kaddr + dst_off + len;
3718 char *s = src_kaddr + src_off + len; 3716 char *s = src_kaddr + src_off + len;
3719 3717
3720 while (len--) 3718 while (len--)
3721 *--p = *--s; 3719 *--p = *--s;
3722 3720
3723 kunmap_atomic(src_kaddr, KM_USER1); 3721 kunmap_atomic(src_kaddr, KM_USER1);
3724 } 3722 }
3725 kunmap_atomic(dst_kaddr, KM_USER0); 3723 kunmap_atomic(dst_kaddr, KM_USER0);
3726 } 3724 }
3727 3725
3728 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 3726 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3729 { 3727 {
3730 unsigned long distance = (src > dst) ? src - dst : dst - src; 3728 unsigned long distance = (src > dst) ? src - dst : dst - src;
3731 return distance < len; 3729 return distance < len;
3732 } 3730 }
3733 3731
3734 static void copy_pages(struct page *dst_page, struct page *src_page, 3732 static void copy_pages(struct page *dst_page, struct page *src_page,
3735 unsigned long dst_off, unsigned long src_off, 3733 unsigned long dst_off, unsigned long src_off,
3736 unsigned long len) 3734 unsigned long len)
3737 { 3735 {
3738 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3736 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3739 char *src_kaddr; 3737 char *src_kaddr;
3740 3738
3741 if (dst_page != src_page) { 3739 if (dst_page != src_page) {
3742 src_kaddr = kmap_atomic(src_page, KM_USER1); 3740 src_kaddr = kmap_atomic(src_page, KM_USER1);
3743 } else { 3741 } else {
3744 src_kaddr = dst_kaddr; 3742 src_kaddr = dst_kaddr;
3745 BUG_ON(areas_overlap(src_off, dst_off, len)); 3743 BUG_ON(areas_overlap(src_off, dst_off, len));
3746 } 3744 }
3747 3745
3748 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3746 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3749 kunmap_atomic(dst_kaddr, KM_USER0); 3747 kunmap_atomic(dst_kaddr, KM_USER0);
3750 if (dst_page != src_page) 3748 if (dst_page != src_page)
3751 kunmap_atomic(src_kaddr, KM_USER1); 3749 kunmap_atomic(src_kaddr, KM_USER1);
3752 } 3750 }
3753 3751
3754 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3752 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3755 unsigned long src_offset, unsigned long len) 3753 unsigned long src_offset, unsigned long len)
3756 { 3754 {
3757 size_t cur; 3755 size_t cur;
3758 size_t dst_off_in_page; 3756 size_t dst_off_in_page;
3759 size_t src_off_in_page; 3757 size_t src_off_in_page;
3760 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3758 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3761 unsigned long dst_i; 3759 unsigned long dst_i;
3762 unsigned long src_i; 3760 unsigned long src_i;
3763 3761
3764 if (src_offset + len > dst->len) { 3762 if (src_offset + len > dst->len) {
3765 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3763 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3766 "len %lu dst len %lu\n", src_offset, len, dst->len); 3764 "len %lu dst len %lu\n", src_offset, len, dst->len);
3767 BUG_ON(1); 3765 BUG_ON(1);
3768 } 3766 }
3769 if (dst_offset + len > dst->len) { 3767 if (dst_offset + len > dst->len) {
3770 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3768 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3771 "len %lu dst len %lu\n", dst_offset, len, dst->len); 3769 "len %lu dst len %lu\n", dst_offset, len, dst->len);
3772 BUG_ON(1); 3770 BUG_ON(1);
3773 } 3771 }
3774 3772
3775 while (len > 0) { 3773 while (len > 0) {
3776 dst_off_in_page = (start_offset + dst_offset) & 3774 dst_off_in_page = (start_offset + dst_offset) &
3777 ((unsigned long)PAGE_CACHE_SIZE - 1); 3775 ((unsigned long)PAGE_CACHE_SIZE - 1);
3778 src_off_in_page = (start_offset + src_offset) & 3776 src_off_in_page = (start_offset + src_offset) &
3779 ((unsigned long)PAGE_CACHE_SIZE - 1); 3777 ((unsigned long)PAGE_CACHE_SIZE - 1);
3780 3778
3781 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3779 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3782 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3780 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3783 3781
3784 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3782 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3785 src_off_in_page)); 3783 src_off_in_page));
3786 cur = min_t(unsigned long, cur, 3784 cur = min_t(unsigned long, cur,
3787 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3785 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3788 3786
3789 copy_pages(extent_buffer_page(dst, dst_i), 3787 copy_pages(extent_buffer_page(dst, dst_i),
3790 extent_buffer_page(dst, src_i), 3788 extent_buffer_page(dst, src_i),
3791 dst_off_in_page, src_off_in_page, cur); 3789 dst_off_in_page, src_off_in_page, cur);
3792 3790
3793 src_offset += cur; 3791 src_offset += cur;
3794 dst_offset += cur; 3792 dst_offset += cur;
3795 len -= cur; 3793 len -= cur;
3796 } 3794 }
3797 } 3795 }
3798 3796
3799 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3797 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3800 unsigned long src_offset, unsigned long len) 3798 unsigned long src_offset, unsigned long len)
3801 { 3799 {
3802 size_t cur; 3800 size_t cur;
3803 size_t dst_off_in_page; 3801 size_t dst_off_in_page;
3804 size_t src_off_in_page; 3802 size_t src_off_in_page;
3805 unsigned long dst_end = dst_offset + len - 1; 3803 unsigned long dst_end = dst_offset + len - 1;
3806 unsigned long src_end = src_offset + len - 1; 3804 unsigned long src_end = src_offset + len - 1;
3807 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3805 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3808 unsigned long dst_i; 3806 unsigned long dst_i;
3809 unsigned long src_i; 3807 unsigned long src_i;
3810 3808
3811 if (src_offset + len > dst->len) { 3809 if (src_offset + len > dst->len) {
3812 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3810 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3813 "len %lu len %lu\n", src_offset, len, dst->len); 3811 "len %lu len %lu\n", src_offset, len, dst->len);
3814 BUG_ON(1); 3812 BUG_ON(1);
3815 } 3813 }
3816 if (dst_offset + len > dst->len) { 3814 if (dst_offset + len > dst->len) {
3817 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3815 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3818 "len %lu len %lu\n", dst_offset, len, dst->len); 3816 "len %lu len %lu\n", dst_offset, len, dst->len);
3819 BUG_ON(1); 3817 BUG_ON(1);
3820 } 3818 }
3821 if (!areas_overlap(src_offset, dst_offset, len)) { 3819 if (!areas_overlap(src_offset, dst_offset, len)) {
3822 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3820 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3823 return; 3821 return;
3824 } 3822 }
3825 while (len > 0) { 3823 while (len > 0) {
3826 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3824 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3827 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3825 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3828 3826
3829 dst_off_in_page = (start_offset + dst_end) & 3827 dst_off_in_page = (start_offset + dst_end) &
3830 ((unsigned long)PAGE_CACHE_SIZE - 1); 3828 ((unsigned long)PAGE_CACHE_SIZE - 1);
3831 src_off_in_page = (start_offset + src_end) & 3829 src_off_in_page = (start_offset + src_end) &
3832 ((unsigned long)PAGE_CACHE_SIZE - 1); 3830 ((unsigned long)PAGE_CACHE_SIZE - 1);
3833 3831
3834 cur = min_t(unsigned long, len, src_off_in_page + 1); 3832 cur = min_t(unsigned long, len, src_off_in_page + 1);
3835 cur = min(cur, dst_off_in_page + 1); 3833 cur = min(cur, dst_off_in_page + 1);
3836 move_pages(extent_buffer_page(dst, dst_i), 3834 move_pages(extent_buffer_page(dst, dst_i),
3837 extent_buffer_page(dst, src_i), 3835 extent_buffer_page(dst, src_i),
3838 dst_off_in_page - cur + 1, 3836 dst_off_in_page - cur + 1,
3839 src_off_in_page - cur + 1, cur); 3837 src_off_in_page - cur + 1, cur);
3840 3838
3841 dst_end -= cur; 3839 dst_end -= cur;
3842 src_end -= cur; 3840 src_end -= cur;
3843 len -= cur; 3841 len -= cur;
3844 } 3842 }
3845 } 3843 }
3846 3844
3847 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3845 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3848 { 3846 {
3849 struct extent_buffer *eb = 3847 struct extent_buffer *eb =
3850 container_of(head, struct extent_buffer, rcu_head); 3848 container_of(head, struct extent_buffer, rcu_head);
3851 3849
3852 btrfs_release_extent_buffer(eb); 3850 btrfs_release_extent_buffer(eb);
3853 } 3851 }
3854 3852
3855 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3853 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3856 { 3854 {
3857 u64 start = page_offset(page); 3855 u64 start = page_offset(page);
3858 struct extent_buffer *eb; 3856 struct extent_buffer *eb;
3859 int ret = 1; 3857 int ret = 1;
3860 3858
3861 spin_lock(&tree->buffer_lock); 3859 spin_lock(&tree->buffer_lock);
3862 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3860 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3863 if (!eb) { 3861 if (!eb) {
3864 spin_unlock(&tree->buffer_lock); 3862 spin_unlock(&tree->buffer_lock);
3865 return ret; 3863 return ret;
3866 } 3864 }
3867 3865
3868 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3866 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3869 ret = 0; 3867 ret = 0;
3870 goto out; 3868 goto out;
3871 } 3869 }
3872 3870
3873 /* 3871 /*
3874 * set @eb->refs to 0 if it is already 1, and then release the @eb. 3872 * set @eb->refs to 0 if it is already 1, and then release the @eb.
3875 * Or go back. 3873 * Or go back.
3876 */ 3874 */
3877 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 3875 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
3878 ret = 0; 3876 ret = 0;
3879 goto out; 3877 goto out;
3880 } 3878 }
3881 3879
3882 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3880 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3883 out: 3881 out:
3884 spin_unlock(&tree->buffer_lock); 3882 spin_unlock(&tree->buffer_lock);
3885 3883
3886 /* at this point we can safely release the extent buffer */ 3884 /* at this point we can safely release the extent buffer */
3887 if (atomic_read(&eb->refs) == 0) 3885 if (atomic_read(&eb->refs) == 0)
3888 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3886 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3889 return ret; 3887 return ret;
3890 } 3888 }
3891 3889
1 /* 1 /*
2 * linux/fs/ext4/inode.c 2 * linux/fs/ext4/inode.c
3 * 3 *
4 * Copyright (C) 1992, 1993, 1994, 1995 4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr) 5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal 6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 * 8 *
9 * from 9 * from
10 * 10 *
11 * linux/fs/minix/inode.c 11 * linux/fs/minix/inode.c
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * Goal-directed block allocation by Stephen Tweedie 15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998 16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by 17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995 18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz) 20 * (jj@sunsite.ms.mff.cuni.cz)
21 * 21 *
22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25 #include <linux/module.h> 25 #include <linux/module.h>
26 #include <linux/fs.h> 26 #include <linux/fs.h>
27 #include <linux/time.h> 27 #include <linux/time.h>
28 #include <linux/jbd2.h> 28 #include <linux/jbd2.h>
29 #include <linux/highuid.h> 29 #include <linux/highuid.h>
30 #include <linux/pagemap.h> 30 #include <linux/pagemap.h>
31 #include <linux/quotaops.h> 31 #include <linux/quotaops.h>
32 #include <linux/string.h> 32 #include <linux/string.h>
33 #include <linux/buffer_head.h> 33 #include <linux/buffer_head.h>
34 #include <linux/writeback.h> 34 #include <linux/writeback.h>
35 #include <linux/pagevec.h> 35 #include <linux/pagevec.h>
36 #include <linux/mpage.h> 36 #include <linux/mpage.h>
37 #include <linux/namei.h> 37 #include <linux/namei.h>
38 #include <linux/uio.h> 38 #include <linux/uio.h>
39 #include <linux/bio.h> 39 #include <linux/bio.h>
40 #include <linux/workqueue.h> 40 #include <linux/workqueue.h>
41 #include <linux/kernel.h> 41 #include <linux/kernel.h>
42 #include <linux/printk.h> 42 #include <linux/printk.h>
43 #include <linux/slab.h> 43 #include <linux/slab.h>
44 #include <linux/ratelimit.h> 44 #include <linux/ratelimit.h>
45 45
46 #include "ext4_jbd2.h" 46 #include "ext4_jbd2.h"
47 #include "xattr.h" 47 #include "xattr.h"
48 #include "acl.h" 48 #include "acl.h"
49 #include "ext4_extents.h" 49 #include "ext4_extents.h"
50 50
51 #include <trace/events/ext4.h> 51 #include <trace/events/ext4.h>
52 52
53 #define MPAGE_DA_EXTENT_TAIL 0x01 53 #define MPAGE_DA_EXTENT_TAIL 0x01
54 54
55 static inline int ext4_begin_ordered_truncate(struct inode *inode, 55 static inline int ext4_begin_ordered_truncate(struct inode *inode,
56 loff_t new_size) 56 loff_t new_size)
57 { 57 {
58 trace_ext4_begin_ordered_truncate(inode, new_size); 58 trace_ext4_begin_ordered_truncate(inode, new_size);
59 /* 59 /*
60 * If jinode is zero, then we never opened the file for 60 * If jinode is zero, then we never opened the file for
61 * writing, so there's no need to call 61 * writing, so there's no need to call
62 * jbd2_journal_begin_ordered_truncate() since there's no 62 * jbd2_journal_begin_ordered_truncate() since there's no
63 * outstanding writes we need to flush. 63 * outstanding writes we need to flush.
64 */ 64 */
65 if (!EXT4_I(inode)->jinode) 65 if (!EXT4_I(inode)->jinode)
66 return 0; 66 return 0;
67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), 67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68 EXT4_I(inode)->jinode, 68 EXT4_I(inode)->jinode,
69 new_size); 69 new_size);
70 } 70 }
71 71
72 static void ext4_invalidatepage(struct page *page, unsigned long offset); 72 static void ext4_invalidatepage(struct page *page, unsigned long offset);
73 static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 73 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
74 struct buffer_head *bh_result, int create); 74 struct buffer_head *bh_result, int create);
75 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); 75 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
76 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); 76 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
77 static int __ext4_journalled_writepage(struct page *page, unsigned int len); 77 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
78 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 78 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
79 79
80 /* 80 /*
81 * Test whether an inode is a fast symlink. 81 * Test whether an inode is a fast symlink.
82 */ 82 */
83 static int ext4_inode_is_fast_symlink(struct inode *inode) 83 static int ext4_inode_is_fast_symlink(struct inode *inode)
84 { 84 {
85 int ea_blocks = EXT4_I(inode)->i_file_acl ? 85 int ea_blocks = EXT4_I(inode)->i_file_acl ?
86 (inode->i_sb->s_blocksize >> 9) : 0; 86 (inode->i_sb->s_blocksize >> 9) : 0;
87 87
88 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 88 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
89 } 89 }
90 90
91 /* 91 /*
92 * Work out how many blocks we need to proceed with the next chunk of a 92 * Work out how many blocks we need to proceed with the next chunk of a
93 * truncate transaction. 93 * truncate transaction.
94 */ 94 */
95 static unsigned long blocks_for_truncate(struct inode *inode) 95 static unsigned long blocks_for_truncate(struct inode *inode)
96 { 96 {
97 ext4_lblk_t needed; 97 ext4_lblk_t needed;
98 98
99 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 99 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
100 100
101 /* Give ourselves just enough room to cope with inodes in which 101 /* Give ourselves just enough room to cope with inodes in which
102 * i_blocks is corrupt: we've seen disk corruptions in the past 102 * i_blocks is corrupt: we've seen disk corruptions in the past
103 * which resulted in random data in an inode which looked enough 103 * which resulted in random data in an inode which looked enough
104 * like a regular file for ext4 to try to delete it. Things 104 * like a regular file for ext4 to try to delete it. Things
105 * will go a bit crazy if that happens, but at least we should 105 * will go a bit crazy if that happens, but at least we should
106 * try not to panic the whole kernel. */ 106 * try not to panic the whole kernel. */
107 if (needed < 2) 107 if (needed < 2)
108 needed = 2; 108 needed = 2;
109 109
110 /* But we need to bound the transaction so we don't overflow the 110 /* But we need to bound the transaction so we don't overflow the
111 * journal. */ 111 * journal. */
112 if (needed > EXT4_MAX_TRANS_DATA) 112 if (needed > EXT4_MAX_TRANS_DATA)
113 needed = EXT4_MAX_TRANS_DATA; 113 needed = EXT4_MAX_TRANS_DATA;
114 114
115 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 115 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
116 } 116 }
117 117
118 /* 118 /*
119 * Truncate transactions can be complex and absolutely huge. So we need to 119 * Truncate transactions can be complex and absolutely huge. So we need to
120 * be able to restart the transaction at a conventient checkpoint to make 120 * be able to restart the transaction at a conventient checkpoint to make
121 * sure we don't overflow the journal. 121 * sure we don't overflow the journal.
122 * 122 *
123 * start_transaction gets us a new handle for a truncate transaction, 123 * start_transaction gets us a new handle for a truncate transaction,
124 * and extend_transaction tries to extend the existing one a bit. If 124 * and extend_transaction tries to extend the existing one a bit. If
125 * extend fails, we need to propagate the failure up and restart the 125 * extend fails, we need to propagate the failure up and restart the
126 * transaction in the top-level truncate loop. --sct 126 * transaction in the top-level truncate loop. --sct
127 */ 127 */
128 static handle_t *start_transaction(struct inode *inode) 128 static handle_t *start_transaction(struct inode *inode)
129 { 129 {
130 handle_t *result; 130 handle_t *result;
131 131
132 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 132 result = ext4_journal_start(inode, blocks_for_truncate(inode));
133 if (!IS_ERR(result)) 133 if (!IS_ERR(result))
134 return result; 134 return result;
135 135
136 ext4_std_error(inode->i_sb, PTR_ERR(result)); 136 ext4_std_error(inode->i_sb, PTR_ERR(result));
137 return result; 137 return result;
138 } 138 }
139 139
140 /* 140 /*
141 * Try to extend this transaction for the purposes of truncation. 141 * Try to extend this transaction for the purposes of truncation.
142 * 142 *
143 * Returns 0 if we managed to create more room. If we can't create more 143 * Returns 0 if we managed to create more room. If we can't create more
144 * room, and the transaction must be restarted we return 1. 144 * room, and the transaction must be restarted we return 1.
145 */ 145 */
146 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 146 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
147 { 147 {
148 if (!ext4_handle_valid(handle)) 148 if (!ext4_handle_valid(handle))
149 return 0; 149 return 0;
150 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 150 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
151 return 0; 151 return 0;
152 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 152 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
153 return 0; 153 return 0;
154 return 1; 154 return 1;
155 } 155 }
156 156
157 /* 157 /*
158 * Restart the transaction associated with *handle. This does a commit, 158 * Restart the transaction associated with *handle. This does a commit,
159 * so before we call here everything must be consistently dirtied against 159 * so before we call here everything must be consistently dirtied against
160 * this transaction. 160 * this transaction.
161 */ 161 */
162 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, 162 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
163 int nblocks) 163 int nblocks)
164 { 164 {
165 int ret; 165 int ret;
166 166
167 /* 167 /*
168 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this 168 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
169 * moment, get_block can be called only for blocks inside i_size since 169 * moment, get_block can be called only for blocks inside i_size since
170 * page cache has been already dropped and writes are blocked by 170 * page cache has been already dropped and writes are blocked by
171 * i_mutex. So we can safely drop the i_data_sem here. 171 * i_mutex. So we can safely drop the i_data_sem here.
172 */ 172 */
173 BUG_ON(EXT4_JOURNAL(inode) == NULL); 173 BUG_ON(EXT4_JOURNAL(inode) == NULL);
174 jbd_debug(2, "restarting handle %p\n", handle); 174 jbd_debug(2, "restarting handle %p\n", handle);
175 up_write(&EXT4_I(inode)->i_data_sem); 175 up_write(&EXT4_I(inode)->i_data_sem);
176 ret = ext4_journal_restart(handle, nblocks); 176 ret = ext4_journal_restart(handle, nblocks);
177 down_write(&EXT4_I(inode)->i_data_sem); 177 down_write(&EXT4_I(inode)->i_data_sem);
178 ext4_discard_preallocations(inode); 178 ext4_discard_preallocations(inode);
179 179
180 return ret; 180 return ret;
181 } 181 }
182 182
183 /* 183 /*
184 * Called at the last iput() if i_nlink is zero. 184 * Called at the last iput() if i_nlink is zero.
185 */ 185 */
186 void ext4_evict_inode(struct inode *inode) 186 void ext4_evict_inode(struct inode *inode)
187 { 187 {
188 handle_t *handle; 188 handle_t *handle;
189 int err; 189 int err;
190 190
191 trace_ext4_evict_inode(inode); 191 trace_ext4_evict_inode(inode);
192 if (inode->i_nlink) { 192 if (inode->i_nlink) {
193 truncate_inode_pages(&inode->i_data, 0); 193 truncate_inode_pages(&inode->i_data, 0);
194 goto no_delete; 194 goto no_delete;
195 } 195 }
196 196
197 if (!is_bad_inode(inode)) 197 if (!is_bad_inode(inode))
198 dquot_initialize(inode); 198 dquot_initialize(inode);
199 199
200 if (ext4_should_order_data(inode)) 200 if (ext4_should_order_data(inode))
201 ext4_begin_ordered_truncate(inode, 0); 201 ext4_begin_ordered_truncate(inode, 0);
202 truncate_inode_pages(&inode->i_data, 0); 202 truncate_inode_pages(&inode->i_data, 0);
203 203
204 if (is_bad_inode(inode)) 204 if (is_bad_inode(inode))
205 goto no_delete; 205 goto no_delete;
206 206
207 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 207 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
208 if (IS_ERR(handle)) { 208 if (IS_ERR(handle)) {
209 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 209 ext4_std_error(inode->i_sb, PTR_ERR(handle));
210 /* 210 /*
211 * If we're going to skip the normal cleanup, we still need to 211 * If we're going to skip the normal cleanup, we still need to
212 * make sure that the in-core orphan linked list is properly 212 * make sure that the in-core orphan linked list is properly
213 * cleaned up. 213 * cleaned up.
214 */ 214 */
215 ext4_orphan_del(NULL, inode); 215 ext4_orphan_del(NULL, inode);
216 goto no_delete; 216 goto no_delete;
217 } 217 }
218 218
219 if (IS_SYNC(inode)) 219 if (IS_SYNC(inode))
220 ext4_handle_sync(handle); 220 ext4_handle_sync(handle);
221 inode->i_size = 0; 221 inode->i_size = 0;
222 err = ext4_mark_inode_dirty(handle, inode); 222 err = ext4_mark_inode_dirty(handle, inode);
223 if (err) { 223 if (err) {
224 ext4_warning(inode->i_sb, 224 ext4_warning(inode->i_sb,
225 "couldn't mark inode dirty (err %d)", err); 225 "couldn't mark inode dirty (err %d)", err);
226 goto stop_handle; 226 goto stop_handle;
227 } 227 }
228 if (inode->i_blocks) 228 if (inode->i_blocks)
229 ext4_truncate(inode); 229 ext4_truncate(inode);
230 230
231 /* 231 /*
232 * ext4_ext_truncate() doesn't reserve any slop when it 232 * ext4_ext_truncate() doesn't reserve any slop when it
233 * restarts journal transactions; therefore there may not be 233 * restarts journal transactions; therefore there may not be
234 * enough credits left in the handle to remove the inode from 234 * enough credits left in the handle to remove the inode from
235 * the orphan list and set the dtime field. 235 * the orphan list and set the dtime field.
236 */ 236 */
237 if (!ext4_handle_has_enough_credits(handle, 3)) { 237 if (!ext4_handle_has_enough_credits(handle, 3)) {
238 err = ext4_journal_extend(handle, 3); 238 err = ext4_journal_extend(handle, 3);
239 if (err > 0) 239 if (err > 0)
240 err = ext4_journal_restart(handle, 3); 240 err = ext4_journal_restart(handle, 3);
241 if (err != 0) { 241 if (err != 0) {
242 ext4_warning(inode->i_sb, 242 ext4_warning(inode->i_sb,
243 "couldn't extend journal (err %d)", err); 243 "couldn't extend journal (err %d)", err);
244 stop_handle: 244 stop_handle:
245 ext4_journal_stop(handle); 245 ext4_journal_stop(handle);
246 ext4_orphan_del(NULL, inode); 246 ext4_orphan_del(NULL, inode);
247 goto no_delete; 247 goto no_delete;
248 } 248 }
249 } 249 }
250 250
251 /* 251 /*
252 * Kill off the orphan record which ext4_truncate created. 252 * Kill off the orphan record which ext4_truncate created.
253 * AKPM: I think this can be inside the above `if'. 253 * AKPM: I think this can be inside the above `if'.
254 * Note that ext4_orphan_del() has to be able to cope with the 254 * Note that ext4_orphan_del() has to be able to cope with the
255 * deletion of a non-existent orphan - this is because we don't 255 * deletion of a non-existent orphan - this is because we don't
256 * know if ext4_truncate() actually created an orphan record. 256 * know if ext4_truncate() actually created an orphan record.
257 * (Well, we could do this if we need to, but heck - it works) 257 * (Well, we could do this if we need to, but heck - it works)
258 */ 258 */
259 ext4_orphan_del(handle, inode); 259 ext4_orphan_del(handle, inode);
260 EXT4_I(inode)->i_dtime = get_seconds(); 260 EXT4_I(inode)->i_dtime = get_seconds();
261 261
262 /* 262 /*
263 * One subtle ordering requirement: if anything has gone wrong 263 * One subtle ordering requirement: if anything has gone wrong
264 * (transaction abort, IO errors, whatever), then we can still 264 * (transaction abort, IO errors, whatever), then we can still
265 * do these next steps (the fs will already have been marked as 265 * do these next steps (the fs will already have been marked as
266 * having errors), but we can't free the inode if the mark_dirty 266 * having errors), but we can't free the inode if the mark_dirty
267 * fails. 267 * fails.
268 */ 268 */
269 if (ext4_mark_inode_dirty(handle, inode)) 269 if (ext4_mark_inode_dirty(handle, inode))
270 /* If that failed, just do the required in-core inode clear. */ 270 /* If that failed, just do the required in-core inode clear. */
271 ext4_clear_inode(inode); 271 ext4_clear_inode(inode);
272 else 272 else
273 ext4_free_inode(handle, inode); 273 ext4_free_inode(handle, inode);
274 ext4_journal_stop(handle); 274 ext4_journal_stop(handle);
275 return; 275 return;
276 no_delete: 276 no_delete:
277 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 277 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
278 } 278 }
279 279
280 typedef struct { 280 typedef struct {
281 __le32 *p; 281 __le32 *p;
282 __le32 key; 282 __le32 key;
283 struct buffer_head *bh; 283 struct buffer_head *bh;
284 } Indirect; 284 } Indirect;
285 285
286 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 286 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
287 { 287 {
288 p->key = *(p->p = v); 288 p->key = *(p->p = v);
289 p->bh = bh; 289 p->bh = bh;
290 } 290 }
291 291
292 /** 292 /**
293 * ext4_block_to_path - parse the block number into array of offsets 293 * ext4_block_to_path - parse the block number into array of offsets
294 * @inode: inode in question (we are only interested in its superblock) 294 * @inode: inode in question (we are only interested in its superblock)
295 * @i_block: block number to be parsed 295 * @i_block: block number to be parsed
296 * @offsets: array to store the offsets in 296 * @offsets: array to store the offsets in
297 * @boundary: set this non-zero if the referred-to block is likely to be 297 * @boundary: set this non-zero if the referred-to block is likely to be
298 * followed (on disk) by an indirect block. 298 * followed (on disk) by an indirect block.
299 * 299 *
300 * To store the locations of file's data ext4 uses a data structure common 300 * To store the locations of file's data ext4 uses a data structure common
301 * for UNIX filesystems - tree of pointers anchored in the inode, with 301 * for UNIX filesystems - tree of pointers anchored in the inode, with
302 * data blocks at leaves and indirect blocks in intermediate nodes. 302 * data blocks at leaves and indirect blocks in intermediate nodes.
303 * This function translates the block number into path in that tree - 303 * This function translates the block number into path in that tree -
304 * return value is the path length and @offsets[n] is the offset of 304 * return value is the path length and @offsets[n] is the offset of
305 * pointer to (n+1)th node in the nth one. If @block is out of range 305 * pointer to (n+1)th node in the nth one. If @block is out of range
306 * (negative or too large) warning is printed and zero returned. 306 * (negative or too large) warning is printed and zero returned.
307 * 307 *
308 * Note: function doesn't find node addresses, so no IO is needed. All 308 * Note: function doesn't find node addresses, so no IO is needed. All
309 * we need to know is the capacity of indirect blocks (taken from the 309 * we need to know is the capacity of indirect blocks (taken from the
310 * inode->i_sb). 310 * inode->i_sb).
311 */ 311 */
312 312
313 /* 313 /*
314 * Portability note: the last comparison (check that we fit into triple 314 * Portability note: the last comparison (check that we fit into triple
315 * indirect block) is spelled differently, because otherwise on an 315 * indirect block) is spelled differently, because otherwise on an
316 * architecture with 32-bit longs and 8Kb pages we might get into trouble 316 * architecture with 32-bit longs and 8Kb pages we might get into trouble
317 * if our filesystem had 8Kb blocks. We might use long long, but that would 317 * if our filesystem had 8Kb blocks. We might use long long, but that would
318 * kill us on x86. Oh, well, at least the sign propagation does not matter - 318 * kill us on x86. Oh, well, at least the sign propagation does not matter -
319 * i_block would have to be negative in the very beginning, so we would not 319 * i_block would have to be negative in the very beginning, so we would not
320 * get there at all. 320 * get there at all.
321 */ 321 */
322 322
323 static int ext4_block_to_path(struct inode *inode, 323 static int ext4_block_to_path(struct inode *inode,
324 ext4_lblk_t i_block, 324 ext4_lblk_t i_block,
325 ext4_lblk_t offsets[4], int *boundary) 325 ext4_lblk_t offsets[4], int *boundary)
326 { 326 {
327 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 327 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
328 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 328 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
329 const long direct_blocks = EXT4_NDIR_BLOCKS, 329 const long direct_blocks = EXT4_NDIR_BLOCKS,
330 indirect_blocks = ptrs, 330 indirect_blocks = ptrs,
331 double_blocks = (1 << (ptrs_bits * 2)); 331 double_blocks = (1 << (ptrs_bits * 2));
332 int n = 0; 332 int n = 0;
333 int final = 0; 333 int final = 0;
334 334
335 if (i_block < direct_blocks) { 335 if (i_block < direct_blocks) {
336 offsets[n++] = i_block; 336 offsets[n++] = i_block;
337 final = direct_blocks; 337 final = direct_blocks;
338 } else if ((i_block -= direct_blocks) < indirect_blocks) { 338 } else if ((i_block -= direct_blocks) < indirect_blocks) {
339 offsets[n++] = EXT4_IND_BLOCK; 339 offsets[n++] = EXT4_IND_BLOCK;
340 offsets[n++] = i_block; 340 offsets[n++] = i_block;
341 final = ptrs; 341 final = ptrs;
342 } else if ((i_block -= indirect_blocks) < double_blocks) { 342 } else if ((i_block -= indirect_blocks) < double_blocks) {
343 offsets[n++] = EXT4_DIND_BLOCK; 343 offsets[n++] = EXT4_DIND_BLOCK;
344 offsets[n++] = i_block >> ptrs_bits; 344 offsets[n++] = i_block >> ptrs_bits;
345 offsets[n++] = i_block & (ptrs - 1); 345 offsets[n++] = i_block & (ptrs - 1);
346 final = ptrs; 346 final = ptrs;
347 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 347 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
348 offsets[n++] = EXT4_TIND_BLOCK; 348 offsets[n++] = EXT4_TIND_BLOCK;
349 offsets[n++] = i_block >> (ptrs_bits * 2); 349 offsets[n++] = i_block >> (ptrs_bits * 2);
350 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 350 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
351 offsets[n++] = i_block & (ptrs - 1); 351 offsets[n++] = i_block & (ptrs - 1);
352 final = ptrs; 352 final = ptrs;
353 } else { 353 } else {
354 ext4_warning(inode->i_sb, "block %lu > max in inode %lu", 354 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
355 i_block + direct_blocks + 355 i_block + direct_blocks +
356 indirect_blocks + double_blocks, inode->i_ino); 356 indirect_blocks + double_blocks, inode->i_ino);
357 } 357 }
358 if (boundary) 358 if (boundary)
359 *boundary = final - 1 - (i_block & (ptrs - 1)); 359 *boundary = final - 1 - (i_block & (ptrs - 1));
360 return n; 360 return n;
361 } 361 }
362 362
363 static int __ext4_check_blockref(const char *function, unsigned int line, 363 static int __ext4_check_blockref(const char *function, unsigned int line,
364 struct inode *inode, 364 struct inode *inode,
365 __le32 *p, unsigned int max) 365 __le32 *p, unsigned int max)
366 { 366 {
367 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 367 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
368 __le32 *bref = p; 368 __le32 *bref = p;
369 unsigned int blk; 369 unsigned int blk;
370 370
371 while (bref < p+max) { 371 while (bref < p+max) {
372 blk = le32_to_cpu(*bref++); 372 blk = le32_to_cpu(*bref++);
373 if (blk && 373 if (blk &&
374 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 374 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
375 blk, 1))) { 375 blk, 1))) {
376 es->s_last_error_block = cpu_to_le64(blk); 376 es->s_last_error_block = cpu_to_le64(blk);
377 ext4_error_inode(inode, function, line, blk, 377 ext4_error_inode(inode, function, line, blk,
378 "invalid block"); 378 "invalid block");
379 return -EIO; 379 return -EIO;
380 } 380 }
381 } 381 }
382 return 0; 382 return 0;
383 } 383 }
384 384
385 385
386 #define ext4_check_indirect_blockref(inode, bh) \ 386 #define ext4_check_indirect_blockref(inode, bh) \
387 __ext4_check_blockref(__func__, __LINE__, inode, \ 387 __ext4_check_blockref(__func__, __LINE__, inode, \
388 (__le32 *)(bh)->b_data, \ 388 (__le32 *)(bh)->b_data, \
389 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 389 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
390 390
391 #define ext4_check_inode_blockref(inode) \ 391 #define ext4_check_inode_blockref(inode) \
392 __ext4_check_blockref(__func__, __LINE__, inode, \ 392 __ext4_check_blockref(__func__, __LINE__, inode, \
393 EXT4_I(inode)->i_data, \ 393 EXT4_I(inode)->i_data, \
394 EXT4_NDIR_BLOCKS) 394 EXT4_NDIR_BLOCKS)
395 395
396 /** 396 /**
397 * ext4_get_branch - read the chain of indirect blocks leading to data 397 * ext4_get_branch - read the chain of indirect blocks leading to data
398 * @inode: inode in question 398 * @inode: inode in question
399 * @depth: depth of the chain (1 - direct pointer, etc.) 399 * @depth: depth of the chain (1 - direct pointer, etc.)
400 * @offsets: offsets of pointers in inode/indirect blocks 400 * @offsets: offsets of pointers in inode/indirect blocks
401 * @chain: place to store the result 401 * @chain: place to store the result
402 * @err: here we store the error value 402 * @err: here we store the error value
403 * 403 *
404 * Function fills the array of triples <key, p, bh> and returns %NULL 404 * Function fills the array of triples <key, p, bh> and returns %NULL
405 * if everything went OK or the pointer to the last filled triple 405 * if everything went OK or the pointer to the last filled triple
406 * (incomplete one) otherwise. Upon the return chain[i].key contains 406 * (incomplete one) otherwise. Upon the return chain[i].key contains
407 * the number of (i+1)-th block in the chain (as it is stored in memory, 407 * the number of (i+1)-th block in the chain (as it is stored in memory,
408 * i.e. little-endian 32-bit), chain[i].p contains the address of that 408 * i.e. little-endian 32-bit), chain[i].p contains the address of that
409 * number (it points into struct inode for i==0 and into the bh->b_data 409 * number (it points into struct inode for i==0 and into the bh->b_data
410 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 410 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
411 * block for i>0 and NULL for i==0. In other words, it holds the block 411 * block for i>0 and NULL for i==0. In other words, it holds the block
412 * numbers of the chain, addresses they were taken from (and where we can 412 * numbers of the chain, addresses they were taken from (and where we can
413 * verify that chain did not change) and buffer_heads hosting these 413 * verify that chain did not change) and buffer_heads hosting these
414 * numbers. 414 * numbers.
415 * 415 *
416 * Function stops when it stumbles upon zero pointer (absent block) 416 * Function stops when it stumbles upon zero pointer (absent block)
417 * (pointer to last triple returned, *@err == 0) 417 * (pointer to last triple returned, *@err == 0)
418 * or when it gets an IO error reading an indirect block 418 * or when it gets an IO error reading an indirect block
419 * (ditto, *@err == -EIO) 419 * (ditto, *@err == -EIO)
420 * or when it reads all @depth-1 indirect blocks successfully and finds 420 * or when it reads all @depth-1 indirect blocks successfully and finds
421 * the whole chain, all way to the data (returns %NULL, *err == 0). 421 * the whole chain, all way to the data (returns %NULL, *err == 0).
422 * 422 *
423 * Need to be called with 423 * Need to be called with
424 * down_read(&EXT4_I(inode)->i_data_sem) 424 * down_read(&EXT4_I(inode)->i_data_sem)
425 */ 425 */
426 static Indirect *ext4_get_branch(struct inode *inode, int depth, 426 static Indirect *ext4_get_branch(struct inode *inode, int depth,
427 ext4_lblk_t *offsets, 427 ext4_lblk_t *offsets,
428 Indirect chain[4], int *err) 428 Indirect chain[4], int *err)
429 { 429 {
430 struct super_block *sb = inode->i_sb; 430 struct super_block *sb = inode->i_sb;
431 Indirect *p = chain; 431 Indirect *p = chain;
432 struct buffer_head *bh; 432 struct buffer_head *bh;
433 433
434 *err = 0; 434 *err = 0;
435 /* i_data is not going away, no lock needed */ 435 /* i_data is not going away, no lock needed */
436 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 436 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
437 if (!p->key) 437 if (!p->key)
438 goto no_block; 438 goto no_block;
439 while (--depth) { 439 while (--depth) {
440 bh = sb_getblk(sb, le32_to_cpu(p->key)); 440 bh = sb_getblk(sb, le32_to_cpu(p->key));
441 if (unlikely(!bh)) 441 if (unlikely(!bh))
442 goto failure; 442 goto failure;
443 443
444 if (!bh_uptodate_or_lock(bh)) { 444 if (!bh_uptodate_or_lock(bh)) {
445 if (bh_submit_read(bh) < 0) { 445 if (bh_submit_read(bh) < 0) {
446 put_bh(bh); 446 put_bh(bh);
447 goto failure; 447 goto failure;
448 } 448 }
449 /* validate block references */ 449 /* validate block references */
450 if (ext4_check_indirect_blockref(inode, bh)) { 450 if (ext4_check_indirect_blockref(inode, bh)) {
451 put_bh(bh); 451 put_bh(bh);
452 goto failure; 452 goto failure;
453 } 453 }
454 } 454 }
455 455
456 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 456 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
457 /* Reader: end */ 457 /* Reader: end */
458 if (!p->key) 458 if (!p->key)
459 goto no_block; 459 goto no_block;
460 } 460 }
461 return NULL; 461 return NULL;
462 462
463 failure: 463 failure:
464 *err = -EIO; 464 *err = -EIO;
465 no_block: 465 no_block:
466 return p; 466 return p;
467 } 467 }
468 468
469 /** 469 /**
470 * ext4_find_near - find a place for allocation with sufficient locality 470 * ext4_find_near - find a place for allocation with sufficient locality
471 * @inode: owner 471 * @inode: owner
472 * @ind: descriptor of indirect block. 472 * @ind: descriptor of indirect block.
473 * 473 *
474 * This function returns the preferred place for block allocation. 474 * This function returns the preferred place for block allocation.
475 * It is used when heuristic for sequential allocation fails. 475 * It is used when heuristic for sequential allocation fails.
476 * Rules are: 476 * Rules are:
477 * + if there is a block to the left of our position - allocate near it. 477 * + if there is a block to the left of our position - allocate near it.
478 * + if pointer will live in indirect block - allocate near that block. 478 * + if pointer will live in indirect block - allocate near that block.
479 * + if pointer will live in inode - allocate in the same 479 * + if pointer will live in inode - allocate in the same
480 * cylinder group. 480 * cylinder group.
481 * 481 *
482 * In the latter case we colour the starting block by the callers PID to 482 * In the latter case we colour the starting block by the callers PID to
483 * prevent it from clashing with concurrent allocations for a different inode 483 * prevent it from clashing with concurrent allocations for a different inode
484 * in the same block group. The PID is used here so that functionally related 484 * in the same block group. The PID is used here so that functionally related
485 * files will be close-by on-disk. 485 * files will be close-by on-disk.
486 * 486 *
487 * Caller must make sure that @ind is valid and will stay that way. 487 * Caller must make sure that @ind is valid and will stay that way.
488 */ 488 */
489 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 489 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
490 { 490 {
491 struct ext4_inode_info *ei = EXT4_I(inode); 491 struct ext4_inode_info *ei = EXT4_I(inode);
492 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 492 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
493 __le32 *p; 493 __le32 *p;
494 ext4_fsblk_t bg_start; 494 ext4_fsblk_t bg_start;
495 ext4_fsblk_t last_block; 495 ext4_fsblk_t last_block;
496 ext4_grpblk_t colour; 496 ext4_grpblk_t colour;
497 ext4_group_t block_group; 497 ext4_group_t block_group;
498 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 498 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
499 499
500 /* Try to find previous block */ 500 /* Try to find previous block */
501 for (p = ind->p - 1; p >= start; p--) { 501 for (p = ind->p - 1; p >= start; p--) {
502 if (*p) 502 if (*p)
503 return le32_to_cpu(*p); 503 return le32_to_cpu(*p);
504 } 504 }
505 505
506 /* No such thing, so let's try location of indirect block */ 506 /* No such thing, so let's try location of indirect block */
507 if (ind->bh) 507 if (ind->bh)
508 return ind->bh->b_blocknr; 508 return ind->bh->b_blocknr;
509 509
510 /* 510 /*
511 * It is going to be referred to from the inode itself? OK, just put it 511 * It is going to be referred to from the inode itself? OK, just put it
512 * into the same cylinder group then. 512 * into the same cylinder group then.
513 */ 513 */
514 block_group = ei->i_block_group; 514 block_group = ei->i_block_group;
515 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 515 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
516 block_group &= ~(flex_size-1); 516 block_group &= ~(flex_size-1);
517 if (S_ISREG(inode->i_mode)) 517 if (S_ISREG(inode->i_mode))
518 block_group++; 518 block_group++;
519 } 519 }
520 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 520 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
521 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 521 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
522 522
523 /* 523 /*
524 * If we are doing delayed allocation, we don't need take 524 * If we are doing delayed allocation, we don't need take
525 * colour into account. 525 * colour into account.
526 */ 526 */
527 if (test_opt(inode->i_sb, DELALLOC)) 527 if (test_opt(inode->i_sb, DELALLOC))
528 return bg_start; 528 return bg_start;
529 529
530 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 530 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
531 colour = (current->pid % 16) * 531 colour = (current->pid % 16) *
532 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 532 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
533 else 533 else
534 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 534 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
535 return bg_start + colour; 535 return bg_start + colour;
536 } 536 }
537 537
538 /** 538 /**
539 * ext4_find_goal - find a preferred place for allocation. 539 * ext4_find_goal - find a preferred place for allocation.
540 * @inode: owner 540 * @inode: owner
541 * @block: block we want 541 * @block: block we want
542 * @partial: pointer to the last triple within a chain 542 * @partial: pointer to the last triple within a chain
543 * 543 *
544 * Normally this function find the preferred place for block allocation, 544 * Normally this function find the preferred place for block allocation,
545 * returns it. 545 * returns it.
546 * Because this is only used for non-extent files, we limit the block nr 546 * Because this is only used for non-extent files, we limit the block nr
547 * to 32 bits. 547 * to 32 bits.
548 */ 548 */
549 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 549 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
550 Indirect *partial) 550 Indirect *partial)
551 { 551 {
552 ext4_fsblk_t goal; 552 ext4_fsblk_t goal;
553 553
554 /* 554 /*
555 * XXX need to get goal block from mballoc's data structures 555 * XXX need to get goal block from mballoc's data structures
556 */ 556 */
557 557
558 goal = ext4_find_near(inode, partial); 558 goal = ext4_find_near(inode, partial);
559 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 559 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
560 return goal; 560 return goal;
561 } 561 }
562 562
563 /** 563 /**
564 * ext4_blks_to_allocate - Look up the block map and count the number 564 * ext4_blks_to_allocate - Look up the block map and count the number
565 * of direct blocks need to be allocated for the given branch. 565 * of direct blocks need to be allocated for the given branch.
566 * 566 *
567 * @branch: chain of indirect blocks 567 * @branch: chain of indirect blocks
568 * @k: number of blocks need for indirect blocks 568 * @k: number of blocks need for indirect blocks
569 * @blks: number of data blocks to be mapped. 569 * @blks: number of data blocks to be mapped.
570 * @blocks_to_boundary: the offset in the indirect block 570 * @blocks_to_boundary: the offset in the indirect block
571 * 571 *
572 * return the total number of blocks to be allocate, including the 572 * return the total number of blocks to be allocate, including the
573 * direct and indirect blocks. 573 * direct and indirect blocks.
574 */ 574 */
575 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 575 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
576 int blocks_to_boundary) 576 int blocks_to_boundary)
577 { 577 {
578 unsigned int count = 0; 578 unsigned int count = 0;
579 579
580 /* 580 /*
581 * Simple case, [t,d]Indirect block(s) has not allocated yet 581 * Simple case, [t,d]Indirect block(s) has not allocated yet
582 * then it's clear blocks on that path have not allocated 582 * then it's clear blocks on that path have not allocated
583 */ 583 */
584 if (k > 0) { 584 if (k > 0) {
585 /* right now we don't handle cross boundary allocation */ 585 /* right now we don't handle cross boundary allocation */
586 if (blks < blocks_to_boundary + 1) 586 if (blks < blocks_to_boundary + 1)
587 count += blks; 587 count += blks;
588 else 588 else
589 count += blocks_to_boundary + 1; 589 count += blocks_to_boundary + 1;
590 return count; 590 return count;
591 } 591 }
592 592
593 count++; 593 count++;
594 while (count < blks && count <= blocks_to_boundary && 594 while (count < blks && count <= blocks_to_boundary &&
595 le32_to_cpu(*(branch[0].p + count)) == 0) { 595 le32_to_cpu(*(branch[0].p + count)) == 0) {
596 count++; 596 count++;
597 } 597 }
598 return count; 598 return count;
599 } 599 }
600 600
601 /** 601 /**
602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction 603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks 604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at 605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation 606 * @goal: preferred physical block of allocation
607 * @indirect_blks: the number of blocks need to allocate for indirect 607 * @indirect_blks: the number of blocks need to allocate for indirect
608 * blocks 608 * blocks
609 * @blks: number of desired blocks 609 * @blks: number of desired blocks
610 * @new_blocks: on return it will store the new block numbers for 610 * @new_blocks: on return it will store the new block numbers for
611 * the indirect blocks(if needed) and the first direct block, 611 * the indirect blocks(if needed) and the first direct block,
612 * @err: on return it will store the error code 612 * @err: on return it will store the error code
613 * 613 *
614 * This function will return the number of blocks allocated as 614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters. 615 * requested by the passed-in parameters.
616 */ 616 */
617 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 617 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
618 ext4_lblk_t iblock, ext4_fsblk_t goal, 618 ext4_lblk_t iblock, ext4_fsblk_t goal,
619 int indirect_blks, int blks, 619 int indirect_blks, int blks,
620 ext4_fsblk_t new_blocks[4], int *err) 620 ext4_fsblk_t new_blocks[4], int *err)
621 { 621 {
622 struct ext4_allocation_request ar; 622 struct ext4_allocation_request ar;
623 int target, i; 623 int target, i;
624 unsigned long count = 0, blk_allocated = 0; 624 unsigned long count = 0, blk_allocated = 0;
625 int index = 0; 625 int index = 0;
626 ext4_fsblk_t current_block = 0; 626 ext4_fsblk_t current_block = 0;
627 int ret = 0; 627 int ret = 0;
628 628
629 /* 629 /*
630 * Here we try to allocate the requested multiple blocks at once, 630 * Here we try to allocate the requested multiple blocks at once,
631 * on a best-effort basis. 631 * on a best-effort basis.
632 * To build a branch, we should allocate blocks for 632 * To build a branch, we should allocate blocks for
633 * the indirect blocks(if not allocated yet), and at least 633 * the indirect blocks(if not allocated yet), and at least
634 * the first direct block of this branch. That's the 634 * the first direct block of this branch. That's the
635 * minimum number of blocks need to allocate(required) 635 * minimum number of blocks need to allocate(required)
636 */ 636 */
637 /* first we try to allocate the indirect blocks */ 637 /* first we try to allocate the indirect blocks */
638 target = indirect_blks; 638 target = indirect_blks;
639 while (target > 0) { 639 while (target > 0) {
640 count = target; 640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, goal, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 0, &count, err); 643 0, &count, err);
644 if (*err) 644 if (*err)
645 goto failed_out; 645 goto failed_out;
646 646
647 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { 647 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
648 EXT4_ERROR_INODE(inode, 648 EXT4_ERROR_INODE(inode,
649 "current_block %llu + count %lu > %d!", 649 "current_block %llu + count %lu > %d!",
650 current_block, count, 650 current_block, count,
651 EXT4_MAX_BLOCK_FILE_PHYS); 651 EXT4_MAX_BLOCK_FILE_PHYS);
652 *err = -EIO; 652 *err = -EIO;
653 goto failed_out; 653 goto failed_out;
654 } 654 }
655 655
656 target -= count; 656 target -= count;
657 /* allocate blocks for indirect blocks */ 657 /* allocate blocks for indirect blocks */
658 while (index < indirect_blks && count) { 658 while (index < indirect_blks && count) {
659 new_blocks[index++] = current_block++; 659 new_blocks[index++] = current_block++;
660 count--; 660 count--;
661 } 661 }
662 if (count > 0) { 662 if (count > 0) {
663 /* 663 /*
664 * save the new block number 664 * save the new block number
665 * for the first direct block 665 * for the first direct block
666 */ 666 */
667 new_blocks[index] = current_block; 667 new_blocks[index] = current_block;
668 printk(KERN_INFO "%s returned more blocks than " 668 printk(KERN_INFO "%s returned more blocks than "
669 "requested\n", __func__); 669 "requested\n", __func__);
670 WARN_ON(1); 670 WARN_ON(1);
671 break; 671 break;
672 } 672 }
673 } 673 }
674 674
675 target = blks - count ; 675 target = blks - count ;
676 blk_allocated = count; 676 blk_allocated = count;
677 if (!target) 677 if (!target)
678 goto allocated; 678 goto allocated;
679 /* Now allocate data blocks */ 679 /* Now allocate data blocks */
680 memset(&ar, 0, sizeof(ar)); 680 memset(&ar, 0, sizeof(ar));
681 ar.inode = inode; 681 ar.inode = inode;
682 ar.goal = goal; 682 ar.goal = goal;
683 ar.len = target; 683 ar.len = target;
684 ar.logical = iblock; 684 ar.logical = iblock;
685 if (S_ISREG(inode->i_mode)) 685 if (S_ISREG(inode->i_mode))
686 /* enable in-core preallocation only for regular files */ 686 /* enable in-core preallocation only for regular files */
687 ar.flags = EXT4_MB_HINT_DATA; 687 ar.flags = EXT4_MB_HINT_DATA;
688 688
689 current_block = ext4_mb_new_blocks(handle, &ar, err); 689 current_block = ext4_mb_new_blocks(handle, &ar, err);
690 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { 690 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
691 EXT4_ERROR_INODE(inode, 691 EXT4_ERROR_INODE(inode,
692 "current_block %llu + ar.len %d > %d!", 692 "current_block %llu + ar.len %d > %d!",
693 current_block, ar.len, 693 current_block, ar.len,
694 EXT4_MAX_BLOCK_FILE_PHYS); 694 EXT4_MAX_BLOCK_FILE_PHYS);
695 *err = -EIO; 695 *err = -EIO;
696 goto failed_out; 696 goto failed_out;
697 } 697 }
698 698
699 if (*err && (target == blks)) { 699 if (*err && (target == blks)) {
700 /* 700 /*
701 * if the allocation failed and we didn't allocate 701 * if the allocation failed and we didn't allocate
702 * any blocks before 702 * any blocks before
703 */ 703 */
704 goto failed_out; 704 goto failed_out;
705 } 705 }
706 if (!*err) { 706 if (!*err) {
707 if (target == blks) { 707 if (target == blks) {
708 /* 708 /*
709 * save the new block number 709 * save the new block number
710 * for the first direct block 710 * for the first direct block
711 */ 711 */
712 new_blocks[index] = current_block; 712 new_blocks[index] = current_block;
713 } 713 }
714 blk_allocated += ar.len; 714 blk_allocated += ar.len;
715 } 715 }
716 allocated: 716 allocated:
717 /* total number of blocks allocated for direct blocks */ 717 /* total number of blocks allocated for direct blocks */
718 ret = blk_allocated; 718 ret = blk_allocated;
719 *err = 0; 719 *err = 0;
720 return ret; 720 return ret;
721 failed_out: 721 failed_out:
722 for (i = 0; i < index; i++) 722 for (i = 0; i < index; i++)
723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724 return ret; 724 return ret;
725 } 725 }
726 726
727 /** 727 /**
728 * ext4_alloc_branch - allocate and set up a chain of blocks. 728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction 729 * @handle: handle for this transaction
730 * @inode: owner 730 * @inode: owner
731 * @indirect_blks: number of allocated indirect blocks 731 * @indirect_blks: number of allocated indirect blocks
732 * @blks: number of allocated direct blocks 732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation 733 * @goal: preferred place for allocation
734 * @offsets: offsets (in the blocks) to store the pointers to next. 734 * @offsets: offsets (in the blocks) to store the pointers to next.
735 * @branch: place to store the chain in. 735 * @branch: place to store the chain in.
736 * 736 *
737 * This function allocates blocks, zeroes out all but the last one, 737 * This function allocates blocks, zeroes out all but the last one,
738 * links them into chain and (if we are synchronous) writes them to disk. 738 * links them into chain and (if we are synchronous) writes them to disk.
739 * In other words, it prepares a branch that can be spliced onto the 739 * In other words, it prepares a branch that can be spliced onto the
740 * inode. It stores the information about that chain in the branch[], in 740 * inode. It stores the information about that chain in the branch[], in
741 * the same format as ext4_get_branch() would do. We are calling it after 741 * the same format as ext4_get_branch() would do. We are calling it after
742 * we had read the existing part of chain and partial points to the last 742 * we had read the existing part of chain and partial points to the last
743 * triple of that (one with zero ->key). Upon the exit we have the same 743 * triple of that (one with zero ->key). Upon the exit we have the same
744 * picture as after the successful ext4_get_block(), except that in one 744 * picture as after the successful ext4_get_block(), except that in one
745 * place chain is disconnected - *branch->p is still zero (we did not 745 * place chain is disconnected - *branch->p is still zero (we did not
746 * set the last link), but branch->key contains the number that should 746 * set the last link), but branch->key contains the number that should
747 * be placed into *branch->p to fill that gap. 747 * be placed into *branch->p to fill that gap.
748 * 748 *
749 * If allocation fails we free all blocks we've allocated (and forget 749 * If allocation fails we free all blocks we've allocated (and forget
750 * their buffer_heads) and return the error value the from failed 750 * their buffer_heads) and return the error value the from failed
751 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 751 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
752 * as described above and return 0. 752 * as described above and return 0.
753 */ 753 */
754 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 754 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 ext4_lblk_t iblock, int indirect_blks, 755 ext4_lblk_t iblock, int indirect_blks,
756 int *blks, ext4_fsblk_t goal, 756 int *blks, ext4_fsblk_t goal,
757 ext4_lblk_t *offsets, Indirect *branch) 757 ext4_lblk_t *offsets, Indirect *branch)
758 { 758 {
759 int blocksize = inode->i_sb->s_blocksize; 759 int blocksize = inode->i_sb->s_blocksize;
760 int i, n = 0; 760 int i, n = 0;
761 int err = 0; 761 int err = 0;
762 struct buffer_head *bh; 762 struct buffer_head *bh;
763 int num; 763 int num;
764 ext4_fsblk_t new_blocks[4]; 764 ext4_fsblk_t new_blocks[4];
765 ext4_fsblk_t current_block; 765 ext4_fsblk_t current_block;
766 766
767 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 767 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
768 *blks, new_blocks, &err); 768 *blks, new_blocks, &err);
769 if (err) 769 if (err)
770 return err; 770 return err;
771 771
772 branch[0].key = cpu_to_le32(new_blocks[0]); 772 branch[0].key = cpu_to_le32(new_blocks[0]);
773 /* 773 /*
774 * metadata blocks and data blocks are allocated. 774 * metadata blocks and data blocks are allocated.
775 */ 775 */
776 for (n = 1; n <= indirect_blks; n++) { 776 for (n = 1; n <= indirect_blks; n++) {
777 /* 777 /*
778 * Get buffer_head for parent block, zero it out 778 * Get buffer_head for parent block, zero it out
779 * and set the pointer to new one, then send 779 * and set the pointer to new one, then send
780 * parent to disk. 780 * parent to disk.
781 */ 781 */
782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783 if (unlikely(!bh)) { 783 if (unlikely(!bh)) {
784 err = -EIO; 784 err = -EIO;
785 goto failed; 785 goto failed;
786 } 786 }
787 787
788 branch[n].bh = bh; 788 branch[n].bh = bh;
789 lock_buffer(bh); 789 lock_buffer(bh);
790 BUFFER_TRACE(bh, "call get_create_access"); 790 BUFFER_TRACE(bh, "call get_create_access");
791 err = ext4_journal_get_create_access(handle, bh); 791 err = ext4_journal_get_create_access(handle, bh);
792 if (err) { 792 if (err) {
793 /* Don't brelse(bh) here; it's done in 793 /* Don't brelse(bh) here; it's done in
794 * ext4_journal_forget() below */ 794 * ext4_journal_forget() below */
795 unlock_buffer(bh); 795 unlock_buffer(bh);
796 goto failed; 796 goto failed;
797 } 797 }
798 798
799 memset(bh->b_data, 0, blocksize); 799 memset(bh->b_data, 0, blocksize);
800 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 800 branch[n].p = (__le32 *) bh->b_data + offsets[n];
801 branch[n].key = cpu_to_le32(new_blocks[n]); 801 branch[n].key = cpu_to_le32(new_blocks[n]);
802 *branch[n].p = branch[n].key; 802 *branch[n].p = branch[n].key;
803 if (n == indirect_blks) { 803 if (n == indirect_blks) {
804 current_block = new_blocks[n]; 804 current_block = new_blocks[n];
805 /* 805 /*
806 * End of chain, update the last new metablock of 806 * End of chain, update the last new metablock of
807 * the chain to point to the new allocated 807 * the chain to point to the new allocated
808 * data blocks numbers 808 * data blocks numbers
809 */ 809 */
810 for (i = 1; i < num; i++) 810 for (i = 1; i < num; i++)
811 *(branch[n].p + i) = cpu_to_le32(++current_block); 811 *(branch[n].p + i) = cpu_to_le32(++current_block);
812 } 812 }
813 BUFFER_TRACE(bh, "marking uptodate"); 813 BUFFER_TRACE(bh, "marking uptodate");
814 set_buffer_uptodate(bh); 814 set_buffer_uptodate(bh);
815 unlock_buffer(bh); 815 unlock_buffer(bh);
816 816
817 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 817 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
818 err = ext4_handle_dirty_metadata(handle, inode, bh); 818 err = ext4_handle_dirty_metadata(handle, inode, bh);
819 if (err) 819 if (err)
820 goto failed; 820 goto failed;
821 } 821 }
822 *blks = num; 822 *blks = num;
823 return err; 823 return err;
824 failed: 824 failed:
825 /* Allocation failed, free what we already allocated */ 825 /* Allocation failed, free what we already allocated */
826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); 826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827 for (i = 1; i <= n ; i++) { 827 for (i = 1; i <= n ; i++) {
828 /* 828 /*
829 * branch[i].bh is newly allocated, so there is no 829 * branch[i].bh is newly allocated, so there is no
830 * need to revoke the block, which is why we don't 830 * need to revoke the block, which is why we don't
831 * need to set EXT4_FREE_BLOCKS_METADATA. 831 * need to set EXT4_FREE_BLOCKS_METADATA.
832 */ 832 */
833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834 EXT4_FREE_BLOCKS_FORGET); 834 EXT4_FREE_BLOCKS_FORGET);
835 } 835 }
836 for (i = n+1; i < indirect_blks; i++) 836 for (i = n+1; i < indirect_blks; i++)
837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838 838
839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); 839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840 840
841 return err; 841 return err;
842 } 842 }
843 843
844 /** 844 /**
845 * ext4_splice_branch - splice the allocated branch onto inode. 845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction 846 * @handle: handle for this transaction
847 * @inode: owner 847 * @inode: owner
848 * @block: (logical) number of block we are adding 848 * @block: (logical) number of block we are adding
849 * @chain: chain of indirect blocks (with a missing link - see 849 * @chain: chain of indirect blocks (with a missing link - see
850 * ext4_alloc_branch) 850 * ext4_alloc_branch)
851 * @where: location of missing link 851 * @where: location of missing link
852 * @num: number of indirect blocks we are adding 852 * @num: number of indirect blocks we are adding
853 * @blks: number of direct blocks we are adding 853 * @blks: number of direct blocks we are adding
854 * 854 *
855 * This function fills the missing link and does all housekeeping needed in 855 * This function fills the missing link and does all housekeeping needed in
856 * inode (->i_blocks, etc.). In case of success we end up with the full 856 * inode (->i_blocks, etc.). In case of success we end up with the full
857 * chain to new block and return 0. 857 * chain to new block and return 0.
858 */ 858 */
859 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 859 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
860 ext4_lblk_t block, Indirect *where, int num, 860 ext4_lblk_t block, Indirect *where, int num,
861 int blks) 861 int blks)
862 { 862 {
863 int i; 863 int i;
864 int err = 0; 864 int err = 0;
865 ext4_fsblk_t current_block; 865 ext4_fsblk_t current_block;
866 866
867 /* 867 /*
868 * If we're splicing into a [td]indirect block (as opposed to the 868 * If we're splicing into a [td]indirect block (as opposed to the
869 * inode) then we need to get write access to the [td]indirect block 869 * inode) then we need to get write access to the [td]indirect block
870 * before the splice. 870 * before the splice.
871 */ 871 */
872 if (where->bh) { 872 if (where->bh) {
873 BUFFER_TRACE(where->bh, "get_write_access"); 873 BUFFER_TRACE(where->bh, "get_write_access");
874 err = ext4_journal_get_write_access(handle, where->bh); 874 err = ext4_journal_get_write_access(handle, where->bh);
875 if (err) 875 if (err)
876 goto err_out; 876 goto err_out;
877 } 877 }
878 /* That's it */ 878 /* That's it */
879 879
880 *where->p = where->key; 880 *where->p = where->key;
881 881
882 /* 882 /*
883 * Update the host buffer_head or inode to point to more just allocated 883 * Update the host buffer_head or inode to point to more just allocated
884 * direct blocks blocks 884 * direct blocks blocks
885 */ 885 */
886 if (num == 0 && blks > 1) { 886 if (num == 0 && blks > 1) {
887 current_block = le32_to_cpu(where->key) + 1; 887 current_block = le32_to_cpu(where->key) + 1;
888 for (i = 1; i < blks; i++) 888 for (i = 1; i < blks; i++)
889 *(where->p + i) = cpu_to_le32(current_block++); 889 *(where->p + i) = cpu_to_le32(current_block++);
890 } 890 }
891 891
892 /* We are done with atomic stuff, now do the rest of housekeeping */ 892 /* We are done with atomic stuff, now do the rest of housekeeping */
893 /* had we spliced it onto indirect block? */ 893 /* had we spliced it onto indirect block? */
894 if (where->bh) { 894 if (where->bh) {
895 /* 895 /*
896 * If we spliced it onto an indirect block, we haven't 896 * If we spliced it onto an indirect block, we haven't
897 * altered the inode. Note however that if it is being spliced 897 * altered the inode. Note however that if it is being spliced
898 * onto an indirect block at the very end of the file (the 898 * onto an indirect block at the very end of the file (the
899 * file is growing) then we *will* alter the inode to reflect 899 * file is growing) then we *will* alter the inode to reflect
900 * the new i_size. But that is not done here - it is done in 900 * the new i_size. But that is not done here - it is done in
901 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 901 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
902 */ 902 */
903 jbd_debug(5, "splicing indirect only\n"); 903 jbd_debug(5, "splicing indirect only\n");
904 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 904 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
905 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 905 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
906 if (err) 906 if (err)
907 goto err_out; 907 goto err_out;
908 } else { 908 } else {
909 /* 909 /*
910 * OK, we spliced it into the inode itself on a direct block. 910 * OK, we spliced it into the inode itself on a direct block.
911 */ 911 */
912 ext4_mark_inode_dirty(handle, inode); 912 ext4_mark_inode_dirty(handle, inode);
913 jbd_debug(5, "splicing direct\n"); 913 jbd_debug(5, "splicing direct\n");
914 } 914 }
915 return err; 915 return err;
916 916
917 err_out: 917 err_out:
918 for (i = 1; i <= num; i++) { 918 for (i = 1; i <= num; i++) {
919 /* 919 /*
920 * branch[i].bh is newly allocated, so there is no 920 * branch[i].bh is newly allocated, so there is no
921 * need to revoke the block, which is why we don't 921 * need to revoke the block, which is why we don't
922 * need to set EXT4_FREE_BLOCKS_METADATA. 922 * need to set EXT4_FREE_BLOCKS_METADATA.
923 */ 923 */
924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925 EXT4_FREE_BLOCKS_FORGET); 925 EXT4_FREE_BLOCKS_FORGET);
926 } 926 }
927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), 927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928 blks, 0); 928 blks, 0);
929 929
930 return err; 930 return err;
931 } 931 }
932 932
933 /* 933 /*
934 * The ext4_ind_map_blocks() function handles non-extents inodes 934 * The ext4_ind_map_blocks() function handles non-extents inodes
935 * (i.e., using the traditional indirect/double-indirect i_blocks 935 * (i.e., using the traditional indirect/double-indirect i_blocks
936 * scheme) for ext4_map_blocks(). 936 * scheme) for ext4_map_blocks().
937 * 937 *
938 * Allocation strategy is simple: if we have to allocate something, we will 938 * Allocation strategy is simple: if we have to allocate something, we will
939 * have to go the whole way to leaf. So let's do it before attaching anything 939 * have to go the whole way to leaf. So let's do it before attaching anything
940 * to tree, set linkage between the newborn blocks, write them if sync is 940 * to tree, set linkage between the newborn blocks, write them if sync is
941 * required, recheck the path, free and repeat if check fails, otherwise 941 * required, recheck the path, free and repeat if check fails, otherwise
942 * set the last missing link (that will protect us from any truncate-generated 942 * set the last missing link (that will protect us from any truncate-generated
943 * removals - all blocks on the path are immune now) and possibly force the 943 * removals - all blocks on the path are immune now) and possibly force the
944 * write on the parent block. 944 * write on the parent block.
945 * That has a nice additional property: no special recovery from the failed 945 * That has a nice additional property: no special recovery from the failed
946 * allocations is needed - we simply release blocks and do not touch anything 946 * allocations is needed - we simply release blocks and do not touch anything
947 * reachable from inode. 947 * reachable from inode.
948 * 948 *
949 * `handle' can be NULL if create == 0. 949 * `handle' can be NULL if create == 0.
950 * 950 *
951 * return > 0, # of blocks mapped or allocated. 951 * return > 0, # of blocks mapped or allocated.
952 * return = 0, if plain lookup failed. 952 * return = 0, if plain lookup failed.
953 * return < 0, error case. 953 * return < 0, error case.
954 * 954 *
955 * The ext4_ind_get_blocks() function should be called with 955 * The ext4_ind_get_blocks() function should be called with
956 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem 956 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
957 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or 957 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
958 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 958 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
959 * blocks. 959 * blocks.
960 */ 960 */
961 static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 961 static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
962 struct ext4_map_blocks *map, 962 struct ext4_map_blocks *map,
963 int flags) 963 int flags)
964 { 964 {
965 int err = -EIO; 965 int err = -EIO;
966 ext4_lblk_t offsets[4]; 966 ext4_lblk_t offsets[4];
967 Indirect chain[4]; 967 Indirect chain[4];
968 Indirect *partial; 968 Indirect *partial;
969 ext4_fsblk_t goal; 969 ext4_fsblk_t goal;
970 int indirect_blks; 970 int indirect_blks;
971 int blocks_to_boundary = 0; 971 int blocks_to_boundary = 0;
972 int depth; 972 int depth;
973 int count = 0; 973 int count = 0;
974 ext4_fsblk_t first_block = 0; 974 ext4_fsblk_t first_block = 0;
975 975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
979 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
980 &blocks_to_boundary); 980 &blocks_to_boundary);
981 981
982 if (depth == 0) 982 if (depth == 0)
983 goto out; 983 goto out;
984 984
985 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 985 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
986 986
987 /* Simplest case - block found, no allocation needed */ 987 /* Simplest case - block found, no allocation needed */
988 if (!partial) { 988 if (!partial) {
989 first_block = le32_to_cpu(chain[depth - 1].key); 989 first_block = le32_to_cpu(chain[depth - 1].key);
990 count++; 990 count++;
991 /*map more blocks*/ 991 /*map more blocks*/
992 while (count < map->m_len && count <= blocks_to_boundary) { 992 while (count < map->m_len && count <= blocks_to_boundary) {
993 ext4_fsblk_t blk; 993 ext4_fsblk_t blk;
994 994
995 blk = le32_to_cpu(*(chain[depth-1].p + count)); 995 blk = le32_to_cpu(*(chain[depth-1].p + count));
996 996
997 if (blk == first_block + count) 997 if (blk == first_block + count)
998 count++; 998 count++;
999 else 999 else
1000 break; 1000 break;
1001 } 1001 }
1002 goto got_it; 1002 goto got_it;
1003 } 1003 }
1004 1004
1005 /* Next simple case - plain lookup or failed read of indirect block */ 1005 /* Next simple case - plain lookup or failed read of indirect block */
1006 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 1006 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
1007 goto cleanup; 1007 goto cleanup;
1008 1008
1009 /* 1009 /*
1010 * Okay, we need to do block allocation. 1010 * Okay, we need to do block allocation.
1011 */ 1011 */
1012 goal = ext4_find_goal(inode, map->m_lblk, partial); 1012 goal = ext4_find_goal(inode, map->m_lblk, partial);
1013 1013
1014 /* the number of blocks need to allocate for [d,t]indirect blocks */ 1014 /* the number of blocks need to allocate for [d,t]indirect blocks */
1015 indirect_blks = (chain + depth) - partial - 1; 1015 indirect_blks = (chain + depth) - partial - 1;
1016 1016
1017 /* 1017 /*
1018 * Next look up the indirect map to count the totoal number of 1018 * Next look up the indirect map to count the totoal number of
1019 * direct blocks to allocate for this branch. 1019 * direct blocks to allocate for this branch.
1020 */ 1020 */
1021 count = ext4_blks_to_allocate(partial, indirect_blks, 1021 count = ext4_blks_to_allocate(partial, indirect_blks,
1022 map->m_len, blocks_to_boundary); 1022 map->m_len, blocks_to_boundary);
1023 /* 1023 /*
1024 * Block out ext4_truncate while we alter the tree 1024 * Block out ext4_truncate while we alter the tree
1025 */ 1025 */
1026 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, 1026 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
1027 &count, goal, 1027 &count, goal,
1028 offsets + (partial - chain), partial); 1028 offsets + (partial - chain), partial);
1029 1029
1030 /* 1030 /*
1031 * The ext4_splice_branch call will free and forget any buffers 1031 * The ext4_splice_branch call will free and forget any buffers
1032 * on the new chain if there is a failure, but that risks using 1032 * on the new chain if there is a failure, but that risks using
1033 * up transaction credits, especially for bitmaps where the 1033 * up transaction credits, especially for bitmaps where the
1034 * credits cannot be returned. Can we handle this somehow? We 1034 * credits cannot be returned. Can we handle this somehow? We
1035 * may need to return -EAGAIN upwards in the worst case. --sct 1035 * may need to return -EAGAIN upwards in the worst case. --sct
1036 */ 1036 */
1037 if (!err) 1037 if (!err)
1038 err = ext4_splice_branch(handle, inode, map->m_lblk, 1038 err = ext4_splice_branch(handle, inode, map->m_lblk,
1039 partial, indirect_blks, count); 1039 partial, indirect_blks, count);
1040 if (err) 1040 if (err)
1041 goto cleanup; 1041 goto cleanup;
1042 1042
1043 map->m_flags |= EXT4_MAP_NEW; 1043 map->m_flags |= EXT4_MAP_NEW;
1044 1044
1045 ext4_update_inode_fsync_trans(handle, inode, 1); 1045 ext4_update_inode_fsync_trans(handle, inode, 1);
1046 got_it: 1046 got_it:
1047 map->m_flags |= EXT4_MAP_MAPPED; 1047 map->m_flags |= EXT4_MAP_MAPPED;
1048 map->m_pblk = le32_to_cpu(chain[depth-1].key); 1048 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1049 map->m_len = count; 1049 map->m_len = count;
1050 if (count > blocks_to_boundary) 1050 if (count > blocks_to_boundary)
1051 map->m_flags |= EXT4_MAP_BOUNDARY; 1051 map->m_flags |= EXT4_MAP_BOUNDARY;
1052 err = count; 1052 err = count;
1053 /* Clean up and exit */ 1053 /* Clean up and exit */
1054 partial = chain + depth - 1; /* the whole chain */ 1054 partial = chain + depth - 1; /* the whole chain */
1055 cleanup: 1055 cleanup:
1056 while (partial > chain) { 1056 while (partial > chain) {
1057 BUFFER_TRACE(partial->bh, "call brelse"); 1057 BUFFER_TRACE(partial->bh, "call brelse");
1058 brelse(partial->bh); 1058 brelse(partial->bh);
1059 partial--; 1059 partial--;
1060 } 1060 }
1061 out: 1061 out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, 1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err); 1063 map->m_pblk, map->m_len, err);
1064 return err; 1064 return err;
1065 } 1065 }
1066 1066
1067 #ifdef CONFIG_QUOTA 1067 #ifdef CONFIG_QUOTA
1068 qsize_t *ext4_get_reserved_space(struct inode *inode) 1068 qsize_t *ext4_get_reserved_space(struct inode *inode)
1069 { 1069 {
1070 return &EXT4_I(inode)->i_reserved_quota; 1070 return &EXT4_I(inode)->i_reserved_quota;
1071 } 1071 }
1072 #endif 1072 #endif
1073 1073
1074 /* 1074 /*
1075 * Calculate the number of metadata blocks need to reserve 1075 * Calculate the number of metadata blocks need to reserve
1076 * to allocate a new block at @lblocks for non extent file based file 1076 * to allocate a new block at @lblocks for non extent file based file
1077 */ 1077 */
1078 static int ext4_indirect_calc_metadata_amount(struct inode *inode, 1078 static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1079 sector_t lblock) 1079 sector_t lblock)
1080 { 1080 {
1081 struct ext4_inode_info *ei = EXT4_I(inode); 1081 struct ext4_inode_info *ei = EXT4_I(inode);
1082 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); 1082 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1083 int blk_bits; 1083 int blk_bits;
1084 1084
1085 if (lblock < EXT4_NDIR_BLOCKS) 1085 if (lblock < EXT4_NDIR_BLOCKS)
1086 return 0; 1086 return 0;
1087 1087
1088 lblock -= EXT4_NDIR_BLOCKS; 1088 lblock -= EXT4_NDIR_BLOCKS;
1089 1089
1090 if (ei->i_da_metadata_calc_len && 1090 if (ei->i_da_metadata_calc_len &&
1091 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { 1091 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1092 ei->i_da_metadata_calc_len++; 1092 ei->i_da_metadata_calc_len++;
1093 return 0; 1093 return 0;
1094 } 1094 }
1095 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 1095 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1096 ei->i_da_metadata_calc_len = 1; 1096 ei->i_da_metadata_calc_len = 1;
1097 blk_bits = order_base_2(lblock); 1097 blk_bits = order_base_2(lblock);
1098 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 1098 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1099 } 1099 }
1100 1100
1101 /* 1101 /*
1102 * Calculate the number of metadata blocks need to reserve 1102 * Calculate the number of metadata blocks need to reserve
1103 * to allocate a block located at @lblock 1103 * to allocate a block located at @lblock
1104 */ 1104 */
1105 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 1105 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1106 { 1106 {
1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1108 return ext4_ext_calc_metadata_amount(inode, lblock); 1108 return ext4_ext_calc_metadata_amount(inode, lblock);
1109 1109
1110 return ext4_indirect_calc_metadata_amount(inode, lblock); 1110 return ext4_indirect_calc_metadata_amount(inode, lblock);
1111 } 1111 }
1112 1112
1113 /* 1113 /*
1114 * Called with i_data_sem down, which is important since we can call 1114 * Called with i_data_sem down, which is important since we can call
1115 * ext4_discard_preallocations() from here. 1115 * ext4_discard_preallocations() from here.
1116 */ 1116 */
1117 void ext4_da_update_reserve_space(struct inode *inode, 1117 void ext4_da_update_reserve_space(struct inode *inode,
1118 int used, int quota_claim) 1118 int used, int quota_claim)
1119 { 1119 {
1120 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1120 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1121 struct ext4_inode_info *ei = EXT4_I(inode); 1121 struct ext4_inode_info *ei = EXT4_I(inode);
1122 1122
1123 spin_lock(&ei->i_block_reservation_lock); 1123 spin_lock(&ei->i_block_reservation_lock);
1124 trace_ext4_da_update_reserve_space(inode, used); 1124 trace_ext4_da_update_reserve_space(inode, used);
1125 if (unlikely(used > ei->i_reserved_data_blocks)) { 1125 if (unlikely(used > ei->i_reserved_data_blocks)) {
1126 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 1126 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1127 "with only %d reserved data blocks\n", 1127 "with only %d reserved data blocks\n",
1128 __func__, inode->i_ino, used, 1128 __func__, inode->i_ino, used,
1129 ei->i_reserved_data_blocks); 1129 ei->i_reserved_data_blocks);
1130 WARN_ON(1); 1130 WARN_ON(1);
1131 used = ei->i_reserved_data_blocks; 1131 used = ei->i_reserved_data_blocks;
1132 } 1132 }
1133 1133
1134 /* Update per-inode reservations */ 1134 /* Update per-inode reservations */
1135 ei->i_reserved_data_blocks -= used; 1135 ei->i_reserved_data_blocks -= used;
1136 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1136 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1137 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1137 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1138 used + ei->i_allocated_meta_blocks); 1138 used + ei->i_allocated_meta_blocks);
1139 ei->i_allocated_meta_blocks = 0; 1139 ei->i_allocated_meta_blocks = 0;
1140 1140
1141 if (ei->i_reserved_data_blocks == 0) { 1141 if (ei->i_reserved_data_blocks == 0) {
1142 /* 1142 /*
1143 * We can release all of the reserved metadata blocks 1143 * We can release all of the reserved metadata blocks
1144 * only when we have written all of the delayed 1144 * only when we have written all of the delayed
1145 * allocation blocks. 1145 * allocation blocks.
1146 */ 1146 */
1147 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1147 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1148 ei->i_reserved_meta_blocks); 1148 ei->i_reserved_meta_blocks);
1149 ei->i_reserved_meta_blocks = 0; 1149 ei->i_reserved_meta_blocks = 0;
1150 ei->i_da_metadata_calc_len = 0; 1150 ei->i_da_metadata_calc_len = 0;
1151 } 1151 }
1152 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1152 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1153 1153
1154 /* Update quota subsystem for data blocks */ 1154 /* Update quota subsystem for data blocks */
1155 if (quota_claim) 1155 if (quota_claim)
1156 dquot_claim_block(inode, used); 1156 dquot_claim_block(inode, used);
1157 else { 1157 else {
1158 /* 1158 /*
1159 * We did fallocate with an offset that is already delayed 1159 * We did fallocate with an offset that is already delayed
1160 * allocated. So on delayed allocated writeback we should 1160 * allocated. So on delayed allocated writeback we should
1161 * not re-claim the quota for fallocated blocks. 1161 * not re-claim the quota for fallocated blocks.
1162 */ 1162 */
1163 dquot_release_reservation_block(inode, used); 1163 dquot_release_reservation_block(inode, used);
1164 } 1164 }
1165 1165
1166 /* 1166 /*
1167 * If we have done all the pending block allocations and if 1167 * If we have done all the pending block allocations and if
1168 * there aren't any writers on the inode, we can discard the 1168 * there aren't any writers on the inode, we can discard the
1169 * inode's preallocations. 1169 * inode's preallocations.
1170 */ 1170 */
1171 if ((ei->i_reserved_data_blocks == 0) && 1171 if ((ei->i_reserved_data_blocks == 0) &&
1172 (atomic_read(&inode->i_writecount) == 0)) 1172 (atomic_read(&inode->i_writecount) == 0))
1173 ext4_discard_preallocations(inode); 1173 ext4_discard_preallocations(inode);
1174 } 1174 }
1175 1175
1176 static int __check_block_validity(struct inode *inode, const char *func, 1176 static int __check_block_validity(struct inode *inode, const char *func,
1177 unsigned int line, 1177 unsigned int line,
1178 struct ext4_map_blocks *map) 1178 struct ext4_map_blocks *map)
1179 { 1179 {
1180 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, 1180 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1181 map->m_len)) { 1181 map->m_len)) {
1182 ext4_error_inode(inode, func, line, map->m_pblk, 1182 ext4_error_inode(inode, func, line, map->m_pblk,
1183 "lblock %lu mapped to illegal pblock " 1183 "lblock %lu mapped to illegal pblock "
1184 "(length %d)", (unsigned long) map->m_lblk, 1184 "(length %d)", (unsigned long) map->m_lblk,
1185 map->m_len); 1185 map->m_len);
1186 return -EIO; 1186 return -EIO;
1187 } 1187 }
1188 return 0; 1188 return 0;
1189 } 1189 }
1190 1190
1191 #define check_block_validity(inode, map) \ 1191 #define check_block_validity(inode, map) \
1192 __check_block_validity((inode), __func__, __LINE__, (map)) 1192 __check_block_validity((inode), __func__, __LINE__, (map))
1193 1193
1194 /* 1194 /*
1195 * Return the number of contiguous dirty pages in a given inode 1195 * Return the number of contiguous dirty pages in a given inode
1196 * starting at page frame idx. 1196 * starting at page frame idx.
1197 */ 1197 */
1198 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, 1198 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1199 unsigned int max_pages) 1199 unsigned int max_pages)
1200 { 1200 {
1201 struct address_space *mapping = inode->i_mapping; 1201 struct address_space *mapping = inode->i_mapping;
1202 pgoff_t index; 1202 pgoff_t index;
1203 struct pagevec pvec; 1203 struct pagevec pvec;
1204 pgoff_t num = 0; 1204 pgoff_t num = 0;
1205 int i, nr_pages, done = 0; 1205 int i, nr_pages, done = 0;
1206 1206
1207 if (max_pages == 0) 1207 if (max_pages == 0)
1208 return 0; 1208 return 0;
1209 pagevec_init(&pvec, 0); 1209 pagevec_init(&pvec, 0);
1210 while (!done) { 1210 while (!done) {
1211 index = idx; 1211 index = idx;
1212 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1212 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1213 PAGECACHE_TAG_DIRTY, 1213 PAGECACHE_TAG_DIRTY,
1214 (pgoff_t)PAGEVEC_SIZE); 1214 (pgoff_t)PAGEVEC_SIZE);
1215 if (nr_pages == 0) 1215 if (nr_pages == 0)
1216 break; 1216 break;
1217 for (i = 0; i < nr_pages; i++) { 1217 for (i = 0; i < nr_pages; i++) {
1218 struct page *page = pvec.pages[i]; 1218 struct page *page = pvec.pages[i];
1219 struct buffer_head *bh, *head; 1219 struct buffer_head *bh, *head;
1220 1220
1221 lock_page(page); 1221 lock_page(page);
1222 if (unlikely(page->mapping != mapping) || 1222 if (unlikely(page->mapping != mapping) ||
1223 !PageDirty(page) || 1223 !PageDirty(page) ||
1224 PageWriteback(page) || 1224 PageWriteback(page) ||
1225 page->index != idx) { 1225 page->index != idx) {
1226 done = 1; 1226 done = 1;
1227 unlock_page(page); 1227 unlock_page(page);
1228 break; 1228 break;
1229 } 1229 }
1230 if (page_has_buffers(page)) { 1230 if (page_has_buffers(page)) {
1231 bh = head = page_buffers(page); 1231 bh = head = page_buffers(page);
1232 do { 1232 do {
1233 if (!buffer_delay(bh) && 1233 if (!buffer_delay(bh) &&
1234 !buffer_unwritten(bh)) 1234 !buffer_unwritten(bh))
1235 done = 1; 1235 done = 1;
1236 bh = bh->b_this_page; 1236 bh = bh->b_this_page;
1237 } while (!done && (bh != head)); 1237 } while (!done && (bh != head));
1238 } 1238 }
1239 unlock_page(page); 1239 unlock_page(page);
1240 if (done) 1240 if (done)
1241 break; 1241 break;
1242 idx++; 1242 idx++;
1243 num++; 1243 num++;
1244 if (num >= max_pages) { 1244 if (num >= max_pages) {
1245 done = 1; 1245 done = 1;
1246 break; 1246 break;
1247 } 1247 }
1248 } 1248 }
1249 pagevec_release(&pvec); 1249 pagevec_release(&pvec);
1250 } 1250 }
1251 return num; 1251 return num;
1252 } 1252 }
1253 1253
1254 /* 1254 /*
1255 * The ext4_map_blocks() function tries to look up the requested blocks, 1255 * The ext4_map_blocks() function tries to look up the requested blocks,
1256 * and returns if the blocks are already mapped. 1256 * and returns if the blocks are already mapped.
1257 * 1257 *
1258 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1258 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1259 * and store the allocated blocks in the result buffer head and mark it 1259 * and store the allocated blocks in the result buffer head and mark it
1260 * mapped. 1260 * mapped.
1261 * 1261 *
1262 * If file type is extents based, it will call ext4_ext_map_blocks(), 1262 * If file type is extents based, it will call ext4_ext_map_blocks(),
1263 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 1263 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1264 * based files 1264 * based files
1265 * 1265 *
1266 * On success, it returns the number of blocks being mapped or allocate. 1266 * On success, it returns the number of blocks being mapped or allocate.
1267 * if create==0 and the blocks are pre-allocated and uninitialized block, 1267 * if create==0 and the blocks are pre-allocated and uninitialized block,
1268 * the result buffer head is unmapped. If the create ==1, it will make sure 1268 * the result buffer head is unmapped. If the create ==1, it will make sure
1269 * the buffer head is mapped. 1269 * the buffer head is mapped.
1270 * 1270 *
1271 * It returns 0 if plain look up failed (blocks have not been allocated), in 1271 * It returns 0 if plain look up failed (blocks have not been allocated), in
1272 * that casem, buffer head is unmapped 1272 * that casem, buffer head is unmapped
1273 * 1273 *
1274 * It returns the error in case of allocation failure. 1274 * It returns the error in case of allocation failure.
1275 */ 1275 */
1276 int ext4_map_blocks(handle_t *handle, struct inode *inode, 1276 int ext4_map_blocks(handle_t *handle, struct inode *inode,
1277 struct ext4_map_blocks *map, int flags) 1277 struct ext4_map_blocks *map, int flags)
1278 { 1278 {
1279 int retval; 1279 int retval;
1280 1280
1281 map->m_flags = 0; 1281 map->m_flags = 0;
1282 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 1282 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1283 "logical block %lu\n", inode->i_ino, flags, map->m_len, 1283 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1284 (unsigned long) map->m_lblk); 1284 (unsigned long) map->m_lblk);
1285 /* 1285 /*
1286 * Try to see if we can get the block without requesting a new 1286 * Try to see if we can get the block without requesting a new
1287 * file system block. 1287 * file system block.
1288 */ 1288 */
1289 down_read((&EXT4_I(inode)->i_data_sem)); 1289 down_read((&EXT4_I(inode)->i_data_sem));
1290 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 1290 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1291 retval = ext4_ext_map_blocks(handle, inode, map, 0); 1291 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1292 } else { 1292 } else {
1293 retval = ext4_ind_map_blocks(handle, inode, map, 0); 1293 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1294 } 1294 }
1295 up_read((&EXT4_I(inode)->i_data_sem)); 1295 up_read((&EXT4_I(inode)->i_data_sem));
1296 1296
1297 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1297 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1298 int ret = check_block_validity(inode, map); 1298 int ret = check_block_validity(inode, map);
1299 if (ret != 0) 1299 if (ret != 0)
1300 return ret; 1300 return ret;
1301 } 1301 }
1302 1302
1303 /* If it is only a block(s) look up */ 1303 /* If it is only a block(s) look up */
1304 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 1304 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1305 return retval; 1305 return retval;
1306 1306
1307 /* 1307 /*
1308 * Returns if the blocks have already allocated 1308 * Returns if the blocks have already allocated
1309 * 1309 *
1310 * Note that if blocks have been preallocated 1310 * Note that if blocks have been preallocated
1311 * ext4_ext_get_block() returns th create = 0 1311 * ext4_ext_get_block() returns th create = 0
1312 * with buffer head unmapped. 1312 * with buffer head unmapped.
1313 */ 1313 */
1314 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 1314 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1315 return retval; 1315 return retval;
1316 1316
1317 /* 1317 /*
1318 * When we call get_blocks without the create flag, the 1318 * When we call get_blocks without the create flag, the
1319 * BH_Unwritten flag could have gotten set if the blocks 1319 * BH_Unwritten flag could have gotten set if the blocks
1320 * requested were part of a uninitialized extent. We need to 1320 * requested were part of a uninitialized extent. We need to
1321 * clear this flag now that we are committed to convert all or 1321 * clear this flag now that we are committed to convert all or
1322 * part of the uninitialized extent to be an initialized 1322 * part of the uninitialized extent to be an initialized
1323 * extent. This is because we need to avoid the combination 1323 * extent. This is because we need to avoid the combination
1324 * of BH_Unwritten and BH_Mapped flags being simultaneously 1324 * of BH_Unwritten and BH_Mapped flags being simultaneously
1325 * set on the buffer_head. 1325 * set on the buffer_head.
1326 */ 1326 */
1327 map->m_flags &= ~EXT4_MAP_UNWRITTEN; 1327 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1328 1328
1329 /* 1329 /*
1330 * New blocks allocate and/or writing to uninitialized extent 1330 * New blocks allocate and/or writing to uninitialized extent
1331 * will possibly result in updating i_data, so we take 1331 * will possibly result in updating i_data, so we take
1332 * the write lock of i_data_sem, and call get_blocks() 1332 * the write lock of i_data_sem, and call get_blocks()
1333 * with create == 1 flag. 1333 * with create == 1 flag.
1334 */ 1334 */
1335 down_write((&EXT4_I(inode)->i_data_sem)); 1335 down_write((&EXT4_I(inode)->i_data_sem));
1336 1336
1337 /* 1337 /*
1338 * if the caller is from delayed allocation writeout path 1338 * if the caller is from delayed allocation writeout path
1339 * we have already reserved fs blocks for allocation 1339 * we have already reserved fs blocks for allocation
1340 * let the underlying get_block() function know to 1340 * let the underlying get_block() function know to
1341 * avoid double accounting 1341 * avoid double accounting
1342 */ 1342 */
1343 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1343 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1344 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 1344 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1345 /* 1345 /*
1346 * We need to check for EXT4 here because migrate 1346 * We need to check for EXT4 here because migrate
1347 * could have changed the inode type in between 1347 * could have changed the inode type in between
1348 */ 1348 */
1349 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 1349 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1350 retval = ext4_ext_map_blocks(handle, inode, map, flags); 1350 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1351 } else { 1351 } else {
1352 retval = ext4_ind_map_blocks(handle, inode, map, flags); 1352 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1353 1353
1354 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { 1354 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1355 /* 1355 /*
1356 * We allocated new blocks which will result in 1356 * We allocated new blocks which will result in
1357 * i_data's format changing. Force the migrate 1357 * i_data's format changing. Force the migrate
1358 * to fail by clearing migrate flags 1358 * to fail by clearing migrate flags
1359 */ 1359 */
1360 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); 1360 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1361 } 1361 }
1362 1362
1363 /* 1363 /*
1364 * Update reserved blocks/metadata blocks after successful 1364 * Update reserved blocks/metadata blocks after successful
1365 * block allocation which had been deferred till now. We don't 1365 * block allocation which had been deferred till now. We don't
1366 * support fallocate for non extent files. So we can update 1366 * support fallocate for non extent files. So we can update
1367 * reserve space here. 1367 * reserve space here.
1368 */ 1368 */
1369 if ((retval > 0) && 1369 if ((retval > 0) &&
1370 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 1370 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1371 ext4_da_update_reserve_space(inode, retval, 1); 1371 ext4_da_update_reserve_space(inode, retval, 1);
1372 } 1372 }
1373 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1373 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1374 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 1374 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1375 1375
1376 up_write((&EXT4_I(inode)->i_data_sem)); 1376 up_write((&EXT4_I(inode)->i_data_sem));
1377 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1377 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1378 int ret = check_block_validity(inode, map); 1378 int ret = check_block_validity(inode, map);
1379 if (ret != 0) 1379 if (ret != 0)
1380 return ret; 1380 return ret;
1381 } 1381 }
1382 return retval; 1382 return retval;
1383 } 1383 }
1384 1384
1385 /* Maximum number of blocks we map for direct IO at once. */ 1385 /* Maximum number of blocks we map for direct IO at once. */
1386 #define DIO_MAX_BLOCKS 4096 1386 #define DIO_MAX_BLOCKS 4096
1387 1387
1388 static int _ext4_get_block(struct inode *inode, sector_t iblock, 1388 static int _ext4_get_block(struct inode *inode, sector_t iblock,
1389 struct buffer_head *bh, int flags) 1389 struct buffer_head *bh, int flags)
1390 { 1390 {
1391 handle_t *handle = ext4_journal_current_handle(); 1391 handle_t *handle = ext4_journal_current_handle();
1392 struct ext4_map_blocks map; 1392 struct ext4_map_blocks map;
1393 int ret = 0, started = 0; 1393 int ret = 0, started = 0;
1394 int dio_credits; 1394 int dio_credits;
1395 1395
1396 map.m_lblk = iblock; 1396 map.m_lblk = iblock;
1397 map.m_len = bh->b_size >> inode->i_blkbits; 1397 map.m_len = bh->b_size >> inode->i_blkbits;
1398 1398
1399 if (flags && !handle) { 1399 if (flags && !handle) {
1400 /* Direct IO write... */ 1400 /* Direct IO write... */
1401 if (map.m_len > DIO_MAX_BLOCKS) 1401 if (map.m_len > DIO_MAX_BLOCKS)
1402 map.m_len = DIO_MAX_BLOCKS; 1402 map.m_len = DIO_MAX_BLOCKS;
1403 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); 1403 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1404 handle = ext4_journal_start(inode, dio_credits); 1404 handle = ext4_journal_start(inode, dio_credits);
1405 if (IS_ERR(handle)) { 1405 if (IS_ERR(handle)) {
1406 ret = PTR_ERR(handle); 1406 ret = PTR_ERR(handle);
1407 return ret; 1407 return ret;
1408 } 1408 }
1409 started = 1; 1409 started = 1;
1410 } 1410 }
1411 1411
1412 ret = ext4_map_blocks(handle, inode, &map, flags); 1412 ret = ext4_map_blocks(handle, inode, &map, flags);
1413 if (ret > 0) { 1413 if (ret > 0) {
1414 map_bh(bh, inode->i_sb, map.m_pblk); 1414 map_bh(bh, inode->i_sb, map.m_pblk);
1415 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 1415 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1416 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 1416 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1417 ret = 0; 1417 ret = 0;
1418 } 1418 }
1419 if (started) 1419 if (started)
1420 ext4_journal_stop(handle); 1420 ext4_journal_stop(handle);
1421 return ret; 1421 return ret;
1422 } 1422 }
1423 1423
1424 int ext4_get_block(struct inode *inode, sector_t iblock, 1424 int ext4_get_block(struct inode *inode, sector_t iblock,
1425 struct buffer_head *bh, int create) 1425 struct buffer_head *bh, int create)
1426 { 1426 {
1427 return _ext4_get_block(inode, iblock, bh, 1427 return _ext4_get_block(inode, iblock, bh,
1428 create ? EXT4_GET_BLOCKS_CREATE : 0); 1428 create ? EXT4_GET_BLOCKS_CREATE : 0);
1429 } 1429 }
1430 1430
1431 /* 1431 /*
1432 * `handle' can be NULL if create is zero 1432 * `handle' can be NULL if create is zero
1433 */ 1433 */
1434 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1434 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1435 ext4_lblk_t block, int create, int *errp) 1435 ext4_lblk_t block, int create, int *errp)
1436 { 1436 {
1437 struct ext4_map_blocks map; 1437 struct ext4_map_blocks map;
1438 struct buffer_head *bh; 1438 struct buffer_head *bh;
1439 int fatal = 0, err; 1439 int fatal = 0, err;
1440 1440
1441 J_ASSERT(handle != NULL || create == 0); 1441 J_ASSERT(handle != NULL || create == 0);
1442 1442
1443 map.m_lblk = block; 1443 map.m_lblk = block;
1444 map.m_len = 1; 1444 map.m_len = 1;
1445 err = ext4_map_blocks(handle, inode, &map, 1445 err = ext4_map_blocks(handle, inode, &map,
1446 create ? EXT4_GET_BLOCKS_CREATE : 0); 1446 create ? EXT4_GET_BLOCKS_CREATE : 0);
1447 1447
1448 if (err < 0) 1448 if (err < 0)
1449 *errp = err; 1449 *errp = err;
1450 if (err <= 0) 1450 if (err <= 0)
1451 return NULL; 1451 return NULL;
1452 *errp = 0; 1452 *errp = 0;
1453 1453
1454 bh = sb_getblk(inode->i_sb, map.m_pblk); 1454 bh = sb_getblk(inode->i_sb, map.m_pblk);
1455 if (!bh) { 1455 if (!bh) {
1456 *errp = -EIO; 1456 *errp = -EIO;
1457 return NULL; 1457 return NULL;
1458 } 1458 }
1459 if (map.m_flags & EXT4_MAP_NEW) { 1459 if (map.m_flags & EXT4_MAP_NEW) {
1460 J_ASSERT(create != 0); 1460 J_ASSERT(create != 0);
1461 J_ASSERT(handle != NULL); 1461 J_ASSERT(handle != NULL);
1462 1462
1463 /* 1463 /*
1464 * Now that we do not always journal data, we should 1464 * Now that we do not always journal data, we should
1465 * keep in mind whether this should always journal the 1465 * keep in mind whether this should always journal the
1466 * new buffer as metadata. For now, regular file 1466 * new buffer as metadata. For now, regular file
1467 * writes use ext4_get_block instead, so it's not a 1467 * writes use ext4_get_block instead, so it's not a
1468 * problem. 1468 * problem.
1469 */ 1469 */
1470 lock_buffer(bh); 1470 lock_buffer(bh);
1471 BUFFER_TRACE(bh, "call get_create_access"); 1471 BUFFER_TRACE(bh, "call get_create_access");
1472 fatal = ext4_journal_get_create_access(handle, bh); 1472 fatal = ext4_journal_get_create_access(handle, bh);
1473 if (!fatal && !buffer_uptodate(bh)) { 1473 if (!fatal && !buffer_uptodate(bh)) {
1474 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1474 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1475 set_buffer_uptodate(bh); 1475 set_buffer_uptodate(bh);
1476 } 1476 }
1477 unlock_buffer(bh); 1477 unlock_buffer(bh);
1478 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1478 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1479 err = ext4_handle_dirty_metadata(handle, inode, bh); 1479 err = ext4_handle_dirty_metadata(handle, inode, bh);
1480 if (!fatal) 1480 if (!fatal)
1481 fatal = err; 1481 fatal = err;
1482 } else { 1482 } else {
1483 BUFFER_TRACE(bh, "not a new buffer"); 1483 BUFFER_TRACE(bh, "not a new buffer");
1484 } 1484 }
1485 if (fatal) { 1485 if (fatal) {
1486 *errp = fatal; 1486 *errp = fatal;
1487 brelse(bh); 1487 brelse(bh);
1488 bh = NULL; 1488 bh = NULL;
1489 } 1489 }
1490 return bh; 1490 return bh;
1491 } 1491 }
1492 1492
1493 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1493 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1494 ext4_lblk_t block, int create, int *err) 1494 ext4_lblk_t block, int create, int *err)
1495 { 1495 {
1496 struct buffer_head *bh; 1496 struct buffer_head *bh;
1497 1497
1498 bh = ext4_getblk(handle, inode, block, create, err); 1498 bh = ext4_getblk(handle, inode, block, create, err);
1499 if (!bh) 1499 if (!bh)
1500 return bh; 1500 return bh;
1501 if (buffer_uptodate(bh)) 1501 if (buffer_uptodate(bh))
1502 return bh; 1502 return bh;
1503 ll_rw_block(READ_META, 1, &bh); 1503 ll_rw_block(READ_META, 1, &bh);
1504 wait_on_buffer(bh); 1504 wait_on_buffer(bh);
1505 if (buffer_uptodate(bh)) 1505 if (buffer_uptodate(bh))
1506 return bh; 1506 return bh;
1507 put_bh(bh); 1507 put_bh(bh);
1508 *err = -EIO; 1508 *err = -EIO;
1509 return NULL; 1509 return NULL;
1510 } 1510 }
1511 1511
1512 static int walk_page_buffers(handle_t *handle, 1512 static int walk_page_buffers(handle_t *handle,
1513 struct buffer_head *head, 1513 struct buffer_head *head,
1514 unsigned from, 1514 unsigned from,
1515 unsigned to, 1515 unsigned to,
1516 int *partial, 1516 int *partial,
1517 int (*fn)(handle_t *handle, 1517 int (*fn)(handle_t *handle,
1518 struct buffer_head *bh)) 1518 struct buffer_head *bh))
1519 { 1519 {
1520 struct buffer_head *bh; 1520 struct buffer_head *bh;
1521 unsigned block_start, block_end; 1521 unsigned block_start, block_end;
1522 unsigned blocksize = head->b_size; 1522 unsigned blocksize = head->b_size;
1523 int err, ret = 0; 1523 int err, ret = 0;
1524 struct buffer_head *next; 1524 struct buffer_head *next;
1525 1525
1526 for (bh = head, block_start = 0; 1526 for (bh = head, block_start = 0;
1527 ret == 0 && (bh != head || !block_start); 1527 ret == 0 && (bh != head || !block_start);
1528 block_start = block_end, bh = next) { 1528 block_start = block_end, bh = next) {
1529 next = bh->b_this_page; 1529 next = bh->b_this_page;
1530 block_end = block_start + blocksize; 1530 block_end = block_start + blocksize;
1531 if (block_end <= from || block_start >= to) { 1531 if (block_end <= from || block_start >= to) {
1532 if (partial && !buffer_uptodate(bh)) 1532 if (partial && !buffer_uptodate(bh))
1533 *partial = 1; 1533 *partial = 1;
1534 continue; 1534 continue;
1535 } 1535 }
1536 err = (*fn)(handle, bh); 1536 err = (*fn)(handle, bh);
1537 if (!ret) 1537 if (!ret)
1538 ret = err; 1538 ret = err;
1539 } 1539 }
1540 return ret; 1540 return ret;
1541 } 1541 }
1542 1542
1543 /* 1543 /*
1544 * To preserve ordering, it is essential that the hole instantiation and 1544 * To preserve ordering, it is essential that the hole instantiation and
1545 * the data write be encapsulated in a single transaction. We cannot 1545 * the data write be encapsulated in a single transaction. We cannot
1546 * close off a transaction and start a new one between the ext4_get_block() 1546 * close off a transaction and start a new one between the ext4_get_block()
1547 * and the commit_write(). So doing the jbd2_journal_start at the start of 1547 * and the commit_write(). So doing the jbd2_journal_start at the start of
1548 * prepare_write() is the right place. 1548 * prepare_write() is the right place.
1549 * 1549 *
1550 * Also, this function can nest inside ext4_writepage() -> 1550 * Also, this function can nest inside ext4_writepage() ->
1551 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1551 * block_write_full_page(). In that case, we *know* that ext4_writepage()
1552 * has generated enough buffer credits to do the whole page. So we won't 1552 * has generated enough buffer credits to do the whole page. So we won't
1553 * block on the journal in that case, which is good, because the caller may 1553 * block on the journal in that case, which is good, because the caller may
1554 * be PF_MEMALLOC. 1554 * be PF_MEMALLOC.
1555 * 1555 *
1556 * By accident, ext4 can be reentered when a transaction is open via 1556 * By accident, ext4 can be reentered when a transaction is open via
1557 * quota file writes. If we were to commit the transaction while thus 1557 * quota file writes. If we were to commit the transaction while thus
1558 * reentered, there can be a deadlock - we would be holding a quota 1558 * reentered, there can be a deadlock - we would be holding a quota
1559 * lock, and the commit would never complete if another thread had a 1559 * lock, and the commit would never complete if another thread had a
1560 * transaction open and was blocking on the quota lock - a ranking 1560 * transaction open and was blocking on the quota lock - a ranking
1561 * violation. 1561 * violation.
1562 * 1562 *
1563 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1563 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1564 * will _not_ run commit under these circumstances because handle->h_ref 1564 * will _not_ run commit under these circumstances because handle->h_ref
1565 * is elevated. We'll still have enough credits for the tiny quotafile 1565 * is elevated. We'll still have enough credits for the tiny quotafile
1566 * write. 1566 * write.
1567 */ 1567 */
1568 static int do_journal_get_write_access(handle_t *handle, 1568 static int do_journal_get_write_access(handle_t *handle,
1569 struct buffer_head *bh) 1569 struct buffer_head *bh)
1570 { 1570 {
1571 int dirty = buffer_dirty(bh); 1571 int dirty = buffer_dirty(bh);
1572 int ret; 1572 int ret;
1573 1573
1574 if (!buffer_mapped(bh) || buffer_freed(bh)) 1574 if (!buffer_mapped(bh) || buffer_freed(bh))
1575 return 0; 1575 return 0;
1576 /* 1576 /*
1577 * __block_write_begin() could have dirtied some buffers. Clean 1577 * __block_write_begin() could have dirtied some buffers. Clean
1578 * the dirty bit as jbd2_journal_get_write_access() could complain 1578 * the dirty bit as jbd2_journal_get_write_access() could complain
1579 * otherwise about fs integrity issues. Setting of the dirty bit 1579 * otherwise about fs integrity issues. Setting of the dirty bit
1580 * by __block_write_begin() isn't a real problem here as we clear 1580 * by __block_write_begin() isn't a real problem here as we clear
1581 * the bit before releasing a page lock and thus writeback cannot 1581 * the bit before releasing a page lock and thus writeback cannot
1582 * ever write the buffer. 1582 * ever write the buffer.
1583 */ 1583 */
1584 if (dirty) 1584 if (dirty)
1585 clear_buffer_dirty(bh); 1585 clear_buffer_dirty(bh);
1586 ret = ext4_journal_get_write_access(handle, bh); 1586 ret = ext4_journal_get_write_access(handle, bh);
1587 if (!ret && dirty) 1587 if (!ret && dirty)
1588 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1588 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1589 return ret; 1589 return ret;
1590 } 1590 }
1591 1591
1592 /* 1592 /*
1593 * Truncate blocks that were not used by write. We have to truncate the 1593 * Truncate blocks that were not used by write. We have to truncate the
1594 * pagecache as well so that corresponding buffers get properly unmapped. 1594 * pagecache as well so that corresponding buffers get properly unmapped.
1595 */ 1595 */
1596 static void ext4_truncate_failed_write(struct inode *inode) 1596 static void ext4_truncate_failed_write(struct inode *inode)
1597 { 1597 {
1598 truncate_inode_pages(inode->i_mapping, inode->i_size); 1598 truncate_inode_pages(inode->i_mapping, inode->i_size);
1599 ext4_truncate(inode); 1599 ext4_truncate(inode);
1600 } 1600 }
1601 1601
1602 static int ext4_get_block_write(struct inode *inode, sector_t iblock, 1602 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1603 struct buffer_head *bh_result, int create); 1603 struct buffer_head *bh_result, int create);
1604 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1604 static int ext4_write_begin(struct file *file, struct address_space *mapping,
1605 loff_t pos, unsigned len, unsigned flags, 1605 loff_t pos, unsigned len, unsigned flags,
1606 struct page **pagep, void **fsdata) 1606 struct page **pagep, void **fsdata)
1607 { 1607 {
1608 struct inode *inode = mapping->host; 1608 struct inode *inode = mapping->host;
1609 int ret, needed_blocks; 1609 int ret, needed_blocks;
1610 handle_t *handle; 1610 handle_t *handle;
1611 int retries = 0; 1611 int retries = 0;
1612 struct page *page; 1612 struct page *page;
1613 pgoff_t index; 1613 pgoff_t index;
1614 unsigned from, to; 1614 unsigned from, to;
1615 1615
1616 trace_ext4_write_begin(inode, pos, len, flags); 1616 trace_ext4_write_begin(inode, pos, len, flags);
1617 /* 1617 /*
1618 * Reserve one block more for addition to orphan list in case 1618 * Reserve one block more for addition to orphan list in case
1619 * we allocate blocks but write fails for some reason 1619 * we allocate blocks but write fails for some reason
1620 */ 1620 */
1621 needed_blocks = ext4_writepage_trans_blocks(inode) + 1; 1621 needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1622 index = pos >> PAGE_CACHE_SHIFT; 1622 index = pos >> PAGE_CACHE_SHIFT;
1623 from = pos & (PAGE_CACHE_SIZE - 1); 1623 from = pos & (PAGE_CACHE_SIZE - 1);
1624 to = from + len; 1624 to = from + len;
1625 1625
1626 retry: 1626 retry:
1627 handle = ext4_journal_start(inode, needed_blocks); 1627 handle = ext4_journal_start(inode, needed_blocks);
1628 if (IS_ERR(handle)) { 1628 if (IS_ERR(handle)) {
1629 ret = PTR_ERR(handle); 1629 ret = PTR_ERR(handle);
1630 goto out; 1630 goto out;
1631 } 1631 }
1632 1632
1633 /* We cannot recurse into the filesystem as the transaction is already 1633 /* We cannot recurse into the filesystem as the transaction is already
1634 * started */ 1634 * started */
1635 flags |= AOP_FLAG_NOFS; 1635 flags |= AOP_FLAG_NOFS;
1636 1636
1637 page = grab_cache_page_write_begin(mapping, index, flags); 1637 page = grab_cache_page_write_begin(mapping, index, flags);
1638 if (!page) { 1638 if (!page) {
1639 ext4_journal_stop(handle); 1639 ext4_journal_stop(handle);
1640 ret = -ENOMEM; 1640 ret = -ENOMEM;
1641 goto out; 1641 goto out;
1642 } 1642 }
1643 *pagep = page; 1643 *pagep = page;
1644 1644
1645 if (ext4_should_dioread_nolock(inode)) 1645 if (ext4_should_dioread_nolock(inode))
1646 ret = __block_write_begin(page, pos, len, ext4_get_block_write); 1646 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
1647 else 1647 else
1648 ret = __block_write_begin(page, pos, len, ext4_get_block); 1648 ret = __block_write_begin(page, pos, len, ext4_get_block);
1649 1649
1650 if (!ret && ext4_should_journal_data(inode)) { 1650 if (!ret && ext4_should_journal_data(inode)) {
1651 ret = walk_page_buffers(handle, page_buffers(page), 1651 ret = walk_page_buffers(handle, page_buffers(page),
1652 from, to, NULL, do_journal_get_write_access); 1652 from, to, NULL, do_journal_get_write_access);
1653 } 1653 }
1654 1654
1655 if (ret) { 1655 if (ret) {
1656 unlock_page(page); 1656 unlock_page(page);
1657 page_cache_release(page); 1657 page_cache_release(page);
1658 /* 1658 /*
1659 * __block_write_begin may have instantiated a few blocks 1659 * __block_write_begin may have instantiated a few blocks
1660 * outside i_size. Trim these off again. Don't need 1660 * outside i_size. Trim these off again. Don't need
1661 * i_size_read because we hold i_mutex. 1661 * i_size_read because we hold i_mutex.
1662 * 1662 *
1663 * Add inode to orphan list in case we crash before 1663 * Add inode to orphan list in case we crash before
1664 * truncate finishes 1664 * truncate finishes
1665 */ 1665 */
1666 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1666 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1667 ext4_orphan_add(handle, inode); 1667 ext4_orphan_add(handle, inode);
1668 1668
1669 ext4_journal_stop(handle); 1669 ext4_journal_stop(handle);
1670 if (pos + len > inode->i_size) { 1670 if (pos + len > inode->i_size) {
1671 ext4_truncate_failed_write(inode); 1671 ext4_truncate_failed_write(inode);
1672 /* 1672 /*
1673 * If truncate failed early the inode might 1673 * If truncate failed early the inode might
1674 * still be on the orphan list; we need to 1674 * still be on the orphan list; we need to
1675 * make sure the inode is removed from the 1675 * make sure the inode is removed from the
1676 * orphan list in that case. 1676 * orphan list in that case.
1677 */ 1677 */
1678 if (inode->i_nlink) 1678 if (inode->i_nlink)
1679 ext4_orphan_del(NULL, inode); 1679 ext4_orphan_del(NULL, inode);
1680 } 1680 }
1681 } 1681 }
1682 1682
1683 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1683 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1684 goto retry; 1684 goto retry;
1685 out: 1685 out:
1686 return ret; 1686 return ret;
1687 } 1687 }
1688 1688
1689 /* For write_end() in data=journal mode */ 1689 /* For write_end() in data=journal mode */
1690 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1690 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1691 { 1691 {
1692 if (!buffer_mapped(bh) || buffer_freed(bh)) 1692 if (!buffer_mapped(bh) || buffer_freed(bh))
1693 return 0; 1693 return 0;
1694 set_buffer_uptodate(bh); 1694 set_buffer_uptodate(bh);
1695 return ext4_handle_dirty_metadata(handle, NULL, bh); 1695 return ext4_handle_dirty_metadata(handle, NULL, bh);
1696 } 1696 }
1697 1697
1698 static int ext4_generic_write_end(struct file *file, 1698 static int ext4_generic_write_end(struct file *file,
1699 struct address_space *mapping, 1699 struct address_space *mapping,
1700 loff_t pos, unsigned len, unsigned copied, 1700 loff_t pos, unsigned len, unsigned copied,
1701 struct page *page, void *fsdata) 1701 struct page *page, void *fsdata)
1702 { 1702 {
1703 int i_size_changed = 0; 1703 int i_size_changed = 0;
1704 struct inode *inode = mapping->host; 1704 struct inode *inode = mapping->host;
1705 handle_t *handle = ext4_journal_current_handle(); 1705 handle_t *handle = ext4_journal_current_handle();
1706 1706
1707 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1707 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1708 1708
1709 /* 1709 /*
1710 * No need to use i_size_read() here, the i_size 1710 * No need to use i_size_read() here, the i_size
1711 * cannot change under us because we hold i_mutex. 1711 * cannot change under us because we hold i_mutex.
1712 * 1712 *
1713 * But it's important to update i_size while still holding page lock: 1713 * But it's important to update i_size while still holding page lock:
1714 * page writeout could otherwise come in and zero beyond i_size. 1714 * page writeout could otherwise come in and zero beyond i_size.
1715 */ 1715 */
1716 if (pos + copied > inode->i_size) { 1716 if (pos + copied > inode->i_size) {
1717 i_size_write(inode, pos + copied); 1717 i_size_write(inode, pos + copied);
1718 i_size_changed = 1; 1718 i_size_changed = 1;
1719 } 1719 }
1720 1720
1721 if (pos + copied > EXT4_I(inode)->i_disksize) { 1721 if (pos + copied > EXT4_I(inode)->i_disksize) {
1722 /* We need to mark inode dirty even if 1722 /* We need to mark inode dirty even if
1723 * new_i_size is less that inode->i_size 1723 * new_i_size is less that inode->i_size
1724 * bu greater than i_disksize.(hint delalloc) 1724 * bu greater than i_disksize.(hint delalloc)
1725 */ 1725 */
1726 ext4_update_i_disksize(inode, (pos + copied)); 1726 ext4_update_i_disksize(inode, (pos + copied));
1727 i_size_changed = 1; 1727 i_size_changed = 1;
1728 } 1728 }
1729 unlock_page(page); 1729 unlock_page(page);
1730 page_cache_release(page); 1730 page_cache_release(page);
1731 1731
1732 /* 1732 /*
1733 * Don't mark the inode dirty under page lock. First, it unnecessarily 1733 * Don't mark the inode dirty under page lock. First, it unnecessarily
1734 * makes the holding time of page lock longer. Second, it forces lock 1734 * makes the holding time of page lock longer. Second, it forces lock
1735 * ordering of page lock and transaction start for journaling 1735 * ordering of page lock and transaction start for journaling
1736 * filesystems. 1736 * filesystems.
1737 */ 1737 */
1738 if (i_size_changed) 1738 if (i_size_changed)
1739 ext4_mark_inode_dirty(handle, inode); 1739 ext4_mark_inode_dirty(handle, inode);
1740 1740
1741 return copied; 1741 return copied;
1742 } 1742 }
1743 1743
1744 /* 1744 /*
1745 * We need to pick up the new inode size which generic_commit_write gave us 1745 * We need to pick up the new inode size which generic_commit_write gave us
1746 * `file' can be NULL - eg, when called from page_symlink(). 1746 * `file' can be NULL - eg, when called from page_symlink().
1747 * 1747 *
1748 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1748 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1749 * buffers are managed internally. 1749 * buffers are managed internally.
1750 */ 1750 */
1751 static int ext4_ordered_write_end(struct file *file, 1751 static int ext4_ordered_write_end(struct file *file,
1752 struct address_space *mapping, 1752 struct address_space *mapping,
1753 loff_t pos, unsigned len, unsigned copied, 1753 loff_t pos, unsigned len, unsigned copied,
1754 struct page *page, void *fsdata) 1754 struct page *page, void *fsdata)
1755 { 1755 {
1756 handle_t *handle = ext4_journal_current_handle(); 1756 handle_t *handle = ext4_journal_current_handle();
1757 struct inode *inode = mapping->host; 1757 struct inode *inode = mapping->host;
1758 int ret = 0, ret2; 1758 int ret = 0, ret2;
1759 1759
1760 trace_ext4_ordered_write_end(inode, pos, len, copied); 1760 trace_ext4_ordered_write_end(inode, pos, len, copied);
1761 ret = ext4_jbd2_file_inode(handle, inode); 1761 ret = ext4_jbd2_file_inode(handle, inode);
1762 1762
1763 if (ret == 0) { 1763 if (ret == 0) {
1764 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1764 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1765 page, fsdata); 1765 page, fsdata);
1766 copied = ret2; 1766 copied = ret2;
1767 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1767 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1768 /* if we have allocated more blocks and copied 1768 /* if we have allocated more blocks and copied
1769 * less. We will have blocks allocated outside 1769 * less. We will have blocks allocated outside
1770 * inode->i_size. So truncate them 1770 * inode->i_size. So truncate them
1771 */ 1771 */
1772 ext4_orphan_add(handle, inode); 1772 ext4_orphan_add(handle, inode);
1773 if (ret2 < 0) 1773 if (ret2 < 0)
1774 ret = ret2; 1774 ret = ret2;
1775 } 1775 }
1776 ret2 = ext4_journal_stop(handle); 1776 ret2 = ext4_journal_stop(handle);
1777 if (!ret) 1777 if (!ret)
1778 ret = ret2; 1778 ret = ret2;
1779 1779
1780 if (pos + len > inode->i_size) { 1780 if (pos + len > inode->i_size) {
1781 ext4_truncate_failed_write(inode); 1781 ext4_truncate_failed_write(inode);
1782 /* 1782 /*
1783 * If truncate failed early the inode might still be 1783 * If truncate failed early the inode might still be
1784 * on the orphan list; we need to make sure the inode 1784 * on the orphan list; we need to make sure the inode
1785 * is removed from the orphan list in that case. 1785 * is removed from the orphan list in that case.
1786 */ 1786 */
1787 if (inode->i_nlink) 1787 if (inode->i_nlink)
1788 ext4_orphan_del(NULL, inode); 1788 ext4_orphan_del(NULL, inode);
1789 } 1789 }
1790 1790
1791 1791
1792 return ret ? ret : copied; 1792 return ret ? ret : copied;
1793 } 1793 }
1794 1794
1795 static int ext4_writeback_write_end(struct file *file, 1795 static int ext4_writeback_write_end(struct file *file,
1796 struct address_space *mapping, 1796 struct address_space *mapping,
1797 loff_t pos, unsigned len, unsigned copied, 1797 loff_t pos, unsigned len, unsigned copied,
1798 struct page *page, void *fsdata) 1798 struct page *page, void *fsdata)
1799 { 1799 {
1800 handle_t *handle = ext4_journal_current_handle(); 1800 handle_t *handle = ext4_journal_current_handle();
1801 struct inode *inode = mapping->host; 1801 struct inode *inode = mapping->host;
1802 int ret = 0, ret2; 1802 int ret = 0, ret2;
1803 1803
1804 trace_ext4_writeback_write_end(inode, pos, len, copied); 1804 trace_ext4_writeback_write_end(inode, pos, len, copied);
1805 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1805 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1806 page, fsdata); 1806 page, fsdata);
1807 copied = ret2; 1807 copied = ret2;
1808 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1808 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1809 /* if we have allocated more blocks and copied 1809 /* if we have allocated more blocks and copied
1810 * less. We will have blocks allocated outside 1810 * less. We will have blocks allocated outside
1811 * inode->i_size. So truncate them 1811 * inode->i_size. So truncate them
1812 */ 1812 */
1813 ext4_orphan_add(handle, inode); 1813 ext4_orphan_add(handle, inode);
1814 1814
1815 if (ret2 < 0) 1815 if (ret2 < 0)
1816 ret = ret2; 1816 ret = ret2;
1817 1817
1818 ret2 = ext4_journal_stop(handle); 1818 ret2 = ext4_journal_stop(handle);
1819 if (!ret) 1819 if (!ret)
1820 ret = ret2; 1820 ret = ret2;
1821 1821
1822 if (pos + len > inode->i_size) { 1822 if (pos + len > inode->i_size) {
1823 ext4_truncate_failed_write(inode); 1823 ext4_truncate_failed_write(inode);
1824 /* 1824 /*
1825 * If truncate failed early the inode might still be 1825 * If truncate failed early the inode might still be
1826 * on the orphan list; we need to make sure the inode 1826 * on the orphan list; we need to make sure the inode
1827 * is removed from the orphan list in that case. 1827 * is removed from the orphan list in that case.
1828 */ 1828 */
1829 if (inode->i_nlink) 1829 if (inode->i_nlink)
1830 ext4_orphan_del(NULL, inode); 1830 ext4_orphan_del(NULL, inode);
1831 } 1831 }
1832 1832
1833 return ret ? ret : copied; 1833 return ret ? ret : copied;
1834 } 1834 }
1835 1835
1836 static int ext4_journalled_write_end(struct file *file, 1836 static int ext4_journalled_write_end(struct file *file,
1837 struct address_space *mapping, 1837 struct address_space *mapping,
1838 loff_t pos, unsigned len, unsigned copied, 1838 loff_t pos, unsigned len, unsigned copied,
1839 struct page *page, void *fsdata) 1839 struct page *page, void *fsdata)
1840 { 1840 {
1841 handle_t *handle = ext4_journal_current_handle(); 1841 handle_t *handle = ext4_journal_current_handle();
1842 struct inode *inode = mapping->host; 1842 struct inode *inode = mapping->host;
1843 int ret = 0, ret2; 1843 int ret = 0, ret2;
1844 int partial = 0; 1844 int partial = 0;
1845 unsigned from, to; 1845 unsigned from, to;
1846 loff_t new_i_size; 1846 loff_t new_i_size;
1847 1847
1848 trace_ext4_journalled_write_end(inode, pos, len, copied); 1848 trace_ext4_journalled_write_end(inode, pos, len, copied);
1849 from = pos & (PAGE_CACHE_SIZE - 1); 1849 from = pos & (PAGE_CACHE_SIZE - 1);
1850 to = from + len; 1850 to = from + len;
1851 1851
1852 if (copied < len) { 1852 if (copied < len) {
1853 if (!PageUptodate(page)) 1853 if (!PageUptodate(page))
1854 copied = 0; 1854 copied = 0;
1855 page_zero_new_buffers(page, from+copied, to); 1855 page_zero_new_buffers(page, from+copied, to);
1856 } 1856 }
1857 1857
1858 ret = walk_page_buffers(handle, page_buffers(page), from, 1858 ret = walk_page_buffers(handle, page_buffers(page), from,
1859 to, &partial, write_end_fn); 1859 to, &partial, write_end_fn);
1860 if (!partial) 1860 if (!partial)
1861 SetPageUptodate(page); 1861 SetPageUptodate(page);
1862 new_i_size = pos + copied; 1862 new_i_size = pos + copied;
1863 if (new_i_size > inode->i_size) 1863 if (new_i_size > inode->i_size)
1864 i_size_write(inode, pos+copied); 1864 i_size_write(inode, pos+copied);
1865 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1865 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1866 if (new_i_size > EXT4_I(inode)->i_disksize) { 1866 if (new_i_size > EXT4_I(inode)->i_disksize) {
1867 ext4_update_i_disksize(inode, new_i_size); 1867 ext4_update_i_disksize(inode, new_i_size);
1868 ret2 = ext4_mark_inode_dirty(handle, inode); 1868 ret2 = ext4_mark_inode_dirty(handle, inode);
1869 if (!ret) 1869 if (!ret)
1870 ret = ret2; 1870 ret = ret2;
1871 } 1871 }
1872 1872
1873 unlock_page(page); 1873 unlock_page(page);
1874 page_cache_release(page); 1874 page_cache_release(page);
1875 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1875 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1876 /* if we have allocated more blocks and copied 1876 /* if we have allocated more blocks and copied
1877 * less. We will have blocks allocated outside 1877 * less. We will have blocks allocated outside
1878 * inode->i_size. So truncate them 1878 * inode->i_size. So truncate them
1879 */ 1879 */
1880 ext4_orphan_add(handle, inode); 1880 ext4_orphan_add(handle, inode);
1881 1881
1882 ret2 = ext4_journal_stop(handle); 1882 ret2 = ext4_journal_stop(handle);
1883 if (!ret) 1883 if (!ret)
1884 ret = ret2; 1884 ret = ret2;
1885 if (pos + len > inode->i_size) { 1885 if (pos + len > inode->i_size) {
1886 ext4_truncate_failed_write(inode); 1886 ext4_truncate_failed_write(inode);
1887 /* 1887 /*
1888 * If truncate failed early the inode might still be 1888 * If truncate failed early the inode might still be
1889 * on the orphan list; we need to make sure the inode 1889 * on the orphan list; we need to make sure the inode
1890 * is removed from the orphan list in that case. 1890 * is removed from the orphan list in that case.
1891 */ 1891 */
1892 if (inode->i_nlink) 1892 if (inode->i_nlink)
1893 ext4_orphan_del(NULL, inode); 1893 ext4_orphan_del(NULL, inode);
1894 } 1894 }
1895 1895
1896 return ret ? ret : copied; 1896 return ret ? ret : copied;
1897 } 1897 }
1898 1898
1899 /* 1899 /*
1900 * Reserve a single block located at lblock 1900 * Reserve a single block located at lblock
1901 */ 1901 */
1902 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1902 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1903 { 1903 {
1904 int retries = 0; 1904 int retries = 0;
1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1906 struct ext4_inode_info *ei = EXT4_I(inode); 1906 struct ext4_inode_info *ei = EXT4_I(inode);
1907 unsigned long md_needed; 1907 unsigned long md_needed;
1908 int ret; 1908 int ret;
1909 1909
1910 /* 1910 /*
1911 * recalculate the amount of metadata blocks to reserve 1911 * recalculate the amount of metadata blocks to reserve
1912 * in order to allocate nrblocks 1912 * in order to allocate nrblocks
1913 * worse case is one extent per block 1913 * worse case is one extent per block
1914 */ 1914 */
1915 repeat: 1915 repeat:
1916 spin_lock(&ei->i_block_reservation_lock); 1916 spin_lock(&ei->i_block_reservation_lock);
1917 md_needed = ext4_calc_metadata_amount(inode, lblock); 1917 md_needed = ext4_calc_metadata_amount(inode, lblock);
1918 trace_ext4_da_reserve_space(inode, md_needed); 1918 trace_ext4_da_reserve_space(inode, md_needed);
1919 spin_unlock(&ei->i_block_reservation_lock); 1919 spin_unlock(&ei->i_block_reservation_lock);
1920 1920
1921 /* 1921 /*
1922 * We will charge metadata quota at writeout time; this saves 1922 * We will charge metadata quota at writeout time; this saves
1923 * us from metadata over-estimation, though we may go over by 1923 * us from metadata over-estimation, though we may go over by
1924 * a small amount in the end. Here we just reserve for data. 1924 * a small amount in the end. Here we just reserve for data.
1925 */ 1925 */
1926 ret = dquot_reserve_block(inode, 1); 1926 ret = dquot_reserve_block(inode, 1);
1927 if (ret) 1927 if (ret)
1928 return ret; 1928 return ret;
1929 /* 1929 /*
1930 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1931 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1932 */ 1932 */
1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1934 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1936 yield(); 1936 yield();
1937 goto repeat; 1937 goto repeat;
1938 } 1938 }
1939 return -ENOSPC; 1939 return -ENOSPC;
1940 } 1940 }
1941 spin_lock(&ei->i_block_reservation_lock); 1941 spin_lock(&ei->i_block_reservation_lock);
1942 ei->i_reserved_data_blocks++; 1942 ei->i_reserved_data_blocks++;
1943 ei->i_reserved_meta_blocks += md_needed; 1943 ei->i_reserved_meta_blocks += md_needed;
1944 spin_unlock(&ei->i_block_reservation_lock); 1944 spin_unlock(&ei->i_block_reservation_lock);
1945 1945
1946 return 0; /* success */ 1946 return 0; /* success */
1947 } 1947 }
1948 1948
1949 static void ext4_da_release_space(struct inode *inode, int to_free) 1949 static void ext4_da_release_space(struct inode *inode, int to_free)
1950 { 1950 {
1951 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1951 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1952 struct ext4_inode_info *ei = EXT4_I(inode); 1952 struct ext4_inode_info *ei = EXT4_I(inode);
1953 1953
1954 if (!to_free) 1954 if (!to_free)
1955 return; /* Nothing to release, exit */ 1955 return; /* Nothing to release, exit */
1956 1956
1957 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1957 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1958 1958
1959 trace_ext4_da_release_space(inode, to_free); 1959 trace_ext4_da_release_space(inode, to_free);
1960 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1960 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1961 /* 1961 /*
1962 * if there aren't enough reserved blocks, then the 1962 * if there aren't enough reserved blocks, then the
1963 * counter is messed up somewhere. Since this 1963 * counter is messed up somewhere. Since this
1964 * function is called from invalidate page, it's 1964 * function is called from invalidate page, it's
1965 * harmless to return without any action. 1965 * harmless to return without any action.
1966 */ 1966 */
1967 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " 1967 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1968 "ino %lu, to_free %d with only %d reserved " 1968 "ino %lu, to_free %d with only %d reserved "
1969 "data blocks\n", inode->i_ino, to_free, 1969 "data blocks\n", inode->i_ino, to_free,
1970 ei->i_reserved_data_blocks); 1970 ei->i_reserved_data_blocks);
1971 WARN_ON(1); 1971 WARN_ON(1);
1972 to_free = ei->i_reserved_data_blocks; 1972 to_free = ei->i_reserved_data_blocks;
1973 } 1973 }
1974 ei->i_reserved_data_blocks -= to_free; 1974 ei->i_reserved_data_blocks -= to_free;
1975 1975
1976 if (ei->i_reserved_data_blocks == 0) { 1976 if (ei->i_reserved_data_blocks == 0) {
1977 /* 1977 /*
1978 * We can release all of the reserved metadata blocks 1978 * We can release all of the reserved metadata blocks
1979 * only when we have written all of the delayed 1979 * only when we have written all of the delayed
1980 * allocation blocks. 1980 * allocation blocks.
1981 */ 1981 */
1982 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1982 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1983 ei->i_reserved_meta_blocks); 1983 ei->i_reserved_meta_blocks);
1984 ei->i_reserved_meta_blocks = 0; 1984 ei->i_reserved_meta_blocks = 0;
1985 ei->i_da_metadata_calc_len = 0; 1985 ei->i_da_metadata_calc_len = 0;
1986 } 1986 }
1987 1987
1988 /* update fs dirty data blocks counter */ 1988 /* update fs dirty data blocks counter */
1989 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1989 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1990 1990
1991 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1991 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1992 1992
1993 dquot_release_reservation_block(inode, to_free); 1993 dquot_release_reservation_block(inode, to_free);
1994 } 1994 }
1995 1995
1996 static void ext4_da_page_release_reservation(struct page *page, 1996 static void ext4_da_page_release_reservation(struct page *page,
1997 unsigned long offset) 1997 unsigned long offset)
1998 { 1998 {
1999 int to_release = 0; 1999 int to_release = 0;
2000 struct buffer_head *head, *bh; 2000 struct buffer_head *head, *bh;
2001 unsigned int curr_off = 0; 2001 unsigned int curr_off = 0;
2002 2002
2003 head = page_buffers(page); 2003 head = page_buffers(page);
2004 bh = head; 2004 bh = head;
2005 do { 2005 do {
2006 unsigned int next_off = curr_off + bh->b_size; 2006 unsigned int next_off = curr_off + bh->b_size;
2007 2007
2008 if ((offset <= curr_off) && (buffer_delay(bh))) { 2008 if ((offset <= curr_off) && (buffer_delay(bh))) {
2009 to_release++; 2009 to_release++;
2010 clear_buffer_delay(bh); 2010 clear_buffer_delay(bh);
2011 } 2011 }
2012 curr_off = next_off; 2012 curr_off = next_off;
2013 } while ((bh = bh->b_this_page) != head); 2013 } while ((bh = bh->b_this_page) != head);
2014 ext4_da_release_space(page->mapping->host, to_release); 2014 ext4_da_release_space(page->mapping->host, to_release);
2015 } 2015 }
2016 2016
2017 /* 2017 /*
2018 * Delayed allocation stuff 2018 * Delayed allocation stuff
2019 */ 2019 */
2020 2020
2021 /* 2021 /*
2022 * mpage_da_submit_io - walks through extent of pages and try to write 2022 * mpage_da_submit_io - walks through extent of pages and try to write
2023 * them with writepage() call back 2023 * them with writepage() call back
2024 * 2024 *
2025 * @mpd->inode: inode 2025 * @mpd->inode: inode
2026 * @mpd->first_page: first page of the extent 2026 * @mpd->first_page: first page of the extent
2027 * @mpd->next_page: page after the last page of the extent 2027 * @mpd->next_page: page after the last page of the extent
2028 * 2028 *
2029 * By the time mpage_da_submit_io() is called we expect all blocks 2029 * By the time mpage_da_submit_io() is called we expect all blocks
2030 * to be allocated. this may be wrong if allocation failed. 2030 * to be allocated. this may be wrong if allocation failed.
2031 * 2031 *
2032 * As pages are already locked by write_cache_pages(), we can't use it 2032 * As pages are already locked by write_cache_pages(), we can't use it
2033 */ 2033 */
2034 static int mpage_da_submit_io(struct mpage_da_data *mpd, 2034 static int mpage_da_submit_io(struct mpage_da_data *mpd,
2035 struct ext4_map_blocks *map) 2035 struct ext4_map_blocks *map)
2036 { 2036 {
2037 struct pagevec pvec; 2037 struct pagevec pvec;
2038 unsigned long index, end; 2038 unsigned long index, end;
2039 int ret = 0, err, nr_pages, i; 2039 int ret = 0, err, nr_pages, i;
2040 struct inode *inode = mpd->inode; 2040 struct inode *inode = mpd->inode;
2041 struct address_space *mapping = inode->i_mapping; 2041 struct address_space *mapping = inode->i_mapping;
2042 loff_t size = i_size_read(inode); 2042 loff_t size = i_size_read(inode);
2043 unsigned int len, block_start; 2043 unsigned int len, block_start;
2044 struct buffer_head *bh, *page_bufs = NULL; 2044 struct buffer_head *bh, *page_bufs = NULL;
2045 int journal_data = ext4_should_journal_data(inode); 2045 int journal_data = ext4_should_journal_data(inode);
2046 sector_t pblock = 0, cur_logical = 0; 2046 sector_t pblock = 0, cur_logical = 0;
2047 struct ext4_io_submit io_submit; 2047 struct ext4_io_submit io_submit;
2048 2048
2049 BUG_ON(mpd->next_page <= mpd->first_page); 2049 BUG_ON(mpd->next_page <= mpd->first_page);
2050 memset(&io_submit, 0, sizeof(io_submit)); 2050 memset(&io_submit, 0, sizeof(io_submit));
2051 /* 2051 /*
2052 * We need to start from the first_page to the next_page - 1 2052 * We need to start from the first_page to the next_page - 1
2053 * to make sure we also write the mapped dirty buffer_heads. 2053 * to make sure we also write the mapped dirty buffer_heads.
2054 * If we look at mpd->b_blocknr we would only be looking 2054 * If we look at mpd->b_blocknr we would only be looking
2055 * at the currently mapped buffer_heads. 2055 * at the currently mapped buffer_heads.
2056 */ 2056 */
2057 index = mpd->first_page; 2057 index = mpd->first_page;
2058 end = mpd->next_page - 1; 2058 end = mpd->next_page - 1;
2059 2059
2060 pagevec_init(&pvec, 0); 2060 pagevec_init(&pvec, 0);
2061 while (index <= end) { 2061 while (index <= end) {
2062 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2062 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2063 if (nr_pages == 0) 2063 if (nr_pages == 0)
2064 break; 2064 break;
2065 for (i = 0; i < nr_pages; i++) { 2065 for (i = 0; i < nr_pages; i++) {
2066 int commit_write = 0, skip_page = 0; 2066 int commit_write = 0, skip_page = 0;
2067 struct page *page = pvec.pages[i]; 2067 struct page *page = pvec.pages[i];
2068 2068
2069 index = page->index; 2069 index = page->index;
2070 if (index > end) 2070 if (index > end)
2071 break; 2071 break;
2072 2072
2073 if (index == size >> PAGE_CACHE_SHIFT) 2073 if (index == size >> PAGE_CACHE_SHIFT)
2074 len = size & ~PAGE_CACHE_MASK; 2074 len = size & ~PAGE_CACHE_MASK;
2075 else 2075 else
2076 len = PAGE_CACHE_SIZE; 2076 len = PAGE_CACHE_SIZE;
2077 if (map) { 2077 if (map) {
2078 cur_logical = index << (PAGE_CACHE_SHIFT - 2078 cur_logical = index << (PAGE_CACHE_SHIFT -
2079 inode->i_blkbits); 2079 inode->i_blkbits);
2080 pblock = map->m_pblk + (cur_logical - 2080 pblock = map->m_pblk + (cur_logical -
2081 map->m_lblk); 2081 map->m_lblk);
2082 } 2082 }
2083 index++; 2083 index++;
2084 2084
2085 BUG_ON(!PageLocked(page)); 2085 BUG_ON(!PageLocked(page));
2086 BUG_ON(PageWriteback(page)); 2086 BUG_ON(PageWriteback(page));
2087 2087
2088 /* 2088 /*
2089 * If the page does not have buffers (for 2089 * If the page does not have buffers (for
2090 * whatever reason), try to create them using 2090 * whatever reason), try to create them using
2091 * __block_write_begin. If this fails, 2091 * __block_write_begin. If this fails,
2092 * skip the page and move on. 2092 * skip the page and move on.
2093 */ 2093 */
2094 if (!page_has_buffers(page)) { 2094 if (!page_has_buffers(page)) {
2095 if (__block_write_begin(page, 0, len, 2095 if (__block_write_begin(page, 0, len,
2096 noalloc_get_block_write)) { 2096 noalloc_get_block_write)) {
2097 skip_page: 2097 skip_page:
2098 unlock_page(page); 2098 unlock_page(page);
2099 continue; 2099 continue;
2100 } 2100 }
2101 commit_write = 1; 2101 commit_write = 1;
2102 } 2102 }
2103 2103
2104 bh = page_bufs = page_buffers(page); 2104 bh = page_bufs = page_buffers(page);
2105 block_start = 0; 2105 block_start = 0;
2106 do { 2106 do {
2107 if (!bh) 2107 if (!bh)
2108 goto skip_page; 2108 goto skip_page;
2109 if (map && (cur_logical >= map->m_lblk) && 2109 if (map && (cur_logical >= map->m_lblk) &&
2110 (cur_logical <= (map->m_lblk + 2110 (cur_logical <= (map->m_lblk +
2111 (map->m_len - 1)))) { 2111 (map->m_len - 1)))) {
2112 if (buffer_delay(bh)) { 2112 if (buffer_delay(bh)) {
2113 clear_buffer_delay(bh); 2113 clear_buffer_delay(bh);
2114 bh->b_blocknr = pblock; 2114 bh->b_blocknr = pblock;
2115 } 2115 }
2116 if (buffer_unwritten(bh) || 2116 if (buffer_unwritten(bh) ||
2117 buffer_mapped(bh)) 2117 buffer_mapped(bh))
2118 BUG_ON(bh->b_blocknr != pblock); 2118 BUG_ON(bh->b_blocknr != pblock);
2119 if (map->m_flags & EXT4_MAP_UNINIT) 2119 if (map->m_flags & EXT4_MAP_UNINIT)
2120 set_buffer_uninit(bh); 2120 set_buffer_uninit(bh);
2121 clear_buffer_unwritten(bh); 2121 clear_buffer_unwritten(bh);
2122 } 2122 }
2123 2123
2124 /* skip page if block allocation undone */ 2124 /* skip page if block allocation undone */
2125 if (buffer_delay(bh) || buffer_unwritten(bh)) 2125 if (buffer_delay(bh) || buffer_unwritten(bh))
2126 skip_page = 1; 2126 skip_page = 1;
2127 bh = bh->b_this_page; 2127 bh = bh->b_this_page;
2128 block_start += bh->b_size; 2128 block_start += bh->b_size;
2129 cur_logical++; 2129 cur_logical++;
2130 pblock++; 2130 pblock++;
2131 } while (bh != page_bufs); 2131 } while (bh != page_bufs);
2132 2132
2133 if (skip_page) 2133 if (skip_page)
2134 goto skip_page; 2134 goto skip_page;
2135 2135
2136 if (commit_write) 2136 if (commit_write)
2137 /* mark the buffer_heads as dirty & uptodate */ 2137 /* mark the buffer_heads as dirty & uptodate */
2138 block_commit_write(page, 0, len); 2138 block_commit_write(page, 0, len);
2139 2139
2140 clear_page_dirty_for_io(page); 2140 clear_page_dirty_for_io(page);
2141 /* 2141 /*
2142 * Delalloc doesn't support data journalling, 2142 * Delalloc doesn't support data journalling,
2143 * but eventually maybe we'll lift this 2143 * but eventually maybe we'll lift this
2144 * restriction. 2144 * restriction.
2145 */ 2145 */
2146 if (unlikely(journal_data && PageChecked(page))) 2146 if (unlikely(journal_data && PageChecked(page)))
2147 err = __ext4_journalled_writepage(page, len); 2147 err = __ext4_journalled_writepage(page, len);
2148 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) 2148 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2149 err = ext4_bio_write_page(&io_submit, page, 2149 err = ext4_bio_write_page(&io_submit, page,
2150 len, mpd->wbc); 2150 len, mpd->wbc);
2151 else 2151 else
2152 err = block_write_full_page(page, 2152 err = block_write_full_page(page,
2153 noalloc_get_block_write, mpd->wbc); 2153 noalloc_get_block_write, mpd->wbc);
2154 2154
2155 if (!err) 2155 if (!err)
2156 mpd->pages_written++; 2156 mpd->pages_written++;
2157 /* 2157 /*
2158 * In error case, we have to continue because 2158 * In error case, we have to continue because
2159 * remaining pages are still locked 2159 * remaining pages are still locked
2160 */ 2160 */
2161 if (ret == 0) 2161 if (ret == 0)
2162 ret = err; 2162 ret = err;
2163 } 2163 }
2164 pagevec_release(&pvec); 2164 pagevec_release(&pvec);
2165 } 2165 }
2166 ext4_io_submit(&io_submit); 2166 ext4_io_submit(&io_submit);
2167 return ret; 2167 return ret;
2168 } 2168 }
2169 2169
2170 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 2170 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2171 { 2171 {
2172 int nr_pages, i; 2172 int nr_pages, i;
2173 pgoff_t index, end; 2173 pgoff_t index, end;
2174 struct pagevec pvec; 2174 struct pagevec pvec;
2175 struct inode *inode = mpd->inode; 2175 struct inode *inode = mpd->inode;
2176 struct address_space *mapping = inode->i_mapping; 2176 struct address_space *mapping = inode->i_mapping;
2177 2177
2178 index = mpd->first_page; 2178 index = mpd->first_page;
2179 end = mpd->next_page - 1; 2179 end = mpd->next_page - 1;
2180 while (index <= end) { 2180 while (index <= end) {
2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2182 if (nr_pages == 0) 2182 if (nr_pages == 0)
2183 break; 2183 break;
2184 for (i = 0; i < nr_pages; i++) { 2184 for (i = 0; i < nr_pages; i++) {
2185 struct page *page = pvec.pages[i]; 2185 struct page *page = pvec.pages[i];
2186 if (page->index > end) 2186 if (page->index > end)
2187 break; 2187 break;
2188 BUG_ON(!PageLocked(page)); 2188 BUG_ON(!PageLocked(page));
2189 BUG_ON(PageWriteback(page)); 2189 BUG_ON(PageWriteback(page));
2190 block_invalidatepage(page, 0); 2190 block_invalidatepage(page, 0);
2191 ClearPageUptodate(page); 2191 ClearPageUptodate(page);
2192 unlock_page(page); 2192 unlock_page(page);
2193 } 2193 }
2194 index = pvec.pages[nr_pages - 1]->index + 1; 2194 index = pvec.pages[nr_pages - 1]->index + 1;
2195 pagevec_release(&pvec); 2195 pagevec_release(&pvec);
2196 } 2196 }
2197 return; 2197 return;
2198 } 2198 }
2199 2199
2200 static void ext4_print_free_blocks(struct inode *inode) 2200 static void ext4_print_free_blocks(struct inode *inode)
2201 { 2201 {
2202 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2202 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2203 printk(KERN_CRIT "Total free blocks count %lld\n", 2203 printk(KERN_CRIT "Total free blocks count %lld\n",
2204 ext4_count_free_blocks(inode->i_sb)); 2204 ext4_count_free_blocks(inode->i_sb));
2205 printk(KERN_CRIT "Free/Dirty block details\n"); 2205 printk(KERN_CRIT "Free/Dirty block details\n");
2206 printk(KERN_CRIT "free_blocks=%lld\n", 2206 printk(KERN_CRIT "free_blocks=%lld\n",
2207 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); 2207 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2208 printk(KERN_CRIT "dirty_blocks=%lld\n", 2208 printk(KERN_CRIT "dirty_blocks=%lld\n",
2209 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2209 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2210 printk(KERN_CRIT "Block reservation details\n"); 2210 printk(KERN_CRIT "Block reservation details\n");
2211 printk(KERN_CRIT "i_reserved_data_blocks=%u\n", 2211 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2212 EXT4_I(inode)->i_reserved_data_blocks); 2212 EXT4_I(inode)->i_reserved_data_blocks);
2213 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", 2213 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2214 EXT4_I(inode)->i_reserved_meta_blocks); 2214 EXT4_I(inode)->i_reserved_meta_blocks);
2215 return; 2215 return;
2216 } 2216 }
2217 2217
2218 /* 2218 /*
2219 * mpage_da_map_and_submit - go through given space, map them 2219 * mpage_da_map_and_submit - go through given space, map them
2220 * if necessary, and then submit them for I/O 2220 * if necessary, and then submit them for I/O
2221 * 2221 *
2222 * @mpd - bh describing space 2222 * @mpd - bh describing space
2223 * 2223 *
2224 * The function skips space we know is already mapped to disk blocks. 2224 * The function skips space we know is already mapped to disk blocks.
2225 * 2225 *
2226 */ 2226 */
2227 static void mpage_da_map_and_submit(struct mpage_da_data *mpd) 2227 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2228 { 2228 {
2229 int err, blks, get_blocks_flags; 2229 int err, blks, get_blocks_flags;
2230 struct ext4_map_blocks map, *mapp = NULL; 2230 struct ext4_map_blocks map, *mapp = NULL;
2231 sector_t next = mpd->b_blocknr; 2231 sector_t next = mpd->b_blocknr;
2232 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2232 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2233 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2233 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2234 handle_t *handle = NULL; 2234 handle_t *handle = NULL;
2235 2235
2236 /* 2236 /*
2237 * If the blocks are mapped already, or we couldn't accumulate 2237 * If the blocks are mapped already, or we couldn't accumulate
2238 * any blocks, then proceed immediately to the submission stage. 2238 * any blocks, then proceed immediately to the submission stage.
2239 */ 2239 */
2240 if ((mpd->b_size == 0) || 2240 if ((mpd->b_size == 0) ||
2241 ((mpd->b_state & (1 << BH_Mapped)) && 2241 ((mpd->b_state & (1 << BH_Mapped)) &&
2242 !(mpd->b_state & (1 << BH_Delay)) && 2242 !(mpd->b_state & (1 << BH_Delay)) &&
2243 !(mpd->b_state & (1 << BH_Unwritten)))) 2243 !(mpd->b_state & (1 << BH_Unwritten))))
2244 goto submit_io; 2244 goto submit_io;
2245 2245
2246 handle = ext4_journal_current_handle(); 2246 handle = ext4_journal_current_handle();
2247 BUG_ON(!handle); 2247 BUG_ON(!handle);
2248 2248
2249 /* 2249 /*
2250 * Call ext4_map_blocks() to allocate any delayed allocation 2250 * Call ext4_map_blocks() to allocate any delayed allocation
2251 * blocks, or to convert an uninitialized extent to be 2251 * blocks, or to convert an uninitialized extent to be
2252 * initialized (in the case where we have written into 2252 * initialized (in the case where we have written into
2253 * one or more preallocated blocks). 2253 * one or more preallocated blocks).
2254 * 2254 *
2255 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to 2255 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2256 * indicate that we are on the delayed allocation path. This 2256 * indicate that we are on the delayed allocation path. This
2257 * affects functions in many different parts of the allocation 2257 * affects functions in many different parts of the allocation
2258 * call path. This flag exists primarily because we don't 2258 * call path. This flag exists primarily because we don't
2259 * want to change *many* call functions, so ext4_map_blocks() 2259 * want to change *many* call functions, so ext4_map_blocks()
2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the 2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2261 * inode's allocation semaphore is taken. 2261 * inode's allocation semaphore is taken.
2262 * 2262 *
2263 * If the blocks in questions were delalloc blocks, set 2263 * If the blocks in questions were delalloc blocks, set
2264 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2264 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2265 * variables are updated after the blocks have been allocated. 2265 * variables are updated after the blocks have been allocated.
2266 */ 2266 */
2267 map.m_lblk = next; 2267 map.m_lblk = next;
2268 map.m_len = max_blocks; 2268 map.m_len = max_blocks;
2269 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2269 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2270 if (ext4_should_dioread_nolock(mpd->inode)) 2270 if (ext4_should_dioread_nolock(mpd->inode))
2271 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2271 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2272 if (mpd->b_state & (1 << BH_Delay)) 2272 if (mpd->b_state & (1 << BH_Delay))
2273 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2273 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2274 2274
2275 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 2275 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2276 if (blks < 0) { 2276 if (blks < 0) {
2277 struct super_block *sb = mpd->inode->i_sb; 2277 struct super_block *sb = mpd->inode->i_sb;
2278 2278
2279 err = blks; 2279 err = blks;
2280 /* 2280 /*
2281 * If get block returns EAGAIN or ENOSPC and there 2281 * If get block returns EAGAIN or ENOSPC and there
2282 * appears to be free blocks we will just let 2282 * appears to be free blocks we will just let
2283 * mpage_da_submit_io() unlock all of the pages. 2283 * mpage_da_submit_io() unlock all of the pages.
2284 */ 2284 */
2285 if (err == -EAGAIN) 2285 if (err == -EAGAIN)
2286 goto submit_io; 2286 goto submit_io;
2287 2287
2288 if (err == -ENOSPC && 2288 if (err == -ENOSPC &&
2289 ext4_count_free_blocks(sb)) { 2289 ext4_count_free_blocks(sb)) {
2290 mpd->retval = err; 2290 mpd->retval = err;
2291 goto submit_io; 2291 goto submit_io;
2292 } 2292 }
2293 2293
2294 /* 2294 /*
2295 * get block failure will cause us to loop in 2295 * get block failure will cause us to loop in
2296 * writepages, because a_ops->writepage won't be able 2296 * writepages, because a_ops->writepage won't be able
2297 * to make progress. The page will be redirtied by 2297 * to make progress. The page will be redirtied by
2298 * writepage and writepages will again try to write 2298 * writepage and writepages will again try to write
2299 * the same. 2299 * the same.
2300 */ 2300 */
2301 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { 2301 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2302 ext4_msg(sb, KERN_CRIT, 2302 ext4_msg(sb, KERN_CRIT,
2303 "delayed block allocation failed for inode %lu " 2303 "delayed block allocation failed for inode %lu "
2304 "at logical offset %llu with max blocks %zd " 2304 "at logical offset %llu with max blocks %zd "
2305 "with error %d", mpd->inode->i_ino, 2305 "with error %d", mpd->inode->i_ino,
2306 (unsigned long long) next, 2306 (unsigned long long) next,
2307 mpd->b_size >> mpd->inode->i_blkbits, err); 2307 mpd->b_size >> mpd->inode->i_blkbits, err);
2308 ext4_msg(sb, KERN_CRIT, 2308 ext4_msg(sb, KERN_CRIT,
2309 "This should not happen!! Data will be lost\n"); 2309 "This should not happen!! Data will be lost\n");
2310 if (err == -ENOSPC) 2310 if (err == -ENOSPC)
2311 ext4_print_free_blocks(mpd->inode); 2311 ext4_print_free_blocks(mpd->inode);
2312 } 2312 }
2313 /* invalidate all the pages */ 2313 /* invalidate all the pages */
2314 ext4_da_block_invalidatepages(mpd); 2314 ext4_da_block_invalidatepages(mpd);
2315 2315
2316 /* Mark this page range as having been completed */ 2316 /* Mark this page range as having been completed */
2317 mpd->io_done = 1; 2317 mpd->io_done = 1;
2318 return; 2318 return;
2319 } 2319 }
2320 BUG_ON(blks == 0); 2320 BUG_ON(blks == 0);
2321 2321
2322 mapp = &map; 2322 mapp = &map;
2323 if (map.m_flags & EXT4_MAP_NEW) { 2323 if (map.m_flags & EXT4_MAP_NEW) {
2324 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2324 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2325 int i; 2325 int i;
2326 2326
2327 for (i = 0; i < map.m_len; i++) 2327 for (i = 0; i < map.m_len; i++)
2328 unmap_underlying_metadata(bdev, map.m_pblk + i); 2328 unmap_underlying_metadata(bdev, map.m_pblk + i);
2329 } 2329 }
2330 2330
2331 if (ext4_should_order_data(mpd->inode)) { 2331 if (ext4_should_order_data(mpd->inode)) {
2332 err = ext4_jbd2_file_inode(handle, mpd->inode); 2332 err = ext4_jbd2_file_inode(handle, mpd->inode);
2333 if (err) 2333 if (err)
2334 /* This only happens if the journal is aborted */ 2334 /* This only happens if the journal is aborted */
2335 return; 2335 return;
2336 } 2336 }
2337 2337
2338 /* 2338 /*
2339 * Update on-disk size along with block allocation. 2339 * Update on-disk size along with block allocation.
2340 */ 2340 */
2341 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; 2341 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2342 if (disksize > i_size_read(mpd->inode)) 2342 if (disksize > i_size_read(mpd->inode))
2343 disksize = i_size_read(mpd->inode); 2343 disksize = i_size_read(mpd->inode);
2344 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2344 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2345 ext4_update_i_disksize(mpd->inode, disksize); 2345 ext4_update_i_disksize(mpd->inode, disksize);
2346 err = ext4_mark_inode_dirty(handle, mpd->inode); 2346 err = ext4_mark_inode_dirty(handle, mpd->inode);
2347 if (err) 2347 if (err)
2348 ext4_error(mpd->inode->i_sb, 2348 ext4_error(mpd->inode->i_sb,
2349 "Failed to mark inode %lu dirty", 2349 "Failed to mark inode %lu dirty",
2350 mpd->inode->i_ino); 2350 mpd->inode->i_ino);
2351 } 2351 }
2352 2352
2353 submit_io: 2353 submit_io:
2354 mpage_da_submit_io(mpd, mapp); 2354 mpage_da_submit_io(mpd, mapp);
2355 mpd->io_done = 1; 2355 mpd->io_done = 1;
2356 } 2356 }
2357 2357
2358 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2358 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
2359 (1 << BH_Delay) | (1 << BH_Unwritten)) 2359 (1 << BH_Delay) | (1 << BH_Unwritten))
2360 2360
2361 /* 2361 /*
2362 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 2362 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
2363 * 2363 *
2364 * @mpd->lbh - extent of blocks 2364 * @mpd->lbh - extent of blocks
2365 * @logical - logical number of the block in the file 2365 * @logical - logical number of the block in the file
2366 * @bh - bh of the block (used to access block's state) 2366 * @bh - bh of the block (used to access block's state)
2367 * 2367 *
2368 * the function is used to collect contig. blocks in same state 2368 * the function is used to collect contig. blocks in same state
2369 */ 2369 */
2370 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2370 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2371 sector_t logical, size_t b_size, 2371 sector_t logical, size_t b_size,
2372 unsigned long b_state) 2372 unsigned long b_state)
2373 { 2373 {
2374 sector_t next; 2374 sector_t next;
2375 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2375 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2376 2376
2377 /* 2377 /*
2378 * XXX Don't go larger than mballoc is willing to allocate 2378 * XXX Don't go larger than mballoc is willing to allocate
2379 * This is a stopgap solution. We eventually need to fold 2379 * This is a stopgap solution. We eventually need to fold
2380 * mpage_da_submit_io() into this function and then call 2380 * mpage_da_submit_io() into this function and then call
2381 * ext4_map_blocks() multiple times in a loop 2381 * ext4_map_blocks() multiple times in a loop
2382 */ 2382 */
2383 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) 2383 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2384 goto flush_it; 2384 goto flush_it;
2385 2385
2386 /* check if thereserved journal credits might overflow */ 2386 /* check if thereserved journal credits might overflow */
2387 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { 2387 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2388 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2388 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2389 /* 2389 /*
2390 * With non-extent format we are limited by the journal 2390 * With non-extent format we are limited by the journal
2391 * credit available. Total credit needed to insert 2391 * credit available. Total credit needed to insert
2392 * nrblocks contiguous blocks is dependent on the 2392 * nrblocks contiguous blocks is dependent on the
2393 * nrblocks. So limit nrblocks. 2393 * nrblocks. So limit nrblocks.
2394 */ 2394 */
2395 goto flush_it; 2395 goto flush_it;
2396 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 2396 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
2397 EXT4_MAX_TRANS_DATA) { 2397 EXT4_MAX_TRANS_DATA) {
2398 /* 2398 /*
2399 * Adding the new buffer_head would make it cross the 2399 * Adding the new buffer_head would make it cross the
2400 * allowed limit for which we have journal credit 2400 * allowed limit for which we have journal credit
2401 * reserved. So limit the new bh->b_size 2401 * reserved. So limit the new bh->b_size
2402 */ 2402 */
2403 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 2403 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
2404 mpd->inode->i_blkbits; 2404 mpd->inode->i_blkbits;
2405 /* we will do mpage_da_submit_io in the next loop */ 2405 /* we will do mpage_da_submit_io in the next loop */
2406 } 2406 }
2407 } 2407 }
2408 /* 2408 /*
2409 * First block in the extent 2409 * First block in the extent
2410 */ 2410 */
2411 if (mpd->b_size == 0) { 2411 if (mpd->b_size == 0) {
2412 mpd->b_blocknr = logical; 2412 mpd->b_blocknr = logical;
2413 mpd->b_size = b_size; 2413 mpd->b_size = b_size;
2414 mpd->b_state = b_state & BH_FLAGS; 2414 mpd->b_state = b_state & BH_FLAGS;
2415 return; 2415 return;
2416 } 2416 }
2417 2417
2418 next = mpd->b_blocknr + nrblocks; 2418 next = mpd->b_blocknr + nrblocks;
2419 /* 2419 /*
2420 * Can we merge the block to our big extent? 2420 * Can we merge the block to our big extent?
2421 */ 2421 */
2422 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { 2422 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2423 mpd->b_size += b_size; 2423 mpd->b_size += b_size;
2424 return; 2424 return;
2425 } 2425 }
2426 2426
2427 flush_it: 2427 flush_it:
2428 /* 2428 /*
2429 * We couldn't merge the block to our extent, so we 2429 * We couldn't merge the block to our extent, so we
2430 * need to flush current extent and start new one 2430 * need to flush current extent and start new one
2431 */ 2431 */
2432 mpage_da_map_and_submit(mpd); 2432 mpage_da_map_and_submit(mpd);
2433 return; 2433 return;
2434 } 2434 }
2435 2435
2436 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 2436 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2437 { 2437 {
2438 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 2438 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
2439 } 2439 }
2440 2440
2441 /* 2441 /*
2442 * This is a special get_blocks_t callback which is used by 2442 * This is a special get_blocks_t callback which is used by
2443 * ext4_da_write_begin(). It will either return mapped block or 2443 * ext4_da_write_begin(). It will either return mapped block or
2444 * reserve space for a single block. 2444 * reserve space for a single block.
2445 * 2445 *
2446 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 2446 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2447 * We also have b_blocknr = -1 and b_bdev initialized properly 2447 * We also have b_blocknr = -1 and b_bdev initialized properly
2448 * 2448 *
2449 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 2449 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2450 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 2450 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2451 * initialized properly. 2451 * initialized properly.
2452 */ 2452 */
2453 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2453 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2454 struct buffer_head *bh, int create) 2454 struct buffer_head *bh, int create)
2455 { 2455 {
2456 struct ext4_map_blocks map; 2456 struct ext4_map_blocks map;
2457 int ret = 0; 2457 int ret = 0;
2458 sector_t invalid_block = ~((sector_t) 0xffff); 2458 sector_t invalid_block = ~((sector_t) 0xffff);
2459 2459
2460 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 2460 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2461 invalid_block = ~0; 2461 invalid_block = ~0;
2462 2462
2463 BUG_ON(create == 0); 2463 BUG_ON(create == 0);
2464 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 2464 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2465 2465
2466 map.m_lblk = iblock; 2466 map.m_lblk = iblock;
2467 map.m_len = 1; 2467 map.m_len = 1;
2468 2468
2469 /* 2469 /*
2470 * first, we need to know whether the block is allocated already 2470 * first, we need to know whether the block is allocated already
2471 * preallocated blocks are unmapped but should treated 2471 * preallocated blocks are unmapped but should treated
2472 * the same as allocated blocks. 2472 * the same as allocated blocks.
2473 */ 2473 */
2474 ret = ext4_map_blocks(NULL, inode, &map, 0); 2474 ret = ext4_map_blocks(NULL, inode, &map, 0);
2475 if (ret < 0) 2475 if (ret < 0)
2476 return ret; 2476 return ret;
2477 if (ret == 0) { 2477 if (ret == 0) {
2478 if (buffer_delay(bh)) 2478 if (buffer_delay(bh))
2479 return 0; /* Not sure this could or should happen */ 2479 return 0; /* Not sure this could or should happen */
2480 /* 2480 /*
2481 * XXX: __block_write_begin() unmaps passed block, is it OK? 2481 * XXX: __block_write_begin() unmaps passed block, is it OK?
2482 */ 2482 */
2483 ret = ext4_da_reserve_space(inode, iblock); 2483 ret = ext4_da_reserve_space(inode, iblock);
2484 if (ret) 2484 if (ret)
2485 /* not enough space to reserve */ 2485 /* not enough space to reserve */
2486 return ret; 2486 return ret;
2487 2487
2488 map_bh(bh, inode->i_sb, invalid_block); 2488 map_bh(bh, inode->i_sb, invalid_block);
2489 set_buffer_new(bh); 2489 set_buffer_new(bh);
2490 set_buffer_delay(bh); 2490 set_buffer_delay(bh);
2491 return 0; 2491 return 0;
2492 } 2492 }
2493 2493
2494 map_bh(bh, inode->i_sb, map.m_pblk); 2494 map_bh(bh, inode->i_sb, map.m_pblk);
2495 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 2495 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2496 2496
2497 if (buffer_unwritten(bh)) { 2497 if (buffer_unwritten(bh)) {
2498 /* A delayed write to unwritten bh should be marked 2498 /* A delayed write to unwritten bh should be marked
2499 * new and mapped. Mapped ensures that we don't do 2499 * new and mapped. Mapped ensures that we don't do
2500 * get_block multiple times when we write to the same 2500 * get_block multiple times when we write to the same
2501 * offset and new ensures that we do proper zero out 2501 * offset and new ensures that we do proper zero out
2502 * for partial write. 2502 * for partial write.
2503 */ 2503 */
2504 set_buffer_new(bh); 2504 set_buffer_new(bh);
2505 set_buffer_mapped(bh); 2505 set_buffer_mapped(bh);
2506 } 2506 }
2507 return 0; 2507 return 0;
2508 } 2508 }
2509 2509
2510 /* 2510 /*
2511 * This function is used as a standard get_block_t calback function 2511 * This function is used as a standard get_block_t calback function
2512 * when there is no desire to allocate any blocks. It is used as a 2512 * when there is no desire to allocate any blocks. It is used as a
2513 * callback function for block_write_begin() and block_write_full_page(). 2513 * callback function for block_write_begin() and block_write_full_page().
2514 * These functions should only try to map a single block at a time. 2514 * These functions should only try to map a single block at a time.
2515 * 2515 *
2516 * Since this function doesn't do block allocations even if the caller 2516 * Since this function doesn't do block allocations even if the caller
2517 * requests it by passing in create=1, it is critically important that 2517 * requests it by passing in create=1, it is critically important that
2518 * any caller checks to make sure that any buffer heads are returned 2518 * any caller checks to make sure that any buffer heads are returned
2519 * by this function are either all already mapped or marked for 2519 * by this function are either all already mapped or marked for
2520 * delayed allocation before calling block_write_full_page(). Otherwise, 2520 * delayed allocation before calling block_write_full_page(). Otherwise,
2521 * b_blocknr could be left unitialized, and the page write functions will 2521 * b_blocknr could be left unitialized, and the page write functions will
2522 * be taken by surprise. 2522 * be taken by surprise.
2523 */ 2523 */
2524 static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2524 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2525 struct buffer_head *bh_result, int create) 2525 struct buffer_head *bh_result, int create)
2526 { 2526 {
2527 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2527 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2528 return _ext4_get_block(inode, iblock, bh_result, 0); 2528 return _ext4_get_block(inode, iblock, bh_result, 0);
2529 } 2529 }
2530 2530
2531 static int bget_one(handle_t *handle, struct buffer_head *bh) 2531 static int bget_one(handle_t *handle, struct buffer_head *bh)
2532 { 2532 {
2533 get_bh(bh); 2533 get_bh(bh);
2534 return 0; 2534 return 0;
2535 } 2535 }
2536 2536
2537 static int bput_one(handle_t *handle, struct buffer_head *bh) 2537 static int bput_one(handle_t *handle, struct buffer_head *bh)
2538 { 2538 {
2539 put_bh(bh); 2539 put_bh(bh);
2540 return 0; 2540 return 0;
2541 } 2541 }
2542 2542
2543 static int __ext4_journalled_writepage(struct page *page, 2543 static int __ext4_journalled_writepage(struct page *page,
2544 unsigned int len) 2544 unsigned int len)
2545 { 2545 {
2546 struct address_space *mapping = page->mapping; 2546 struct address_space *mapping = page->mapping;
2547 struct inode *inode = mapping->host; 2547 struct inode *inode = mapping->host;
2548 struct buffer_head *page_bufs; 2548 struct buffer_head *page_bufs;
2549 handle_t *handle = NULL; 2549 handle_t *handle = NULL;
2550 int ret = 0; 2550 int ret = 0;
2551 int err; 2551 int err;
2552 2552
2553 ClearPageChecked(page); 2553 ClearPageChecked(page);
2554 page_bufs = page_buffers(page); 2554 page_bufs = page_buffers(page);
2555 BUG_ON(!page_bufs); 2555 BUG_ON(!page_bufs);
2556 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2556 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
2557 /* As soon as we unlock the page, it can go away, but we have 2557 /* As soon as we unlock the page, it can go away, but we have
2558 * references to buffers so we are safe */ 2558 * references to buffers so we are safe */
2559 unlock_page(page); 2559 unlock_page(page);
2560 2560
2561 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2561 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2562 if (IS_ERR(handle)) { 2562 if (IS_ERR(handle)) {
2563 ret = PTR_ERR(handle); 2563 ret = PTR_ERR(handle);
2564 goto out; 2564 goto out;
2565 } 2565 }
2566 2566
2567 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 2567 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2568 do_journal_get_write_access); 2568 do_journal_get_write_access);
2569 2569
2570 err = walk_page_buffers(handle, page_bufs, 0, len, NULL, 2570 err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2571 write_end_fn); 2571 write_end_fn);
2572 if (ret == 0) 2572 if (ret == 0)
2573 ret = err; 2573 ret = err;
2574 err = ext4_journal_stop(handle); 2574 err = ext4_journal_stop(handle);
2575 if (!ret) 2575 if (!ret)
2576 ret = err; 2576 ret = err;
2577 2577
2578 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2578 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2579 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 2579 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2580 out: 2580 out:
2581 return ret; 2581 return ret;
2582 } 2582 }
2583 2583
2584 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); 2584 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2585 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); 2585 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2586 2586
2587 /* 2587 /*
2588 * Note that we don't need to start a transaction unless we're journaling data 2588 * Note that we don't need to start a transaction unless we're journaling data
2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2590 * need to file the inode to the transaction's list in ordered mode because if 2590 * need to file the inode to the transaction's list in ordered mode because if
2591 * we are writing back data added by write(), the inode is already there and if 2591 * we are writing back data added by write(), the inode is already there and if
2592 * we are writing back data modified via mmap(), no one guarantees in which 2592 * we are writing back data modified via mmap(), no one guarantees in which
2593 * transaction the data will hit the disk. In case we are journaling data, we 2593 * transaction the data will hit the disk. In case we are journaling data, we
2594 * cannot start transaction directly because transaction start ranks above page 2594 * cannot start transaction directly because transaction start ranks above page
2595 * lock so we have to do some magic. 2595 * lock so we have to do some magic.
2596 * 2596 *
2597 * This function can get called via... 2597 * This function can get called via...
2598 * - ext4_da_writepages after taking page lock (have journal handle) 2598 * - ext4_da_writepages after taking page lock (have journal handle)
2599 * - journal_submit_inode_data_buffers (no journal handle) 2599 * - journal_submit_inode_data_buffers (no journal handle)
2600 * - shrink_page_list via pdflush (no journal handle) 2600 * - shrink_page_list via pdflush (no journal handle)
2601 * - grab_page_cache when doing write_begin (have journal handle) 2601 * - grab_page_cache when doing write_begin (have journal handle)
2602 * 2602 *
2603 * We don't do any block allocation in this function. If we have page with 2603 * We don't do any block allocation in this function. If we have page with
2604 * multiple blocks we need to write those buffer_heads that are mapped. This 2604 * multiple blocks we need to write those buffer_heads that are mapped. This
2605 * is important for mmaped based write. So if we do with blocksize 1K 2605 * is important for mmaped based write. So if we do with blocksize 1K
2606 * truncate(f, 1024); 2606 * truncate(f, 1024);
2607 * a = mmap(f, 0, 4096); 2607 * a = mmap(f, 0, 4096);
2608 * a[0] = 'a'; 2608 * a[0] = 'a';
2609 * truncate(f, 4096); 2609 * truncate(f, 4096);
2610 * we have in the page first buffer_head mapped via page_mkwrite call back 2610 * we have in the page first buffer_head mapped via page_mkwrite call back
2611 * but other bufer_heads would be unmapped but dirty(dirty done via the 2611 * but other bufer_heads would be unmapped but dirty(dirty done via the
2612 * do_wp_page). So writepage should write the first block. If we modify 2612 * do_wp_page). So writepage should write the first block. If we modify
2613 * the mmap area beyond 1024 we will again get a page_fault and the 2613 * the mmap area beyond 1024 we will again get a page_fault and the
2614 * page_mkwrite callback will do the block allocation and mark the 2614 * page_mkwrite callback will do the block allocation and mark the
2615 * buffer_heads mapped. 2615 * buffer_heads mapped.
2616 * 2616 *
2617 * We redirty the page if we have any buffer_heads that is either delay or 2617 * We redirty the page if we have any buffer_heads that is either delay or
2618 * unwritten in the page. 2618 * unwritten in the page.
2619 * 2619 *
2620 * We can get recursively called as show below. 2620 * We can get recursively called as show below.
2621 * 2621 *
2622 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2622 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2623 * ext4_writepage() 2623 * ext4_writepage()
2624 * 2624 *
2625 * But since we don't do any block allocation we should not deadlock. 2625 * But since we don't do any block allocation we should not deadlock.
2626 * Page also have the dirty flag cleared so we don't get recurive page_lock. 2626 * Page also have the dirty flag cleared so we don't get recurive page_lock.
2627 */ 2627 */
2628 static int ext4_writepage(struct page *page, 2628 static int ext4_writepage(struct page *page,
2629 struct writeback_control *wbc) 2629 struct writeback_control *wbc)
2630 { 2630 {
2631 int ret = 0, commit_write = 0; 2631 int ret = 0, commit_write = 0;
2632 loff_t size; 2632 loff_t size;
2633 unsigned int len; 2633 unsigned int len;
2634 struct buffer_head *page_bufs = NULL; 2634 struct buffer_head *page_bufs = NULL;
2635 struct inode *inode = page->mapping->host; 2635 struct inode *inode = page->mapping->host;
2636 2636
2637 trace_ext4_writepage(page); 2637 trace_ext4_writepage(page);
2638 size = i_size_read(inode); 2638 size = i_size_read(inode);
2639 if (page->index == size >> PAGE_CACHE_SHIFT) 2639 if (page->index == size >> PAGE_CACHE_SHIFT)
2640 len = size & ~PAGE_CACHE_MASK; 2640 len = size & ~PAGE_CACHE_MASK;
2641 else 2641 else
2642 len = PAGE_CACHE_SIZE; 2642 len = PAGE_CACHE_SIZE;
2643 2643
2644 /* 2644 /*
2645 * If the page does not have buffers (for whatever reason), 2645 * If the page does not have buffers (for whatever reason),
2646 * try to create them using __block_write_begin. If this 2646 * try to create them using __block_write_begin. If this
2647 * fails, redirty the page and move on. 2647 * fails, redirty the page and move on.
2648 */ 2648 */
2649 if (!page_has_buffers(page)) { 2649 if (!page_has_buffers(page)) {
2650 if (__block_write_begin(page, 0, len, 2650 if (__block_write_begin(page, 0, len,
2651 noalloc_get_block_write)) { 2651 noalloc_get_block_write)) {
2652 redirty_page: 2652 redirty_page:
2653 redirty_page_for_writepage(wbc, page); 2653 redirty_page_for_writepage(wbc, page);
2654 unlock_page(page); 2654 unlock_page(page);
2655 return 0; 2655 return 0;
2656 } 2656 }
2657 commit_write = 1; 2657 commit_write = 1;
2658 } 2658 }
2659 page_bufs = page_buffers(page); 2659 page_bufs = page_buffers(page);
2660 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2660 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2661 ext4_bh_delay_or_unwritten)) { 2661 ext4_bh_delay_or_unwritten)) {
2662 /* 2662 /*
2663 * We don't want to do block allocation, so redirty 2663 * We don't want to do block allocation, so redirty
2664 * the page and return. We may reach here when we do 2664 * the page and return. We may reach here when we do
2665 * a journal commit via journal_submit_inode_data_buffers. 2665 * a journal commit via journal_submit_inode_data_buffers.
2666 * We can also reach here via shrink_page_list 2666 * We can also reach here via shrink_page_list
2667 */ 2667 */
2668 goto redirty_page; 2668 goto redirty_page;
2669 } 2669 }
2670 if (commit_write) 2670 if (commit_write)
2671 /* now mark the buffer_heads as dirty and uptodate */ 2671 /* now mark the buffer_heads as dirty and uptodate */
2672 block_commit_write(page, 0, len); 2672 block_commit_write(page, 0, len);
2673 2673
2674 if (PageChecked(page) && ext4_should_journal_data(inode)) 2674 if (PageChecked(page) && ext4_should_journal_data(inode))
2675 /* 2675 /*
2676 * It's mmapped pagecache. Add buffers and journal it. There 2676 * It's mmapped pagecache. Add buffers and journal it. There
2677 * doesn't seem much point in redirtying the page here. 2677 * doesn't seem much point in redirtying the page here.
2678 */ 2678 */
2679 return __ext4_journalled_writepage(page, len); 2679 return __ext4_journalled_writepage(page, len);
2680 2680
2681 if (buffer_uninit(page_bufs)) { 2681 if (buffer_uninit(page_bufs)) {
2682 ext4_set_bh_endio(page_bufs, inode); 2682 ext4_set_bh_endio(page_bufs, inode);
2683 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2683 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2684 wbc, ext4_end_io_buffer_write); 2684 wbc, ext4_end_io_buffer_write);
2685 } else 2685 } else
2686 ret = block_write_full_page(page, noalloc_get_block_write, 2686 ret = block_write_full_page(page, noalloc_get_block_write,
2687 wbc); 2687 wbc);
2688 2688
2689 return ret; 2689 return ret;
2690 } 2690 }
2691 2691
2692 /* 2692 /*
2693 * This is called via ext4_da_writepages() to 2693 * This is called via ext4_da_writepages() to
2694 * calculate the total number of credits to reserve to fit 2694 * calculate the total number of credits to reserve to fit
2695 * a single extent allocation into a single transaction, 2695 * a single extent allocation into a single transaction,
2696 * ext4_da_writpeages() will loop calling this before 2696 * ext4_da_writpeages() will loop calling this before
2697 * the block allocation. 2697 * the block allocation.
2698 */ 2698 */
2699 2699
2700 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2700 static int ext4_da_writepages_trans_blocks(struct inode *inode)
2701 { 2701 {
2702 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2702 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2703 2703
2704 /* 2704 /*
2705 * With non-extent format the journal credit needed to 2705 * With non-extent format the journal credit needed to
2706 * insert nrblocks contiguous block is dependent on 2706 * insert nrblocks contiguous block is dependent on
2707 * number of contiguous block. So we will limit 2707 * number of contiguous block. So we will limit
2708 * number of contiguous block to a sane value 2708 * number of contiguous block to a sane value
2709 */ 2709 */
2710 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 2710 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2711 (max_blocks > EXT4_MAX_TRANS_DATA)) 2711 (max_blocks > EXT4_MAX_TRANS_DATA))
2712 max_blocks = EXT4_MAX_TRANS_DATA; 2712 max_blocks = EXT4_MAX_TRANS_DATA;
2713 2713
2714 return ext4_chunk_trans_blocks(inode, max_blocks); 2714 return ext4_chunk_trans_blocks(inode, max_blocks);
2715 } 2715 }
2716 2716
2717 /* 2717 /*
2718 * write_cache_pages_da - walk the list of dirty pages of the given 2718 * write_cache_pages_da - walk the list of dirty pages of the given
2719 * address space and accumulate pages that need writing, and call 2719 * address space and accumulate pages that need writing, and call
2720 * mpage_da_map_and_submit to map a single contiguous memory region 2720 * mpage_da_map_and_submit to map a single contiguous memory region
2721 * and then write them. 2721 * and then write them.
2722 */ 2722 */
2723 static int write_cache_pages_da(struct address_space *mapping, 2723 static int write_cache_pages_da(struct address_space *mapping,
2724 struct writeback_control *wbc, 2724 struct writeback_control *wbc,
2725 struct mpage_da_data *mpd, 2725 struct mpage_da_data *mpd,
2726 pgoff_t *done_index) 2726 pgoff_t *done_index)
2727 { 2727 {
2728 struct buffer_head *bh, *head; 2728 struct buffer_head *bh, *head;
2729 struct inode *inode = mapping->host; 2729 struct inode *inode = mapping->host;
2730 struct pagevec pvec; 2730 struct pagevec pvec;
2731 unsigned int nr_pages; 2731 unsigned int nr_pages;
2732 sector_t logical; 2732 sector_t logical;
2733 pgoff_t index, end; 2733 pgoff_t index, end;
2734 long nr_to_write = wbc->nr_to_write; 2734 long nr_to_write = wbc->nr_to_write;
2735 int i, tag, ret = 0; 2735 int i, tag, ret = 0;
2736 2736
2737 memset(mpd, 0, sizeof(struct mpage_da_data)); 2737 memset(mpd, 0, sizeof(struct mpage_da_data));
2738 mpd->wbc = wbc; 2738 mpd->wbc = wbc;
2739 mpd->inode = inode; 2739 mpd->inode = inode;
2740 pagevec_init(&pvec, 0); 2740 pagevec_init(&pvec, 0);
2741 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2742 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2743 2743
2744 if (wbc->sync_mode == WB_SYNC_ALL) 2744 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2745 tag = PAGECACHE_TAG_TOWRITE; 2745 tag = PAGECACHE_TAG_TOWRITE;
2746 else 2746 else
2747 tag = PAGECACHE_TAG_DIRTY; 2747 tag = PAGECACHE_TAG_DIRTY;
2748 2748
2749 *done_index = index; 2749 *done_index = index;
2750 while (index <= end) { 2750 while (index <= end) {
2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2753 if (nr_pages == 0) 2753 if (nr_pages == 0)
2754 return 0; 2754 return 0;
2755 2755
2756 for (i = 0; i < nr_pages; i++) { 2756 for (i = 0; i < nr_pages; i++) {
2757 struct page *page = pvec.pages[i]; 2757 struct page *page = pvec.pages[i];
2758 2758
2759 /* 2759 /*
2760 * At this point, the page may be truncated or 2760 * At this point, the page may be truncated or
2761 * invalidated (changing page->mapping to NULL), or 2761 * invalidated (changing page->mapping to NULL), or
2762 * even swizzled back from swapper_space to tmpfs file 2762 * even swizzled back from swapper_space to tmpfs file
2763 * mapping. However, page->index will not change 2763 * mapping. However, page->index will not change
2764 * because we have a reference on the page. 2764 * because we have a reference on the page.
2765 */ 2765 */
2766 if (page->index > end) 2766 if (page->index > end)
2767 goto out; 2767 goto out;
2768 2768
2769 *done_index = page->index + 1; 2769 *done_index = page->index + 1;
2770 2770
2771 /* 2771 /*
2772 * If we can't merge this page, and we have 2772 * If we can't merge this page, and we have
2773 * accumulated an contiguous region, write it 2773 * accumulated an contiguous region, write it
2774 */ 2774 */
2775 if ((mpd->next_page != page->index) && 2775 if ((mpd->next_page != page->index) &&
2776 (mpd->next_page != mpd->first_page)) { 2776 (mpd->next_page != mpd->first_page)) {
2777 mpage_da_map_and_submit(mpd); 2777 mpage_da_map_and_submit(mpd);
2778 goto ret_extent_tail; 2778 goto ret_extent_tail;
2779 } 2779 }
2780 2780
2781 lock_page(page); 2781 lock_page(page);
2782 2782
2783 /* 2783 /*
2784 * If the page is no longer dirty, or its 2784 * If the page is no longer dirty, or its
2785 * mapping no longer corresponds to inode we 2785 * mapping no longer corresponds to inode we
2786 * are writing (which means it has been 2786 * are writing (which means it has been
2787 * truncated or invalidated), or the page is 2787 * truncated or invalidated), or the page is
2788 * already under writeback and we are not 2788 * already under writeback and we are not
2789 * doing a data integrity writeback, skip the page 2789 * doing a data integrity writeback, skip the page
2790 */ 2790 */
2791 if (!PageDirty(page) || 2791 if (!PageDirty(page) ||
2792 (PageWriteback(page) && 2792 (PageWriteback(page) &&
2793 (wbc->sync_mode == WB_SYNC_NONE)) || 2793 (wbc->sync_mode == WB_SYNC_NONE)) ||
2794 unlikely(page->mapping != mapping)) { 2794 unlikely(page->mapping != mapping)) {
2795 unlock_page(page); 2795 unlock_page(page);
2796 continue; 2796 continue;
2797 } 2797 }
2798 2798
2799 wait_on_page_writeback(page); 2799 wait_on_page_writeback(page);
2800 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2801 2801
2802 if (mpd->next_page != page->index) 2802 if (mpd->next_page != page->index)
2803 mpd->first_page = page->index; 2803 mpd->first_page = page->index;
2804 mpd->next_page = page->index + 1; 2804 mpd->next_page = page->index + 1;
2805 logical = (sector_t) page->index << 2805 logical = (sector_t) page->index <<
2806 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2806 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2807 2807
2808 if (!page_has_buffers(page)) { 2808 if (!page_has_buffers(page)) {
2809 mpage_add_bh_to_extent(mpd, logical, 2809 mpage_add_bh_to_extent(mpd, logical,
2810 PAGE_CACHE_SIZE, 2810 PAGE_CACHE_SIZE,
2811 (1 << BH_Dirty) | (1 << BH_Uptodate)); 2811 (1 << BH_Dirty) | (1 << BH_Uptodate));
2812 if (mpd->io_done) 2812 if (mpd->io_done)
2813 goto ret_extent_tail; 2813 goto ret_extent_tail;
2814 } else { 2814 } else {
2815 /* 2815 /*
2816 * Page with regular buffer heads, 2816 * Page with regular buffer heads,
2817 * just add all dirty ones 2817 * just add all dirty ones
2818 */ 2818 */
2819 head = page_buffers(page); 2819 head = page_buffers(page);
2820 bh = head; 2820 bh = head;
2821 do { 2821 do {
2822 BUG_ON(buffer_locked(bh)); 2822 BUG_ON(buffer_locked(bh));
2823 /* 2823 /*
2824 * We need to try to allocate 2824 * We need to try to allocate
2825 * unmapped blocks in the same page. 2825 * unmapped blocks in the same page.
2826 * Otherwise we won't make progress 2826 * Otherwise we won't make progress
2827 * with the page in ext4_writepage 2827 * with the page in ext4_writepage
2828 */ 2828 */
2829 if (ext4_bh_delay_or_unwritten(NULL, bh)) { 2829 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2830 mpage_add_bh_to_extent(mpd, logical, 2830 mpage_add_bh_to_extent(mpd, logical,
2831 bh->b_size, 2831 bh->b_size,
2832 bh->b_state); 2832 bh->b_state);
2833 if (mpd->io_done) 2833 if (mpd->io_done)
2834 goto ret_extent_tail; 2834 goto ret_extent_tail;
2835 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2835 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2836 /* 2836 /*
2837 * mapped dirty buffer. We need 2837 * mapped dirty buffer. We need
2838 * to update the b_state 2838 * to update the b_state
2839 * because we look at b_state 2839 * because we look at b_state
2840 * in mpage_da_map_blocks. We 2840 * in mpage_da_map_blocks. We
2841 * don't update b_size because 2841 * don't update b_size because
2842 * if we find an unmapped 2842 * if we find an unmapped
2843 * buffer_head later we need to 2843 * buffer_head later we need to
2844 * use the b_state flag of that 2844 * use the b_state flag of that
2845 * buffer_head. 2845 * buffer_head.
2846 */ 2846 */
2847 if (mpd->b_size == 0) 2847 if (mpd->b_size == 0)
2848 mpd->b_state = bh->b_state & BH_FLAGS; 2848 mpd->b_state = bh->b_state & BH_FLAGS;
2849 } 2849 }
2850 logical++; 2850 logical++;
2851 } while ((bh = bh->b_this_page) != head); 2851 } while ((bh = bh->b_this_page) != head);
2852 } 2852 }
2853 2853
2854 if (nr_to_write > 0) { 2854 if (nr_to_write > 0) {
2855 nr_to_write--; 2855 nr_to_write--;
2856 if (nr_to_write == 0 && 2856 if (nr_to_write == 0 &&
2857 wbc->sync_mode == WB_SYNC_NONE) 2857 wbc->sync_mode == WB_SYNC_NONE)
2858 /* 2858 /*
2859 * We stop writing back only if we are 2859 * We stop writing back only if we are
2860 * not doing integrity sync. In case of 2860 * not doing integrity sync. In case of
2861 * integrity sync we have to keep going 2861 * integrity sync we have to keep going
2862 * because someone may be concurrently 2862 * because someone may be concurrently
2863 * dirtying pages, and we might have 2863 * dirtying pages, and we might have
2864 * synced a lot of newly appeared dirty 2864 * synced a lot of newly appeared dirty
2865 * pages, but have not synced all of the 2865 * pages, but have not synced all of the
2866 * old dirty pages. 2866 * old dirty pages.
2867 */ 2867 */
2868 goto out; 2868 goto out;
2869 } 2869 }
2870 } 2870 }
2871 pagevec_release(&pvec); 2871 pagevec_release(&pvec);
2872 cond_resched(); 2872 cond_resched();
2873 } 2873 }
2874 return 0; 2874 return 0;
2875 ret_extent_tail: 2875 ret_extent_tail:
2876 ret = MPAGE_DA_EXTENT_TAIL; 2876 ret = MPAGE_DA_EXTENT_TAIL;
2877 out: 2877 out:
2878 pagevec_release(&pvec); 2878 pagevec_release(&pvec);
2879 cond_resched(); 2879 cond_resched();
2880 return ret; 2880 return ret;
2881 } 2881 }
2882 2882
2883 2883
2884 static int ext4_da_writepages(struct address_space *mapping, 2884 static int ext4_da_writepages(struct address_space *mapping,
2885 struct writeback_control *wbc) 2885 struct writeback_control *wbc)
2886 { 2886 {
2887 pgoff_t index; 2887 pgoff_t index;
2888 int range_whole = 0; 2888 int range_whole = 0;
2889 handle_t *handle = NULL; 2889 handle_t *handle = NULL;
2890 struct mpage_da_data mpd; 2890 struct mpage_da_data mpd;
2891 struct inode *inode = mapping->host; 2891 struct inode *inode = mapping->host;
2892 int pages_written = 0; 2892 int pages_written = 0;
2893 unsigned int max_pages; 2893 unsigned int max_pages;
2894 int range_cyclic, cycled = 1, io_done = 0; 2894 int range_cyclic, cycled = 1, io_done = 0;
2895 int needed_blocks, ret = 0; 2895 int needed_blocks, ret = 0;
2896 long desired_nr_to_write, nr_to_writebump = 0; 2896 long desired_nr_to_write, nr_to_writebump = 0;
2897 loff_t range_start = wbc->range_start; 2897 loff_t range_start = wbc->range_start;
2898 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2898 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2899 pgoff_t done_index = 0; 2899 pgoff_t done_index = 0;
2900 pgoff_t end; 2900 pgoff_t end;
2901 2901
2902 trace_ext4_da_writepages(inode, wbc); 2902 trace_ext4_da_writepages(inode, wbc);
2903 2903
2904 /* 2904 /*
2905 * No pages to write? This is mainly a kludge to avoid starting 2905 * No pages to write? This is mainly a kludge to avoid starting
2906 * a transaction for special inodes like journal inode on last iput() 2906 * a transaction for special inodes like journal inode on last iput()
2907 * because that could violate lock ordering on umount 2907 * because that could violate lock ordering on umount
2908 */ 2908 */
2909 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2909 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2910 return 0; 2910 return 0;
2911 2911
2912 /* 2912 /*
2913 * If the filesystem has aborted, it is read-only, so return 2913 * If the filesystem has aborted, it is read-only, so return
2914 * right away instead of dumping stack traces later on that 2914 * right away instead of dumping stack traces later on that
2915 * will obscure the real source of the problem. We test 2915 * will obscure the real source of the problem. We test
2916 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2916 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2917 * the latter could be true if the filesystem is mounted 2917 * the latter could be true if the filesystem is mounted
2918 * read-only, and in that case, ext4_da_writepages should 2918 * read-only, and in that case, ext4_da_writepages should
2919 * *never* be called, so if that ever happens, we would want 2919 * *never* be called, so if that ever happens, we would want
2920 * the stack trace. 2920 * the stack trace.
2921 */ 2921 */
2922 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2922 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2923 return -EROFS; 2923 return -EROFS;
2924 2924
2925 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2925 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2926 range_whole = 1; 2926 range_whole = 1;
2927 2927
2928 range_cyclic = wbc->range_cyclic; 2928 range_cyclic = wbc->range_cyclic;
2929 if (wbc->range_cyclic) { 2929 if (wbc->range_cyclic) {
2930 index = mapping->writeback_index; 2930 index = mapping->writeback_index;
2931 if (index) 2931 if (index)
2932 cycled = 0; 2932 cycled = 0;
2933 wbc->range_start = index << PAGE_CACHE_SHIFT; 2933 wbc->range_start = index << PAGE_CACHE_SHIFT;
2934 wbc->range_end = LLONG_MAX; 2934 wbc->range_end = LLONG_MAX;
2935 wbc->range_cyclic = 0; 2935 wbc->range_cyclic = 0;
2936 end = -1; 2936 end = -1;
2937 } else { 2937 } else {
2938 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2938 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2939 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2939 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2940 } 2940 }
2941 2941
2942 /* 2942 /*
2943 * This works around two forms of stupidity. The first is in 2943 * This works around two forms of stupidity. The first is in
2944 * the writeback code, which caps the maximum number of pages 2944 * the writeback code, which caps the maximum number of pages
2945 * written to be 1024 pages. This is wrong on multiple 2945 * written to be 1024 pages. This is wrong on multiple
2946 * levels; different architectues have a different page size, 2946 * levels; different architectues have a different page size,
2947 * which changes the maximum amount of data which gets 2947 * which changes the maximum amount of data which gets
2948 * written. Secondly, 4 megabytes is way too small. XFS 2948 * written. Secondly, 4 megabytes is way too small. XFS
2949 * forces this value to be 16 megabytes by multiplying 2949 * forces this value to be 16 megabytes by multiplying
2950 * nr_to_write parameter by four, and then relies on its 2950 * nr_to_write parameter by four, and then relies on its
2951 * allocator to allocate larger extents to make them 2951 * allocator to allocate larger extents to make them
2952 * contiguous. Unfortunately this brings us to the second 2952 * contiguous. Unfortunately this brings us to the second
2953 * stupidity, which is that ext4's mballoc code only allocates 2953 * stupidity, which is that ext4's mballoc code only allocates
2954 * at most 2048 blocks. So we force contiguous writes up to 2954 * at most 2048 blocks. So we force contiguous writes up to
2955 * the number of dirty blocks in the inode, or 2955 * the number of dirty blocks in the inode, or
2956 * sbi->max_writeback_mb_bump whichever is smaller. 2956 * sbi->max_writeback_mb_bump whichever is smaller.
2957 */ 2957 */
2958 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2958 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2959 if (!range_cyclic && range_whole) { 2959 if (!range_cyclic && range_whole) {
2960 if (wbc->nr_to_write == LONG_MAX) 2960 if (wbc->nr_to_write == LONG_MAX)
2961 desired_nr_to_write = wbc->nr_to_write; 2961 desired_nr_to_write = wbc->nr_to_write;
2962 else 2962 else
2963 desired_nr_to_write = wbc->nr_to_write * 8; 2963 desired_nr_to_write = wbc->nr_to_write * 8;
2964 } else 2964 } else
2965 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 2965 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2966 max_pages); 2966 max_pages);
2967 if (desired_nr_to_write > max_pages) 2967 if (desired_nr_to_write > max_pages)
2968 desired_nr_to_write = max_pages; 2968 desired_nr_to_write = max_pages;
2969 2969
2970 if (wbc->nr_to_write < desired_nr_to_write) { 2970 if (wbc->nr_to_write < desired_nr_to_write) {
2971 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; 2971 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2972 wbc->nr_to_write = desired_nr_to_write; 2972 wbc->nr_to_write = desired_nr_to_write;
2973 } 2973 }
2974 2974
2975 retry: 2975 retry:
2976 if (wbc->sync_mode == WB_SYNC_ALL) 2976 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2977 tag_pages_for_writeback(mapping, index, end); 2977 tag_pages_for_writeback(mapping, index, end);
2978 2978
2979 while (!ret && wbc->nr_to_write > 0) { 2979 while (!ret && wbc->nr_to_write > 0) {
2980 2980
2981 /* 2981 /*
2982 * we insert one extent at a time. So we need 2982 * we insert one extent at a time. So we need
2983 * credit needed for single extent allocation. 2983 * credit needed for single extent allocation.
2984 * journalled mode is currently not supported 2984 * journalled mode is currently not supported
2985 * by delalloc 2985 * by delalloc
2986 */ 2986 */
2987 BUG_ON(ext4_should_journal_data(inode)); 2987 BUG_ON(ext4_should_journal_data(inode));
2988 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2988 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2989 2989
2990 /* start a new transaction*/ 2990 /* start a new transaction*/
2991 handle = ext4_journal_start(inode, needed_blocks); 2991 handle = ext4_journal_start(inode, needed_blocks);
2992 if (IS_ERR(handle)) { 2992 if (IS_ERR(handle)) {
2993 ret = PTR_ERR(handle); 2993 ret = PTR_ERR(handle);
2994 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2994 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2995 "%ld pages, ino %lu; err %d", __func__, 2995 "%ld pages, ino %lu; err %d", __func__,
2996 wbc->nr_to_write, inode->i_ino, ret); 2996 wbc->nr_to_write, inode->i_ino, ret);
2997 goto out_writepages; 2997 goto out_writepages;
2998 } 2998 }
2999 2999
3000 /* 3000 /*
3001 * Now call write_cache_pages_da() to find the next 3001 * Now call write_cache_pages_da() to find the next
3002 * contiguous region of logical blocks that need 3002 * contiguous region of logical blocks that need
3003 * blocks to be allocated by ext4 and submit them. 3003 * blocks to be allocated by ext4 and submit them.
3004 */ 3004 */
3005 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 3005 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3006 /* 3006 /*
3007 * If we have a contiguous extent of pages and we 3007 * If we have a contiguous extent of pages and we
3008 * haven't done the I/O yet, map the blocks and submit 3008 * haven't done the I/O yet, map the blocks and submit
3009 * them for I/O. 3009 * them for I/O.
3010 */ 3010 */
3011 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3011 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3012 mpage_da_map_and_submit(&mpd); 3012 mpage_da_map_and_submit(&mpd);
3013 ret = MPAGE_DA_EXTENT_TAIL; 3013 ret = MPAGE_DA_EXTENT_TAIL;
3014 } 3014 }
3015 trace_ext4_da_write_pages(inode, &mpd); 3015 trace_ext4_da_write_pages(inode, &mpd);
3016 wbc->nr_to_write -= mpd.pages_written; 3016 wbc->nr_to_write -= mpd.pages_written;
3017 3017
3018 ext4_journal_stop(handle); 3018 ext4_journal_stop(handle);
3019 3019
3020 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 3020 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
3021 /* commit the transaction which would 3021 /* commit the transaction which would
3022 * free blocks released in the transaction 3022 * free blocks released in the transaction
3023 * and try again 3023 * and try again
3024 */ 3024 */
3025 jbd2_journal_force_commit_nested(sbi->s_journal); 3025 jbd2_journal_force_commit_nested(sbi->s_journal);
3026 ret = 0; 3026 ret = 0;
3027 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 3027 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3028 /* 3028 /*
3029 * got one extent now try with 3029 * got one extent now try with
3030 * rest of the pages 3030 * rest of the pages
3031 */ 3031 */
3032 pages_written += mpd.pages_written; 3032 pages_written += mpd.pages_written;
3033 ret = 0; 3033 ret = 0;
3034 io_done = 1; 3034 io_done = 1;
3035 } else if (wbc->nr_to_write) 3035 } else if (wbc->nr_to_write)
3036 /* 3036 /*
3037 * There is no more writeout needed 3037 * There is no more writeout needed
3038 * or we requested for a noblocking writeout 3038 * or we requested for a noblocking writeout
3039 * and we found the device congested 3039 * and we found the device congested
3040 */ 3040 */
3041 break; 3041 break;
3042 } 3042 }
3043 if (!io_done && !cycled) { 3043 if (!io_done && !cycled) {
3044 cycled = 1; 3044 cycled = 1;
3045 index = 0; 3045 index = 0;
3046 wbc->range_start = index << PAGE_CACHE_SHIFT; 3046 wbc->range_start = index << PAGE_CACHE_SHIFT;
3047 wbc->range_end = mapping->writeback_index - 1; 3047 wbc->range_end = mapping->writeback_index - 1;
3048 goto retry; 3048 goto retry;
3049 } 3049 }
3050 3050
3051 /* Update index */ 3051 /* Update index */
3052 wbc->range_cyclic = range_cyclic; 3052 wbc->range_cyclic = range_cyclic;
3053 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3053 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3054 /* 3054 /*
3055 * set the writeback_index so that range_cyclic 3055 * set the writeback_index so that range_cyclic
3056 * mode will write it back later 3056 * mode will write it back later
3057 */ 3057 */
3058 mapping->writeback_index = done_index; 3058 mapping->writeback_index = done_index;
3059 3059
3060 out_writepages: 3060 out_writepages:
3061 wbc->nr_to_write -= nr_to_writebump; 3061 wbc->nr_to_write -= nr_to_writebump;
3062 wbc->range_start = range_start; 3062 wbc->range_start = range_start;
3063 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3063 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
3064 return ret; 3064 return ret;
3065 } 3065 }
3066 3066
3067 #define FALL_BACK_TO_NONDELALLOC 1 3067 #define FALL_BACK_TO_NONDELALLOC 1
3068 static int ext4_nonda_switch(struct super_block *sb) 3068 static int ext4_nonda_switch(struct super_block *sb)
3069 { 3069 {
3070 s64 free_blocks, dirty_blocks; 3070 s64 free_blocks, dirty_blocks;
3071 struct ext4_sb_info *sbi = EXT4_SB(sb); 3071 struct ext4_sb_info *sbi = EXT4_SB(sb);
3072 3072
3073 /* 3073 /*
3074 * switch to non delalloc mode if we are running low 3074 * switch to non delalloc mode if we are running low
3075 * on free block. The free block accounting via percpu 3075 * on free block. The free block accounting via percpu
3076 * counters can get slightly wrong with percpu_counter_batch getting 3076 * counters can get slightly wrong with percpu_counter_batch getting
3077 * accumulated on each CPU without updating global counters 3077 * accumulated on each CPU without updating global counters
3078 * Delalloc need an accurate free block accounting. So switch 3078 * Delalloc need an accurate free block accounting. So switch
3079 * to non delalloc when we are near to error range. 3079 * to non delalloc when we are near to error range.
3080 */ 3080 */
3081 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 3081 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
3082 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 3082 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
3083 if (2 * free_blocks < 3 * dirty_blocks || 3083 if (2 * free_blocks < 3 * dirty_blocks ||
3084 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 3084 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
3085 /* 3085 /*
3086 * free block count is less than 150% of dirty blocks 3086 * free block count is less than 150% of dirty blocks
3087 * or free blocks is less than watermark 3087 * or free blocks is less than watermark
3088 */ 3088 */
3089 return 1; 3089 return 1;
3090 } 3090 }
3091 /* 3091 /*
3092 * Even if we don't switch but are nearing capacity, 3092 * Even if we don't switch but are nearing capacity,
3093 * start pushing delalloc when 1/2 of free blocks are dirty. 3093 * start pushing delalloc when 1/2 of free blocks are dirty.
3094 */ 3094 */
3095 if (free_blocks < 2 * dirty_blocks) 3095 if (free_blocks < 2 * dirty_blocks)
3096 writeback_inodes_sb_if_idle(sb); 3096 writeback_inodes_sb_if_idle(sb);
3097 3097
3098 return 0; 3098 return 0;
3099 } 3099 }
3100 3100
3101 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 3101 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3102 loff_t pos, unsigned len, unsigned flags, 3102 loff_t pos, unsigned len, unsigned flags,
3103 struct page **pagep, void **fsdata) 3103 struct page **pagep, void **fsdata)
3104 { 3104 {
3105 int ret, retries = 0; 3105 int ret, retries = 0;
3106 struct page *page; 3106 struct page *page;
3107 pgoff_t index; 3107 pgoff_t index;
3108 struct inode *inode = mapping->host; 3108 struct inode *inode = mapping->host;
3109 handle_t *handle; 3109 handle_t *handle;
3110 3110
3111 index = pos >> PAGE_CACHE_SHIFT; 3111 index = pos >> PAGE_CACHE_SHIFT;
3112 3112
3113 if (ext4_nonda_switch(inode->i_sb)) { 3113 if (ext4_nonda_switch(inode->i_sb)) {
3114 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 3114 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
3115 return ext4_write_begin(file, mapping, pos, 3115 return ext4_write_begin(file, mapping, pos,
3116 len, flags, pagep, fsdata); 3116 len, flags, pagep, fsdata);
3117 } 3117 }
3118 *fsdata = (void *)0; 3118 *fsdata = (void *)0;
3119 trace_ext4_da_write_begin(inode, pos, len, flags); 3119 trace_ext4_da_write_begin(inode, pos, len, flags);
3120 retry: 3120 retry:
3121 /* 3121 /*
3122 * With delayed allocation, we don't log the i_disksize update 3122 * With delayed allocation, we don't log the i_disksize update
3123 * if there is delayed block allocation. But we still need 3123 * if there is delayed block allocation. But we still need
3124 * to journalling the i_disksize update if writes to the end 3124 * to journalling the i_disksize update if writes to the end
3125 * of file which has an already mapped buffer. 3125 * of file which has an already mapped buffer.
3126 */ 3126 */
3127 handle = ext4_journal_start(inode, 1); 3127 handle = ext4_journal_start(inode, 1);
3128 if (IS_ERR(handle)) { 3128 if (IS_ERR(handle)) {
3129 ret = PTR_ERR(handle); 3129 ret = PTR_ERR(handle);
3130 goto out; 3130 goto out;
3131 } 3131 }
3132 /* We cannot recurse into the filesystem as the transaction is already 3132 /* We cannot recurse into the filesystem as the transaction is already
3133 * started */ 3133 * started */
3134 flags |= AOP_FLAG_NOFS; 3134 flags |= AOP_FLAG_NOFS;
3135 3135
3136 page = grab_cache_page_write_begin(mapping, index, flags); 3136 page = grab_cache_page_write_begin(mapping, index, flags);
3137 if (!page) { 3137 if (!page) {
3138 ext4_journal_stop(handle); 3138 ext4_journal_stop(handle);
3139 ret = -ENOMEM; 3139 ret = -ENOMEM;
3140 goto out; 3140 goto out;
3141 } 3141 }
3142 *pagep = page; 3142 *pagep = page;
3143 3143
3144 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 3144 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
3145 if (ret < 0) { 3145 if (ret < 0) {
3146 unlock_page(page); 3146 unlock_page(page);
3147 ext4_journal_stop(handle); 3147 ext4_journal_stop(handle);
3148 page_cache_release(page); 3148 page_cache_release(page);
3149 /* 3149 /*
3150 * block_write_begin may have instantiated a few blocks 3150 * block_write_begin may have instantiated a few blocks
3151 * outside i_size. Trim these off again. Don't need 3151 * outside i_size. Trim these off again. Don't need
3152 * i_size_read because we hold i_mutex. 3152 * i_size_read because we hold i_mutex.
3153 */ 3153 */
3154 if (pos + len > inode->i_size) 3154 if (pos + len > inode->i_size)
3155 ext4_truncate_failed_write(inode); 3155 ext4_truncate_failed_write(inode);
3156 } 3156 }
3157 3157
3158 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3158 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3159 goto retry; 3159 goto retry;
3160 out: 3160 out:
3161 return ret; 3161 return ret;
3162 } 3162 }
3163 3163
3164 /* 3164 /*
3165 * Check if we should update i_disksize 3165 * Check if we should update i_disksize
3166 * when write to the end of file but not require block allocation 3166 * when write to the end of file but not require block allocation
3167 */ 3167 */
3168 static int ext4_da_should_update_i_disksize(struct page *page, 3168 static int ext4_da_should_update_i_disksize(struct page *page,
3169 unsigned long offset) 3169 unsigned long offset)
3170 { 3170 {
3171 struct buffer_head *bh; 3171 struct buffer_head *bh;
3172 struct inode *inode = page->mapping->host; 3172 struct inode *inode = page->mapping->host;
3173 unsigned int idx; 3173 unsigned int idx;
3174 int i; 3174 int i;
3175 3175
3176 bh = page_buffers(page); 3176 bh = page_buffers(page);
3177 idx = offset >> inode->i_blkbits; 3177 idx = offset >> inode->i_blkbits;
3178 3178
3179 for (i = 0; i < idx; i++) 3179 for (i = 0; i < idx; i++)
3180 bh = bh->b_this_page; 3180 bh = bh->b_this_page;
3181 3181
3182 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 3182 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
3183 return 0; 3183 return 0;
3184 return 1; 3184 return 1;
3185 } 3185 }
3186 3186
3187 static int ext4_da_write_end(struct file *file, 3187 static int ext4_da_write_end(struct file *file,
3188 struct address_space *mapping, 3188 struct address_space *mapping,
3189 loff_t pos, unsigned len, unsigned copied, 3189 loff_t pos, unsigned len, unsigned copied,
3190 struct page *page, void *fsdata) 3190 struct page *page, void *fsdata)
3191 { 3191 {
3192 struct inode *inode = mapping->host; 3192 struct inode *inode = mapping->host;
3193 int ret = 0, ret2; 3193 int ret = 0, ret2;
3194 handle_t *handle = ext4_journal_current_handle(); 3194 handle_t *handle = ext4_journal_current_handle();
3195 loff_t new_i_size; 3195 loff_t new_i_size;
3196 unsigned long start, end; 3196 unsigned long start, end;
3197 int write_mode = (int)(unsigned long)fsdata; 3197 int write_mode = (int)(unsigned long)fsdata;
3198 3198
3199 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 3199 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
3200 if (ext4_should_order_data(inode)) { 3200 if (ext4_should_order_data(inode)) {
3201 return ext4_ordered_write_end(file, mapping, pos, 3201 return ext4_ordered_write_end(file, mapping, pos,
3202 len, copied, page, fsdata); 3202 len, copied, page, fsdata);
3203 } else if (ext4_should_writeback_data(inode)) { 3203 } else if (ext4_should_writeback_data(inode)) {
3204 return ext4_writeback_write_end(file, mapping, pos, 3204 return ext4_writeback_write_end(file, mapping, pos,
3205 len, copied, page, fsdata); 3205 len, copied, page, fsdata);
3206 } else { 3206 } else {
3207 BUG(); 3207 BUG();
3208 } 3208 }
3209 } 3209 }
3210 3210
3211 trace_ext4_da_write_end(inode, pos, len, copied); 3211 trace_ext4_da_write_end(inode, pos, len, copied);
3212 start = pos & (PAGE_CACHE_SIZE - 1); 3212 start = pos & (PAGE_CACHE_SIZE - 1);
3213 end = start + copied - 1; 3213 end = start + copied - 1;
3214 3214
3215 /* 3215 /*
3216 * generic_write_end() will run mark_inode_dirty() if i_size 3216 * generic_write_end() will run mark_inode_dirty() if i_size
3217 * changes. So let's piggyback the i_disksize mark_inode_dirty 3217 * changes. So let's piggyback the i_disksize mark_inode_dirty
3218 * into that. 3218 * into that.
3219 */ 3219 */
3220 3220
3221 new_i_size = pos + copied; 3221 new_i_size = pos + copied;
3222 if (new_i_size > EXT4_I(inode)->i_disksize) { 3222 if (new_i_size > EXT4_I(inode)->i_disksize) {
3223 if (ext4_da_should_update_i_disksize(page, end)) { 3223 if (ext4_da_should_update_i_disksize(page, end)) {
3224 down_write(&EXT4_I(inode)->i_data_sem); 3224 down_write(&EXT4_I(inode)->i_data_sem);
3225 if (new_i_size > EXT4_I(inode)->i_disksize) { 3225 if (new_i_size > EXT4_I(inode)->i_disksize) {
3226 /* 3226 /*
3227 * Updating i_disksize when extending file 3227 * Updating i_disksize when extending file
3228 * without needing block allocation 3228 * without needing block allocation
3229 */ 3229 */
3230 if (ext4_should_order_data(inode)) 3230 if (ext4_should_order_data(inode))
3231 ret = ext4_jbd2_file_inode(handle, 3231 ret = ext4_jbd2_file_inode(handle,
3232 inode); 3232 inode);
3233 3233
3234 EXT4_I(inode)->i_disksize = new_i_size; 3234 EXT4_I(inode)->i_disksize = new_i_size;
3235 } 3235 }
3236 up_write(&EXT4_I(inode)->i_data_sem); 3236 up_write(&EXT4_I(inode)->i_data_sem);
3237 /* We need to mark inode dirty even if 3237 /* We need to mark inode dirty even if
3238 * new_i_size is less that inode->i_size 3238 * new_i_size is less that inode->i_size
3239 * bu greater than i_disksize.(hint delalloc) 3239 * bu greater than i_disksize.(hint delalloc)
3240 */ 3240 */
3241 ext4_mark_inode_dirty(handle, inode); 3241 ext4_mark_inode_dirty(handle, inode);
3242 } 3242 }
3243 } 3243 }
3244 ret2 = generic_write_end(file, mapping, pos, len, copied, 3244 ret2 = generic_write_end(file, mapping, pos, len, copied,
3245 page, fsdata); 3245 page, fsdata);
3246 copied = ret2; 3246 copied = ret2;
3247 if (ret2 < 0) 3247 if (ret2 < 0)
3248 ret = ret2; 3248 ret = ret2;
3249 ret2 = ext4_journal_stop(handle); 3249 ret2 = ext4_journal_stop(handle);
3250 if (!ret) 3250 if (!ret)
3251 ret = ret2; 3251 ret = ret2;
3252 3252
3253 return ret ? ret : copied; 3253 return ret ? ret : copied;
3254 } 3254 }
3255 3255
3256 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 3256 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3257 { 3257 {
3258 /* 3258 /*
3259 * Drop reserved blocks 3259 * Drop reserved blocks
3260 */ 3260 */
3261 BUG_ON(!PageLocked(page)); 3261 BUG_ON(!PageLocked(page));
3262 if (!page_has_buffers(page)) 3262 if (!page_has_buffers(page))
3263 goto out; 3263 goto out;
3264 3264
3265 ext4_da_page_release_reservation(page, offset); 3265 ext4_da_page_release_reservation(page, offset);
3266 3266
3267 out: 3267 out:
3268 ext4_invalidatepage(page, offset); 3268 ext4_invalidatepage(page, offset);
3269 3269
3270 return; 3270 return;
3271 } 3271 }
3272 3272
3273 /* 3273 /*
3274 * Force all delayed allocation blocks to be allocated for a given inode. 3274 * Force all delayed allocation blocks to be allocated for a given inode.
3275 */ 3275 */
3276 int ext4_alloc_da_blocks(struct inode *inode) 3276 int ext4_alloc_da_blocks(struct inode *inode)
3277 { 3277 {
3278 trace_ext4_alloc_da_blocks(inode); 3278 trace_ext4_alloc_da_blocks(inode);
3279 3279
3280 if (!EXT4_I(inode)->i_reserved_data_blocks && 3280 if (!EXT4_I(inode)->i_reserved_data_blocks &&
3281 !EXT4_I(inode)->i_reserved_meta_blocks) 3281 !EXT4_I(inode)->i_reserved_meta_blocks)
3282 return 0; 3282 return 0;
3283 3283
3284 /* 3284 /*
3285 * We do something simple for now. The filemap_flush() will 3285 * We do something simple for now. The filemap_flush() will
3286 * also start triggering a write of the data blocks, which is 3286 * also start triggering a write of the data blocks, which is
3287 * not strictly speaking necessary (and for users of 3287 * not strictly speaking necessary (and for users of
3288 * laptop_mode, not even desirable). However, to do otherwise 3288 * laptop_mode, not even desirable). However, to do otherwise
3289 * would require replicating code paths in: 3289 * would require replicating code paths in:
3290 * 3290 *
3291 * ext4_da_writepages() -> 3291 * ext4_da_writepages() ->
3292 * write_cache_pages() ---> (via passed in callback function) 3292 * write_cache_pages() ---> (via passed in callback function)
3293 * __mpage_da_writepage() --> 3293 * __mpage_da_writepage() -->
3294 * mpage_add_bh_to_extent() 3294 * mpage_add_bh_to_extent()
3295 * mpage_da_map_blocks() 3295 * mpage_da_map_blocks()
3296 * 3296 *
3297 * The problem is that write_cache_pages(), located in 3297 * The problem is that write_cache_pages(), located in
3298 * mm/page-writeback.c, marks pages clean in preparation for 3298 * mm/page-writeback.c, marks pages clean in preparation for
3299 * doing I/O, which is not desirable if we're not planning on 3299 * doing I/O, which is not desirable if we're not planning on
3300 * doing I/O at all. 3300 * doing I/O at all.
3301 * 3301 *
3302 * We could call write_cache_pages(), and then redirty all of 3302 * We could call write_cache_pages(), and then redirty all of
3303 * the pages by calling redirty_page_for_writepage() but that 3303 * the pages by calling redirty_page_for_writepage() but that
3304 * would be ugly in the extreme. So instead we would need to 3304 * would be ugly in the extreme. So instead we would need to
3305 * replicate parts of the code in the above functions, 3305 * replicate parts of the code in the above functions,
3306 * simplifying them because we wouldn't actually intend to 3306 * simplifying them because we wouldn't actually intend to
3307 * write out the pages, but rather only collect contiguous 3307 * write out the pages, but rather only collect contiguous
3308 * logical block extents, call the multi-block allocator, and 3308 * logical block extents, call the multi-block allocator, and
3309 * then update the buffer heads with the block allocations. 3309 * then update the buffer heads with the block allocations.
3310 * 3310 *
3311 * For now, though, we'll cheat by calling filemap_flush(), 3311 * For now, though, we'll cheat by calling filemap_flush(),
3312 * which will map the blocks, and start the I/O, but not 3312 * which will map the blocks, and start the I/O, but not
3313 * actually wait for the I/O to complete. 3313 * actually wait for the I/O to complete.
3314 */ 3314 */
3315 return filemap_flush(inode->i_mapping); 3315 return filemap_flush(inode->i_mapping);
3316 } 3316 }
3317 3317
3318 /* 3318 /*
3319 * bmap() is special. It gets used by applications such as lilo and by 3319 * bmap() is special. It gets used by applications such as lilo and by
3320 * the swapper to find the on-disk block of a specific piece of data. 3320 * the swapper to find the on-disk block of a specific piece of data.
3321 * 3321 *
3322 * Naturally, this is dangerous if the block concerned is still in the 3322 * Naturally, this is dangerous if the block concerned is still in the
3323 * journal. If somebody makes a swapfile on an ext4 data-journaling 3323 * journal. If somebody makes a swapfile on an ext4 data-journaling
3324 * filesystem and enables swap, then they may get a nasty shock when the 3324 * filesystem and enables swap, then they may get a nasty shock when the
3325 * data getting swapped to that swapfile suddenly gets overwritten by 3325 * data getting swapped to that swapfile suddenly gets overwritten by
3326 * the original zero's written out previously to the journal and 3326 * the original zero's written out previously to the journal and
3327 * awaiting writeback in the kernel's buffer cache. 3327 * awaiting writeback in the kernel's buffer cache.
3328 * 3328 *
3329 * So, if we see any bmap calls here on a modified, data-journaled file, 3329 * So, if we see any bmap calls here on a modified, data-journaled file,
3330 * take extra steps to flush any blocks which might be in the cache. 3330 * take extra steps to flush any blocks which might be in the cache.
3331 */ 3331 */
3332 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3332 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3333 { 3333 {
3334 struct inode *inode = mapping->host; 3334 struct inode *inode = mapping->host;
3335 journal_t *journal; 3335 journal_t *journal;
3336 int err; 3336 int err;
3337 3337
3338 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3338 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3339 test_opt(inode->i_sb, DELALLOC)) { 3339 test_opt(inode->i_sb, DELALLOC)) {
3340 /* 3340 /*
3341 * With delalloc we want to sync the file 3341 * With delalloc we want to sync the file
3342 * so that we can make sure we allocate 3342 * so that we can make sure we allocate
3343 * blocks for file 3343 * blocks for file
3344 */ 3344 */
3345 filemap_write_and_wait(mapping); 3345 filemap_write_and_wait(mapping);
3346 } 3346 }
3347 3347
3348 if (EXT4_JOURNAL(inode) && 3348 if (EXT4_JOURNAL(inode) &&
3349 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { 3349 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3350 /* 3350 /*
3351 * This is a REALLY heavyweight approach, but the use of 3351 * This is a REALLY heavyweight approach, but the use of
3352 * bmap on dirty files is expected to be extremely rare: 3352 * bmap on dirty files is expected to be extremely rare:
3353 * only if we run lilo or swapon on a freshly made file 3353 * only if we run lilo or swapon on a freshly made file
3354 * do we expect this to happen. 3354 * do we expect this to happen.
3355 * 3355 *
3356 * (bmap requires CAP_SYS_RAWIO so this does not 3356 * (bmap requires CAP_SYS_RAWIO so this does not
3357 * represent an unprivileged user DOS attack --- we'd be 3357 * represent an unprivileged user DOS attack --- we'd be
3358 * in trouble if mortal users could trigger this path at 3358 * in trouble if mortal users could trigger this path at
3359 * will.) 3359 * will.)
3360 * 3360 *
3361 * NB. EXT4_STATE_JDATA is not set on files other than 3361 * NB. EXT4_STATE_JDATA is not set on files other than
3362 * regular files. If somebody wants to bmap a directory 3362 * regular files. If somebody wants to bmap a directory
3363 * or symlink and gets confused because the buffer 3363 * or symlink and gets confused because the buffer
3364 * hasn't yet been flushed to disk, they deserve 3364 * hasn't yet been flushed to disk, they deserve
3365 * everything they get. 3365 * everything they get.
3366 */ 3366 */
3367 3367
3368 ext4_clear_inode_state(inode, EXT4_STATE_JDATA); 3368 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3369 journal = EXT4_JOURNAL(inode); 3369 journal = EXT4_JOURNAL(inode);
3370 jbd2_journal_lock_updates(journal); 3370 jbd2_journal_lock_updates(journal);
3371 err = jbd2_journal_flush(journal); 3371 err = jbd2_journal_flush(journal);
3372 jbd2_journal_unlock_updates(journal); 3372 jbd2_journal_unlock_updates(journal);
3373 3373
3374 if (err) 3374 if (err)
3375 return 0; 3375 return 0;
3376 } 3376 }
3377 3377
3378 return generic_block_bmap(mapping, block, ext4_get_block); 3378 return generic_block_bmap(mapping, block, ext4_get_block);
3379 } 3379 }
3380 3380
3381 static int ext4_readpage(struct file *file, struct page *page) 3381 static int ext4_readpage(struct file *file, struct page *page)
3382 { 3382 {
3383 trace_ext4_readpage(page); 3383 trace_ext4_readpage(page);
3384 return mpage_readpage(page, ext4_get_block); 3384 return mpage_readpage(page, ext4_get_block);
3385 } 3385 }
3386 3386
3387 static int 3387 static int
3388 ext4_readpages(struct file *file, struct address_space *mapping, 3388 ext4_readpages(struct file *file, struct address_space *mapping,
3389 struct list_head *pages, unsigned nr_pages) 3389 struct list_head *pages, unsigned nr_pages)
3390 { 3390 {
3391 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3391 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3392 } 3392 }
3393 3393
3394 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3394 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3395 { 3395 {
3396 struct buffer_head *head, *bh; 3396 struct buffer_head *head, *bh;
3397 unsigned int curr_off = 0; 3397 unsigned int curr_off = 0;
3398 3398
3399 if (!page_has_buffers(page)) 3399 if (!page_has_buffers(page))
3400 return; 3400 return;
3401 head = bh = page_buffers(page); 3401 head = bh = page_buffers(page);
3402 do { 3402 do {
3403 if (offset <= curr_off && test_clear_buffer_uninit(bh) 3403 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3404 && bh->b_private) { 3404 && bh->b_private) {
3405 ext4_free_io_end(bh->b_private); 3405 ext4_free_io_end(bh->b_private);
3406 bh->b_private = NULL; 3406 bh->b_private = NULL;
3407 bh->b_end_io = NULL; 3407 bh->b_end_io = NULL;
3408 } 3408 }
3409 curr_off = curr_off + bh->b_size; 3409 curr_off = curr_off + bh->b_size;
3410 bh = bh->b_this_page; 3410 bh = bh->b_this_page;
3411 } while (bh != head); 3411 } while (bh != head);
3412 } 3412 }
3413 3413
3414 static void ext4_invalidatepage(struct page *page, unsigned long offset) 3414 static void ext4_invalidatepage(struct page *page, unsigned long offset)
3415 { 3415 {
3416 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3416 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3417 3417
3418 trace_ext4_invalidatepage(page, offset); 3418 trace_ext4_invalidatepage(page, offset);
3419 3419
3420 /* 3420 /*
3421 * free any io_end structure allocated for buffers to be discarded 3421 * free any io_end structure allocated for buffers to be discarded
3422 */ 3422 */
3423 if (ext4_should_dioread_nolock(page->mapping->host)) 3423 if (ext4_should_dioread_nolock(page->mapping->host))
3424 ext4_invalidatepage_free_endio(page, offset); 3424 ext4_invalidatepage_free_endio(page, offset);
3425 /* 3425 /*
3426 * If it's a full truncate we just forget about the pending dirtying 3426 * If it's a full truncate we just forget about the pending dirtying
3427 */ 3427 */
3428 if (offset == 0) 3428 if (offset == 0)
3429 ClearPageChecked(page); 3429 ClearPageChecked(page);
3430 3430
3431 if (journal) 3431 if (journal)
3432 jbd2_journal_invalidatepage(journal, page, offset); 3432 jbd2_journal_invalidatepage(journal, page, offset);
3433 else 3433 else
3434 block_invalidatepage(page, offset); 3434 block_invalidatepage(page, offset);
3435 } 3435 }
3436 3436
3437 static int ext4_releasepage(struct page *page, gfp_t wait) 3437 static int ext4_releasepage(struct page *page, gfp_t wait)
3438 { 3438 {
3439 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3439 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3440 3440
3441 trace_ext4_releasepage(page); 3441 trace_ext4_releasepage(page);
3442 3442
3443 WARN_ON(PageChecked(page)); 3443 WARN_ON(PageChecked(page));
3444 if (!page_has_buffers(page)) 3444 if (!page_has_buffers(page))
3445 return 0; 3445 return 0;
3446 if (journal) 3446 if (journal)
3447 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3447 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3448 else 3448 else
3449 return try_to_free_buffers(page); 3449 return try_to_free_buffers(page);
3450 } 3450 }
3451 3451
3452 /* 3452 /*
3453 * O_DIRECT for ext3 (or indirect map) based files 3453 * O_DIRECT for ext3 (or indirect map) based files
3454 * 3454 *
3455 * If the O_DIRECT write will extend the file then add this inode to the 3455 * If the O_DIRECT write will extend the file then add this inode to the
3456 * orphan list. So recovery will truncate it back to the original size 3456 * orphan list. So recovery will truncate it back to the original size
3457 * if the machine crashes during the write. 3457 * if the machine crashes during the write.
3458 * 3458 *
3459 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3459 * If the O_DIRECT write is intantiating holes inside i_size and the machine
3460 * crashes then stale disk data _may_ be exposed inside the file. But current 3460 * crashes then stale disk data _may_ be exposed inside the file. But current
3461 * VFS code falls back into buffered path in that case so we are safe. 3461 * VFS code falls back into buffered path in that case so we are safe.
3462 */ 3462 */
3463 static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, 3463 static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3464 const struct iovec *iov, loff_t offset, 3464 const struct iovec *iov, loff_t offset,
3465 unsigned long nr_segs) 3465 unsigned long nr_segs)
3466 { 3466 {
3467 struct file *file = iocb->ki_filp; 3467 struct file *file = iocb->ki_filp;
3468 struct inode *inode = file->f_mapping->host; 3468 struct inode *inode = file->f_mapping->host;
3469 struct ext4_inode_info *ei = EXT4_I(inode); 3469 struct ext4_inode_info *ei = EXT4_I(inode);
3470 handle_t *handle; 3470 handle_t *handle;
3471 ssize_t ret; 3471 ssize_t ret;
3472 int orphan = 0; 3472 int orphan = 0;
3473 size_t count = iov_length(iov, nr_segs); 3473 size_t count = iov_length(iov, nr_segs);
3474 int retries = 0; 3474 int retries = 0;
3475 3475
3476 if (rw == WRITE) { 3476 if (rw == WRITE) {
3477 loff_t final_size = offset + count; 3477 loff_t final_size = offset + count;
3478 3478
3479 if (final_size > inode->i_size) { 3479 if (final_size > inode->i_size) {
3480 /* Credits for sb + inode write */ 3480 /* Credits for sb + inode write */
3481 handle = ext4_journal_start(inode, 2); 3481 handle = ext4_journal_start(inode, 2);
3482 if (IS_ERR(handle)) { 3482 if (IS_ERR(handle)) {
3483 ret = PTR_ERR(handle); 3483 ret = PTR_ERR(handle);
3484 goto out; 3484 goto out;
3485 } 3485 }
3486 ret = ext4_orphan_add(handle, inode); 3486 ret = ext4_orphan_add(handle, inode);
3487 if (ret) { 3487 if (ret) {
3488 ext4_journal_stop(handle); 3488 ext4_journal_stop(handle);
3489 goto out; 3489 goto out;
3490 } 3490 }
3491 orphan = 1; 3491 orphan = 1;
3492 ei->i_disksize = inode->i_size; 3492 ei->i_disksize = inode->i_size;
3493 ext4_journal_stop(handle); 3493 ext4_journal_stop(handle);
3494 } 3494 }
3495 } 3495 }
3496 3496
3497 retry: 3497 retry:
3498 if (rw == READ && ext4_should_dioread_nolock(inode)) 3498 if (rw == READ && ext4_should_dioread_nolock(inode))
3499 ret = __blockdev_direct_IO(rw, iocb, inode, 3499 ret = __blockdev_direct_IO(rw, iocb, inode,
3500 inode->i_sb->s_bdev, iov, 3500 inode->i_sb->s_bdev, iov,
3501 offset, nr_segs, 3501 offset, nr_segs,
3502 ext4_get_block, NULL, NULL, 0); 3502 ext4_get_block, NULL, NULL, 0);
3503 else { 3503 else {
3504 ret = blockdev_direct_IO(rw, iocb, inode, iov, 3504 ret = blockdev_direct_IO(rw, iocb, inode, iov,
3505 offset, nr_segs, ext4_get_block); 3505 offset, nr_segs, ext4_get_block);
3506 3506
3507 if (unlikely((rw & WRITE) && ret < 0)) { 3507 if (unlikely((rw & WRITE) && ret < 0)) {
3508 loff_t isize = i_size_read(inode); 3508 loff_t isize = i_size_read(inode);
3509 loff_t end = offset + iov_length(iov, nr_segs); 3509 loff_t end = offset + iov_length(iov, nr_segs);
3510 3510
3511 if (end > isize) 3511 if (end > isize)
3512 ext4_truncate_failed_write(inode); 3512 ext4_truncate_failed_write(inode);
3513 } 3513 }
3514 } 3514 }
3515 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3515 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3516 goto retry; 3516 goto retry;
3517 3517
3518 if (orphan) { 3518 if (orphan) {
3519 int err; 3519 int err;
3520 3520
3521 /* Credits for sb + inode write */ 3521 /* Credits for sb + inode write */
3522 handle = ext4_journal_start(inode, 2); 3522 handle = ext4_journal_start(inode, 2);
3523 if (IS_ERR(handle)) { 3523 if (IS_ERR(handle)) {
3524 /* This is really bad luck. We've written the data 3524 /* This is really bad luck. We've written the data
3525 * but cannot extend i_size. Bail out and pretend 3525 * but cannot extend i_size. Bail out and pretend
3526 * the write failed... */ 3526 * the write failed... */
3527 ret = PTR_ERR(handle); 3527 ret = PTR_ERR(handle);
3528 if (inode->i_nlink) 3528 if (inode->i_nlink)
3529 ext4_orphan_del(NULL, inode); 3529 ext4_orphan_del(NULL, inode);
3530 3530
3531 goto out; 3531 goto out;
3532 } 3532 }
3533 if (inode->i_nlink) 3533 if (inode->i_nlink)
3534 ext4_orphan_del(handle, inode); 3534 ext4_orphan_del(handle, inode);
3535 if (ret > 0) { 3535 if (ret > 0) {
3536 loff_t end = offset + ret; 3536 loff_t end = offset + ret;
3537 if (end > inode->i_size) { 3537 if (end > inode->i_size) {
3538 ei->i_disksize = end; 3538 ei->i_disksize = end;
3539 i_size_write(inode, end); 3539 i_size_write(inode, end);
3540 /* 3540 /*
3541 * We're going to return a positive `ret' 3541 * We're going to return a positive `ret'
3542 * here due to non-zero-length I/O, so there's 3542 * here due to non-zero-length I/O, so there's
3543 * no way of reporting error returns from 3543 * no way of reporting error returns from
3544 * ext4_mark_inode_dirty() to userspace. So 3544 * ext4_mark_inode_dirty() to userspace. So
3545 * ignore it. 3545 * ignore it.
3546 */ 3546 */
3547 ext4_mark_inode_dirty(handle, inode); 3547 ext4_mark_inode_dirty(handle, inode);
3548 } 3548 }
3549 } 3549 }
3550 err = ext4_journal_stop(handle); 3550 err = ext4_journal_stop(handle);
3551 if (ret == 0) 3551 if (ret == 0)
3552 ret = err; 3552 ret = err;
3553 } 3553 }
3554 out: 3554 out:
3555 return ret; 3555 return ret;
3556 } 3556 }
3557 3557
3558 /* 3558 /*
3559 * ext4_get_block used when preparing for a DIO write or buffer write. 3559 * ext4_get_block used when preparing for a DIO write or buffer write.
3560 * We allocate an uinitialized extent if blocks haven't been allocated. 3560 * We allocate an uinitialized extent if blocks haven't been allocated.
3561 * The extent will be converted to initialized after the IO is complete. 3561 * The extent will be converted to initialized after the IO is complete.
3562 */ 3562 */
3563 static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3563 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3564 struct buffer_head *bh_result, int create) 3564 struct buffer_head *bh_result, int create)
3565 { 3565 {
3566 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3566 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3567 inode->i_ino, create); 3567 inode->i_ino, create);
3568 return _ext4_get_block(inode, iblock, bh_result, 3568 return _ext4_get_block(inode, iblock, bh_result,
3569 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3569 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3570 } 3570 }
3571 3571
3572 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3572 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3573 ssize_t size, void *private, int ret, 3573 ssize_t size, void *private, int ret,
3574 bool is_async) 3574 bool is_async)
3575 { 3575 {
3576 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 3576 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
3577 ext4_io_end_t *io_end = iocb->private; 3577 ext4_io_end_t *io_end = iocb->private;
3578 struct workqueue_struct *wq; 3578 struct workqueue_struct *wq;
3579 unsigned long flags; 3579 unsigned long flags;
3580 struct ext4_inode_info *ei; 3580 struct ext4_inode_info *ei;
3581 3581
3582 /* if not async direct IO or dio with 0 bytes write, just return */ 3582 /* if not async direct IO or dio with 0 bytes write, just return */
3583 if (!io_end || !size) 3583 if (!io_end || !size)
3584 goto out; 3584 goto out;
3585 3585
3586 ext_debug("ext4_end_io_dio(): io_end 0x%p" 3586 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3587 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 3587 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3588 iocb->private, io_end->inode->i_ino, iocb, offset, 3588 iocb->private, io_end->inode->i_ino, iocb, offset,
3589 size); 3589 size);
3590 3590
3591 /* if not aio dio with unwritten extents, just free io and return */ 3591 /* if not aio dio with unwritten extents, just free io and return */
3592 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 3592 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3593 ext4_free_io_end(io_end); 3593 ext4_free_io_end(io_end);
3594 iocb->private = NULL; 3594 iocb->private = NULL;
3595 out: 3595 out:
3596 if (is_async) 3596 if (is_async)
3597 aio_complete(iocb, ret, 0); 3597 aio_complete(iocb, ret, 0);
3598 inode_dio_done(inode); 3598 inode_dio_done(inode);
3599 return; 3599 return;
3600 } 3600 }
3601 3601
3602 io_end->offset = offset; 3602 io_end->offset = offset;
3603 io_end->size = size; 3603 io_end->size = size;
3604 if (is_async) { 3604 if (is_async) {
3605 io_end->iocb = iocb; 3605 io_end->iocb = iocb;
3606 io_end->result = ret; 3606 io_end->result = ret;
3607 } 3607 }
3608 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3608 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3609 3609
3610 /* Add the io_end to per-inode completed aio dio list*/ 3610 /* Add the io_end to per-inode completed aio dio list*/
3611 ei = EXT4_I(io_end->inode); 3611 ei = EXT4_I(io_end->inode);
3612 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3612 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3613 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3613 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3614 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3614 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3615 3615
3616 /* queue the work to convert unwritten extents to written */ 3616 /* queue the work to convert unwritten extents to written */
3617 queue_work(wq, &io_end->work); 3617 queue_work(wq, &io_end->work);
3618 iocb->private = NULL; 3618 iocb->private = NULL;
3619 3619
3620 /* XXX: probably should move into the real I/O completion handler */ 3620 /* XXX: probably should move into the real I/O completion handler */
3621 inode_dio_done(inode); 3621 inode_dio_done(inode);
3622 } 3622 }
3623 3623
3624 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 3624 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3625 { 3625 {
3626 ext4_io_end_t *io_end = bh->b_private; 3626 ext4_io_end_t *io_end = bh->b_private;
3627 struct workqueue_struct *wq; 3627 struct workqueue_struct *wq;
3628 struct inode *inode; 3628 struct inode *inode;
3629 unsigned long flags; 3629 unsigned long flags;
3630 3630
3631 if (!test_clear_buffer_uninit(bh) || !io_end) 3631 if (!test_clear_buffer_uninit(bh) || !io_end)
3632 goto out; 3632 goto out;
3633 3633
3634 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { 3634 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3635 printk("sb umounted, discard end_io request for inode %lu\n", 3635 printk("sb umounted, discard end_io request for inode %lu\n",
3636 io_end->inode->i_ino); 3636 io_end->inode->i_ino);
3637 ext4_free_io_end(io_end); 3637 ext4_free_io_end(io_end);
3638 goto out; 3638 goto out;
3639 } 3639 }
3640 3640
3641 io_end->flag = EXT4_IO_END_UNWRITTEN; 3641 io_end->flag = EXT4_IO_END_UNWRITTEN;
3642 inode = io_end->inode; 3642 inode = io_end->inode;
3643 3643
3644 /* Add the io_end to per-inode completed io list*/ 3644 /* Add the io_end to per-inode completed io list*/
3645 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 3645 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3646 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 3646 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3647 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 3647 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3648 3648
3649 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; 3649 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3650 /* queue the work to convert unwritten extents to written */ 3650 /* queue the work to convert unwritten extents to written */
3651 queue_work(wq, &io_end->work); 3651 queue_work(wq, &io_end->work);
3652 out: 3652 out:
3653 bh->b_private = NULL; 3653 bh->b_private = NULL;
3654 bh->b_end_io = NULL; 3654 bh->b_end_io = NULL;
3655 clear_buffer_uninit(bh); 3655 clear_buffer_uninit(bh);
3656 end_buffer_async_write(bh, uptodate); 3656 end_buffer_async_write(bh, uptodate);
3657 } 3657 }
3658 3658
3659 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) 3659 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3660 { 3660 {
3661 ext4_io_end_t *io_end; 3661 ext4_io_end_t *io_end;
3662 struct page *page = bh->b_page; 3662 struct page *page = bh->b_page;
3663 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; 3663 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3664 size_t size = bh->b_size; 3664 size_t size = bh->b_size;
3665 3665
3666 retry: 3666 retry:
3667 io_end = ext4_init_io_end(inode, GFP_ATOMIC); 3667 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3668 if (!io_end) { 3668 if (!io_end) {
3669 pr_warn_ratelimited("%s: allocation fail\n", __func__); 3669 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3670 schedule(); 3670 schedule();
3671 goto retry; 3671 goto retry;
3672 } 3672 }
3673 io_end->offset = offset; 3673 io_end->offset = offset;
3674 io_end->size = size; 3674 io_end->size = size;
3675 /* 3675 /*
3676 * We need to hold a reference to the page to make sure it 3676 * We need to hold a reference to the page to make sure it
3677 * doesn't get evicted before ext4_end_io_work() has a chance 3677 * doesn't get evicted before ext4_end_io_work() has a chance
3678 * to convert the extent from written to unwritten. 3678 * to convert the extent from written to unwritten.
3679 */ 3679 */
3680 io_end->page = page; 3680 io_end->page = page;
3681 get_page(io_end->page); 3681 get_page(io_end->page);
3682 3682
3683 bh->b_private = io_end; 3683 bh->b_private = io_end;
3684 bh->b_end_io = ext4_end_io_buffer_write; 3684 bh->b_end_io = ext4_end_io_buffer_write;
3685 return 0; 3685 return 0;
3686 } 3686 }
3687 3687
3688 /* 3688 /*
3689 * For ext4 extent files, ext4 will do direct-io write to holes, 3689 * For ext4 extent files, ext4 will do direct-io write to holes,
3690 * preallocated extents, and those write extend the file, no need to 3690 * preallocated extents, and those write extend the file, no need to
3691 * fall back to buffered IO. 3691 * fall back to buffered IO.
3692 * 3692 *
3693 * For holes, we fallocate those blocks, mark them as uninitialized 3693 * For holes, we fallocate those blocks, mark them as uninitialized
3694 * If those blocks were preallocated, we mark sure they are splited, but 3694 * If those blocks were preallocated, we mark sure they are splited, but
3695 * still keep the range to write as uninitialized. 3695 * still keep the range to write as uninitialized.
3696 * 3696 *
3697 * The unwrritten extents will be converted to written when DIO is completed. 3697 * The unwrritten extents will be converted to written when DIO is completed.
3698 * For async direct IO, since the IO may still pending when return, we 3698 * For async direct IO, since the IO may still pending when return, we
3699 * set up an end_io call back function, which will do the conversion 3699 * set up an end_io call back function, which will do the conversion
3700 * when async direct IO completed. 3700 * when async direct IO completed.
3701 * 3701 *
3702 * If the O_DIRECT write will extend the file then add this inode to the 3702 * If the O_DIRECT write will extend the file then add this inode to the
3703 * orphan list. So recovery will truncate it back to the original size 3703 * orphan list. So recovery will truncate it back to the original size
3704 * if the machine crashes during the write. 3704 * if the machine crashes during the write.
3705 * 3705 *
3706 */ 3706 */
3707 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, 3707 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3708 const struct iovec *iov, loff_t offset, 3708 const struct iovec *iov, loff_t offset,
3709 unsigned long nr_segs) 3709 unsigned long nr_segs)
3710 { 3710 {
3711 struct file *file = iocb->ki_filp; 3711 struct file *file = iocb->ki_filp;
3712 struct inode *inode = file->f_mapping->host; 3712 struct inode *inode = file->f_mapping->host;
3713 ssize_t ret; 3713 ssize_t ret;
3714 size_t count = iov_length(iov, nr_segs); 3714 size_t count = iov_length(iov, nr_segs);
3715 3715
3716 loff_t final_size = offset + count; 3716 loff_t final_size = offset + count;
3717 if (rw == WRITE && final_size <= inode->i_size) { 3717 if (rw == WRITE && final_size <= inode->i_size) {
3718 /* 3718 /*
3719 * We could direct write to holes and fallocate. 3719 * We could direct write to holes and fallocate.
3720 * 3720 *
3721 * Allocated blocks to fill the hole are marked as uninitialized 3721 * Allocated blocks to fill the hole are marked as uninitialized
3722 * to prevent parallel buffered read to expose the stale data 3722 * to prevent parallel buffered read to expose the stale data
3723 * before DIO complete the data IO. 3723 * before DIO complete the data IO.
3724 * 3724 *
3725 * As to previously fallocated extents, ext4 get_block 3725 * As to previously fallocated extents, ext4 get_block
3726 * will just simply mark the buffer mapped but still 3726 * will just simply mark the buffer mapped but still
3727 * keep the extents uninitialized. 3727 * keep the extents uninitialized.
3728 * 3728 *
3729 * for non AIO case, we will convert those unwritten extents 3729 * for non AIO case, we will convert those unwritten extents
3730 * to written after return back from blockdev_direct_IO. 3730 * to written after return back from blockdev_direct_IO.
3731 * 3731 *
3732 * for async DIO, the conversion needs to be defered when 3732 * for async DIO, the conversion needs to be defered when
3733 * the IO is completed. The ext4 end_io callback function 3733 * the IO is completed. The ext4 end_io callback function
3734 * will be called to take care of the conversion work. 3734 * will be called to take care of the conversion work.
3735 * Here for async case, we allocate an io_end structure to 3735 * Here for async case, we allocate an io_end structure to
3736 * hook to the iocb. 3736 * hook to the iocb.
3737 */ 3737 */
3738 iocb->private = NULL; 3738 iocb->private = NULL;
3739 EXT4_I(inode)->cur_aio_dio = NULL; 3739 EXT4_I(inode)->cur_aio_dio = NULL;
3740 if (!is_sync_kiocb(iocb)) { 3740 if (!is_sync_kiocb(iocb)) {
3741 iocb->private = ext4_init_io_end(inode, GFP_NOFS); 3741 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3742 if (!iocb->private) 3742 if (!iocb->private)
3743 return -ENOMEM; 3743 return -ENOMEM;
3744 /* 3744 /*
3745 * we save the io structure for current async 3745 * we save the io structure for current async
3746 * direct IO, so that later ext4_map_blocks() 3746 * direct IO, so that later ext4_map_blocks()
3747 * could flag the io structure whether there 3747 * could flag the io structure whether there
3748 * is a unwritten extents needs to be converted 3748 * is a unwritten extents needs to be converted
3749 * when IO is completed. 3749 * when IO is completed.
3750 */ 3750 */
3751 EXT4_I(inode)->cur_aio_dio = iocb->private; 3751 EXT4_I(inode)->cur_aio_dio = iocb->private;
3752 } 3752 }
3753 3753
3754 ret = __blockdev_direct_IO(rw, iocb, inode, 3754 ret = __blockdev_direct_IO(rw, iocb, inode,
3755 inode->i_sb->s_bdev, iov, 3755 inode->i_sb->s_bdev, iov,
3756 offset, nr_segs, 3756 offset, nr_segs,
3757 ext4_get_block_write, 3757 ext4_get_block_write,
3758 ext4_end_io_dio, 3758 ext4_end_io_dio,
3759 NULL, 3759 NULL,
3760 DIO_LOCKING | DIO_SKIP_HOLES); 3760 DIO_LOCKING | DIO_SKIP_HOLES);
3761 if (iocb->private) 3761 if (iocb->private)
3762 EXT4_I(inode)->cur_aio_dio = NULL; 3762 EXT4_I(inode)->cur_aio_dio = NULL;
3763 /* 3763 /*
3764 * The io_end structure takes a reference to the inode, 3764 * The io_end structure takes a reference to the inode,
3765 * that structure needs to be destroyed and the 3765 * that structure needs to be destroyed and the
3766 * reference to the inode need to be dropped, when IO is 3766 * reference to the inode need to be dropped, when IO is
3767 * complete, even with 0 byte write, or failed. 3767 * complete, even with 0 byte write, or failed.
3768 * 3768 *
3769 * In the successful AIO DIO case, the io_end structure will be 3769 * In the successful AIO DIO case, the io_end structure will be
3770 * desctroyed and the reference to the inode will be dropped 3770 * desctroyed and the reference to the inode will be dropped
3771 * after the end_io call back function is called. 3771 * after the end_io call back function is called.
3772 * 3772 *
3773 * In the case there is 0 byte write, or error case, since 3773 * In the case there is 0 byte write, or error case, since
3774 * VFS direct IO won't invoke the end_io call back function, 3774 * VFS direct IO won't invoke the end_io call back function,
3775 * we need to free the end_io structure here. 3775 * we need to free the end_io structure here.
3776 */ 3776 */
3777 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3777 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3778 ext4_free_io_end(iocb->private); 3778 ext4_free_io_end(iocb->private);
3779 iocb->private = NULL; 3779 iocb->private = NULL;
3780 } else if (ret > 0 && ext4_test_inode_state(inode, 3780 } else if (ret > 0 && ext4_test_inode_state(inode,
3781 EXT4_STATE_DIO_UNWRITTEN)) { 3781 EXT4_STATE_DIO_UNWRITTEN)) {
3782 int err; 3782 int err;
3783 /* 3783 /*
3784 * for non AIO case, since the IO is already 3784 * for non AIO case, since the IO is already
3785 * completed, we could do the conversion right here 3785 * completed, we could do the conversion right here
3786 */ 3786 */
3787 err = ext4_convert_unwritten_extents(inode, 3787 err = ext4_convert_unwritten_extents(inode,
3788 offset, ret); 3788 offset, ret);
3789 if (err < 0) 3789 if (err < 0)
3790 ret = err; 3790 ret = err;
3791 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3791 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3792 } 3792 }
3793 return ret; 3793 return ret;
3794 } 3794 }
3795 3795
3796 /* for write the the end of file case, we fall back to old way */ 3796 /* for write the the end of file case, we fall back to old way */
3797 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3797 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3798 } 3798 }
3799 3799
3800 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3800 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3801 const struct iovec *iov, loff_t offset, 3801 const struct iovec *iov, loff_t offset,
3802 unsigned long nr_segs) 3802 unsigned long nr_segs)
3803 { 3803 {
3804 struct file *file = iocb->ki_filp; 3804 struct file *file = iocb->ki_filp;
3805 struct inode *inode = file->f_mapping->host; 3805 struct inode *inode = file->f_mapping->host;
3806 ssize_t ret; 3806 ssize_t ret;
3807 3807
3808 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 3808 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3809 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3809 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3810 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3810 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3811 else 3811 else
3812 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3812 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3813 trace_ext4_direct_IO_exit(inode, offset, 3813 trace_ext4_direct_IO_exit(inode, offset,
3814 iov_length(iov, nr_segs), rw, ret); 3814 iov_length(iov, nr_segs), rw, ret);
3815 return ret; 3815 return ret;
3816 } 3816 }
3817 3817
3818 /* 3818 /*
3819 * Pages can be marked dirty completely asynchronously from ext4's journalling 3819 * Pages can be marked dirty completely asynchronously from ext4's journalling
3820 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3820 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
3821 * much here because ->set_page_dirty is called under VFS locks. The page is 3821 * much here because ->set_page_dirty is called under VFS locks. The page is
3822 * not necessarily locked. 3822 * not necessarily locked.
3823 * 3823 *
3824 * We cannot just dirty the page and leave attached buffers clean, because the 3824 * We cannot just dirty the page and leave attached buffers clean, because the
3825 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3825 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
3826 * or jbddirty because all the journalling code will explode. 3826 * or jbddirty because all the journalling code will explode.
3827 * 3827 *
3828 * So what we do is to mark the page "pending dirty" and next time writepage 3828 * So what we do is to mark the page "pending dirty" and next time writepage
3829 * is called, propagate that into the buffers appropriately. 3829 * is called, propagate that into the buffers appropriately.
3830 */ 3830 */
3831 static int ext4_journalled_set_page_dirty(struct page *page) 3831 static int ext4_journalled_set_page_dirty(struct page *page)
3832 { 3832 {
3833 SetPageChecked(page); 3833 SetPageChecked(page);
3834 return __set_page_dirty_nobuffers(page); 3834 return __set_page_dirty_nobuffers(page);
3835 } 3835 }
3836 3836
3837 static const struct address_space_operations ext4_ordered_aops = { 3837 static const struct address_space_operations ext4_ordered_aops = {
3838 .readpage = ext4_readpage, 3838 .readpage = ext4_readpage,
3839 .readpages = ext4_readpages, 3839 .readpages = ext4_readpages,
3840 .writepage = ext4_writepage, 3840 .writepage = ext4_writepage,
3841 .write_begin = ext4_write_begin, 3841 .write_begin = ext4_write_begin,
3842 .write_end = ext4_ordered_write_end, 3842 .write_end = ext4_ordered_write_end,
3843 .bmap = ext4_bmap, 3843 .bmap = ext4_bmap,
3844 .invalidatepage = ext4_invalidatepage, 3844 .invalidatepage = ext4_invalidatepage,
3845 .releasepage = ext4_releasepage, 3845 .releasepage = ext4_releasepage,
3846 .direct_IO = ext4_direct_IO, 3846 .direct_IO = ext4_direct_IO,
3847 .migratepage = buffer_migrate_page, 3847 .migratepage = buffer_migrate_page,
3848 .is_partially_uptodate = block_is_partially_uptodate, 3848 .is_partially_uptodate = block_is_partially_uptodate,
3849 .error_remove_page = generic_error_remove_page, 3849 .error_remove_page = generic_error_remove_page,
3850 }; 3850 };
3851 3851
3852 static const struct address_space_operations ext4_writeback_aops = { 3852 static const struct address_space_operations ext4_writeback_aops = {
3853 .readpage = ext4_readpage, 3853 .readpage = ext4_readpage,
3854 .readpages = ext4_readpages, 3854 .readpages = ext4_readpages,
3855 .writepage = ext4_writepage, 3855 .writepage = ext4_writepage,
3856 .write_begin = ext4_write_begin, 3856 .write_begin = ext4_write_begin,
3857 .write_end = ext4_writeback_write_end, 3857 .write_end = ext4_writeback_write_end,
3858 .bmap = ext4_bmap, 3858 .bmap = ext4_bmap,
3859 .invalidatepage = ext4_invalidatepage, 3859 .invalidatepage = ext4_invalidatepage,
3860 .releasepage = ext4_releasepage, 3860 .releasepage = ext4_releasepage,
3861 .direct_IO = ext4_direct_IO, 3861 .direct_IO = ext4_direct_IO,
3862 .migratepage = buffer_migrate_page, 3862 .migratepage = buffer_migrate_page,
3863 .is_partially_uptodate = block_is_partially_uptodate, 3863 .is_partially_uptodate = block_is_partially_uptodate,
3864 .error_remove_page = generic_error_remove_page, 3864 .error_remove_page = generic_error_remove_page,
3865 }; 3865 };
3866 3866
3867 static const struct address_space_operations ext4_journalled_aops = { 3867 static const struct address_space_operations ext4_journalled_aops = {
3868 .readpage = ext4_readpage, 3868 .readpage = ext4_readpage,
3869 .readpages = ext4_readpages, 3869 .readpages = ext4_readpages,
3870 .writepage = ext4_writepage, 3870 .writepage = ext4_writepage,
3871 .write_begin = ext4_write_begin, 3871 .write_begin = ext4_write_begin,
3872 .write_end = ext4_journalled_write_end, 3872 .write_end = ext4_journalled_write_end,
3873 .set_page_dirty = ext4_journalled_set_page_dirty, 3873 .set_page_dirty = ext4_journalled_set_page_dirty,
3874 .bmap = ext4_bmap, 3874 .bmap = ext4_bmap,
3875 .invalidatepage = ext4_invalidatepage, 3875 .invalidatepage = ext4_invalidatepage,
3876 .releasepage = ext4_releasepage, 3876 .releasepage = ext4_releasepage,
3877 .is_partially_uptodate = block_is_partially_uptodate, 3877 .is_partially_uptodate = block_is_partially_uptodate,
3878 .error_remove_page = generic_error_remove_page, 3878 .error_remove_page = generic_error_remove_page,
3879 }; 3879 };
3880 3880
3881 static const struct address_space_operations ext4_da_aops = { 3881 static const struct address_space_operations ext4_da_aops = {
3882 .readpage = ext4_readpage, 3882 .readpage = ext4_readpage,
3883 .readpages = ext4_readpages, 3883 .readpages = ext4_readpages,
3884 .writepage = ext4_writepage, 3884 .writepage = ext4_writepage,
3885 .writepages = ext4_da_writepages, 3885 .writepages = ext4_da_writepages,
3886 .write_begin = ext4_da_write_begin, 3886 .write_begin = ext4_da_write_begin,
3887 .write_end = ext4_da_write_end, 3887 .write_end = ext4_da_write_end,
3888 .bmap = ext4_bmap, 3888 .bmap = ext4_bmap,
3889 .invalidatepage = ext4_da_invalidatepage, 3889 .invalidatepage = ext4_da_invalidatepage,
3890 .releasepage = ext4_releasepage, 3890 .releasepage = ext4_releasepage,
3891 .direct_IO = ext4_direct_IO, 3891 .direct_IO = ext4_direct_IO,
3892 .migratepage = buffer_migrate_page, 3892 .migratepage = buffer_migrate_page,
3893 .is_partially_uptodate = block_is_partially_uptodate, 3893 .is_partially_uptodate = block_is_partially_uptodate,
3894 .error_remove_page = generic_error_remove_page, 3894 .error_remove_page = generic_error_remove_page,
3895 }; 3895 };
3896 3896
3897 void ext4_set_aops(struct inode *inode) 3897 void ext4_set_aops(struct inode *inode)
3898 { 3898 {
3899 if (ext4_should_order_data(inode) && 3899 if (ext4_should_order_data(inode) &&
3900 test_opt(inode->i_sb, DELALLOC)) 3900 test_opt(inode->i_sb, DELALLOC))
3901 inode->i_mapping->a_ops = &ext4_da_aops; 3901 inode->i_mapping->a_ops = &ext4_da_aops;
3902 else if (ext4_should_order_data(inode)) 3902 else if (ext4_should_order_data(inode))
3903 inode->i_mapping->a_ops = &ext4_ordered_aops; 3903 inode->i_mapping->a_ops = &ext4_ordered_aops;
3904 else if (ext4_should_writeback_data(inode) && 3904 else if (ext4_should_writeback_data(inode) &&
3905 test_opt(inode->i_sb, DELALLOC)) 3905 test_opt(inode->i_sb, DELALLOC))
3906 inode->i_mapping->a_ops = &ext4_da_aops; 3906 inode->i_mapping->a_ops = &ext4_da_aops;
3907 else if (ext4_should_writeback_data(inode)) 3907 else if (ext4_should_writeback_data(inode))
3908 inode->i_mapping->a_ops = &ext4_writeback_aops; 3908 inode->i_mapping->a_ops = &ext4_writeback_aops;
3909 else 3909 else
3910 inode->i_mapping->a_ops = &ext4_journalled_aops; 3910 inode->i_mapping->a_ops = &ext4_journalled_aops;
3911 } 3911 }
3912 3912
3913 /* 3913 /*
3914 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3914 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3915 * up to the end of the block which corresponds to `from'. 3915 * up to the end of the block which corresponds to `from'.
3916 * This required during truncate. We need to physically zero the tail end 3916 * This required during truncate. We need to physically zero the tail end
3917 * of that block so it doesn't yield old data if the file is later grown. 3917 * of that block so it doesn't yield old data if the file is later grown.
3918 */ 3918 */
3919 int ext4_block_truncate_page(handle_t *handle, 3919 int ext4_block_truncate_page(handle_t *handle,
3920 struct address_space *mapping, loff_t from) 3920 struct address_space *mapping, loff_t from)
3921 { 3921 {
3922 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3922 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3923 unsigned length; 3923 unsigned length;
3924 unsigned blocksize; 3924 unsigned blocksize;
3925 struct inode *inode = mapping->host; 3925 struct inode *inode = mapping->host;
3926 3926
3927 blocksize = inode->i_sb->s_blocksize; 3927 blocksize = inode->i_sb->s_blocksize;
3928 length = blocksize - (offset & (blocksize - 1)); 3928 length = blocksize - (offset & (blocksize - 1));
3929 3929
3930 return ext4_block_zero_page_range(handle, mapping, from, length); 3930 return ext4_block_zero_page_range(handle, mapping, from, length);
3931 } 3931 }
3932 3932
3933 /* 3933 /*
3934 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3934 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3935 * starting from file offset 'from'. The range to be zero'd must 3935 * starting from file offset 'from'. The range to be zero'd must
3936 * be contained with in one block. If the specified range exceeds 3936 * be contained with in one block. If the specified range exceeds
3937 * the end of the block it will be shortened to end of the block 3937 * the end of the block it will be shortened to end of the block
3938 * that cooresponds to 'from' 3938 * that cooresponds to 'from'
3939 */ 3939 */
3940 int ext4_block_zero_page_range(handle_t *handle, 3940 int ext4_block_zero_page_range(handle_t *handle,
3941 struct address_space *mapping, loff_t from, loff_t length) 3941 struct address_space *mapping, loff_t from, loff_t length)
3942 { 3942 {
3943 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3943 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3944 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3944 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3945 unsigned blocksize, max, pos; 3945 unsigned blocksize, max, pos;
3946 ext4_lblk_t iblock; 3946 ext4_lblk_t iblock;
3947 struct inode *inode = mapping->host; 3947 struct inode *inode = mapping->host;
3948 struct buffer_head *bh; 3948 struct buffer_head *bh;
3949 struct page *page; 3949 struct page *page;
3950 int err = 0; 3950 int err = 0;
3951 3951
3952 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3952 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3953 mapping_gfp_mask(mapping) & ~__GFP_FS); 3953 mapping_gfp_mask(mapping) & ~__GFP_FS);
3954 if (!page) 3954 if (!page)
3955 return -EINVAL; 3955 return -EINVAL;
3956 3956
3957 blocksize = inode->i_sb->s_blocksize; 3957 blocksize = inode->i_sb->s_blocksize;
3958 max = blocksize - (offset & (blocksize - 1)); 3958 max = blocksize - (offset & (blocksize - 1));
3959 3959
3960 /* 3960 /*
3961 * correct length if it does not fall between 3961 * correct length if it does not fall between
3962 * 'from' and the end of the block 3962 * 'from' and the end of the block
3963 */ 3963 */
3964 if (length > max || length < 0) 3964 if (length > max || length < 0)
3965 length = max; 3965 length = max;
3966 3966
3967 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3967 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3968 3968
3969 if (!page_has_buffers(page)) 3969 if (!page_has_buffers(page))
3970 create_empty_buffers(page, blocksize, 0); 3970 create_empty_buffers(page, blocksize, 0);
3971 3971
3972 /* Find the buffer that contains "offset" */ 3972 /* Find the buffer that contains "offset" */
3973 bh = page_buffers(page); 3973 bh = page_buffers(page);
3974 pos = blocksize; 3974 pos = blocksize;
3975 while (offset >= pos) { 3975 while (offset >= pos) {
3976 bh = bh->b_this_page; 3976 bh = bh->b_this_page;
3977 iblock++; 3977 iblock++;
3978 pos += blocksize; 3978 pos += blocksize;
3979 } 3979 }
3980 3980
3981 err = 0; 3981 err = 0;
3982 if (buffer_freed(bh)) { 3982 if (buffer_freed(bh)) {
3983 BUFFER_TRACE(bh, "freed: skip"); 3983 BUFFER_TRACE(bh, "freed: skip");
3984 goto unlock; 3984 goto unlock;
3985 } 3985 }
3986 3986
3987 if (!buffer_mapped(bh)) { 3987 if (!buffer_mapped(bh)) {
3988 BUFFER_TRACE(bh, "unmapped"); 3988 BUFFER_TRACE(bh, "unmapped");
3989 ext4_get_block(inode, iblock, bh, 0); 3989 ext4_get_block(inode, iblock, bh, 0);
3990 /* unmapped? It's a hole - nothing to do */ 3990 /* unmapped? It's a hole - nothing to do */
3991 if (!buffer_mapped(bh)) { 3991 if (!buffer_mapped(bh)) {
3992 BUFFER_TRACE(bh, "still unmapped"); 3992 BUFFER_TRACE(bh, "still unmapped");
3993 goto unlock; 3993 goto unlock;
3994 } 3994 }
3995 } 3995 }
3996 3996
3997 /* Ok, it's mapped. Make sure it's up-to-date */ 3997 /* Ok, it's mapped. Make sure it's up-to-date */
3998 if (PageUptodate(page)) 3998 if (PageUptodate(page))
3999 set_buffer_uptodate(bh); 3999 set_buffer_uptodate(bh);
4000 4000
4001 if (!buffer_uptodate(bh)) { 4001 if (!buffer_uptodate(bh)) {
4002 err = -EIO; 4002 err = -EIO;
4003 ll_rw_block(READ, 1, &bh); 4003 ll_rw_block(READ, 1, &bh);
4004 wait_on_buffer(bh); 4004 wait_on_buffer(bh);
4005 /* Uhhuh. Read error. Complain and punt. */ 4005 /* Uhhuh. Read error. Complain and punt. */
4006 if (!buffer_uptodate(bh)) 4006 if (!buffer_uptodate(bh))
4007 goto unlock; 4007 goto unlock;
4008 } 4008 }
4009 4009
4010 if (ext4_should_journal_data(inode)) { 4010 if (ext4_should_journal_data(inode)) {
4011 BUFFER_TRACE(bh, "get write access"); 4011 BUFFER_TRACE(bh, "get write access");
4012 err = ext4_journal_get_write_access(handle, bh); 4012 err = ext4_journal_get_write_access(handle, bh);
4013 if (err) 4013 if (err)
4014 goto unlock; 4014 goto unlock;
4015 } 4015 }
4016 4016
4017 zero_user(page, offset, length); 4017 zero_user(page, offset, length);
4018 4018
4019 BUFFER_TRACE(bh, "zeroed end of block"); 4019 BUFFER_TRACE(bh, "zeroed end of block");
4020 4020
4021 err = 0; 4021 err = 0;
4022 if (ext4_should_journal_data(inode)) { 4022 if (ext4_should_journal_data(inode)) {
4023 err = ext4_handle_dirty_metadata(handle, inode, bh); 4023 err = ext4_handle_dirty_metadata(handle, inode, bh);
4024 } else { 4024 } else {
4025 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) 4025 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4026 err = ext4_jbd2_file_inode(handle, inode); 4026 err = ext4_jbd2_file_inode(handle, inode);
4027 mark_buffer_dirty(bh); 4027 mark_buffer_dirty(bh);
4028 } 4028 }
4029 4029
4030 unlock: 4030 unlock:
4031 unlock_page(page); 4031 unlock_page(page);
4032 page_cache_release(page); 4032 page_cache_release(page);
4033 return err; 4033 return err;
4034 } 4034 }
4035 4035
4036 /* 4036 /*
4037 * Probably it should be a library function... search for first non-zero word 4037 * Probably it should be a library function... search for first non-zero word
4038 * or memcmp with zero_page, whatever is better for particular architecture. 4038 * or memcmp with zero_page, whatever is better for particular architecture.
4039 * Linus? 4039 * Linus?
4040 */ 4040 */
4041 static inline int all_zeroes(__le32 *p, __le32 *q) 4041 static inline int all_zeroes(__le32 *p, __le32 *q)
4042 { 4042 {
4043 while (p < q) 4043 while (p < q)
4044 if (*p++) 4044 if (*p++)
4045 return 0; 4045 return 0;
4046 return 1; 4046 return 1;
4047 } 4047 }
4048 4048
4049 /** 4049 /**
4050 * ext4_find_shared - find the indirect blocks for partial truncation. 4050 * ext4_find_shared - find the indirect blocks for partial truncation.
4051 * @inode: inode in question 4051 * @inode: inode in question
4052 * @depth: depth of the affected branch 4052 * @depth: depth of the affected branch
4053 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 4053 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
4054 * @chain: place to store the pointers to partial indirect blocks 4054 * @chain: place to store the pointers to partial indirect blocks
4055 * @top: place to the (detached) top of branch 4055 * @top: place to the (detached) top of branch
4056 * 4056 *
4057 * This is a helper function used by ext4_truncate(). 4057 * This is a helper function used by ext4_truncate().
4058 * 4058 *
4059 * When we do truncate() we may have to clean the ends of several 4059 * When we do truncate() we may have to clean the ends of several
4060 * indirect blocks but leave the blocks themselves alive. Block is 4060 * indirect blocks but leave the blocks themselves alive. Block is
4061 * partially truncated if some data below the new i_size is referred 4061 * partially truncated if some data below the new i_size is referred
4062 * from it (and it is on the path to the first completely truncated 4062 * from it (and it is on the path to the first completely truncated
4063 * data block, indeed). We have to free the top of that path along 4063 * data block, indeed). We have to free the top of that path along
4064 * with everything to the right of the path. Since no allocation 4064 * with everything to the right of the path. Since no allocation
4065 * past the truncation point is possible until ext4_truncate() 4065 * past the truncation point is possible until ext4_truncate()
4066 * finishes, we may safely do the latter, but top of branch may 4066 * finishes, we may safely do the latter, but top of branch may
4067 * require special attention - pageout below the truncation point 4067 * require special attention - pageout below the truncation point
4068 * might try to populate it. 4068 * might try to populate it.
4069 * 4069 *
4070 * We atomically detach the top of branch from the tree, store the 4070 * We atomically detach the top of branch from the tree, store the
4071 * block number of its root in *@top, pointers to buffer_heads of 4071 * block number of its root in *@top, pointers to buffer_heads of
4072 * partially truncated blocks - in @chain[].bh and pointers to 4072 * partially truncated blocks - in @chain[].bh and pointers to
4073 * their last elements that should not be removed - in 4073 * their last elements that should not be removed - in
4074 * @chain[].p. Return value is the pointer to last filled element 4074 * @chain[].p. Return value is the pointer to last filled element
4075 * of @chain. 4075 * of @chain.
4076 * 4076 *
4077 * The work left to caller to do the actual freeing of subtrees: 4077 * The work left to caller to do the actual freeing of subtrees:
4078 * a) free the subtree starting from *@top 4078 * a) free the subtree starting from *@top
4079 * b) free the subtrees whose roots are stored in 4079 * b) free the subtrees whose roots are stored in
4080 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 4080 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
4081 * c) free the subtrees growing from the inode past the @chain[0]. 4081 * c) free the subtrees growing from the inode past the @chain[0].
4082 * (no partially truncated stuff there). */ 4082 * (no partially truncated stuff there). */
4083 4083
4084 static Indirect *ext4_find_shared(struct inode *inode, int depth, 4084 static Indirect *ext4_find_shared(struct inode *inode, int depth,
4085 ext4_lblk_t offsets[4], Indirect chain[4], 4085 ext4_lblk_t offsets[4], Indirect chain[4],
4086 __le32 *top) 4086 __le32 *top)
4087 { 4087 {
4088 Indirect *partial, *p; 4088 Indirect *partial, *p;
4089 int k, err; 4089 int k, err;
4090 4090
4091 *top = 0; 4091 *top = 0;
4092 /* Make k index the deepest non-null offset + 1 */ 4092 /* Make k index the deepest non-null offset + 1 */
4093 for (k = depth; k > 1 && !offsets[k-1]; k--) 4093 for (k = depth; k > 1 && !offsets[k-1]; k--)
4094 ; 4094 ;
4095 partial = ext4_get_branch(inode, k, offsets, chain, &err); 4095 partial = ext4_get_branch(inode, k, offsets, chain, &err);
4096 /* Writer: pointers */ 4096 /* Writer: pointers */
4097 if (!partial) 4097 if (!partial)
4098 partial = chain + k-1; 4098 partial = chain + k-1;
4099 /* 4099 /*
4100 * If the branch acquired continuation since we've looked at it - 4100 * If the branch acquired continuation since we've looked at it -
4101 * fine, it should all survive and (new) top doesn't belong to us. 4101 * fine, it should all survive and (new) top doesn't belong to us.
4102 */ 4102 */
4103 if (!partial->key && *partial->p) 4103 if (!partial->key && *partial->p)
4104 /* Writer: end */ 4104 /* Writer: end */
4105 goto no_top; 4105 goto no_top;
4106 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 4106 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
4107 ; 4107 ;
4108 /* 4108 /*
4109 * OK, we've found the last block that must survive. The rest of our 4109 * OK, we've found the last block that must survive. The rest of our
4110 * branch should be detached before unlocking. However, if that rest 4110 * branch should be detached before unlocking. However, if that rest
4111 * of branch is all ours and does not grow immediately from the inode 4111 * of branch is all ours and does not grow immediately from the inode
4112 * it's easier to cheat and just decrement partial->p. 4112 * it's easier to cheat and just decrement partial->p.
4113 */ 4113 */
4114 if (p == chain + k - 1 && p > chain) { 4114 if (p == chain + k - 1 && p > chain) {
4115 p->p--; 4115 p->p--;
4116 } else { 4116 } else {
4117 *top = *p->p; 4117 *top = *p->p;
4118 /* Nope, don't do this in ext4. Must leave the tree intact */ 4118 /* Nope, don't do this in ext4. Must leave the tree intact */
4119 #if 0 4119 #if 0
4120 *p->p = 0; 4120 *p->p = 0;
4121 #endif 4121 #endif
4122 } 4122 }
4123 /* Writer: end */ 4123 /* Writer: end */
4124 4124
4125 while (partial > p) { 4125 while (partial > p) {
4126 brelse(partial->bh); 4126 brelse(partial->bh);
4127 partial--; 4127 partial--;
4128 } 4128 }
4129 no_top: 4129 no_top:
4130 return partial; 4130 return partial;
4131 } 4131 }
4132 4132
4133 /* 4133 /*
4134 * Zero a number of block pointers in either an inode or an indirect block. 4134 * Zero a number of block pointers in either an inode or an indirect block.
4135 * If we restart the transaction we must again get write access to the 4135 * If we restart the transaction we must again get write access to the
4136 * indirect block for further modification. 4136 * indirect block for further modification.
4137 * 4137 *
4138 * We release `count' blocks on disk, but (last - first) may be greater 4138 * We release `count' blocks on disk, but (last - first) may be greater
4139 * than `count' because there can be holes in there. 4139 * than `count' because there can be holes in there.
4140 * 4140 *
4141 * Return 0 on success, 1 on invalid block range 4141 * Return 0 on success, 1 on invalid block range
4142 * and < 0 on fatal error. 4142 * and < 0 on fatal error.
4143 */ 4143 */
4144 static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 4144 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4145 struct buffer_head *bh, 4145 struct buffer_head *bh,
4146 ext4_fsblk_t block_to_free, 4146 ext4_fsblk_t block_to_free,
4147 unsigned long count, __le32 *first, 4147 unsigned long count, __le32 *first,
4148 __le32 *last) 4148 __le32 *last)
4149 { 4149 {
4150 __le32 *p; 4150 __le32 *p;
4151 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 4151 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4152 int err; 4152 int err;
4153 4153
4154 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4154 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4155 flags |= EXT4_FREE_BLOCKS_METADATA; 4155 flags |= EXT4_FREE_BLOCKS_METADATA;
4156 4156
4157 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4157 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4158 count)) { 4158 count)) {
4159 EXT4_ERROR_INODE(inode, "attempt to clear invalid " 4159 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4160 "blocks %llu len %lu", 4160 "blocks %llu len %lu",
4161 (unsigned long long) block_to_free, count); 4161 (unsigned long long) block_to_free, count);
4162 return 1; 4162 return 1;
4163 } 4163 }
4164 4164
4165 if (try_to_extend_transaction(handle, inode)) { 4165 if (try_to_extend_transaction(handle, inode)) {
4166 if (bh) { 4166 if (bh) {
4167 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4167 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4168 err = ext4_handle_dirty_metadata(handle, inode, bh); 4168 err = ext4_handle_dirty_metadata(handle, inode, bh);
4169 if (unlikely(err)) 4169 if (unlikely(err))
4170 goto out_err; 4170 goto out_err;
4171 } 4171 }
4172 err = ext4_mark_inode_dirty(handle, inode); 4172 err = ext4_mark_inode_dirty(handle, inode);
4173 if (unlikely(err)) 4173 if (unlikely(err))
4174 goto out_err; 4174 goto out_err;
4175 err = ext4_truncate_restart_trans(handle, inode, 4175 err = ext4_truncate_restart_trans(handle, inode,
4176 blocks_for_truncate(inode)); 4176 blocks_for_truncate(inode));
4177 if (unlikely(err)) 4177 if (unlikely(err))
4178 goto out_err; 4178 goto out_err;
4179 if (bh) { 4179 if (bh) {
4180 BUFFER_TRACE(bh, "retaking write access"); 4180 BUFFER_TRACE(bh, "retaking write access");
4181 err = ext4_journal_get_write_access(handle, bh); 4181 err = ext4_journal_get_write_access(handle, bh);
4182 if (unlikely(err)) 4182 if (unlikely(err))
4183 goto out_err; 4183 goto out_err;
4184 } 4184 }
4185 } 4185 }
4186 4186
4187 for (p = first; p < last; p++) 4187 for (p = first; p < last; p++)
4188 *p = 0; 4188 *p = 0;
4189 4189
4190 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); 4190 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4191 return 0; 4191 return 0;
4192 out_err: 4192 out_err:
4193 ext4_std_error(inode->i_sb, err); 4193 ext4_std_error(inode->i_sb, err);
4194 return err; 4194 return err;
4195 } 4195 }
4196 4196
4197 /** 4197 /**
4198 * ext4_free_data - free a list of data blocks 4198 * ext4_free_data - free a list of data blocks
4199 * @handle: handle for this transaction 4199 * @handle: handle for this transaction
4200 * @inode: inode we are dealing with 4200 * @inode: inode we are dealing with
4201 * @this_bh: indirect buffer_head which contains *@first and *@last 4201 * @this_bh: indirect buffer_head which contains *@first and *@last
4202 * @first: array of block numbers 4202 * @first: array of block numbers
4203 * @last: points immediately past the end of array 4203 * @last: points immediately past the end of array
4204 * 4204 *
4205 * We are freeing all blocks referred from that array (numbers are stored as 4205 * We are freeing all blocks referred from that array (numbers are stored as
4206 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 4206 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4207 * 4207 *
4208 * We accumulate contiguous runs of blocks to free. Conveniently, if these 4208 * We accumulate contiguous runs of blocks to free. Conveniently, if these
4209 * blocks are contiguous then releasing them at one time will only affect one 4209 * blocks are contiguous then releasing them at one time will only affect one
4210 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 4210 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
4211 * actually use a lot of journal space. 4211 * actually use a lot of journal space.
4212 * 4212 *
4213 * @this_bh will be %NULL if @first and @last point into the inode's direct 4213 * @this_bh will be %NULL if @first and @last point into the inode's direct
4214 * block pointers. 4214 * block pointers.
4215 */ 4215 */
4216 static void ext4_free_data(handle_t *handle, struct inode *inode, 4216 static void ext4_free_data(handle_t *handle, struct inode *inode,
4217 struct buffer_head *this_bh, 4217 struct buffer_head *this_bh,
4218 __le32 *first, __le32 *last) 4218 __le32 *first, __le32 *last)
4219 { 4219 {
4220 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 4220 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
4221 unsigned long count = 0; /* Number of blocks in the run */ 4221 unsigned long count = 0; /* Number of blocks in the run */
4222 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 4222 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
4223 corresponding to 4223 corresponding to
4224 block_to_free */ 4224 block_to_free */
4225 ext4_fsblk_t nr; /* Current block # */ 4225 ext4_fsblk_t nr; /* Current block # */
4226 __le32 *p; /* Pointer into inode/ind 4226 __le32 *p; /* Pointer into inode/ind
4227 for current block */ 4227 for current block */
4228 int err = 0; 4228 int err = 0;
4229 4229
4230 if (this_bh) { /* For indirect block */ 4230 if (this_bh) { /* For indirect block */
4231 BUFFER_TRACE(this_bh, "get_write_access"); 4231 BUFFER_TRACE(this_bh, "get_write_access");
4232 err = ext4_journal_get_write_access(handle, this_bh); 4232 err = ext4_journal_get_write_access(handle, this_bh);
4233 /* Important: if we can't update the indirect pointers 4233 /* Important: if we can't update the indirect pointers
4234 * to the blocks, we can't free them. */ 4234 * to the blocks, we can't free them. */
4235 if (err) 4235 if (err)
4236 return; 4236 return;
4237 } 4237 }
4238 4238
4239 for (p = first; p < last; p++) { 4239 for (p = first; p < last; p++) {
4240 nr = le32_to_cpu(*p); 4240 nr = le32_to_cpu(*p);
4241 if (nr) { 4241 if (nr) {
4242 /* accumulate blocks to free if they're contiguous */ 4242 /* accumulate blocks to free if they're contiguous */
4243 if (count == 0) { 4243 if (count == 0) {
4244 block_to_free = nr; 4244 block_to_free = nr;
4245 block_to_free_p = p; 4245 block_to_free_p = p;
4246 count = 1; 4246 count = 1;
4247 } else if (nr == block_to_free + count) { 4247 } else if (nr == block_to_free + count) {
4248 count++; 4248 count++;
4249 } else { 4249 } else {
4250 err = ext4_clear_blocks(handle, inode, this_bh, 4250 err = ext4_clear_blocks(handle, inode, this_bh,
4251 block_to_free, count, 4251 block_to_free, count,
4252 block_to_free_p, p); 4252 block_to_free_p, p);
4253 if (err) 4253 if (err)
4254 break; 4254 break;
4255 block_to_free = nr; 4255 block_to_free = nr;
4256 block_to_free_p = p; 4256 block_to_free_p = p;
4257 count = 1; 4257 count = 1;
4258 } 4258 }
4259 } 4259 }
4260 } 4260 }
4261 4261
4262 if (!err && count > 0) 4262 if (!err && count > 0)
4263 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, 4263 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4264 count, block_to_free_p, p); 4264 count, block_to_free_p, p);
4265 if (err < 0) 4265 if (err < 0)
4266 /* fatal error */ 4266 /* fatal error */
4267 return; 4267 return;
4268 4268
4269 if (this_bh) { 4269 if (this_bh) {
4270 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 4270 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
4271 4271
4272 /* 4272 /*
4273 * The buffer head should have an attached journal head at this 4273 * The buffer head should have an attached journal head at this
4274 * point. However, if the data is corrupted and an indirect 4274 * point. However, if the data is corrupted and an indirect
4275 * block pointed to itself, it would have been detached when 4275 * block pointed to itself, it would have been detached when
4276 * the block was cleared. Check for this instead of OOPSing. 4276 * the block was cleared. Check for this instead of OOPSing.
4277 */ 4277 */
4278 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4278 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4279 ext4_handle_dirty_metadata(handle, inode, this_bh); 4279 ext4_handle_dirty_metadata(handle, inode, this_bh);
4280 else 4280 else
4281 EXT4_ERROR_INODE(inode, 4281 EXT4_ERROR_INODE(inode,
4282 "circular indirect block detected at " 4282 "circular indirect block detected at "
4283 "block %llu", 4283 "block %llu",
4284 (unsigned long long) this_bh->b_blocknr); 4284 (unsigned long long) this_bh->b_blocknr);
4285 } 4285 }
4286 } 4286 }
4287 4287
4288 /** 4288 /**
4289 * ext4_free_branches - free an array of branches 4289 * ext4_free_branches - free an array of branches
4290 * @handle: JBD handle for this transaction 4290 * @handle: JBD handle for this transaction
4291 * @inode: inode we are dealing with 4291 * @inode: inode we are dealing with
4292 * @parent_bh: the buffer_head which contains *@first and *@last 4292 * @parent_bh: the buffer_head which contains *@first and *@last
4293 * @first: array of block numbers 4293 * @first: array of block numbers
4294 * @last: pointer immediately past the end of array 4294 * @last: pointer immediately past the end of array
4295 * @depth: depth of the branches to free 4295 * @depth: depth of the branches to free
4296 * 4296 *
4297 * We are freeing all blocks referred from these branches (numbers are 4297 * We are freeing all blocks referred from these branches (numbers are
4298 * stored as little-endian 32-bit) and updating @inode->i_blocks 4298 * stored as little-endian 32-bit) and updating @inode->i_blocks
4299 * appropriately. 4299 * appropriately.
4300 */ 4300 */
4301 static void ext4_free_branches(handle_t *handle, struct inode *inode, 4301 static void ext4_free_branches(handle_t *handle, struct inode *inode,
4302 struct buffer_head *parent_bh, 4302 struct buffer_head *parent_bh,
4303 __le32 *first, __le32 *last, int depth) 4303 __le32 *first, __le32 *last, int depth)
4304 { 4304 {
4305 ext4_fsblk_t nr; 4305 ext4_fsblk_t nr;
4306 __le32 *p; 4306 __le32 *p;
4307 4307
4308 if (ext4_handle_is_aborted(handle)) 4308 if (ext4_handle_is_aborted(handle))
4309 return; 4309 return;
4310 4310
4311 if (depth--) { 4311 if (depth--) {
4312 struct buffer_head *bh; 4312 struct buffer_head *bh;
4313 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 4313 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4314 p = last; 4314 p = last;
4315 while (--p >= first) { 4315 while (--p >= first) {
4316 nr = le32_to_cpu(*p); 4316 nr = le32_to_cpu(*p);
4317 if (!nr) 4317 if (!nr)
4318 continue; /* A hole */ 4318 continue; /* A hole */
4319 4319
4320 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4320 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4321 nr, 1)) { 4321 nr, 1)) {
4322 EXT4_ERROR_INODE(inode, 4322 EXT4_ERROR_INODE(inode,
4323 "invalid indirect mapped " 4323 "invalid indirect mapped "
4324 "block %lu (level %d)", 4324 "block %lu (level %d)",
4325 (unsigned long) nr, depth); 4325 (unsigned long) nr, depth);
4326 break; 4326 break;
4327 } 4327 }
4328 4328
4329 /* Go read the buffer for the next level down */ 4329 /* Go read the buffer for the next level down */
4330 bh = sb_bread(inode->i_sb, nr); 4330 bh = sb_bread(inode->i_sb, nr);
4331 4331
4332 /* 4332 /*
4333 * A read failure? Report error and clear slot 4333 * A read failure? Report error and clear slot
4334 * (should be rare). 4334 * (should be rare).
4335 */ 4335 */
4336 if (!bh) { 4336 if (!bh) {
4337 EXT4_ERROR_INODE_BLOCK(inode, nr, 4337 EXT4_ERROR_INODE_BLOCK(inode, nr,
4338 "Read failure"); 4338 "Read failure");
4339 continue; 4339 continue;
4340 } 4340 }
4341 4341
4342 /* This zaps the entire block. Bottom up. */ 4342 /* This zaps the entire block. Bottom up. */
4343 BUFFER_TRACE(bh, "free child branches"); 4343 BUFFER_TRACE(bh, "free child branches");
4344 ext4_free_branches(handle, inode, bh, 4344 ext4_free_branches(handle, inode, bh,
4345 (__le32 *) bh->b_data, 4345 (__le32 *) bh->b_data,
4346 (__le32 *) bh->b_data + addr_per_block, 4346 (__le32 *) bh->b_data + addr_per_block,
4347 depth); 4347 depth);
4348 brelse(bh); 4348 brelse(bh);
4349 4349
4350 /* 4350 /*
4351 * Everything below this this pointer has been 4351 * Everything below this this pointer has been
4352 * released. Now let this top-of-subtree go. 4352 * released. Now let this top-of-subtree go.
4353 * 4353 *
4354 * We want the freeing of this indirect block to be 4354 * We want the freeing of this indirect block to be
4355 * atomic in the journal with the updating of the 4355 * atomic in the journal with the updating of the
4356 * bitmap block which owns it. So make some room in 4356 * bitmap block which owns it. So make some room in
4357 * the journal. 4357 * the journal.
4358 * 4358 *
4359 * We zero the parent pointer *after* freeing its 4359 * We zero the parent pointer *after* freeing its
4360 * pointee in the bitmaps, so if extend_transaction() 4360 * pointee in the bitmaps, so if extend_transaction()
4361 * for some reason fails to put the bitmap changes and 4361 * for some reason fails to put the bitmap changes and
4362 * the release into the same transaction, recovery 4362 * the release into the same transaction, recovery
4363 * will merely complain about releasing a free block, 4363 * will merely complain about releasing a free block,
4364 * rather than leaking blocks. 4364 * rather than leaking blocks.
4365 */ 4365 */
4366 if (ext4_handle_is_aborted(handle)) 4366 if (ext4_handle_is_aborted(handle))
4367 return; 4367 return;
4368 if (try_to_extend_transaction(handle, inode)) { 4368 if (try_to_extend_transaction(handle, inode)) {
4369 ext4_mark_inode_dirty(handle, inode); 4369 ext4_mark_inode_dirty(handle, inode);
4370 ext4_truncate_restart_trans(handle, inode, 4370 ext4_truncate_restart_trans(handle, inode,
4371 blocks_for_truncate(inode)); 4371 blocks_for_truncate(inode));
4372 } 4372 }
4373 4373
4374 /* 4374 /*
4375 * The forget flag here is critical because if 4375 * The forget flag here is critical because if
4376 * we are journaling (and not doing data 4376 * we are journaling (and not doing data
4377 * journaling), we have to make sure a revoke 4377 * journaling), we have to make sure a revoke
4378 * record is written to prevent the journal 4378 * record is written to prevent the journal
4379 * replay from overwriting the (former) 4379 * replay from overwriting the (former)
4380 * indirect block if it gets reallocated as a 4380 * indirect block if it gets reallocated as a
4381 * data block. This must happen in the same 4381 * data block. This must happen in the same
4382 * transaction where the data blocks are 4382 * transaction where the data blocks are
4383 * actually freed. 4383 * actually freed.
4384 */ 4384 */
4385 ext4_free_blocks(handle, inode, NULL, nr, 1, 4385 ext4_free_blocks(handle, inode, NULL, nr, 1,
4386 EXT4_FREE_BLOCKS_METADATA| 4386 EXT4_FREE_BLOCKS_METADATA|
4387 EXT4_FREE_BLOCKS_FORGET); 4387 EXT4_FREE_BLOCKS_FORGET);
4388 4388
4389 if (parent_bh) { 4389 if (parent_bh) {
4390 /* 4390 /*
4391 * The block which we have just freed is 4391 * The block which we have just freed is
4392 * pointed to by an indirect block: journal it 4392 * pointed to by an indirect block: journal it
4393 */ 4393 */
4394 BUFFER_TRACE(parent_bh, "get_write_access"); 4394 BUFFER_TRACE(parent_bh, "get_write_access");
4395 if (!ext4_journal_get_write_access(handle, 4395 if (!ext4_journal_get_write_access(handle,
4396 parent_bh)){ 4396 parent_bh)){
4397 *p = 0; 4397 *p = 0;
4398 BUFFER_TRACE(parent_bh, 4398 BUFFER_TRACE(parent_bh,
4399 "call ext4_handle_dirty_metadata"); 4399 "call ext4_handle_dirty_metadata");
4400 ext4_handle_dirty_metadata(handle, 4400 ext4_handle_dirty_metadata(handle,
4401 inode, 4401 inode,
4402 parent_bh); 4402 parent_bh);
4403 } 4403 }
4404 } 4404 }
4405 } 4405 }
4406 } else { 4406 } else {
4407 /* We have reached the bottom of the tree. */ 4407 /* We have reached the bottom of the tree. */
4408 BUFFER_TRACE(parent_bh, "free data blocks"); 4408 BUFFER_TRACE(parent_bh, "free data blocks");
4409 ext4_free_data(handle, inode, parent_bh, first, last); 4409 ext4_free_data(handle, inode, parent_bh, first, last);
4410 } 4410 }
4411 } 4411 }
4412 4412
4413 int ext4_can_truncate(struct inode *inode) 4413 int ext4_can_truncate(struct inode *inode)
4414 { 4414 {
4415 if (S_ISREG(inode->i_mode)) 4415 if (S_ISREG(inode->i_mode))
4416 return 1; 4416 return 1;
4417 if (S_ISDIR(inode->i_mode)) 4417 if (S_ISDIR(inode->i_mode))
4418 return 1; 4418 return 1;
4419 if (S_ISLNK(inode->i_mode)) 4419 if (S_ISLNK(inode->i_mode))
4420 return !ext4_inode_is_fast_symlink(inode); 4420 return !ext4_inode_is_fast_symlink(inode);
4421 return 0; 4421 return 0;
4422 } 4422 }
4423 4423
4424 /* 4424 /*
4425 * ext4_punch_hole: punches a hole in a file by releaseing the blocks 4425 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4426 * associated with the given offset and length 4426 * associated with the given offset and length
4427 * 4427 *
4428 * @inode: File inode 4428 * @inode: File inode
4429 * @offset: The offset where the hole will begin 4429 * @offset: The offset where the hole will begin
4430 * @len: The length of the hole 4430 * @len: The length of the hole
4431 * 4431 *
4432 * Returns: 0 on sucess or negative on failure 4432 * Returns: 0 on sucess or negative on failure
4433 */ 4433 */
4434 4434
4435 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 4435 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4436 { 4436 {
4437 struct inode *inode = file->f_path.dentry->d_inode; 4437 struct inode *inode = file->f_path.dentry->d_inode;
4438 if (!S_ISREG(inode->i_mode)) 4438 if (!S_ISREG(inode->i_mode))
4439 return -ENOTSUPP; 4439 return -ENOTSUPP;
4440 4440
4441 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4441 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4442 /* TODO: Add support for non extent hole punching */ 4442 /* TODO: Add support for non extent hole punching */
4443 return -ENOTSUPP; 4443 return -ENOTSUPP;
4444 } 4444 }
4445 4445
4446 return ext4_ext_punch_hole(file, offset, length); 4446 return ext4_ext_punch_hole(file, offset, length);
4447 } 4447 }
4448 4448
4449 /* 4449 /*
4450 * ext4_truncate() 4450 * ext4_truncate()
4451 * 4451 *
4452 * We block out ext4_get_block() block instantiations across the entire 4452 * We block out ext4_get_block() block instantiations across the entire
4453 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 4453 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4454 * simultaneously on behalf of the same inode. 4454 * simultaneously on behalf of the same inode.
4455 * 4455 *
4456 * As we work through the truncate and commmit bits of it to the journal there 4456 * As we work through the truncate and commmit bits of it to the journal there
4457 * is one core, guiding principle: the file's tree must always be consistent on 4457 * is one core, guiding principle: the file's tree must always be consistent on
4458 * disk. We must be able to restart the truncate after a crash. 4458 * disk. We must be able to restart the truncate after a crash.
4459 * 4459 *
4460 * The file's tree may be transiently inconsistent in memory (although it 4460 * The file's tree may be transiently inconsistent in memory (although it
4461 * probably isn't), but whenever we close off and commit a journal transaction, 4461 * probably isn't), but whenever we close off and commit a journal transaction,
4462 * the contents of (the filesystem + the journal) must be consistent and 4462 * the contents of (the filesystem + the journal) must be consistent and
4463 * restartable. It's pretty simple, really: bottom up, right to left (although 4463 * restartable. It's pretty simple, really: bottom up, right to left (although
4464 * left-to-right works OK too). 4464 * left-to-right works OK too).
4465 * 4465 *
4466 * Note that at recovery time, journal replay occurs *before* the restart of 4466 * Note that at recovery time, journal replay occurs *before* the restart of
4467 * truncate against the orphan inode list. 4467 * truncate against the orphan inode list.
4468 * 4468 *
4469 * The committed inode has the new, desired i_size (which is the same as 4469 * The committed inode has the new, desired i_size (which is the same as
4470 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 4470 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
4471 * that this inode's truncate did not complete and it will again call 4471 * that this inode's truncate did not complete and it will again call
4472 * ext4_truncate() to have another go. So there will be instantiated blocks 4472 * ext4_truncate() to have another go. So there will be instantiated blocks
4473 * to the right of the truncation point in a crashed ext4 filesystem. But 4473 * to the right of the truncation point in a crashed ext4 filesystem. But
4474 * that's fine - as long as they are linked from the inode, the post-crash 4474 * that's fine - as long as they are linked from the inode, the post-crash
4475 * ext4_truncate() run will find them and release them. 4475 * ext4_truncate() run will find them and release them.
4476 */ 4476 */
4477 void ext4_truncate(struct inode *inode) 4477 void ext4_truncate(struct inode *inode)
4478 { 4478 {
4479 handle_t *handle; 4479 handle_t *handle;
4480 struct ext4_inode_info *ei = EXT4_I(inode); 4480 struct ext4_inode_info *ei = EXT4_I(inode);
4481 __le32 *i_data = ei->i_data; 4481 __le32 *i_data = ei->i_data;
4482 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 4482 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4483 struct address_space *mapping = inode->i_mapping; 4483 struct address_space *mapping = inode->i_mapping;
4484 ext4_lblk_t offsets[4]; 4484 ext4_lblk_t offsets[4];
4485 Indirect chain[4]; 4485 Indirect chain[4];
4486 Indirect *partial; 4486 Indirect *partial;
4487 __le32 nr = 0; 4487 __le32 nr = 0;
4488 int n = 0; 4488 int n = 0;
4489 ext4_lblk_t last_block, max_block; 4489 ext4_lblk_t last_block, max_block;
4490 unsigned blocksize = inode->i_sb->s_blocksize; 4490 unsigned blocksize = inode->i_sb->s_blocksize;
4491 4491
4492 trace_ext4_truncate_enter(inode); 4492 trace_ext4_truncate_enter(inode);
4493 4493
4494 if (!ext4_can_truncate(inode)) 4494 if (!ext4_can_truncate(inode))
4495 return; 4495 return;
4496 4496
4497 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4497 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4498 4498
4499 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4499 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4500 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4500 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4501 4501
4502 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4502 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4503 ext4_ext_truncate(inode); 4503 ext4_ext_truncate(inode);
4504 trace_ext4_truncate_exit(inode); 4504 trace_ext4_truncate_exit(inode);
4505 return; 4505 return;
4506 } 4506 }
4507 4507
4508 handle = start_transaction(inode); 4508 handle = start_transaction(inode);
4509 if (IS_ERR(handle)) 4509 if (IS_ERR(handle))
4510 return; /* AKPM: return what? */ 4510 return; /* AKPM: return what? */
4511 4511
4512 last_block = (inode->i_size + blocksize-1) 4512 last_block = (inode->i_size + blocksize-1)
4513 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4513 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4514 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 4514 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4515 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4515 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4516 4516
4517 if (inode->i_size & (blocksize - 1)) 4517 if (inode->i_size & (blocksize - 1))
4518 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4518 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4519 goto out_stop; 4519 goto out_stop;
4520 4520
4521 if (last_block != max_block) { 4521 if (last_block != max_block) {
4522 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4522 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4523 if (n == 0) 4523 if (n == 0)
4524 goto out_stop; /* error */ 4524 goto out_stop; /* error */
4525 } 4525 }
4526 4526
4527 /* 4527 /*
4528 * OK. This truncate is going to happen. We add the inode to the 4528 * OK. This truncate is going to happen. We add the inode to the
4529 * orphan list, so that if this truncate spans multiple transactions, 4529 * orphan list, so that if this truncate spans multiple transactions,
4530 * and we crash, we will resume the truncate when the filesystem 4530 * and we crash, we will resume the truncate when the filesystem
4531 * recovers. It also marks the inode dirty, to catch the new size. 4531 * recovers. It also marks the inode dirty, to catch the new size.
4532 * 4532 *
4533 * Implication: the file must always be in a sane, consistent 4533 * Implication: the file must always be in a sane, consistent
4534 * truncatable state while each transaction commits. 4534 * truncatable state while each transaction commits.
4535 */ 4535 */
4536 if (ext4_orphan_add(handle, inode)) 4536 if (ext4_orphan_add(handle, inode))
4537 goto out_stop; 4537 goto out_stop;
4538 4538
4539 /* 4539 /*
4540 * From here we block out all ext4_get_block() callers who want to 4540 * From here we block out all ext4_get_block() callers who want to
4541 * modify the block allocation tree. 4541 * modify the block allocation tree.
4542 */ 4542 */
4543 down_write(&ei->i_data_sem); 4543 down_write(&ei->i_data_sem);
4544 4544
4545 ext4_discard_preallocations(inode); 4545 ext4_discard_preallocations(inode);
4546 4546
4547 /* 4547 /*
4548 * The orphan list entry will now protect us from any crash which 4548 * The orphan list entry will now protect us from any crash which
4549 * occurs before the truncate completes, so it is now safe to propagate 4549 * occurs before the truncate completes, so it is now safe to propagate
4550 * the new, shorter inode size (held for now in i_size) into the 4550 * the new, shorter inode size (held for now in i_size) into the
4551 * on-disk inode. We do this via i_disksize, which is the value which 4551 * on-disk inode. We do this via i_disksize, which is the value which
4552 * ext4 *really* writes onto the disk inode. 4552 * ext4 *really* writes onto the disk inode.
4553 */ 4553 */
4554 ei->i_disksize = inode->i_size; 4554 ei->i_disksize = inode->i_size;
4555 4555
4556 if (last_block == max_block) { 4556 if (last_block == max_block) {
4557 /* 4557 /*
4558 * It is unnecessary to free any data blocks if last_block is 4558 * It is unnecessary to free any data blocks if last_block is
4559 * equal to the indirect block limit. 4559 * equal to the indirect block limit.
4560 */ 4560 */
4561 goto out_unlock; 4561 goto out_unlock;
4562 } else if (n == 1) { /* direct blocks */ 4562 } else if (n == 1) { /* direct blocks */
4563 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4563 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4564 i_data + EXT4_NDIR_BLOCKS); 4564 i_data + EXT4_NDIR_BLOCKS);
4565 goto do_indirects; 4565 goto do_indirects;
4566 } 4566 }
4567 4567
4568 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 4568 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4569 /* Kill the top of shared branch (not detached) */ 4569 /* Kill the top of shared branch (not detached) */
4570 if (nr) { 4570 if (nr) {
4571 if (partial == chain) { 4571 if (partial == chain) {
4572 /* Shared branch grows from the inode */ 4572 /* Shared branch grows from the inode */
4573 ext4_free_branches(handle, inode, NULL, 4573 ext4_free_branches(handle, inode, NULL,
4574 &nr, &nr+1, (chain+n-1) - partial); 4574 &nr, &nr+1, (chain+n-1) - partial);
4575 *partial->p = 0; 4575 *partial->p = 0;
4576 /* 4576 /*
4577 * We mark the inode dirty prior to restart, 4577 * We mark the inode dirty prior to restart,
4578 * and prior to stop. No need for it here. 4578 * and prior to stop. No need for it here.
4579 */ 4579 */
4580 } else { 4580 } else {
4581 /* Shared branch grows from an indirect block */ 4581 /* Shared branch grows from an indirect block */
4582 BUFFER_TRACE(partial->bh, "get_write_access"); 4582 BUFFER_TRACE(partial->bh, "get_write_access");
4583 ext4_free_branches(handle, inode, partial->bh, 4583 ext4_free_branches(handle, inode, partial->bh,
4584 partial->p, 4584 partial->p,
4585 partial->p+1, (chain+n-1) - partial); 4585 partial->p+1, (chain+n-1) - partial);
4586 } 4586 }
4587 } 4587 }
4588 /* Clear the ends of indirect blocks on the shared branch */ 4588 /* Clear the ends of indirect blocks on the shared branch */
4589 while (partial > chain) { 4589 while (partial > chain) {
4590 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 4590 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4591 (__le32*)partial->bh->b_data+addr_per_block, 4591 (__le32*)partial->bh->b_data+addr_per_block,
4592 (chain+n-1) - partial); 4592 (chain+n-1) - partial);
4593 BUFFER_TRACE(partial->bh, "call brelse"); 4593 BUFFER_TRACE(partial->bh, "call brelse");
4594 brelse(partial->bh); 4594 brelse(partial->bh);
4595 partial--; 4595 partial--;
4596 } 4596 }
4597 do_indirects: 4597 do_indirects:
4598 /* Kill the remaining (whole) subtrees */ 4598 /* Kill the remaining (whole) subtrees */
4599 switch (offsets[0]) { 4599 switch (offsets[0]) {
4600 default: 4600 default:
4601 nr = i_data[EXT4_IND_BLOCK]; 4601 nr = i_data[EXT4_IND_BLOCK];
4602 if (nr) { 4602 if (nr) {
4603 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 4603 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4604 i_data[EXT4_IND_BLOCK] = 0; 4604 i_data[EXT4_IND_BLOCK] = 0;
4605 } 4605 }
4606 case EXT4_IND_BLOCK: 4606 case EXT4_IND_BLOCK:
4607 nr = i_data[EXT4_DIND_BLOCK]; 4607 nr = i_data[EXT4_DIND_BLOCK];
4608 if (nr) { 4608 if (nr) {
4609 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 4609 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4610 i_data[EXT4_DIND_BLOCK] = 0; 4610 i_data[EXT4_DIND_BLOCK] = 0;
4611 } 4611 }
4612 case EXT4_DIND_BLOCK: 4612 case EXT4_DIND_BLOCK:
4613 nr = i_data[EXT4_TIND_BLOCK]; 4613 nr = i_data[EXT4_TIND_BLOCK];
4614 if (nr) { 4614 if (nr) {
4615 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 4615 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4616 i_data[EXT4_TIND_BLOCK] = 0; 4616 i_data[EXT4_TIND_BLOCK] = 0;
4617 } 4617 }
4618 case EXT4_TIND_BLOCK: 4618 case EXT4_TIND_BLOCK:
4619 ; 4619 ;
4620 } 4620 }
4621 4621
4622 out_unlock: 4622 out_unlock:
4623 up_write(&ei->i_data_sem); 4623 up_write(&ei->i_data_sem);
4624 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4624 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4625 ext4_mark_inode_dirty(handle, inode); 4625 ext4_mark_inode_dirty(handle, inode);
4626 4626
4627 /* 4627 /*
4628 * In a multi-transaction truncate, we only make the final transaction 4628 * In a multi-transaction truncate, we only make the final transaction
4629 * synchronous 4629 * synchronous
4630 */ 4630 */
4631 if (IS_SYNC(inode)) 4631 if (IS_SYNC(inode))
4632 ext4_handle_sync(handle); 4632 ext4_handle_sync(handle);
4633 out_stop: 4633 out_stop:
4634 /* 4634 /*
4635 * If this was a simple ftruncate(), and the file will remain alive 4635 * If this was a simple ftruncate(), and the file will remain alive
4636 * then we need to clear up the orphan record which we created above. 4636 * then we need to clear up the orphan record which we created above.
4637 * However, if this was a real unlink then we were called by 4637 * However, if this was a real unlink then we were called by
4638 * ext4_delete_inode(), and we allow that function to clean up the 4638 * ext4_delete_inode(), and we allow that function to clean up the
4639 * orphan info for us. 4639 * orphan info for us.
4640 */ 4640 */
4641 if (inode->i_nlink) 4641 if (inode->i_nlink)
4642 ext4_orphan_del(handle, inode); 4642 ext4_orphan_del(handle, inode);
4643 4643
4644 ext4_journal_stop(handle); 4644 ext4_journal_stop(handle);
4645 trace_ext4_truncate_exit(inode); 4645 trace_ext4_truncate_exit(inode);
4646 } 4646 }
4647 4647
4648 /* 4648 /*
4649 * ext4_get_inode_loc returns with an extra refcount against the inode's 4649 * ext4_get_inode_loc returns with an extra refcount against the inode's
4650 * underlying buffer_head on success. If 'in_mem' is true, we have all 4650 * underlying buffer_head on success. If 'in_mem' is true, we have all
4651 * data in memory that is needed to recreate the on-disk version of this 4651 * data in memory that is needed to recreate the on-disk version of this
4652 * inode. 4652 * inode.
4653 */ 4653 */
4654 static int __ext4_get_inode_loc(struct inode *inode, 4654 static int __ext4_get_inode_loc(struct inode *inode,
4655 struct ext4_iloc *iloc, int in_mem) 4655 struct ext4_iloc *iloc, int in_mem)
4656 { 4656 {
4657 struct ext4_group_desc *gdp; 4657 struct ext4_group_desc *gdp;
4658 struct buffer_head *bh; 4658 struct buffer_head *bh;
4659 struct super_block *sb = inode->i_sb; 4659 struct super_block *sb = inode->i_sb;
4660 ext4_fsblk_t block; 4660 ext4_fsblk_t block;
4661 int inodes_per_block, inode_offset; 4661 int inodes_per_block, inode_offset;
4662 4662
4663 iloc->bh = NULL; 4663 iloc->bh = NULL;
4664 if (!ext4_valid_inum(sb, inode->i_ino)) 4664 if (!ext4_valid_inum(sb, inode->i_ino))
4665 return -EIO; 4665 return -EIO;
4666 4666
4667 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 4667 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4668 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4668 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4669 if (!gdp) 4669 if (!gdp)
4670 return -EIO; 4670 return -EIO;
4671 4671
4672 /* 4672 /*
4673 * Figure out the offset within the block group inode table 4673 * Figure out the offset within the block group inode table
4674 */ 4674 */
4675 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4675 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4676 inode_offset = ((inode->i_ino - 1) % 4676 inode_offset = ((inode->i_ino - 1) %
4677 EXT4_INODES_PER_GROUP(sb)); 4677 EXT4_INODES_PER_GROUP(sb));
4678 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4678 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4679 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4679 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4680 4680
4681 bh = sb_getblk(sb, block); 4681 bh = sb_getblk(sb, block);
4682 if (!bh) { 4682 if (!bh) {
4683 EXT4_ERROR_INODE_BLOCK(inode, block, 4683 EXT4_ERROR_INODE_BLOCK(inode, block,
4684 "unable to read itable block"); 4684 "unable to read itable block");
4685 return -EIO; 4685 return -EIO;
4686 } 4686 }
4687 if (!buffer_uptodate(bh)) { 4687 if (!buffer_uptodate(bh)) {
4688 lock_buffer(bh); 4688 lock_buffer(bh);
4689 4689
4690 /* 4690 /*
4691 * If the buffer has the write error flag, we have failed 4691 * If the buffer has the write error flag, we have failed
4692 * to write out another inode in the same block. In this 4692 * to write out another inode in the same block. In this
4693 * case, we don't have to read the block because we may 4693 * case, we don't have to read the block because we may
4694 * read the old inode data successfully. 4694 * read the old inode data successfully.
4695 */ 4695 */
4696 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4696 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4697 set_buffer_uptodate(bh); 4697 set_buffer_uptodate(bh);
4698 4698
4699 if (buffer_uptodate(bh)) { 4699 if (buffer_uptodate(bh)) {
4700 /* someone brought it uptodate while we waited */ 4700 /* someone brought it uptodate while we waited */
4701 unlock_buffer(bh); 4701 unlock_buffer(bh);
4702 goto has_buffer; 4702 goto has_buffer;
4703 } 4703 }
4704 4704
4705 /* 4705 /*
4706 * If we have all information of the inode in memory and this 4706 * If we have all information of the inode in memory and this
4707 * is the only valid inode in the block, we need not read the 4707 * is the only valid inode in the block, we need not read the
4708 * block. 4708 * block.
4709 */ 4709 */
4710 if (in_mem) { 4710 if (in_mem) {
4711 struct buffer_head *bitmap_bh; 4711 struct buffer_head *bitmap_bh;
4712 int i, start; 4712 int i, start;
4713 4713
4714 start = inode_offset & ~(inodes_per_block - 1); 4714 start = inode_offset & ~(inodes_per_block - 1);
4715 4715
4716 /* Is the inode bitmap in cache? */ 4716 /* Is the inode bitmap in cache? */
4717 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4717 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4718 if (!bitmap_bh) 4718 if (!bitmap_bh)
4719 goto make_io; 4719 goto make_io;
4720 4720
4721 /* 4721 /*
4722 * If the inode bitmap isn't in cache then the 4722 * If the inode bitmap isn't in cache then the
4723 * optimisation may end up performing two reads instead 4723 * optimisation may end up performing two reads instead
4724 * of one, so skip it. 4724 * of one, so skip it.
4725 */ 4725 */
4726 if (!buffer_uptodate(bitmap_bh)) { 4726 if (!buffer_uptodate(bitmap_bh)) {
4727 brelse(bitmap_bh); 4727 brelse(bitmap_bh);
4728 goto make_io; 4728 goto make_io;
4729 } 4729 }
4730 for (i = start; i < start + inodes_per_block; i++) { 4730 for (i = start; i < start + inodes_per_block; i++) {
4731 if (i == inode_offset) 4731 if (i == inode_offset)
4732 continue; 4732 continue;
4733 if (ext4_test_bit(i, bitmap_bh->b_data)) 4733 if (ext4_test_bit(i, bitmap_bh->b_data))
4734 break; 4734 break;
4735 } 4735 }
4736 brelse(bitmap_bh); 4736 brelse(bitmap_bh);
4737 if (i == start + inodes_per_block) { 4737 if (i == start + inodes_per_block) {
4738 /* all other inodes are free, so skip I/O */ 4738 /* all other inodes are free, so skip I/O */
4739 memset(bh->b_data, 0, bh->b_size); 4739 memset(bh->b_data, 0, bh->b_size);
4740 set_buffer_uptodate(bh); 4740 set_buffer_uptodate(bh);
4741 unlock_buffer(bh); 4741 unlock_buffer(bh);
4742 goto has_buffer; 4742 goto has_buffer;
4743 } 4743 }
4744 } 4744 }
4745 4745
4746 make_io: 4746 make_io:
4747 /* 4747 /*
4748 * If we need to do any I/O, try to pre-readahead extra 4748 * If we need to do any I/O, try to pre-readahead extra
4749 * blocks from the inode table. 4749 * blocks from the inode table.
4750 */ 4750 */
4751 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4751 if (EXT4_SB(sb)->s_inode_readahead_blks) {
4752 ext4_fsblk_t b, end, table; 4752 ext4_fsblk_t b, end, table;
4753 unsigned num; 4753 unsigned num;
4754 4754
4755 table = ext4_inode_table(sb, gdp); 4755 table = ext4_inode_table(sb, gdp);
4756 /* s_inode_readahead_blks is always a power of 2 */ 4756 /* s_inode_readahead_blks is always a power of 2 */
4757 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4757 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4758 if (table > b) 4758 if (table > b)
4759 b = table; 4759 b = table;
4760 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4760 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
4761 num = EXT4_INODES_PER_GROUP(sb); 4761 num = EXT4_INODES_PER_GROUP(sb);
4762 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4762 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4763 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4763 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
4764 num -= ext4_itable_unused_count(sb, gdp); 4764 num -= ext4_itable_unused_count(sb, gdp);
4765 table += num / inodes_per_block; 4765 table += num / inodes_per_block;
4766 if (end > table) 4766 if (end > table)
4767 end = table; 4767 end = table;
4768 while (b <= end) 4768 while (b <= end)
4769 sb_breadahead(sb, b++); 4769 sb_breadahead(sb, b++);
4770 } 4770 }
4771 4771
4772 /* 4772 /*
4773 * There are other valid inodes in the buffer, this inode 4773 * There are other valid inodes in the buffer, this inode
4774 * has in-inode xattrs, or we don't have this inode in memory. 4774 * has in-inode xattrs, or we don't have this inode in memory.
4775 * Read the block from disk. 4775 * Read the block from disk.
4776 */ 4776 */
4777 trace_ext4_load_inode(inode); 4777 trace_ext4_load_inode(inode);
4778 get_bh(bh); 4778 get_bh(bh);
4779 bh->b_end_io = end_buffer_read_sync; 4779 bh->b_end_io = end_buffer_read_sync;
4780 submit_bh(READ_META, bh); 4780 submit_bh(READ_META, bh);
4781 wait_on_buffer(bh); 4781 wait_on_buffer(bh);
4782 if (!buffer_uptodate(bh)) { 4782 if (!buffer_uptodate(bh)) {
4783 EXT4_ERROR_INODE_BLOCK(inode, block, 4783 EXT4_ERROR_INODE_BLOCK(inode, block,
4784 "unable to read itable block"); 4784 "unable to read itable block");
4785 brelse(bh); 4785 brelse(bh);
4786 return -EIO; 4786 return -EIO;
4787 } 4787 }
4788 } 4788 }
4789 has_buffer: 4789 has_buffer:
4790 iloc->bh = bh; 4790 iloc->bh = bh;
4791 return 0; 4791 return 0;
4792 } 4792 }
4793 4793
4794 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4794 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4795 { 4795 {
4796 /* We have all inode data except xattrs in memory here. */ 4796 /* We have all inode data except xattrs in memory here. */
4797 return __ext4_get_inode_loc(inode, iloc, 4797 return __ext4_get_inode_loc(inode, iloc,
4798 !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); 4798 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4799 } 4799 }
4800 4800
4801 void ext4_set_inode_flags(struct inode *inode) 4801 void ext4_set_inode_flags(struct inode *inode)
4802 { 4802 {
4803 unsigned int flags = EXT4_I(inode)->i_flags; 4803 unsigned int flags = EXT4_I(inode)->i_flags;
4804 4804
4805 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 4805 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
4806 if (flags & EXT4_SYNC_FL) 4806 if (flags & EXT4_SYNC_FL)
4807 inode->i_flags |= S_SYNC; 4807 inode->i_flags |= S_SYNC;
4808 if (flags & EXT4_APPEND_FL) 4808 if (flags & EXT4_APPEND_FL)
4809 inode->i_flags |= S_APPEND; 4809 inode->i_flags |= S_APPEND;
4810 if (flags & EXT4_IMMUTABLE_FL) 4810 if (flags & EXT4_IMMUTABLE_FL)
4811 inode->i_flags |= S_IMMUTABLE; 4811 inode->i_flags |= S_IMMUTABLE;
4812 if (flags & EXT4_NOATIME_FL) 4812 if (flags & EXT4_NOATIME_FL)
4813 inode->i_flags |= S_NOATIME; 4813 inode->i_flags |= S_NOATIME;
4814 if (flags & EXT4_DIRSYNC_FL) 4814 if (flags & EXT4_DIRSYNC_FL)
4815 inode->i_flags |= S_DIRSYNC; 4815 inode->i_flags |= S_DIRSYNC;
4816 } 4816 }
4817 4817
4818 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4818 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4819 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4819 void ext4_get_inode_flags(struct ext4_inode_info *ei)
4820 { 4820 {
4821 unsigned int vfs_fl; 4821 unsigned int vfs_fl;
4822 unsigned long old_fl, new_fl; 4822 unsigned long old_fl, new_fl;
4823 4823
4824 do { 4824 do {
4825 vfs_fl = ei->vfs_inode.i_flags; 4825 vfs_fl = ei->vfs_inode.i_flags;
4826 old_fl = ei->i_flags; 4826 old_fl = ei->i_flags;
4827 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4827 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4828 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| 4828 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4829 EXT4_DIRSYNC_FL); 4829 EXT4_DIRSYNC_FL);
4830 if (vfs_fl & S_SYNC) 4830 if (vfs_fl & S_SYNC)
4831 new_fl |= EXT4_SYNC_FL; 4831 new_fl |= EXT4_SYNC_FL;
4832 if (vfs_fl & S_APPEND) 4832 if (vfs_fl & S_APPEND)
4833 new_fl |= EXT4_APPEND_FL; 4833 new_fl |= EXT4_APPEND_FL;
4834 if (vfs_fl & S_IMMUTABLE) 4834 if (vfs_fl & S_IMMUTABLE)
4835 new_fl |= EXT4_IMMUTABLE_FL; 4835 new_fl |= EXT4_IMMUTABLE_FL;
4836 if (vfs_fl & S_NOATIME) 4836 if (vfs_fl & S_NOATIME)
4837 new_fl |= EXT4_NOATIME_FL; 4837 new_fl |= EXT4_NOATIME_FL;
4838 if (vfs_fl & S_DIRSYNC) 4838 if (vfs_fl & S_DIRSYNC)
4839 new_fl |= EXT4_DIRSYNC_FL; 4839 new_fl |= EXT4_DIRSYNC_FL;
4840 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); 4840 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4841 } 4841 }
4842 4842
4843 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4843 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4844 struct ext4_inode_info *ei) 4844 struct ext4_inode_info *ei)
4845 { 4845 {
4846 blkcnt_t i_blocks ; 4846 blkcnt_t i_blocks ;
4847 struct inode *inode = &(ei->vfs_inode); 4847 struct inode *inode = &(ei->vfs_inode);
4848 struct super_block *sb = inode->i_sb; 4848 struct super_block *sb = inode->i_sb;
4849 4849
4850 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4850 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4851 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4851 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4852 /* we are using combined 48 bit field */ 4852 /* we are using combined 48 bit field */
4853 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4853 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4854 le32_to_cpu(raw_inode->i_blocks_lo); 4854 le32_to_cpu(raw_inode->i_blocks_lo);
4855 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { 4855 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4856 /* i_blocks represent file system block size */ 4856 /* i_blocks represent file system block size */
4857 return i_blocks << (inode->i_blkbits - 9); 4857 return i_blocks << (inode->i_blkbits - 9);
4858 } else { 4858 } else {
4859 return i_blocks; 4859 return i_blocks;
4860 } 4860 }
4861 } else { 4861 } else {
4862 return le32_to_cpu(raw_inode->i_blocks_lo); 4862 return le32_to_cpu(raw_inode->i_blocks_lo);
4863 } 4863 }
4864 } 4864 }
4865 4865
4866 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4866 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4867 { 4867 {
4868 struct ext4_iloc iloc; 4868 struct ext4_iloc iloc;
4869 struct ext4_inode *raw_inode; 4869 struct ext4_inode *raw_inode;
4870 struct ext4_inode_info *ei; 4870 struct ext4_inode_info *ei;
4871 struct inode *inode; 4871 struct inode *inode;
4872 journal_t *journal = EXT4_SB(sb)->s_journal; 4872 journal_t *journal = EXT4_SB(sb)->s_journal;
4873 long ret; 4873 long ret;
4874 int block; 4874 int block;
4875 4875
4876 inode = iget_locked(sb, ino); 4876 inode = iget_locked(sb, ino);
4877 if (!inode) 4877 if (!inode)
4878 return ERR_PTR(-ENOMEM); 4878 return ERR_PTR(-ENOMEM);
4879 if (!(inode->i_state & I_NEW)) 4879 if (!(inode->i_state & I_NEW))
4880 return inode; 4880 return inode;
4881 4881
4882 ei = EXT4_I(inode); 4882 ei = EXT4_I(inode);
4883 iloc.bh = NULL; 4883 iloc.bh = NULL;
4884 4884
4885 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4885 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4886 if (ret < 0) 4886 if (ret < 0)
4887 goto bad_inode; 4887 goto bad_inode;
4888 raw_inode = ext4_raw_inode(&iloc); 4888 raw_inode = ext4_raw_inode(&iloc);
4889 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4889 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4890 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4890 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4891 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4891 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4892 if (!(test_opt(inode->i_sb, NO_UID32))) { 4892 if (!(test_opt(inode->i_sb, NO_UID32))) {
4893 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4893 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4894 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4894 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4895 } 4895 }
4896 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4896 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4897 4897
4898 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 4898 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
4899 ei->i_dir_start_lookup = 0; 4899 ei->i_dir_start_lookup = 0;
4900 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4900 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4901 /* We now have enough fields to check if the inode was active or not. 4901 /* We now have enough fields to check if the inode was active or not.
4902 * This is needed because nfsd might try to access dead inodes 4902 * This is needed because nfsd might try to access dead inodes
4903 * the test is that same one that e2fsck uses 4903 * the test is that same one that e2fsck uses
4904 * NeilBrown 1999oct15 4904 * NeilBrown 1999oct15
4905 */ 4905 */
4906 if (inode->i_nlink == 0) { 4906 if (inode->i_nlink == 0) {
4907 if (inode->i_mode == 0 || 4907 if (inode->i_mode == 0 ||
4908 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4908 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4909 /* this inode is deleted */ 4909 /* this inode is deleted */
4910 ret = -ESTALE; 4910 ret = -ESTALE;
4911 goto bad_inode; 4911 goto bad_inode;
4912 } 4912 }
4913 /* The only unlinked inodes we let through here have 4913 /* The only unlinked inodes we let through here have
4914 * valid i_mode and are being read by the orphan 4914 * valid i_mode and are being read by the orphan
4915 * recovery code: that's fine, we're about to complete 4915 * recovery code: that's fine, we're about to complete
4916 * the process of deleting those. */ 4916 * the process of deleting those. */
4917 } 4917 }
4918 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4918 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4919 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4919 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4920 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4920 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4921 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) 4921 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4922 ei->i_file_acl |= 4922 ei->i_file_acl |=
4923 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4923 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4924 inode->i_size = ext4_isize(raw_inode); 4924 inode->i_size = ext4_isize(raw_inode);
4925 ei->i_disksize = inode->i_size; 4925 ei->i_disksize = inode->i_size;
4926 #ifdef CONFIG_QUOTA 4926 #ifdef CONFIG_QUOTA
4927 ei->i_reserved_quota = 0; 4927 ei->i_reserved_quota = 0;
4928 #endif 4928 #endif
4929 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4929 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4930 ei->i_block_group = iloc.block_group; 4930 ei->i_block_group = iloc.block_group;
4931 ei->i_last_alloc_group = ~0; 4931 ei->i_last_alloc_group = ~0;
4932 /* 4932 /*
4933 * NOTE! The in-memory inode i_data array is in little-endian order 4933 * NOTE! The in-memory inode i_data array is in little-endian order
4934 * even on big-endian machines: we do NOT byteswap the block numbers! 4934 * even on big-endian machines: we do NOT byteswap the block numbers!
4935 */ 4935 */
4936 for (block = 0; block < EXT4_N_BLOCKS; block++) 4936 for (block = 0; block < EXT4_N_BLOCKS; block++)
4937 ei->i_data[block] = raw_inode->i_block[block]; 4937 ei->i_data[block] = raw_inode->i_block[block];
4938 INIT_LIST_HEAD(&ei->i_orphan); 4938 INIT_LIST_HEAD(&ei->i_orphan);
4939 4939
4940 /* 4940 /*
4941 * Set transaction id's of transactions that have to be committed 4941 * Set transaction id's of transactions that have to be committed
4942 * to finish f[data]sync. We set them to currently running transaction 4942 * to finish f[data]sync. We set them to currently running transaction
4943 * as we cannot be sure that the inode or some of its metadata isn't 4943 * as we cannot be sure that the inode or some of its metadata isn't
4944 * part of the transaction - the inode could have been reclaimed and 4944 * part of the transaction - the inode could have been reclaimed and
4945 * now it is reread from disk. 4945 * now it is reread from disk.
4946 */ 4946 */
4947 if (journal) { 4947 if (journal) {
4948 transaction_t *transaction; 4948 transaction_t *transaction;
4949 tid_t tid; 4949 tid_t tid;
4950 4950
4951 read_lock(&journal->j_state_lock); 4951 read_lock(&journal->j_state_lock);
4952 if (journal->j_running_transaction) 4952 if (journal->j_running_transaction)
4953 transaction = journal->j_running_transaction; 4953 transaction = journal->j_running_transaction;
4954 else 4954 else
4955 transaction = journal->j_committing_transaction; 4955 transaction = journal->j_committing_transaction;
4956 if (transaction) 4956 if (transaction)
4957 tid = transaction->t_tid; 4957 tid = transaction->t_tid;
4958 else 4958 else
4959 tid = journal->j_commit_sequence; 4959 tid = journal->j_commit_sequence;
4960 read_unlock(&journal->j_state_lock); 4960 read_unlock(&journal->j_state_lock);
4961 ei->i_sync_tid = tid; 4961 ei->i_sync_tid = tid;
4962 ei->i_datasync_tid = tid; 4962 ei->i_datasync_tid = tid;
4963 } 4963 }
4964 4964
4965 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4965 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4966 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4966 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4967 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4967 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4968 EXT4_INODE_SIZE(inode->i_sb)) { 4968 EXT4_INODE_SIZE(inode->i_sb)) {
4969 ret = -EIO; 4969 ret = -EIO;
4970 goto bad_inode; 4970 goto bad_inode;
4971 } 4971 }
4972 if (ei->i_extra_isize == 0) { 4972 if (ei->i_extra_isize == 0) {
4973 /* The extra space is currently unused. Use it. */ 4973 /* The extra space is currently unused. Use it. */
4974 ei->i_extra_isize = sizeof(struct ext4_inode) - 4974 ei->i_extra_isize = sizeof(struct ext4_inode) -
4975 EXT4_GOOD_OLD_INODE_SIZE; 4975 EXT4_GOOD_OLD_INODE_SIZE;
4976 } else { 4976 } else {
4977 __le32 *magic = (void *)raw_inode + 4977 __le32 *magic = (void *)raw_inode +
4978 EXT4_GOOD_OLD_INODE_SIZE + 4978 EXT4_GOOD_OLD_INODE_SIZE +
4979 ei->i_extra_isize; 4979 ei->i_extra_isize;
4980 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4980 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4981 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 4981 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4982 } 4982 }
4983 } else 4983 } else
4984 ei->i_extra_isize = 0; 4984 ei->i_extra_isize = 0;
4985 4985
4986 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4986 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4987 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4987 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4988 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4988 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4989 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4989 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4990 4990
4991 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4991 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4992 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4992 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4993 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4993 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4994 inode->i_version |= 4994 inode->i_version |=
4995 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4995 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4996 } 4996 }
4997 4997
4998 ret = 0; 4998 ret = 0;
4999 if (ei->i_file_acl && 4999 if (ei->i_file_acl &&
5000 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5000 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5001 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", 5001 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5002 ei->i_file_acl); 5002 ei->i_file_acl);
5003 ret = -EIO; 5003 ret = -EIO;
5004 goto bad_inode; 5004 goto bad_inode;
5005 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5005 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5006 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5006 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5007 (S_ISLNK(inode->i_mode) && 5007 (S_ISLNK(inode->i_mode) &&
5008 !ext4_inode_is_fast_symlink(inode))) 5008 !ext4_inode_is_fast_symlink(inode)))
5009 /* Validate extent which is part of inode */ 5009 /* Validate extent which is part of inode */
5010 ret = ext4_ext_check_inode(inode); 5010 ret = ext4_ext_check_inode(inode);
5011 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5011 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5012 (S_ISLNK(inode->i_mode) && 5012 (S_ISLNK(inode->i_mode) &&
5013 !ext4_inode_is_fast_symlink(inode))) { 5013 !ext4_inode_is_fast_symlink(inode))) {
5014 /* Validate block references which are part of inode */ 5014 /* Validate block references which are part of inode */
5015 ret = ext4_check_inode_blockref(inode); 5015 ret = ext4_check_inode_blockref(inode);
5016 } 5016 }
5017 if (ret) 5017 if (ret)
5018 goto bad_inode; 5018 goto bad_inode;
5019 5019
5020 if (S_ISREG(inode->i_mode)) { 5020 if (S_ISREG(inode->i_mode)) {
5021 inode->i_op = &ext4_file_inode_operations; 5021 inode->i_op = &ext4_file_inode_operations;
5022 inode->i_fop = &ext4_file_operations; 5022 inode->i_fop = &ext4_file_operations;
5023 ext4_set_aops(inode); 5023 ext4_set_aops(inode);
5024 } else if (S_ISDIR(inode->i_mode)) { 5024 } else if (S_ISDIR(inode->i_mode)) {
5025 inode->i_op = &ext4_dir_inode_operations; 5025 inode->i_op = &ext4_dir_inode_operations;
5026 inode->i_fop = &ext4_dir_operations; 5026 inode->i_fop = &ext4_dir_operations;
5027 } else if (S_ISLNK(inode->i_mode)) { 5027 } else if (S_ISLNK(inode->i_mode)) {
5028 if (ext4_inode_is_fast_symlink(inode)) { 5028 if (ext4_inode_is_fast_symlink(inode)) {
5029 inode->i_op = &ext4_fast_symlink_inode_operations; 5029 inode->i_op = &ext4_fast_symlink_inode_operations;
5030 nd_terminate_link(ei->i_data, inode->i_size, 5030 nd_terminate_link(ei->i_data, inode->i_size,
5031 sizeof(ei->i_data) - 1); 5031 sizeof(ei->i_data) - 1);
5032 } else { 5032 } else {
5033 inode->i_op = &ext4_symlink_inode_operations; 5033 inode->i_op = &ext4_symlink_inode_operations;
5034 ext4_set_aops(inode); 5034 ext4_set_aops(inode);
5035 } 5035 }
5036 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 5036 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
5037 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 5037 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
5038 inode->i_op = &ext4_special_inode_operations; 5038 inode->i_op = &ext4_special_inode_operations;
5039 if (raw_inode->i_block[0]) 5039 if (raw_inode->i_block[0])
5040 init_special_inode(inode, inode->i_mode, 5040 init_special_inode(inode, inode->i_mode,
5041 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 5041 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
5042 else 5042 else
5043 init_special_inode(inode, inode->i_mode, 5043 init_special_inode(inode, inode->i_mode,
5044 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5044 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5045 } else { 5045 } else {
5046 ret = -EIO; 5046 ret = -EIO;
5047 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); 5047 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5048 goto bad_inode; 5048 goto bad_inode;
5049 } 5049 }
5050 brelse(iloc.bh); 5050 brelse(iloc.bh);
5051 ext4_set_inode_flags(inode); 5051 ext4_set_inode_flags(inode);
5052 unlock_new_inode(inode); 5052 unlock_new_inode(inode);
5053 return inode; 5053 return inode;
5054 5054
5055 bad_inode: 5055 bad_inode:
5056 brelse(iloc.bh); 5056 brelse(iloc.bh);
5057 iget_failed(inode); 5057 iget_failed(inode);
5058 return ERR_PTR(ret); 5058 return ERR_PTR(ret);
5059 } 5059 }
5060 5060
5061 static int ext4_inode_blocks_set(handle_t *handle, 5061 static int ext4_inode_blocks_set(handle_t *handle,
5062 struct ext4_inode *raw_inode, 5062 struct ext4_inode *raw_inode,
5063 struct ext4_inode_info *ei) 5063 struct ext4_inode_info *ei)
5064 { 5064 {
5065 struct inode *inode = &(ei->vfs_inode); 5065 struct inode *inode = &(ei->vfs_inode);
5066 u64 i_blocks = inode->i_blocks; 5066 u64 i_blocks = inode->i_blocks;
5067 struct super_block *sb = inode->i_sb; 5067 struct super_block *sb = inode->i_sb;
5068 5068
5069 if (i_blocks <= ~0U) { 5069 if (i_blocks <= ~0U) {
5070 /* 5070 /*
5071 * i_blocks can be represnted in a 32 bit variable 5071 * i_blocks can be represnted in a 32 bit variable
5072 * as multiple of 512 bytes 5072 * as multiple of 512 bytes
5073 */ 5073 */
5074 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5074 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5075 raw_inode->i_blocks_high = 0; 5075 raw_inode->i_blocks_high = 0;
5076 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 5076 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5077 return 0; 5077 return 0;
5078 } 5078 }
5079 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 5079 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
5080 return -EFBIG; 5080 return -EFBIG;
5081 5081
5082 if (i_blocks <= 0xffffffffffffULL) { 5082 if (i_blocks <= 0xffffffffffffULL) {
5083 /* 5083 /*
5084 * i_blocks can be represented in a 48 bit variable 5084 * i_blocks can be represented in a 48 bit variable
5085 * as multiple of 512 bytes 5085 * as multiple of 512 bytes
5086 */ 5086 */
5087 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5087 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5088 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 5088 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5089 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 5089 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5090 } else { 5090 } else {
5091 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); 5091 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5092 /* i_block is stored in file system block size */ 5092 /* i_block is stored in file system block size */
5093 i_blocks = i_blocks >> (inode->i_blkbits - 9); 5093 i_blocks = i_blocks >> (inode->i_blkbits - 9);
5094 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5094 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5095 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 5095 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5096 } 5096 }
5097 return 0; 5097 return 0;
5098 } 5098 }
5099 5099
5100 /* 5100 /*
5101 * Post the struct inode info into an on-disk inode location in the 5101 * Post the struct inode info into an on-disk inode location in the
5102 * buffer-cache. This gobbles the caller's reference to the 5102 * buffer-cache. This gobbles the caller's reference to the
5103 * buffer_head in the inode location struct. 5103 * buffer_head in the inode location struct.
5104 * 5104 *
5105 * The caller must have write access to iloc->bh. 5105 * The caller must have write access to iloc->bh.
5106 */ 5106 */
5107 static int ext4_do_update_inode(handle_t *handle, 5107 static int ext4_do_update_inode(handle_t *handle,
5108 struct inode *inode, 5108 struct inode *inode,
5109 struct ext4_iloc *iloc) 5109 struct ext4_iloc *iloc)
5110 { 5110 {
5111 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5111 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
5112 struct ext4_inode_info *ei = EXT4_I(inode); 5112 struct ext4_inode_info *ei = EXT4_I(inode);
5113 struct buffer_head *bh = iloc->bh; 5113 struct buffer_head *bh = iloc->bh;
5114 int err = 0, rc, block; 5114 int err = 0, rc, block;
5115 5115
5116 /* For fields not not tracking in the in-memory inode, 5116 /* For fields not not tracking in the in-memory inode,
5117 * initialise them to zero for new inodes. */ 5117 * initialise them to zero for new inodes. */
5118 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) 5118 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5119 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5119 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5120 5120
5121 ext4_get_inode_flags(ei); 5121 ext4_get_inode_flags(ei);
5122 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 5122 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
5123 if (!(test_opt(inode->i_sb, NO_UID32))) { 5123 if (!(test_opt(inode->i_sb, NO_UID32))) {
5124 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 5124 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
5125 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 5125 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
5126 /* 5126 /*
5127 * Fix up interoperability with old kernels. Otherwise, old inodes get 5127 * Fix up interoperability with old kernels. Otherwise, old inodes get
5128 * re-used with the upper 16 bits of the uid/gid intact 5128 * re-used with the upper 16 bits of the uid/gid intact
5129 */ 5129 */
5130 if (!ei->i_dtime) { 5130 if (!ei->i_dtime) {
5131 raw_inode->i_uid_high = 5131 raw_inode->i_uid_high =
5132 cpu_to_le16(high_16_bits(inode->i_uid)); 5132 cpu_to_le16(high_16_bits(inode->i_uid));
5133 raw_inode->i_gid_high = 5133 raw_inode->i_gid_high =
5134 cpu_to_le16(high_16_bits(inode->i_gid)); 5134 cpu_to_le16(high_16_bits(inode->i_gid));
5135 } else { 5135 } else {
5136 raw_inode->i_uid_high = 0; 5136 raw_inode->i_uid_high = 0;
5137 raw_inode->i_gid_high = 0; 5137 raw_inode->i_gid_high = 0;
5138 } 5138 }
5139 } else { 5139 } else {
5140 raw_inode->i_uid_low = 5140 raw_inode->i_uid_low =
5141 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 5141 cpu_to_le16(fs_high2lowuid(inode->i_uid));
5142 raw_inode->i_gid_low = 5142 raw_inode->i_gid_low =
5143 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 5143 cpu_to_le16(fs_high2lowgid(inode->i_gid));
5144 raw_inode->i_uid_high = 0; 5144 raw_inode->i_uid_high = 0;
5145 raw_inode->i_gid_high = 0; 5145 raw_inode->i_gid_high = 0;
5146 } 5146 }
5147 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 5147 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
5148 5148
5149 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 5149 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
5150 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 5150 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
5151 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 5151 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
5152 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 5152 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
5153 5153
5154 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5154 if (ext4_inode_blocks_set(handle, raw_inode, ei))
5155 goto out_brelse; 5155 goto out_brelse;
5156 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5156 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5157 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 5157 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5158 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5158 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5159 cpu_to_le32(EXT4_OS_HURD)) 5159 cpu_to_le32(EXT4_OS_HURD))
5160 raw_inode->i_file_acl_high = 5160 raw_inode->i_file_acl_high =
5161 cpu_to_le16(ei->i_file_acl >> 32); 5161 cpu_to_le16(ei->i_file_acl >> 32);
5162 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 5162 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
5163 ext4_isize_set(raw_inode, ei->i_disksize); 5163 ext4_isize_set(raw_inode, ei->i_disksize);
5164 if (ei->i_disksize > 0x7fffffffULL) { 5164 if (ei->i_disksize > 0x7fffffffULL) {
5165 struct super_block *sb = inode->i_sb; 5165 struct super_block *sb = inode->i_sb;
5166 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 5166 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
5167 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 5167 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
5168 EXT4_SB(sb)->s_es->s_rev_level == 5168 EXT4_SB(sb)->s_es->s_rev_level ==
5169 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 5169 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
5170 /* If this is the first large file 5170 /* If this is the first large file
5171 * created, add a flag to the superblock. 5171 * created, add a flag to the superblock.
5172 */ 5172 */
5173 err = ext4_journal_get_write_access(handle, 5173 err = ext4_journal_get_write_access(handle,
5174 EXT4_SB(sb)->s_sbh); 5174 EXT4_SB(sb)->s_sbh);
5175 if (err) 5175 if (err)
5176 goto out_brelse; 5176 goto out_brelse;
5177 ext4_update_dynamic_rev(sb); 5177 ext4_update_dynamic_rev(sb);
5178 EXT4_SET_RO_COMPAT_FEATURE(sb, 5178 EXT4_SET_RO_COMPAT_FEATURE(sb,
5179 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5179 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5180 sb->s_dirt = 1; 5180 sb->s_dirt = 1;
5181 ext4_handle_sync(handle); 5181 ext4_handle_sync(handle);
5182 err = ext4_handle_dirty_metadata(handle, NULL, 5182 err = ext4_handle_dirty_metadata(handle, NULL,
5183 EXT4_SB(sb)->s_sbh); 5183 EXT4_SB(sb)->s_sbh);
5184 } 5184 }
5185 } 5185 }
5186 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 5186 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
5187 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 5187 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
5188 if (old_valid_dev(inode->i_rdev)) { 5188 if (old_valid_dev(inode->i_rdev)) {
5189 raw_inode->i_block[0] = 5189 raw_inode->i_block[0] =
5190 cpu_to_le32(old_encode_dev(inode->i_rdev)); 5190 cpu_to_le32(old_encode_dev(inode->i_rdev));
5191 raw_inode->i_block[1] = 0; 5191 raw_inode->i_block[1] = 0;
5192 } else { 5192 } else {
5193 raw_inode->i_block[0] = 0; 5193 raw_inode->i_block[0] = 0;
5194 raw_inode->i_block[1] = 5194 raw_inode->i_block[1] =
5195 cpu_to_le32(new_encode_dev(inode->i_rdev)); 5195 cpu_to_le32(new_encode_dev(inode->i_rdev));
5196 raw_inode->i_block[2] = 0; 5196 raw_inode->i_block[2] = 0;
5197 } 5197 }
5198 } else 5198 } else
5199 for (block = 0; block < EXT4_N_BLOCKS; block++) 5199 for (block = 0; block < EXT4_N_BLOCKS; block++)
5200 raw_inode->i_block[block] = ei->i_data[block]; 5200 raw_inode->i_block[block] = ei->i_data[block];
5201 5201
5202 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 5202 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
5203 if (ei->i_extra_isize) { 5203 if (ei->i_extra_isize) {
5204 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 5204 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
5205 raw_inode->i_version_hi = 5205 raw_inode->i_version_hi =
5206 cpu_to_le32(inode->i_version >> 32); 5206 cpu_to_le32(inode->i_version >> 32);
5207 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5207 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
5208 } 5208 }
5209 5209
5210 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5210 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5211 rc = ext4_handle_dirty_metadata(handle, NULL, bh); 5211 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5212 if (!err) 5212 if (!err)
5213 err = rc; 5213 err = rc;
5214 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 5214 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5215 5215
5216 ext4_update_inode_fsync_trans(handle, inode, 0); 5216 ext4_update_inode_fsync_trans(handle, inode, 0);
5217 out_brelse: 5217 out_brelse:
5218 brelse(bh); 5218 brelse(bh);
5219 ext4_std_error(inode->i_sb, err); 5219 ext4_std_error(inode->i_sb, err);
5220 return err; 5220 return err;
5221 } 5221 }
5222 5222
5223 /* 5223 /*
5224 * ext4_write_inode() 5224 * ext4_write_inode()
5225 * 5225 *
5226 * We are called from a few places: 5226 * We are called from a few places:
5227 * 5227 *
5228 * - Within generic_file_write() for O_SYNC files. 5228 * - Within generic_file_write() for O_SYNC files.
5229 * Here, there will be no transaction running. We wait for any running 5229 * Here, there will be no transaction running. We wait for any running
5230 * trasnaction to commit. 5230 * trasnaction to commit.
5231 * 5231 *
5232 * - Within sys_sync(), kupdate and such. 5232 * - Within sys_sync(), kupdate and such.
5233 * We wait on commit, if tol to. 5233 * We wait on commit, if tol to.
5234 * 5234 *
5235 * - Within prune_icache() (PF_MEMALLOC == true) 5235 * - Within prune_icache() (PF_MEMALLOC == true)
5236 * Here we simply return. We can't afford to block kswapd on the 5236 * Here we simply return. We can't afford to block kswapd on the
5237 * journal commit. 5237 * journal commit.
5238 * 5238 *
5239 * In all cases it is actually safe for us to return without doing anything, 5239 * In all cases it is actually safe for us to return without doing anything,
5240 * because the inode has been copied into a raw inode buffer in 5240 * because the inode has been copied into a raw inode buffer in
5241 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 5241 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
5242 * knfsd. 5242 * knfsd.
5243 * 5243 *
5244 * Note that we are absolutely dependent upon all inode dirtiers doing the 5244 * Note that we are absolutely dependent upon all inode dirtiers doing the
5245 * right thing: they *must* call mark_inode_dirty() after dirtying info in 5245 * right thing: they *must* call mark_inode_dirty() after dirtying info in
5246 * which we are interested. 5246 * which we are interested.
5247 * 5247 *
5248 * It would be a bug for them to not do this. The code: 5248 * It would be a bug for them to not do this. The code:
5249 * 5249 *
5250 * mark_inode_dirty(inode) 5250 * mark_inode_dirty(inode)
5251 * stuff(); 5251 * stuff();
5252 * inode->i_size = expr; 5252 * inode->i_size = expr;
5253 * 5253 *
5254 * is in error because a kswapd-driven write_inode() could occur while 5254 * is in error because a kswapd-driven write_inode() could occur while
5255 * `stuff()' is running, and the new i_size will be lost. Plus the inode 5255 * `stuff()' is running, and the new i_size will be lost. Plus the inode
5256 * will no longer be on the superblock's dirty inode list. 5256 * will no longer be on the superblock's dirty inode list.
5257 */ 5257 */
5258 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) 5258 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5259 { 5259 {
5260 int err; 5260 int err;
5261 5261
5262 if (current->flags & PF_MEMALLOC) 5262 if (current->flags & PF_MEMALLOC)
5263 return 0; 5263 return 0;
5264 5264
5265 if (EXT4_SB(inode->i_sb)->s_journal) { 5265 if (EXT4_SB(inode->i_sb)->s_journal) {
5266 if (ext4_journal_current_handle()) { 5266 if (ext4_journal_current_handle()) {
5267 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 5267 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
5268 dump_stack(); 5268 dump_stack();
5269 return -EIO; 5269 return -EIO;
5270 } 5270 }
5271 5271
5272 if (wbc->sync_mode != WB_SYNC_ALL) 5272 if (wbc->sync_mode != WB_SYNC_ALL)
5273 return 0; 5273 return 0;
5274 5274
5275 err = ext4_force_commit(inode->i_sb); 5275 err = ext4_force_commit(inode->i_sb);
5276 } else { 5276 } else {
5277 struct ext4_iloc iloc; 5277 struct ext4_iloc iloc;
5278 5278
5279 err = __ext4_get_inode_loc(inode, &iloc, 0); 5279 err = __ext4_get_inode_loc(inode, &iloc, 0);
5280 if (err) 5280 if (err)
5281 return err; 5281 return err;
5282 if (wbc->sync_mode == WB_SYNC_ALL) 5282 if (wbc->sync_mode == WB_SYNC_ALL)
5283 sync_dirty_buffer(iloc.bh); 5283 sync_dirty_buffer(iloc.bh);
5284 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5284 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5285 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, 5285 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5286 "IO error syncing inode"); 5286 "IO error syncing inode");
5287 err = -EIO; 5287 err = -EIO;
5288 } 5288 }
5289 brelse(iloc.bh); 5289 brelse(iloc.bh);
5290 } 5290 }
5291 return err; 5291 return err;
5292 } 5292 }
5293 5293
5294 /* 5294 /*
5295 * ext4_setattr() 5295 * ext4_setattr()
5296 * 5296 *
5297 * Called from notify_change. 5297 * Called from notify_change.
5298 * 5298 *
5299 * We want to trap VFS attempts to truncate the file as soon as 5299 * We want to trap VFS attempts to truncate the file as soon as
5300 * possible. In particular, we want to make sure that when the VFS 5300 * possible. In particular, we want to make sure that when the VFS
5301 * shrinks i_size, we put the inode on the orphan list and modify 5301 * shrinks i_size, we put the inode on the orphan list and modify
5302 * i_disksize immediately, so that during the subsequent flushing of 5302 * i_disksize immediately, so that during the subsequent flushing of
5303 * dirty pages and freeing of disk blocks, we can guarantee that any 5303 * dirty pages and freeing of disk blocks, we can guarantee that any
5304 * commit will leave the blocks being flushed in an unused state on 5304 * commit will leave the blocks being flushed in an unused state on
5305 * disk. (On recovery, the inode will get truncated and the blocks will 5305 * disk. (On recovery, the inode will get truncated and the blocks will
5306 * be freed, so we have a strong guarantee that no future commit will 5306 * be freed, so we have a strong guarantee that no future commit will
5307 * leave these blocks visible to the user.) 5307 * leave these blocks visible to the user.)
5308 * 5308 *
5309 * Another thing we have to assure is that if we are in ordered mode 5309 * Another thing we have to assure is that if we are in ordered mode
5310 * and inode is still attached to the committing transaction, we must 5310 * and inode is still attached to the committing transaction, we must
5311 * we start writeout of all the dirty pages which are being truncated. 5311 * we start writeout of all the dirty pages which are being truncated.
5312 * This way we are sure that all the data written in the previous 5312 * This way we are sure that all the data written in the previous
5313 * transaction are already on disk (truncate waits for pages under 5313 * transaction are already on disk (truncate waits for pages under
5314 * writeback). 5314 * writeback).
5315 * 5315 *
5316 * Called with inode->i_mutex down. 5316 * Called with inode->i_mutex down.
5317 */ 5317 */
5318 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 5318 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5319 { 5319 {
5320 struct inode *inode = dentry->d_inode; 5320 struct inode *inode = dentry->d_inode;
5321 int error, rc = 0; 5321 int error, rc = 0;
5322 int orphan = 0; 5322 int orphan = 0;
5323 const unsigned int ia_valid = attr->ia_valid; 5323 const unsigned int ia_valid = attr->ia_valid;
5324 5324
5325 error = inode_change_ok(inode, attr); 5325 error = inode_change_ok(inode, attr);
5326 if (error) 5326 if (error)
5327 return error; 5327 return error;
5328 5328
5329 if (is_quota_modification(inode, attr)) 5329 if (is_quota_modification(inode, attr))
5330 dquot_initialize(inode); 5330 dquot_initialize(inode);
5331 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5331 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5332 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5332 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5333 handle_t *handle; 5333 handle_t *handle;
5334 5334
5335 /* (user+group)*(old+new) structure, inode write (sb, 5335 /* (user+group)*(old+new) structure, inode write (sb,
5336 * inode block, ? - but truncate inode update has it) */ 5336 * inode block, ? - but truncate inode update has it) */
5337 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ 5337 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5338 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); 5338 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5339 if (IS_ERR(handle)) { 5339 if (IS_ERR(handle)) {
5340 error = PTR_ERR(handle); 5340 error = PTR_ERR(handle);
5341 goto err_out; 5341 goto err_out;
5342 } 5342 }
5343 error = dquot_transfer(inode, attr); 5343 error = dquot_transfer(inode, attr);
5344 if (error) { 5344 if (error) {
5345 ext4_journal_stop(handle); 5345 ext4_journal_stop(handle);
5346 return error; 5346 return error;
5347 } 5347 }
5348 /* Update corresponding info in inode so that everything is in 5348 /* Update corresponding info in inode so that everything is in
5349 * one transaction */ 5349 * one transaction */
5350 if (attr->ia_valid & ATTR_UID) 5350 if (attr->ia_valid & ATTR_UID)
5351 inode->i_uid = attr->ia_uid; 5351 inode->i_uid = attr->ia_uid;
5352 if (attr->ia_valid & ATTR_GID) 5352 if (attr->ia_valid & ATTR_GID)
5353 inode->i_gid = attr->ia_gid; 5353 inode->i_gid = attr->ia_gid;
5354 error = ext4_mark_inode_dirty(handle, inode); 5354 error = ext4_mark_inode_dirty(handle, inode);
5355 ext4_journal_stop(handle); 5355 ext4_journal_stop(handle);
5356 } 5356 }
5357 5357
5358 if (attr->ia_valid & ATTR_SIZE) { 5358 if (attr->ia_valid & ATTR_SIZE) {
5359 inode_dio_wait(inode); 5359 inode_dio_wait(inode);
5360 5360
5361 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 5361 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5362 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5362 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5363 5363
5364 if (attr->ia_size > sbi->s_bitmap_maxbytes) 5364 if (attr->ia_size > sbi->s_bitmap_maxbytes)
5365 return -EFBIG; 5365 return -EFBIG;
5366 } 5366 }
5367 } 5367 }
5368 5368
5369 if (S_ISREG(inode->i_mode) && 5369 if (S_ISREG(inode->i_mode) &&
5370 attr->ia_valid & ATTR_SIZE && 5370 attr->ia_valid & ATTR_SIZE &&
5371 (attr->ia_size < inode->i_size)) { 5371 (attr->ia_size < inode->i_size)) {
5372 handle_t *handle; 5372 handle_t *handle;
5373 5373
5374 handle = ext4_journal_start(inode, 3); 5374 handle = ext4_journal_start(inode, 3);
5375 if (IS_ERR(handle)) { 5375 if (IS_ERR(handle)) {
5376 error = PTR_ERR(handle); 5376 error = PTR_ERR(handle);
5377 goto err_out; 5377 goto err_out;
5378 } 5378 }
5379 if (ext4_handle_valid(handle)) { 5379 if (ext4_handle_valid(handle)) {
5380 error = ext4_orphan_add(handle, inode); 5380 error = ext4_orphan_add(handle, inode);
5381 orphan = 1; 5381 orphan = 1;
5382 } 5382 }
5383 EXT4_I(inode)->i_disksize = attr->ia_size; 5383 EXT4_I(inode)->i_disksize = attr->ia_size;
5384 rc = ext4_mark_inode_dirty(handle, inode); 5384 rc = ext4_mark_inode_dirty(handle, inode);
5385 if (!error) 5385 if (!error)
5386 error = rc; 5386 error = rc;
5387 ext4_journal_stop(handle); 5387 ext4_journal_stop(handle);
5388 5388
5389 if (ext4_should_order_data(inode)) { 5389 if (ext4_should_order_data(inode)) {
5390 error = ext4_begin_ordered_truncate(inode, 5390 error = ext4_begin_ordered_truncate(inode,
5391 attr->ia_size); 5391 attr->ia_size);
5392 if (error) { 5392 if (error) {
5393 /* Do as much error cleanup as possible */ 5393 /* Do as much error cleanup as possible */
5394 handle = ext4_journal_start(inode, 3); 5394 handle = ext4_journal_start(inode, 3);
5395 if (IS_ERR(handle)) { 5395 if (IS_ERR(handle)) {
5396 ext4_orphan_del(NULL, inode); 5396 ext4_orphan_del(NULL, inode);
5397 goto err_out; 5397 goto err_out;
5398 } 5398 }
5399 ext4_orphan_del(handle, inode); 5399 ext4_orphan_del(handle, inode);
5400 orphan = 0; 5400 orphan = 0;
5401 ext4_journal_stop(handle); 5401 ext4_journal_stop(handle);
5402 goto err_out; 5402 goto err_out;
5403 } 5403 }
5404 } 5404 }
5405 } 5405 }
5406 5406
5407 if (attr->ia_valid & ATTR_SIZE) { 5407 if (attr->ia_valid & ATTR_SIZE) {
5408 if (attr->ia_size != i_size_read(inode)) { 5408 if (attr->ia_size != i_size_read(inode)) {
5409 truncate_setsize(inode, attr->ia_size); 5409 truncate_setsize(inode, attr->ia_size);
5410 ext4_truncate(inode); 5410 ext4_truncate(inode);
5411 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 5411 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5412 ext4_truncate(inode); 5412 ext4_truncate(inode);
5413 } 5413 }
5414 5414
5415 if (!rc) { 5415 if (!rc) {
5416 setattr_copy(inode, attr); 5416 setattr_copy(inode, attr);
5417 mark_inode_dirty(inode); 5417 mark_inode_dirty(inode);
5418 } 5418 }
5419 5419
5420 /* 5420 /*
5421 * If the call to ext4_truncate failed to get a transaction handle at 5421 * If the call to ext4_truncate failed to get a transaction handle at
5422 * all, we need to clean up the in-core orphan list manually. 5422 * all, we need to clean up the in-core orphan list manually.
5423 */ 5423 */
5424 if (orphan && inode->i_nlink) 5424 if (orphan && inode->i_nlink)
5425 ext4_orphan_del(NULL, inode); 5425 ext4_orphan_del(NULL, inode);
5426 5426
5427 if (!rc && (ia_valid & ATTR_MODE)) 5427 if (!rc && (ia_valid & ATTR_MODE))
5428 rc = ext4_acl_chmod(inode); 5428 rc = ext4_acl_chmod(inode);
5429 5429
5430 err_out: 5430 err_out:
5431 ext4_std_error(inode->i_sb, error); 5431 ext4_std_error(inode->i_sb, error);
5432 if (!error) 5432 if (!error)
5433 error = rc; 5433 error = rc;
5434 return error; 5434 return error;
5435 } 5435 }
5436 5436
5437 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 5437 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5438 struct kstat *stat) 5438 struct kstat *stat)
5439 { 5439 {
5440 struct inode *inode; 5440 struct inode *inode;
5441 unsigned long delalloc_blocks; 5441 unsigned long delalloc_blocks;
5442 5442
5443 inode = dentry->d_inode; 5443 inode = dentry->d_inode;
5444 generic_fillattr(inode, stat); 5444 generic_fillattr(inode, stat);
5445 5445
5446 /* 5446 /*
5447 * We can't update i_blocks if the block allocation is delayed 5447 * We can't update i_blocks if the block allocation is delayed
5448 * otherwise in the case of system crash before the real block 5448 * otherwise in the case of system crash before the real block
5449 * allocation is done, we will have i_blocks inconsistent with 5449 * allocation is done, we will have i_blocks inconsistent with
5450 * on-disk file blocks. 5450 * on-disk file blocks.
5451 * We always keep i_blocks updated together with real 5451 * We always keep i_blocks updated together with real
5452 * allocation. But to not confuse with user, stat 5452 * allocation. But to not confuse with user, stat
5453 * will return the blocks that include the delayed allocation 5453 * will return the blocks that include the delayed allocation
5454 * blocks for this file. 5454 * blocks for this file.
5455 */ 5455 */
5456 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5456 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5457 5457
5458 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5458 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5459 return 0; 5459 return 0;
5460 } 5460 }
5461 5461
5462 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 5462 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5463 int chunk) 5463 int chunk)
5464 { 5464 {
5465 int indirects; 5465 int indirects;
5466 5466
5467 /* if nrblocks are contiguous */ 5467 /* if nrblocks are contiguous */
5468 if (chunk) { 5468 if (chunk) {
5469 /* 5469 /*
5470 * With N contiguous data blocks, we need at most 5470 * With N contiguous data blocks, we need at most
5471 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, 5471 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5472 * 2 dindirect blocks, and 1 tindirect block 5472 * 2 dindirect blocks, and 1 tindirect block
5473 */ 5473 */
5474 return DIV_ROUND_UP(nrblocks, 5474 return DIV_ROUND_UP(nrblocks,
5475 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; 5475 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5476 } 5476 }
5477 /* 5477 /*
5478 * if nrblocks are not contiguous, worse case, each block touch 5478 * if nrblocks are not contiguous, worse case, each block touch
5479 * a indirect block, and each indirect block touch a double indirect 5479 * a indirect block, and each indirect block touch a double indirect
5480 * block, plus a triple indirect block 5480 * block, plus a triple indirect block
5481 */ 5481 */
5482 indirects = nrblocks * 2 + 1; 5482 indirects = nrblocks * 2 + 1;
5483 return indirects; 5483 return indirects;
5484 } 5484 }
5485 5485
5486 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5486 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5487 { 5487 {
5488 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5488 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5489 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5489 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5490 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5490 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5491 } 5491 }
5492 5492
5493 /* 5493 /*
5494 * Account for index blocks, block groups bitmaps and block group 5494 * Account for index blocks, block groups bitmaps and block group
5495 * descriptor blocks if modify datablocks and index blocks 5495 * descriptor blocks if modify datablocks and index blocks
5496 * worse case, the indexs blocks spread over different block groups 5496 * worse case, the indexs blocks spread over different block groups
5497 * 5497 *
5498 * If datablocks are discontiguous, they are possible to spread over 5498 * If datablocks are discontiguous, they are possible to spread over
5499 * different block groups too. If they are contiuguous, with flexbg, 5499 * different block groups too. If they are contiuguous, with flexbg,
5500 * they could still across block group boundary. 5500 * they could still across block group boundary.
5501 * 5501 *
5502 * Also account for superblock, inode, quota and xattr blocks 5502 * Also account for superblock, inode, quota and xattr blocks
5503 */ 5503 */
5504 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5504 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5505 { 5505 {
5506 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5506 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5507 int gdpblocks; 5507 int gdpblocks;
5508 int idxblocks; 5508 int idxblocks;
5509 int ret = 0; 5509 int ret = 0;
5510 5510
5511 /* 5511 /*
5512 * How many index blocks need to touch to modify nrblocks? 5512 * How many index blocks need to touch to modify nrblocks?
5513 * The "Chunk" flag indicating whether the nrblocks is 5513 * The "Chunk" flag indicating whether the nrblocks is
5514 * physically contiguous on disk 5514 * physically contiguous on disk
5515 * 5515 *
5516 * For Direct IO and fallocate, they calls get_block to allocate 5516 * For Direct IO and fallocate, they calls get_block to allocate
5517 * one single extent at a time, so they could set the "Chunk" flag 5517 * one single extent at a time, so they could set the "Chunk" flag
5518 */ 5518 */
5519 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 5519 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
5520 5520
5521 ret = idxblocks; 5521 ret = idxblocks;
5522 5522
5523 /* 5523 /*
5524 * Now let's see how many group bitmaps and group descriptors need 5524 * Now let's see how many group bitmaps and group descriptors need
5525 * to account 5525 * to account
5526 */ 5526 */
5527 groups = idxblocks; 5527 groups = idxblocks;
5528 if (chunk) 5528 if (chunk)
5529 groups += 1; 5529 groups += 1;
5530 else 5530 else
5531 groups += nrblocks; 5531 groups += nrblocks;
5532 5532
5533 gdpblocks = groups; 5533 gdpblocks = groups;
5534 if (groups > ngroups) 5534 if (groups > ngroups)
5535 groups = ngroups; 5535 groups = ngroups;
5536 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5536 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5537 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5537 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5538 5538
5539 /* bitmaps and block group descriptor blocks */ 5539 /* bitmaps and block group descriptor blocks */
5540 ret += groups + gdpblocks; 5540 ret += groups + gdpblocks;
5541 5541
5542 /* Blocks for super block, inode, quota and xattr blocks */ 5542 /* Blocks for super block, inode, quota and xattr blocks */
5543 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 5543 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5544 5544
5545 return ret; 5545 return ret;
5546 } 5546 }
5547 5547
5548 /* 5548 /*
5549 * Calculate the total number of credits to reserve to fit 5549 * Calculate the total number of credits to reserve to fit
5550 * the modification of a single pages into a single transaction, 5550 * the modification of a single pages into a single transaction,
5551 * which may include multiple chunks of block allocations. 5551 * which may include multiple chunks of block allocations.
5552 * 5552 *
5553 * This could be called via ext4_write_begin() 5553 * This could be called via ext4_write_begin()
5554 * 5554 *
5555 * We need to consider the worse case, when 5555 * We need to consider the worse case, when
5556 * one new block per extent. 5556 * one new block per extent.
5557 */ 5557 */
5558 int ext4_writepage_trans_blocks(struct inode *inode) 5558 int ext4_writepage_trans_blocks(struct inode *inode)
5559 { 5559 {
5560 int bpp = ext4_journal_blocks_per_page(inode); 5560 int bpp = ext4_journal_blocks_per_page(inode);
5561 int ret; 5561 int ret;
5562 5562
5563 ret = ext4_meta_trans_blocks(inode, bpp, 0); 5563 ret = ext4_meta_trans_blocks(inode, bpp, 0);
5564 5564
5565 /* Account for data blocks for journalled mode */ 5565 /* Account for data blocks for journalled mode */
5566 if (ext4_should_journal_data(inode)) 5566 if (ext4_should_journal_data(inode))
5567 ret += bpp; 5567 ret += bpp;
5568 return ret; 5568 return ret;
5569 } 5569 }
5570 5570
5571 /* 5571 /*
5572 * Calculate the journal credits for a chunk of data modification. 5572 * Calculate the journal credits for a chunk of data modification.
5573 * 5573 *
5574 * This is called from DIO, fallocate or whoever calling 5574 * This is called from DIO, fallocate or whoever calling
5575 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. 5575 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5576 * 5576 *
5577 * journal buffers for data blocks are not included here, as DIO 5577 * journal buffers for data blocks are not included here, as DIO
5578 * and fallocate do no need to journal data buffers. 5578 * and fallocate do no need to journal data buffers.
5579 */ 5579 */
5580 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 5580 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5581 { 5581 {
5582 return ext4_meta_trans_blocks(inode, nrblocks, 1); 5582 return ext4_meta_trans_blocks(inode, nrblocks, 1);
5583 } 5583 }
5584 5584
5585 /* 5585 /*
5586 * The caller must have previously called ext4_reserve_inode_write(). 5586 * The caller must have previously called ext4_reserve_inode_write().
5587 * Give this, we know that the caller already has write access to iloc->bh. 5587 * Give this, we know that the caller already has write access to iloc->bh.
5588 */ 5588 */
5589 int ext4_mark_iloc_dirty(handle_t *handle, 5589 int ext4_mark_iloc_dirty(handle_t *handle,
5590 struct inode *inode, struct ext4_iloc *iloc) 5590 struct inode *inode, struct ext4_iloc *iloc)
5591 { 5591 {
5592 int err = 0; 5592 int err = 0;
5593 5593
5594 if (test_opt(inode->i_sb, I_VERSION)) 5594 if (test_opt(inode->i_sb, I_VERSION))
5595 inode_inc_iversion(inode); 5595 inode_inc_iversion(inode);
5596 5596
5597 /* the do_update_inode consumes one bh->b_count */ 5597 /* the do_update_inode consumes one bh->b_count */
5598 get_bh(iloc->bh); 5598 get_bh(iloc->bh);
5599 5599
5600 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5600 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5601 err = ext4_do_update_inode(handle, inode, iloc); 5601 err = ext4_do_update_inode(handle, inode, iloc);
5602 put_bh(iloc->bh); 5602 put_bh(iloc->bh);
5603 return err; 5603 return err;
5604 } 5604 }
5605 5605
5606 /* 5606 /*
5607 * On success, We end up with an outstanding reference count against 5607 * On success, We end up with an outstanding reference count against
5608 * iloc->bh. This _must_ be cleaned up later. 5608 * iloc->bh. This _must_ be cleaned up later.
5609 */ 5609 */
5610 5610
5611 int 5611 int
5612 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 5612 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5613 struct ext4_iloc *iloc) 5613 struct ext4_iloc *iloc)
5614 { 5614 {
5615 int err; 5615 int err;
5616 5616
5617 err = ext4_get_inode_loc(inode, iloc); 5617 err = ext4_get_inode_loc(inode, iloc);
5618 if (!err) { 5618 if (!err) {
5619 BUFFER_TRACE(iloc->bh, "get_write_access"); 5619 BUFFER_TRACE(iloc->bh, "get_write_access");
5620 err = ext4_journal_get_write_access(handle, iloc->bh); 5620 err = ext4_journal_get_write_access(handle, iloc->bh);
5621 if (err) { 5621 if (err) {
5622 brelse(iloc->bh); 5622 brelse(iloc->bh);
5623 iloc->bh = NULL; 5623 iloc->bh = NULL;
5624 } 5624 }
5625 } 5625 }
5626 ext4_std_error(inode->i_sb, err); 5626 ext4_std_error(inode->i_sb, err);
5627 return err; 5627 return err;
5628 } 5628 }
5629 5629
5630 /* 5630 /*
5631 * Expand an inode by new_extra_isize bytes. 5631 * Expand an inode by new_extra_isize bytes.
5632 * Returns 0 on success or negative error number on failure. 5632 * Returns 0 on success or negative error number on failure.
5633 */ 5633 */
5634 static int ext4_expand_extra_isize(struct inode *inode, 5634 static int ext4_expand_extra_isize(struct inode *inode,
5635 unsigned int new_extra_isize, 5635 unsigned int new_extra_isize,
5636 struct ext4_iloc iloc, 5636 struct ext4_iloc iloc,
5637 handle_t *handle) 5637 handle_t *handle)
5638 { 5638 {
5639 struct ext4_inode *raw_inode; 5639 struct ext4_inode *raw_inode;
5640 struct ext4_xattr_ibody_header *header; 5640 struct ext4_xattr_ibody_header *header;
5641 5641
5642 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5642 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5643 return 0; 5643 return 0;
5644 5644
5645 raw_inode = ext4_raw_inode(&iloc); 5645 raw_inode = ext4_raw_inode(&iloc);
5646 5646
5647 header = IHDR(inode, raw_inode); 5647 header = IHDR(inode, raw_inode);
5648 5648
5649 /* No extended attributes present */ 5649 /* No extended attributes present */
5650 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5650 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5651 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5651 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5652 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5652 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5653 new_extra_isize); 5653 new_extra_isize);
5654 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5654 EXT4_I(inode)->i_extra_isize = new_extra_isize;
5655 return 0; 5655 return 0;
5656 } 5656 }
5657 5657
5658 /* try to expand with EAs present */ 5658 /* try to expand with EAs present */
5659 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 5659 return ext4_expand_extra_isize_ea(inode, new_extra_isize,
5660 raw_inode, handle); 5660 raw_inode, handle);
5661 } 5661 }
5662 5662
5663 /* 5663 /*
5664 * What we do here is to mark the in-core inode as clean with respect to inode 5664 * What we do here is to mark the in-core inode as clean with respect to inode
5665 * dirtiness (it may still be data-dirty). 5665 * dirtiness (it may still be data-dirty).
5666 * This means that the in-core inode may be reaped by prune_icache 5666 * This means that the in-core inode may be reaped by prune_icache
5667 * without having to perform any I/O. This is a very good thing, 5667 * without having to perform any I/O. This is a very good thing,
5668 * because *any* task may call prune_icache - even ones which 5668 * because *any* task may call prune_icache - even ones which
5669 * have a transaction open against a different journal. 5669 * have a transaction open against a different journal.
5670 * 5670 *
5671 * Is this cheating? Not really. Sure, we haven't written the 5671 * Is this cheating? Not really. Sure, we haven't written the
5672 * inode out, but prune_icache isn't a user-visible syncing function. 5672 * inode out, but prune_icache isn't a user-visible syncing function.
5673 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 5673 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5674 * we start and wait on commits. 5674 * we start and wait on commits.
5675 * 5675 *
5676 * Is this efficient/effective? Well, we're being nice to the system 5676 * Is this efficient/effective? Well, we're being nice to the system
5677 * by cleaning up our inodes proactively so they can be reaped 5677 * by cleaning up our inodes proactively so they can be reaped
5678 * without I/O. But we are potentially leaving up to five seconds' 5678 * without I/O. But we are potentially leaving up to five seconds'
5679 * worth of inodes floating about which prune_icache wants us to 5679 * worth of inodes floating about which prune_icache wants us to
5680 * write out. One way to fix that would be to get prune_icache() 5680 * write out. One way to fix that would be to get prune_icache()
5681 * to do a write_super() to free up some memory. It has the desired 5681 * to do a write_super() to free up some memory. It has the desired
5682 * effect. 5682 * effect.
5683 */ 5683 */
5684 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 5684 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5685 { 5685 {
5686 struct ext4_iloc iloc; 5686 struct ext4_iloc iloc;
5687 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5687 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5688 static unsigned int mnt_count; 5688 static unsigned int mnt_count;
5689 int err, ret; 5689 int err, ret;
5690 5690
5691 might_sleep(); 5691 might_sleep();
5692 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 5692 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5693 err = ext4_reserve_inode_write(handle, inode, &iloc); 5693 err = ext4_reserve_inode_write(handle, inode, &iloc);
5694 if (ext4_handle_valid(handle) && 5694 if (ext4_handle_valid(handle) &&
5695 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5695 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5696 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 5696 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5697 /* 5697 /*
5698 * We need extra buffer credits since we may write into EA block 5698 * We need extra buffer credits since we may write into EA block
5699 * with this same handle. If journal_extend fails, then it will 5699 * with this same handle. If journal_extend fails, then it will
5700 * only result in a minor loss of functionality for that inode. 5700 * only result in a minor loss of functionality for that inode.
5701 * If this is felt to be critical, then e2fsck should be run to 5701 * If this is felt to be critical, then e2fsck should be run to
5702 * force a large enough s_min_extra_isize. 5702 * force a large enough s_min_extra_isize.
5703 */ 5703 */
5704 if ((jbd2_journal_extend(handle, 5704 if ((jbd2_journal_extend(handle,
5705 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 5705 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
5706 ret = ext4_expand_extra_isize(inode, 5706 ret = ext4_expand_extra_isize(inode,
5707 sbi->s_want_extra_isize, 5707 sbi->s_want_extra_isize,
5708 iloc, handle); 5708 iloc, handle);
5709 if (ret) { 5709 if (ret) {
5710 ext4_set_inode_state(inode, 5710 ext4_set_inode_state(inode,
5711 EXT4_STATE_NO_EXPAND); 5711 EXT4_STATE_NO_EXPAND);
5712 if (mnt_count != 5712 if (mnt_count !=
5713 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5713 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5714 ext4_warning(inode->i_sb, 5714 ext4_warning(inode->i_sb,
5715 "Unable to expand inode %lu. Delete" 5715 "Unable to expand inode %lu. Delete"
5716 " some EAs or run e2fsck.", 5716 " some EAs or run e2fsck.",
5717 inode->i_ino); 5717 inode->i_ino);
5718 mnt_count = 5718 mnt_count =
5719 le16_to_cpu(sbi->s_es->s_mnt_count); 5719 le16_to_cpu(sbi->s_es->s_mnt_count);
5720 } 5720 }
5721 } 5721 }
5722 } 5722 }
5723 } 5723 }
5724 if (!err) 5724 if (!err)
5725 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 5725 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5726 return err; 5726 return err;
5727 } 5727 }
5728 5728
5729 /* 5729 /*
5730 * ext4_dirty_inode() is called from __mark_inode_dirty() 5730 * ext4_dirty_inode() is called from __mark_inode_dirty()
5731 * 5731 *
5732 * We're really interested in the case where a file is being extended. 5732 * We're really interested in the case where a file is being extended.
5733 * i_size has been changed by generic_commit_write() and we thus need 5733 * i_size has been changed by generic_commit_write() and we thus need
5734 * to include the updated inode in the current transaction. 5734 * to include the updated inode in the current transaction.
5735 * 5735 *
5736 * Also, dquot_alloc_block() will always dirty the inode when blocks 5736 * Also, dquot_alloc_block() will always dirty the inode when blocks
5737 * are allocated to the file. 5737 * are allocated to the file.
5738 * 5738 *
5739 * If the inode is marked synchronous, we don't honour that here - doing 5739 * If the inode is marked synchronous, we don't honour that here - doing
5740 * so would cause a commit on atime updates, which we don't bother doing. 5740 * so would cause a commit on atime updates, which we don't bother doing.
5741 * We handle synchronous inodes at the highest possible level. 5741 * We handle synchronous inodes at the highest possible level.
5742 */ 5742 */
5743 void ext4_dirty_inode(struct inode *inode, int flags) 5743 void ext4_dirty_inode(struct inode *inode, int flags)
5744 { 5744 {
5745 handle_t *handle; 5745 handle_t *handle;
5746 5746
5747 handle = ext4_journal_start(inode, 2); 5747 handle = ext4_journal_start(inode, 2);
5748 if (IS_ERR(handle)) 5748 if (IS_ERR(handle))
5749 goto out; 5749 goto out;
5750 5750
5751 ext4_mark_inode_dirty(handle, inode); 5751 ext4_mark_inode_dirty(handle, inode);
5752 5752
5753 ext4_journal_stop(handle); 5753 ext4_journal_stop(handle);
5754 out: 5754 out:
5755 return; 5755 return;
5756 } 5756 }
5757 5757
5758 #if 0 5758 #if 0
5759 /* 5759 /*
5760 * Bind an inode's backing buffer_head into this transaction, to prevent 5760 * Bind an inode's backing buffer_head into this transaction, to prevent
5761 * it from being flushed to disk early. Unlike 5761 * it from being flushed to disk early. Unlike
5762 * ext4_reserve_inode_write, this leaves behind no bh reference and 5762 * ext4_reserve_inode_write, this leaves behind no bh reference and
5763 * returns no iloc structure, so the caller needs to repeat the iloc 5763 * returns no iloc structure, so the caller needs to repeat the iloc
5764 * lookup to mark the inode dirty later. 5764 * lookup to mark the inode dirty later.
5765 */ 5765 */
5766 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5766 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5767 { 5767 {
5768 struct ext4_iloc iloc; 5768 struct ext4_iloc iloc;
5769 5769
5770 int err = 0; 5770 int err = 0;
5771 if (handle) { 5771 if (handle) {
5772 err = ext4_get_inode_loc(inode, &iloc); 5772 err = ext4_get_inode_loc(inode, &iloc);
5773 if (!err) { 5773 if (!err) {
5774 BUFFER_TRACE(iloc.bh, "get_write_access"); 5774 BUFFER_TRACE(iloc.bh, "get_write_access");
5775 err = jbd2_journal_get_write_access(handle, iloc.bh); 5775 err = jbd2_journal_get_write_access(handle, iloc.bh);
5776 if (!err) 5776 if (!err)
5777 err = ext4_handle_dirty_metadata(handle, 5777 err = ext4_handle_dirty_metadata(handle,
5778 NULL, 5778 NULL,
5779 iloc.bh); 5779 iloc.bh);
5780 brelse(iloc.bh); 5780 brelse(iloc.bh);
5781 } 5781 }
5782 } 5782 }
5783 ext4_std_error(inode->i_sb, err); 5783 ext4_std_error(inode->i_sb, err);
5784 return err; 5784 return err;
5785 } 5785 }
5786 #endif 5786 #endif
5787 5787
5788 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5788 int ext4_change_inode_journal_flag(struct inode *inode, int val)
5789 { 5789 {
5790 journal_t *journal; 5790 journal_t *journal;
5791 handle_t *handle; 5791 handle_t *handle;
5792 int err; 5792 int err;
5793 5793
5794 /* 5794 /*
5795 * We have to be very careful here: changing a data block's 5795 * We have to be very careful here: changing a data block's
5796 * journaling status dynamically is dangerous. If we write a 5796 * journaling status dynamically is dangerous. If we write a
5797 * data block to the journal, change the status and then delete 5797 * data block to the journal, change the status and then delete
5798 * that block, we risk forgetting to revoke the old log record 5798 * that block, we risk forgetting to revoke the old log record
5799 * from the journal and so a subsequent replay can corrupt data. 5799 * from the journal and so a subsequent replay can corrupt data.
5800 * So, first we make sure that the journal is empty and that 5800 * So, first we make sure that the journal is empty and that
5801 * nobody is changing anything. 5801 * nobody is changing anything.
5802 */ 5802 */
5803 5803
5804 journal = EXT4_JOURNAL(inode); 5804 journal = EXT4_JOURNAL(inode);
5805 if (!journal) 5805 if (!journal)
5806 return 0; 5806 return 0;
5807 if (is_journal_aborted(journal)) 5807 if (is_journal_aborted(journal))
5808 return -EROFS; 5808 return -EROFS;
5809 5809
5810 jbd2_journal_lock_updates(journal); 5810 jbd2_journal_lock_updates(journal);
5811 jbd2_journal_flush(journal); 5811 jbd2_journal_flush(journal);
5812 5812
5813 /* 5813 /*
5814 * OK, there are no updates running now, and all cached data is 5814 * OK, there are no updates running now, and all cached data is
5815 * synced to disk. We are now in a completely consistent state 5815 * synced to disk. We are now in a completely consistent state
5816 * which doesn't have anything in the journal, and we know that 5816 * which doesn't have anything in the journal, and we know that
5817 * no filesystem updates are running, so it is safe to modify 5817 * no filesystem updates are running, so it is safe to modify
5818 * the inode's in-core data-journaling state flag now. 5818 * the inode's in-core data-journaling state flag now.
5819 */ 5819 */
5820 5820
5821 if (val) 5821 if (val)
5822 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5822 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5823 else 5823 else
5824 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5824 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5825 ext4_set_aops(inode); 5825 ext4_set_aops(inode);
5826 5826
5827 jbd2_journal_unlock_updates(journal); 5827 jbd2_journal_unlock_updates(journal);
5828 5828
5829 /* Finally we can mark the inode as dirty. */ 5829 /* Finally we can mark the inode as dirty. */
5830 5830
5831 handle = ext4_journal_start(inode, 1); 5831 handle = ext4_journal_start(inode, 1);
5832 if (IS_ERR(handle)) 5832 if (IS_ERR(handle))
5833 return PTR_ERR(handle); 5833 return PTR_ERR(handle);
5834 5834
5835 err = ext4_mark_inode_dirty(handle, inode); 5835 err = ext4_mark_inode_dirty(handle, inode);
5836 ext4_handle_sync(handle); 5836 ext4_handle_sync(handle);
5837 ext4_journal_stop(handle); 5837 ext4_journal_stop(handle);
5838 ext4_std_error(inode->i_sb, err); 5838 ext4_std_error(inode->i_sb, err);
5839 5839
5840 return err; 5840 return err;
5841 } 5841 }
5842 5842
5843 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5843 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5844 { 5844 {
5845 return !buffer_mapped(bh); 5845 return !buffer_mapped(bh);
5846 } 5846 }
5847 5847
5848 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 5848 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5849 { 5849 {
5850 struct page *page = vmf->page; 5850 struct page *page = vmf->page;
5851 loff_t size; 5851 loff_t size;
5852 unsigned long len; 5852 unsigned long len;
5853 int ret; 5853 int ret;
5854 struct file *file = vma->vm_file; 5854 struct file *file = vma->vm_file;
5855 struct inode *inode = file->f_path.dentry->d_inode; 5855 struct inode *inode = file->f_path.dentry->d_inode;
5856 struct address_space *mapping = inode->i_mapping; 5856 struct address_space *mapping = inode->i_mapping;
5857 handle_t *handle; 5857 handle_t *handle;
5858 get_block_t *get_block; 5858 get_block_t *get_block;
5859 int retries = 0; 5859 int retries = 0;
5860 5860
5861 /* 5861 /*
5862 * This check is racy but catches the common case. We rely on 5862 * This check is racy but catches the common case. We rely on
5863 * __block_page_mkwrite() to do a reliable check. 5863 * __block_page_mkwrite() to do a reliable check.
5864 */ 5864 */
5865 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 5865 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
5866 /* Delalloc case is easy... */ 5866 /* Delalloc case is easy... */
5867 if (test_opt(inode->i_sb, DELALLOC) && 5867 if (test_opt(inode->i_sb, DELALLOC) &&
5868 !ext4_should_journal_data(inode) && 5868 !ext4_should_journal_data(inode) &&
5869 !ext4_nonda_switch(inode->i_sb)) { 5869 !ext4_nonda_switch(inode->i_sb)) {
5870 do { 5870 do {
5871 ret = __block_page_mkwrite(vma, vmf, 5871 ret = __block_page_mkwrite(vma, vmf,
5872 ext4_da_get_block_prep); 5872 ext4_da_get_block_prep);
5873 } while (ret == -ENOSPC && 5873 } while (ret == -ENOSPC &&
5874 ext4_should_retry_alloc(inode->i_sb, &retries)); 5874 ext4_should_retry_alloc(inode->i_sb, &retries));
5875 goto out_ret; 5875 goto out_ret;
5876 } 5876 }
5877 5877
5878 lock_page(page); 5878 lock_page(page);
5879 size = i_size_read(inode); 5879 size = i_size_read(inode);
5880 /* Page got truncated from under us? */ 5880 /* Page got truncated from under us? */
5881 if (page->mapping != mapping || page_offset(page) > size) { 5881 if (page->mapping != mapping || page_offset(page) > size) {
5882 unlock_page(page); 5882 unlock_page(page);
5883 ret = VM_FAULT_NOPAGE; 5883 ret = VM_FAULT_NOPAGE;
5884 goto out; 5884 goto out;
5885 } 5885 }
5886 5886
5887 if (page->index == size >> PAGE_CACHE_SHIFT) 5887 if (page->index == size >> PAGE_CACHE_SHIFT)
5888 len = size & ~PAGE_CACHE_MASK; 5888 len = size & ~PAGE_CACHE_MASK;
5889 else 5889 else
5890 len = PAGE_CACHE_SIZE; 5890 len = PAGE_CACHE_SIZE;
5891 /* 5891 /*
5892 * Return if we have all the buffers mapped. This avoids the need to do 5892 * Return if we have all the buffers mapped. This avoids the need to do
5893 * journal_start/journal_stop which can block and take a long time 5893 * journal_start/journal_stop which can block and take a long time
5894 */ 5894 */
5895 if (page_has_buffers(page)) { 5895 if (page_has_buffers(page)) {
5896 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5896 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5897 ext4_bh_unmapped)) { 5897 ext4_bh_unmapped)) {
5898 /* Wait so that we don't change page under IO */ 5898 /* Wait so that we don't change page under IO */
5899 wait_on_page_writeback(page); 5899 wait_on_page_writeback(page);
5900 ret = VM_FAULT_LOCKED; 5900 ret = VM_FAULT_LOCKED;
5901 goto out; 5901 goto out;
5902 } 5902 }
5903 } 5903 }
5904 unlock_page(page); 5904 unlock_page(page);
5905 /* OK, we need to fill the hole... */ 5905 /* OK, we need to fill the hole... */
5906 if (ext4_should_dioread_nolock(inode)) 5906 if (ext4_should_dioread_nolock(inode))
5907 get_block = ext4_get_block_write; 5907 get_block = ext4_get_block_write;
5908 else 5908 else
5909 get_block = ext4_get_block; 5909 get_block = ext4_get_block;
5910 retry_alloc: 5910 retry_alloc:
5911 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 5911 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
5912 if (IS_ERR(handle)) { 5912 if (IS_ERR(handle)) {
5913 ret = VM_FAULT_SIGBUS; 5913 ret = VM_FAULT_SIGBUS;
5914 goto out; 5914 goto out;
5915 } 5915 }
5916 ret = __block_page_mkwrite(vma, vmf, get_block); 5916 ret = __block_page_mkwrite(vma, vmf, get_block);
5917 if (!ret && ext4_should_journal_data(inode)) { 5917 if (!ret && ext4_should_journal_data(inode)) {
5918 if (walk_page_buffers(handle, page_buffers(page), 0, 5918 if (walk_page_buffers(handle, page_buffers(page), 0,
5919 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { 5919 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
5920 unlock_page(page); 5920 unlock_page(page);
5921 ret = VM_FAULT_SIGBUS; 5921 ret = VM_FAULT_SIGBUS;
5922 goto out; 5922 goto out;
5923 } 5923 }
5924 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 5924 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
5925 } 5925 }
5926 ext4_journal_stop(handle); 5926 ext4_journal_stop(handle);
5927 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 5927 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
5928 goto retry_alloc; 5928 goto retry_alloc;
5929 out_ret: 5929 out_ret:
5930 ret = block_page_mkwrite_return(ret); 5930 ret = block_page_mkwrite_return(ret);
5931 out: 5931 out:
5932 return ret; 5932 return ret;
5933 } 5933 }
5934 5934
1 /* 1 /*
2 * fs/fs-writeback.c 2 * fs/fs-writeback.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * 5 *
6 * Contains all the functions related to writing back and waiting 6 * Contains all the functions related to writing back and waiting
7 * upon dirty inodes against superblocks, and writing back dirty 7 * upon dirty inodes against superblocks, and writing back dirty
8 * pages against inodes. ie: data writeback. Writeout of the 8 * pages against inodes. ie: data writeback. Writeout of the
9 * inode itself is not handled here. 9 * inode itself is not handled here.
10 * 10 *
11 * 10Apr2002 Andrew Morton 11 * 10Apr2002 Andrew Morton
12 * Split out of fs/inode.c 12 * Split out of fs/inode.c
13 * Additions for address_space-based writeback 13 * Additions for address_space-based writeback
14 */ 14 */
15 15
16 #include <linux/kernel.h> 16 #include <linux/kernel.h>
17 #include <linux/module.h> 17 #include <linux/module.h>
18 #include <linux/spinlock.h> 18 #include <linux/spinlock.h>
19 #include <linux/slab.h> 19 #include <linux/slab.h>
20 #include <linux/sched.h> 20 #include <linux/sched.h>
21 #include <linux/fs.h> 21 #include <linux/fs.h>
22 #include <linux/mm.h> 22 #include <linux/mm.h>
23 #include <linux/kthread.h> 23 #include <linux/kthread.h>
24 #include <linux/freezer.h> 24 #include <linux/freezer.h>
25 #include <linux/writeback.h> 25 #include <linux/writeback.h>
26 #include <linux/blkdev.h> 26 #include <linux/blkdev.h>
27 #include <linux/backing-dev.h> 27 #include <linux/backing-dev.h>
28 #include <linux/buffer_head.h> 28 #include <linux/buffer_head.h>
29 #include <linux/tracepoint.h> 29 #include <linux/tracepoint.h>
30 #include "internal.h" 30 #include "internal.h"
31 31
32 /* 32 /*
33 * Passed into wb_writeback(), essentially a subset of writeback_control 33 * Passed into wb_writeback(), essentially a subset of writeback_control
34 */ 34 */
35 struct wb_writeback_work { 35 struct wb_writeback_work {
36 long nr_pages; 36 long nr_pages;
37 struct super_block *sb; 37 struct super_block *sb;
38 unsigned long *older_than_this;
38 enum writeback_sync_modes sync_mode; 39 enum writeback_sync_modes sync_mode;
40 unsigned int tagged_writepages:1;
39 unsigned int for_kupdate:1; 41 unsigned int for_kupdate:1;
40 unsigned int range_cyclic:1; 42 unsigned int range_cyclic:1;
41 unsigned int for_background:1; 43 unsigned int for_background:1;
42 44
43 struct list_head list; /* pending work list */ 45 struct list_head list; /* pending work list */
44 struct completion *done; /* set if the caller waits */ 46 struct completion *done; /* set if the caller waits */
45 }; 47 };
46 48
47 /* 49 /*
48 * Include the creation of the trace points after defining the 50 * Include the creation of the trace points after defining the
49 * wb_writeback_work structure so that the definition remains local to this 51 * wb_writeback_work structure so that the definition remains local to this
50 * file. 52 * file.
51 */ 53 */
52 #define CREATE_TRACE_POINTS 54 #define CREATE_TRACE_POINTS
53 #include <trace/events/writeback.h> 55 #include <trace/events/writeback.h>
54 56
55 /* 57 /*
56 * We don't actually have pdflush, but this one is exported though /proc... 58 * We don't actually have pdflush, but this one is exported though /proc...
57 */ 59 */
58 int nr_pdflush_threads; 60 int nr_pdflush_threads;
59 61
60 /** 62 /**
61 * writeback_in_progress - determine whether there is writeback in progress 63 * writeback_in_progress - determine whether there is writeback in progress
62 * @bdi: the device's backing_dev_info structure. 64 * @bdi: the device's backing_dev_info structure.
63 * 65 *
64 * Determine whether there is writeback waiting to be handled against a 66 * Determine whether there is writeback waiting to be handled against a
65 * backing device. 67 * backing device.
66 */ 68 */
67 int writeback_in_progress(struct backing_dev_info *bdi) 69 int writeback_in_progress(struct backing_dev_info *bdi)
68 { 70 {
69 return test_bit(BDI_writeback_running, &bdi->state); 71 return test_bit(BDI_writeback_running, &bdi->state);
70 } 72 }
71 73
72 static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 74 static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
73 { 75 {
74 struct super_block *sb = inode->i_sb; 76 struct super_block *sb = inode->i_sb;
75 77
76 if (strcmp(sb->s_type->name, "bdev") == 0) 78 if (strcmp(sb->s_type->name, "bdev") == 0)
77 return inode->i_mapping->backing_dev_info; 79 return inode->i_mapping->backing_dev_info;
78 80
79 return sb->s_bdi; 81 return sb->s_bdi;
80 } 82 }
81 83
82 static inline struct inode *wb_inode(struct list_head *head) 84 static inline struct inode *wb_inode(struct list_head *head)
83 { 85 {
84 return list_entry(head, struct inode, i_wb_list); 86 return list_entry(head, struct inode, i_wb_list);
85 } 87 }
86 88
87 /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ 89 /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88 static void bdi_wakeup_flusher(struct backing_dev_info *bdi) 90 static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89 { 91 {
90 if (bdi->wb.task) { 92 if (bdi->wb.task) {
91 wake_up_process(bdi->wb.task); 93 wake_up_process(bdi->wb.task);
92 } else { 94 } else {
93 /* 95 /*
94 * The bdi thread isn't there, wake up the forker thread which 96 * The bdi thread isn't there, wake up the forker thread which
95 * will create and run it. 97 * will create and run it.
96 */ 98 */
97 wake_up_process(default_backing_dev_info.wb.task); 99 wake_up_process(default_backing_dev_info.wb.task);
98 } 100 }
99 } 101 }
100 102
101 static void bdi_queue_work(struct backing_dev_info *bdi, 103 static void bdi_queue_work(struct backing_dev_info *bdi,
102 struct wb_writeback_work *work) 104 struct wb_writeback_work *work)
103 { 105 {
104 trace_writeback_queue(bdi, work); 106 trace_writeback_queue(bdi, work);
105 107
106 spin_lock_bh(&bdi->wb_lock); 108 spin_lock_bh(&bdi->wb_lock);
107 list_add_tail(&work->list, &bdi->work_list); 109 list_add_tail(&work->list, &bdi->work_list);
108 if (!bdi->wb.task) 110 if (!bdi->wb.task)
109 trace_writeback_nothread(bdi, work); 111 trace_writeback_nothread(bdi, work);
110 bdi_wakeup_flusher(bdi); 112 bdi_wakeup_flusher(bdi);
111 spin_unlock_bh(&bdi->wb_lock); 113 spin_unlock_bh(&bdi->wb_lock);
112 } 114 }
113 115
114 static void 116 static void
115 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 117 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
116 bool range_cyclic) 118 bool range_cyclic)
117 { 119 {
118 struct wb_writeback_work *work; 120 struct wb_writeback_work *work;
119 121
120 /* 122 /*
121 * This is WB_SYNC_NONE writeback, so if allocation fails just 123 * This is WB_SYNC_NONE writeback, so if allocation fails just
122 * wakeup the thread for old dirty data writeback 124 * wakeup the thread for old dirty data writeback
123 */ 125 */
124 work = kzalloc(sizeof(*work), GFP_ATOMIC); 126 work = kzalloc(sizeof(*work), GFP_ATOMIC);
125 if (!work) { 127 if (!work) {
126 if (bdi->wb.task) { 128 if (bdi->wb.task) {
127 trace_writeback_nowork(bdi); 129 trace_writeback_nowork(bdi);
128 wake_up_process(bdi->wb.task); 130 wake_up_process(bdi->wb.task);
129 } 131 }
130 return; 132 return;
131 } 133 }
132 134
133 work->sync_mode = WB_SYNC_NONE; 135 work->sync_mode = WB_SYNC_NONE;
134 work->nr_pages = nr_pages; 136 work->nr_pages = nr_pages;
135 work->range_cyclic = range_cyclic; 137 work->range_cyclic = range_cyclic;
136 138
137 bdi_queue_work(bdi, work); 139 bdi_queue_work(bdi, work);
138 } 140 }
139 141
140 /** 142 /**
141 * bdi_start_writeback - start writeback 143 * bdi_start_writeback - start writeback
142 * @bdi: the backing device to write from 144 * @bdi: the backing device to write from
143 * @nr_pages: the number of pages to write 145 * @nr_pages: the number of pages to write
144 * 146 *
145 * Description: 147 * Description:
146 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 148 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
147 * started when this function returns, we make no guarantees on 149 * started when this function returns, we make no guarantees on
148 * completion. Caller need not hold sb s_umount semaphore. 150 * completion. Caller need not hold sb s_umount semaphore.
149 * 151 *
150 */ 152 */
151 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 153 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
152 { 154 {
153 __bdi_start_writeback(bdi, nr_pages, true); 155 __bdi_start_writeback(bdi, nr_pages, true);
154 } 156 }
155 157
156 /** 158 /**
157 * bdi_start_background_writeback - start background writeback 159 * bdi_start_background_writeback - start background writeback
158 * @bdi: the backing device to write from 160 * @bdi: the backing device to write from
159 * 161 *
160 * Description: 162 * Description:
161 * This makes sure WB_SYNC_NONE background writeback happens. When 163 * This makes sure WB_SYNC_NONE background writeback happens. When
162 * this function returns, it is only guaranteed that for given BDI 164 * this function returns, it is only guaranteed that for given BDI
163 * some IO is happening if we are over background dirty threshold. 165 * some IO is happening if we are over background dirty threshold.
164 * Caller need not hold sb s_umount semaphore. 166 * Caller need not hold sb s_umount semaphore.
165 */ 167 */
166 void bdi_start_background_writeback(struct backing_dev_info *bdi) 168 void bdi_start_background_writeback(struct backing_dev_info *bdi)
167 { 169 {
168 /* 170 /*
169 * We just wake up the flusher thread. It will perform background 171 * We just wake up the flusher thread. It will perform background
170 * writeback as soon as there is no other work to do. 172 * writeback as soon as there is no other work to do.
171 */ 173 */
172 trace_writeback_wake_background(bdi); 174 trace_writeback_wake_background(bdi);
173 spin_lock_bh(&bdi->wb_lock); 175 spin_lock_bh(&bdi->wb_lock);
174 bdi_wakeup_flusher(bdi); 176 bdi_wakeup_flusher(bdi);
175 spin_unlock_bh(&bdi->wb_lock); 177 spin_unlock_bh(&bdi->wb_lock);
176 } 178 }
177 179
178 /* 180 /*
179 * Remove the inode from the writeback list it is on. 181 * Remove the inode from the writeback list it is on.
180 */ 182 */
181 void inode_wb_list_del(struct inode *inode) 183 void inode_wb_list_del(struct inode *inode)
182 { 184 {
183 spin_lock(&inode_wb_list_lock); 185 struct backing_dev_info *bdi = inode_to_bdi(inode);
186
187 spin_lock(&bdi->wb.list_lock);
184 list_del_init(&inode->i_wb_list); 188 list_del_init(&inode->i_wb_list);
185 spin_unlock(&inode_wb_list_lock); 189 spin_unlock(&bdi->wb.list_lock);
186 } 190 }
187 191
188
189 /* 192 /*
190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 193 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
191 * furthest end of its superblock's dirty-inode list. 194 * furthest end of its superblock's dirty-inode list.
192 * 195 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 196 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the b_dirty list. If that is 197 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 198 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 199 * out and we don't reset its dirtied_when.
197 */ 200 */
198 static void redirty_tail(struct inode *inode) 201 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
199 { 202 {
200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 203 assert_spin_locked(&wb->list_lock);
201
202 assert_spin_locked(&inode_wb_list_lock);
203 if (!list_empty(&wb->b_dirty)) { 204 if (!list_empty(&wb->b_dirty)) {
204 struct inode *tail; 205 struct inode *tail;
205 206
206 tail = wb_inode(wb->b_dirty.next); 207 tail = wb_inode(wb->b_dirty.next);
207 if (time_before(inode->dirtied_when, tail->dirtied_when)) 208 if (time_before(inode->dirtied_when, tail->dirtied_when))
208 inode->dirtied_when = jiffies; 209 inode->dirtied_when = jiffies;
209 } 210 }
210 list_move(&inode->i_wb_list, &wb->b_dirty); 211 list_move(&inode->i_wb_list, &wb->b_dirty);
211 } 212 }
212 213
213 /* 214 /*
214 * requeue inode for re-scanning after bdi->b_io list is exhausted. 215 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 216 */
216 static void requeue_io(struct inode *inode) 217 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
217 { 218 {
218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 219 assert_spin_locked(&wb->list_lock);
219
220 assert_spin_locked(&inode_wb_list_lock);
221 list_move(&inode->i_wb_list, &wb->b_more_io); 220 list_move(&inode->i_wb_list, &wb->b_more_io);
222 } 221 }
223 222
224 static void inode_sync_complete(struct inode *inode) 223 static void inode_sync_complete(struct inode *inode)
225 { 224 {
226 /* 225 /*
227 * Prevent speculative execution through 226 * Prevent speculative execution through
228 * spin_unlock(&inode_wb_list_lock); 227 * spin_unlock(&wb->list_lock);
229 */ 228 */
230 229
231 smp_mb(); 230 smp_mb();
232 wake_up_bit(&inode->i_state, __I_SYNC); 231 wake_up_bit(&inode->i_state, __I_SYNC);
233 } 232 }
234 233
235 static bool inode_dirtied_after(struct inode *inode, unsigned long t) 234 static bool inode_dirtied_after(struct inode *inode, unsigned long t)
236 { 235 {
237 bool ret = time_after(inode->dirtied_when, t); 236 bool ret = time_after(inode->dirtied_when, t);
238 #ifndef CONFIG_64BIT 237 #ifndef CONFIG_64BIT
239 /* 238 /*
240 * For inodes being constantly redirtied, dirtied_when can get stuck. 239 * For inodes being constantly redirtied, dirtied_when can get stuck.
241 * It _appears_ to be in the future, but is actually in distant past. 240 * It _appears_ to be in the future, but is actually in distant past.
242 * This test is necessary to prevent such wrapped-around relative times 241 * This test is necessary to prevent such wrapped-around relative times
243 * from permanently stopping the whole bdi writeback. 242 * from permanently stopping the whole bdi writeback.
244 */ 243 */
245 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 244 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
246 #endif 245 #endif
247 return ret; 246 return ret;
248 } 247 }
249 248
250 /* 249 /*
251 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 250 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
252 */ 251 */
253 static void move_expired_inodes(struct list_head *delaying_queue, 252 static int move_expired_inodes(struct list_head *delaying_queue,
254 struct list_head *dispatch_queue, 253 struct list_head *dispatch_queue,
255 unsigned long *older_than_this) 254 unsigned long *older_than_this)
256 { 255 {
257 LIST_HEAD(tmp); 256 LIST_HEAD(tmp);
258 struct list_head *pos, *node; 257 struct list_head *pos, *node;
259 struct super_block *sb = NULL; 258 struct super_block *sb = NULL;
260 struct inode *inode; 259 struct inode *inode;
261 int do_sb_sort = 0; 260 int do_sb_sort = 0;
261 int moved = 0;
262 262
263 while (!list_empty(delaying_queue)) { 263 while (!list_empty(delaying_queue)) {
264 inode = wb_inode(delaying_queue->prev); 264 inode = wb_inode(delaying_queue->prev);
265 if (older_than_this && 265 if (older_than_this &&
266 inode_dirtied_after(inode, *older_than_this)) 266 inode_dirtied_after(inode, *older_than_this))
267 break; 267 break;
268 if (sb && sb != inode->i_sb) 268 if (sb && sb != inode->i_sb)
269 do_sb_sort = 1; 269 do_sb_sort = 1;
270 sb = inode->i_sb; 270 sb = inode->i_sb;
271 list_move(&inode->i_wb_list, &tmp); 271 list_move(&inode->i_wb_list, &tmp);
272 moved++;
272 } 273 }
273 274
274 /* just one sb in list, splice to dispatch_queue and we're done */ 275 /* just one sb in list, splice to dispatch_queue and we're done */
275 if (!do_sb_sort) { 276 if (!do_sb_sort) {
276 list_splice(&tmp, dispatch_queue); 277 list_splice(&tmp, dispatch_queue);
277 return; 278 goto out;
278 } 279 }
279 280
280 /* Move inodes from one superblock together */ 281 /* Move inodes from one superblock together */
281 while (!list_empty(&tmp)) { 282 while (!list_empty(&tmp)) {
282 sb = wb_inode(tmp.prev)->i_sb; 283 sb = wb_inode(tmp.prev)->i_sb;
283 list_for_each_prev_safe(pos, node, &tmp) { 284 list_for_each_prev_safe(pos, node, &tmp) {
284 inode = wb_inode(pos); 285 inode = wb_inode(pos);
285 if (inode->i_sb == sb) 286 if (inode->i_sb == sb)
286 list_move(&inode->i_wb_list, dispatch_queue); 287 list_move(&inode->i_wb_list, dispatch_queue);
287 } 288 }
288 } 289 }
290 out:
291 return moved;
289 } 292 }
290 293
291 /* 294 /*
292 * Queue all expired dirty inodes for io, eldest first. 295 * Queue all expired dirty inodes for io, eldest first.
293 * Before 296 * Before
294 * newly dirtied b_dirty b_io b_more_io 297 * newly dirtied b_dirty b_io b_more_io
295 * =============> gf edc BA 298 * =============> gf edc BA
296 * After 299 * After
297 * newly dirtied b_dirty b_io b_more_io 300 * newly dirtied b_dirty b_io b_more_io
298 * =============> g fBAedc 301 * =============> g fBAedc
299 * | 302 * |
300 * +--> dequeue for IO 303 * +--> dequeue for IO
301 */ 304 */
302 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 305 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
303 { 306 {
304 assert_spin_locked(&inode_wb_list_lock); 307 int moved;
308 assert_spin_locked(&wb->list_lock);
305 list_splice_init(&wb->b_more_io, &wb->b_io); 309 list_splice_init(&wb->b_more_io, &wb->b_io);
306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 310 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
311 trace_writeback_queue_io(wb, older_than_this, moved);
307 } 312 }
308 313
309 static int write_inode(struct inode *inode, struct writeback_control *wbc) 314 static int write_inode(struct inode *inode, struct writeback_control *wbc)
310 { 315 {
311 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 316 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
312 return inode->i_sb->s_op->write_inode(inode, wbc); 317 return inode->i_sb->s_op->write_inode(inode, wbc);
313 return 0; 318 return 0;
314 } 319 }
315 320
316 /* 321 /*
317 * Wait for writeback on an inode to complete. 322 * Wait for writeback on an inode to complete.
318 */ 323 */
319 static void inode_wait_for_writeback(struct inode *inode) 324 static void inode_wait_for_writeback(struct inode *inode,
325 struct bdi_writeback *wb)
320 { 326 {
321 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 327 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
322 wait_queue_head_t *wqh; 328 wait_queue_head_t *wqh;
323 329
324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 330 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
325 while (inode->i_state & I_SYNC) { 331 while (inode->i_state & I_SYNC) {
326 spin_unlock(&inode->i_lock); 332 spin_unlock(&inode->i_lock);
327 spin_unlock(&inode_wb_list_lock); 333 spin_unlock(&wb->list_lock);
328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 334 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
329 spin_lock(&inode_wb_list_lock); 335 spin_lock(&wb->list_lock);
330 spin_lock(&inode->i_lock); 336 spin_lock(&inode->i_lock);
331 } 337 }
332 } 338 }
333 339
334 /* 340 /*
335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and 341 * Write out an inode's dirty pages. Called under wb->list_lock and
336 * inode->i_lock. Either the caller has an active reference on the inode or 342 * inode->i_lock. Either the caller has an active reference on the inode or
337 * the inode has I_WILL_FREE set. 343 * the inode has I_WILL_FREE set.
338 * 344 *
339 * If `wait' is set, wait on the writeout. 345 * If `wait' is set, wait on the writeout.
340 * 346 *
341 * The whole writeout design is quite complex and fragile. We want to avoid 347 * The whole writeout design is quite complex and fragile. We want to avoid
342 * starvation of particular inodes when others are being redirtied, prevent 348 * starvation of particular inodes when others are being redirtied, prevent
343 * livelocks, etc. 349 * livelocks, etc.
344 */ 350 */
345 static int 351 static int
346 writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 352 writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
353 struct writeback_control *wbc)
347 { 354 {
348 struct address_space *mapping = inode->i_mapping; 355 struct address_space *mapping = inode->i_mapping;
356 long nr_to_write = wbc->nr_to_write;
349 unsigned dirty; 357 unsigned dirty;
350 int ret; 358 int ret;
351 359
352 assert_spin_locked(&inode_wb_list_lock); 360 assert_spin_locked(&wb->list_lock);
353 assert_spin_locked(&inode->i_lock); 361 assert_spin_locked(&inode->i_lock);
354 362
355 if (!atomic_read(&inode->i_count)) 363 if (!atomic_read(&inode->i_count))
356 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 364 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
357 else 365 else
358 WARN_ON(inode->i_state & I_WILL_FREE); 366 WARN_ON(inode->i_state & I_WILL_FREE);
359 367
360 if (inode->i_state & I_SYNC) { 368 if (inode->i_state & I_SYNC) {
361 /* 369 /*
362 * If this inode is locked for writeback and we are not doing 370 * If this inode is locked for writeback and we are not doing
363 * writeback-for-data-integrity, move it to b_more_io so that 371 * writeback-for-data-integrity, move it to b_more_io so that
364 * writeback can proceed with the other inodes on s_io. 372 * writeback can proceed with the other inodes on s_io.
365 * 373 *
366 * We'll have another go at writing back this inode when we 374 * We'll have another go at writing back this inode when we
367 * completed a full scan of b_io. 375 * completed a full scan of b_io.
368 */ 376 */
369 if (wbc->sync_mode != WB_SYNC_ALL) { 377 if (wbc->sync_mode != WB_SYNC_ALL) {
370 requeue_io(inode); 378 requeue_io(inode, wb);
379 trace_writeback_single_inode_requeue(inode, wbc,
380 nr_to_write);
371 return 0; 381 return 0;
372 } 382 }
373 383
374 /* 384 /*
375 * It's a data-integrity sync. We must wait. 385 * It's a data-integrity sync. We must wait.
376 */ 386 */
377 inode_wait_for_writeback(inode); 387 inode_wait_for_writeback(inode, wb);
378 } 388 }
379 389
380 BUG_ON(inode->i_state & I_SYNC); 390 BUG_ON(inode->i_state & I_SYNC);
381 391
382 /* Set I_SYNC, reset I_DIRTY_PAGES */ 392 /* Set I_SYNC, reset I_DIRTY_PAGES */
383 inode->i_state |= I_SYNC; 393 inode->i_state |= I_SYNC;
384 inode->i_state &= ~I_DIRTY_PAGES; 394 inode->i_state &= ~I_DIRTY_PAGES;
385 spin_unlock(&inode->i_lock); 395 spin_unlock(&inode->i_lock);
386 spin_unlock(&inode_wb_list_lock); 396 spin_unlock(&wb->list_lock);
387 397
388 ret = do_writepages(mapping, wbc); 398 ret = do_writepages(mapping, wbc);
389 399
390 /* 400 /*
391 * Make sure to wait on the data before writing out the metadata. 401 * Make sure to wait on the data before writing out the metadata.
392 * This is important for filesystems that modify metadata on data 402 * This is important for filesystems that modify metadata on data
393 * I/O completion. 403 * I/O completion.
394 */ 404 */
395 if (wbc->sync_mode == WB_SYNC_ALL) { 405 if (wbc->sync_mode == WB_SYNC_ALL) {
396 int err = filemap_fdatawait(mapping); 406 int err = filemap_fdatawait(mapping);
397 if (ret == 0) 407 if (ret == 0)
398 ret = err; 408 ret = err;
399 } 409 }
400 410
401 /* 411 /*
402 * Some filesystems may redirty the inode during the writeback 412 * Some filesystems may redirty the inode during the writeback
403 * due to delalloc, clear dirty metadata flags right before 413 * due to delalloc, clear dirty metadata flags right before
404 * write_inode() 414 * write_inode()
405 */ 415 */
406 spin_lock(&inode->i_lock); 416 spin_lock(&inode->i_lock);
407 dirty = inode->i_state & I_DIRTY; 417 dirty = inode->i_state & I_DIRTY;
408 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 418 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
409 spin_unlock(&inode->i_lock); 419 spin_unlock(&inode->i_lock);
410 /* Don't write the inode if only I_DIRTY_PAGES was set */ 420 /* Don't write the inode if only I_DIRTY_PAGES was set */
411 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 421 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
412 int err = write_inode(inode, wbc); 422 int err = write_inode(inode, wbc);
413 if (ret == 0) 423 if (ret == 0)
414 ret = err; 424 ret = err;
415 } 425 }
416 426
417 spin_lock(&inode_wb_list_lock); 427 spin_lock(&wb->list_lock);
418 spin_lock(&inode->i_lock); 428 spin_lock(&inode->i_lock);
419 inode->i_state &= ~I_SYNC; 429 inode->i_state &= ~I_SYNC;
420 if (!(inode->i_state & I_FREEING)) { 430 if (!(inode->i_state & I_FREEING)) {
431 /*
432 * Sync livelock prevention. Each inode is tagged and synced in
433 * one shot. If still dirty, it will be redirty_tail()'ed below.
434 * Update the dirty time to prevent enqueue and sync it again.
435 */
436 if ((inode->i_state & I_DIRTY) &&
437 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
438 inode->dirtied_when = jiffies;
439
421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 440 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
422 /* 441 /*
423 * We didn't write back all the pages. nfs_writepages() 442 * We didn't write back all the pages. nfs_writepages()
424 * sometimes bales out without doing anything. 443 * sometimes bales out without doing anything.
425 */ 444 */
426 inode->i_state |= I_DIRTY_PAGES; 445 inode->i_state |= I_DIRTY_PAGES;
427 if (wbc->nr_to_write <= 0) { 446 if (wbc->nr_to_write <= 0) {
428 /* 447 /*
429 * slice used up: queue for next turn 448 * slice used up: queue for next turn
430 */ 449 */
431 requeue_io(inode); 450 requeue_io(inode, wb);
432 } else { 451 } else {
433 /* 452 /*
434 * Writeback blocked by something other than 453 * Writeback blocked by something other than
435 * congestion. Delay the inode for some time to 454 * congestion. Delay the inode for some time to
436 * avoid spinning on the CPU (100% iowait) 455 * avoid spinning on the CPU (100% iowait)
437 * retrying writeback of the dirty page/inode 456 * retrying writeback of the dirty page/inode
438 * that cannot be performed immediately. 457 * that cannot be performed immediately.
439 */ 458 */
440 redirty_tail(inode); 459 redirty_tail(inode, wb);
441 } 460 }
442 } else if (inode->i_state & I_DIRTY) { 461 } else if (inode->i_state & I_DIRTY) {
443 /* 462 /*
444 * Filesystems can dirty the inode during writeback 463 * Filesystems can dirty the inode during writeback
445 * operations, such as delayed allocation during 464 * operations, such as delayed allocation during
446 * submission or metadata updates after data IO 465 * submission or metadata updates after data IO
447 * completion. 466 * completion.
448 */ 467 */
449 redirty_tail(inode); 468 redirty_tail(inode, wb);
450 } else { 469 } else {
451 /* 470 /*
452 * The inode is clean. At this point we either have 471 * The inode is clean. At this point we either have
453 * a reference to the inode or it's on it's way out. 472 * a reference to the inode or it's on it's way out.
454 * No need to add it back to the LRU. 473 * No need to add it back to the LRU.
455 */ 474 */
456 list_del_init(&inode->i_wb_list); 475 list_del_init(&inode->i_wb_list);
457 } 476 }
458 } 477 }
459 inode_sync_complete(inode); 478 inode_sync_complete(inode);
479 trace_writeback_single_inode(inode, wbc, nr_to_write);
460 return ret; 480 return ret;
461 } 481 }
462 482
483 static long writeback_chunk_size(struct backing_dev_info *bdi,
484 struct wb_writeback_work *work)
485 {
486 long pages;
487
488 /*
489 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
490 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
491 * here avoids calling into writeback_inodes_wb() more than once.
492 *
493 * The intended call sequence for WB_SYNC_ALL writeback is:
494 *
495 * wb_writeback()
496 * writeback_sb_inodes() <== called only once
497 * write_cache_pages() <== called once for each inode
498 * (quickly) tag currently dirty pages
499 * (maybe slowly) sync all tagged pages
500 */
501 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
502 pages = LONG_MAX;
503 else {
504 pages = min(bdi->avg_write_bandwidth / 2,
505 global_dirty_limit / DIRTY_SCOPE);
506 pages = min(pages, work->nr_pages);
507 pages = round_down(pages + MIN_WRITEBACK_PAGES,
508 MIN_WRITEBACK_PAGES);
509 }
510
511 return pages;
512 }
513
463 /* 514 /*
464 * Write a portion of b_io inodes which belong to @sb. 515 * Write a portion of b_io inodes which belong to @sb.
465 * 516 *
466 * If @only_this_sb is true, then find and write all such 517 * If @only_this_sb is true, then find and write all such
467 * inodes. Otherwise write only ones which go sequentially 518 * inodes. Otherwise write only ones which go sequentially
468 * in reverse order. 519 * in reverse order.
469 * 520 *
470 * Return 1, if the caller writeback routine should be 521 * Return the number of pages and/or inodes written.
471 * interrupted. Otherwise return 0.
472 */ 522 */
473 static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, 523 static long writeback_sb_inodes(struct super_block *sb,
474 struct writeback_control *wbc, bool only_this_sb) 524 struct bdi_writeback *wb,
525 struct wb_writeback_work *work)
475 { 526 {
527 struct writeback_control wbc = {
528 .sync_mode = work->sync_mode,
529 .tagged_writepages = work->tagged_writepages,
530 .for_kupdate = work->for_kupdate,
531 .for_background = work->for_background,
532 .range_cyclic = work->range_cyclic,
533 .range_start = 0,
534 .range_end = LLONG_MAX,
535 };
536 unsigned long start_time = jiffies;
537 long write_chunk;
538 long wrote = 0; /* count both pages and inodes */
539
476 while (!list_empty(&wb->b_io)) { 540 while (!list_empty(&wb->b_io)) {
477 long pages_skipped;
478 struct inode *inode = wb_inode(wb->b_io.prev); 541 struct inode *inode = wb_inode(wb->b_io.prev);
479 542
480 if (inode->i_sb != sb) { 543 if (inode->i_sb != sb) {
481 if (only_this_sb) { 544 if (work->sb) {
482 /* 545 /*
483 * We only want to write back data for this 546 * We only want to write back data for this
484 * superblock, move all inodes not belonging 547 * superblock, move all inodes not belonging
485 * to it back onto the dirty list. 548 * to it back onto the dirty list.
486 */ 549 */
487 redirty_tail(inode); 550 redirty_tail(inode, wb);
488 continue; 551 continue;
489 } 552 }
490 553
491 /* 554 /*
492 * The inode belongs to a different superblock. 555 * The inode belongs to a different superblock.
493 * Bounce back to the caller to unpin this and 556 * Bounce back to the caller to unpin this and
494 * pin the next superblock. 557 * pin the next superblock.
495 */ 558 */
496 return 0; 559 break;
497 } 560 }
498 561
499 /* 562 /*
500 * Don't bother with new inodes or inodes beeing freed, first 563 * Don't bother with new inodes or inodes beeing freed, first
501 * kind does not need peridic writeout yet, and for the latter 564 * kind does not need peridic writeout yet, and for the latter
502 * kind writeout is handled by the freer. 565 * kind writeout is handled by the freer.
503 */ 566 */
504 spin_lock(&inode->i_lock); 567 spin_lock(&inode->i_lock);
505 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 568 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
506 spin_unlock(&inode->i_lock); 569 spin_unlock(&inode->i_lock);
507 requeue_io(inode); 570 redirty_tail(inode, wb);
508 continue; 571 continue;
509 } 572 }
510
511 /*
512 * Was this inode dirtied after sync_sb_inodes was called?
513 * This keeps sync from extra jobs and livelock.
514 */
515 if (inode_dirtied_after(inode, wbc->wb_start)) {
516 spin_unlock(&inode->i_lock);
517 return 1;
518 }
519
520 __iget(inode); 573 __iget(inode);
574 write_chunk = writeback_chunk_size(wb->bdi, work);
575 wbc.nr_to_write = write_chunk;
576 wbc.pages_skipped = 0;
521 577
522 pages_skipped = wbc->pages_skipped; 578 writeback_single_inode(inode, wb, &wbc);
523 writeback_single_inode(inode, wbc); 579
524 if (wbc->pages_skipped != pages_skipped) { 580 work->nr_pages -= write_chunk - wbc.nr_to_write;
581 wrote += write_chunk - wbc.nr_to_write;
582 if (!(inode->i_state & I_DIRTY))
583 wrote++;
584 if (wbc.pages_skipped) {
525 /* 585 /*
526 * writeback is not making progress due to locked 586 * writeback is not making progress due to locked
527 * buffers. Skip this inode for now. 587 * buffers. Skip this inode for now.
528 */ 588 */
529 redirty_tail(inode); 589 redirty_tail(inode, wb);
530 } 590 }
531 spin_unlock(&inode->i_lock); 591 spin_unlock(&inode->i_lock);
532 spin_unlock(&inode_wb_list_lock); 592 spin_unlock(&wb->list_lock);
533 iput(inode); 593 iput(inode);
534 cond_resched(); 594 cond_resched();
535 spin_lock(&inode_wb_list_lock); 595 spin_lock(&wb->list_lock);
536 if (wbc->nr_to_write <= 0) { 596 /*
537 wbc->more_io = 1; 597 * bail out to wb_writeback() often enough to check
538 return 1; 598 * background threshold and other termination conditions.
599 */
600 if (wrote) {
601 if (time_is_before_jiffies(start_time + HZ / 10UL))
602 break;
603 if (work->nr_pages <= 0)
604 break;
539 } 605 }
540 if (!list_empty(&wb->b_more_io))
541 wbc->more_io = 1;
542 } 606 }
543 /* b_io is empty */ 607 return wrote;
544 return 1;
545 } 608 }
546 609
547 void writeback_inodes_wb(struct bdi_writeback *wb, 610 static long __writeback_inodes_wb(struct bdi_writeback *wb,
548 struct writeback_control *wbc) 611 struct wb_writeback_work *work)
549 { 612 {
550 int ret = 0; 613 unsigned long start_time = jiffies;
614 long wrote = 0;
551 615
552 if (!wbc->wb_start)
553 wbc->wb_start = jiffies; /* livelock avoidance */
554 spin_lock(&inode_wb_list_lock);
555 if (!wbc->for_kupdate || list_empty(&wb->b_io))
556 queue_io(wb, wbc->older_than_this);
557
558 while (!list_empty(&wb->b_io)) { 616 while (!list_empty(&wb->b_io)) {
559 struct inode *inode = wb_inode(wb->b_io.prev); 617 struct inode *inode = wb_inode(wb->b_io.prev);
560 struct super_block *sb = inode->i_sb; 618 struct super_block *sb = inode->i_sb;
561 619
562 if (!grab_super_passive(sb)) { 620 if (!grab_super_passive(sb)) {
563 requeue_io(inode); 621 requeue_io(inode, wb);
564 continue; 622 continue;
565 } 623 }
566 ret = writeback_sb_inodes(sb, wb, wbc, false); 624 wrote += writeback_sb_inodes(sb, wb, work);
567 drop_super(sb); 625 drop_super(sb);
568 626
569 if (ret) 627 /* refer to the same tests at the end of writeback_sb_inodes */
570 break; 628 if (wrote) {
629 if (time_is_before_jiffies(start_time + HZ / 10UL))
630 break;
631 if (work->nr_pages <= 0)
632 break;
633 }
571 } 634 }
572 spin_unlock(&inode_wb_list_lock);
573 /* Leave any unwritten inodes on b_io */ 635 /* Leave any unwritten inodes on b_io */
636 return wrote;
574 } 637 }
575 638
576 static void __writeback_inodes_sb(struct super_block *sb, 639 long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
577 struct bdi_writeback *wb, struct writeback_control *wbc)
578 { 640 {
579 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 641 struct wb_writeback_work work = {
642 .nr_pages = nr_pages,
643 .sync_mode = WB_SYNC_NONE,
644 .range_cyclic = 1,
645 };
580 646
581 spin_lock(&inode_wb_list_lock); 647 spin_lock(&wb->list_lock);
582 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 648 if (list_empty(&wb->b_io))
583 queue_io(wb, wbc->older_than_this); 649 queue_io(wb, NULL);
584 writeback_sb_inodes(sb, wb, wbc, true); 650 __writeback_inodes_wb(wb, &work);
585 spin_unlock(&inode_wb_list_lock); 651 spin_unlock(&wb->list_lock);
652
653 return nr_pages - work.nr_pages;
586 } 654 }
587 655
588 /*
589 * The maximum number of pages to writeout in a single bdi flush/kupdate
590 * operation. We do this so we don't hold I_SYNC against an inode for
591 * enormous amounts of time, which would block a userspace task which has
592 * been forced to throttle against that inode. Also, the code reevaluates
593 * the dirty each time it has written this many pages.
594 */
595 #define MAX_WRITEBACK_PAGES 1024
596
597 static inline bool over_bground_thresh(void) 656 static inline bool over_bground_thresh(void)
598 { 657 {
599 unsigned long background_thresh, dirty_thresh; 658 unsigned long background_thresh, dirty_thresh;
600 659
601 global_dirty_limits(&background_thresh, &dirty_thresh); 660 global_dirty_limits(&background_thresh, &dirty_thresh);
602 661
603 return (global_page_state(NR_FILE_DIRTY) + 662 return (global_page_state(NR_FILE_DIRTY) +
604 global_page_state(NR_UNSTABLE_NFS) > background_thresh); 663 global_page_state(NR_UNSTABLE_NFS) > background_thresh);
605 } 664 }
606 665
607 /* 666 /*
667 * Called under wb->list_lock. If there are multiple wb per bdi,
668 * only the flusher working on the first wb should do it.
669 */
670 static void wb_update_bandwidth(struct bdi_writeback *wb,
671 unsigned long start_time)
672 {
673 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
674 }
675
676 /*
608 * Explicit flushing or periodic writeback of "old" data. 677 * Explicit flushing or periodic writeback of "old" data.
609 * 678 *
610 * Define "old": the first time one of an inode's pages is dirtied, we mark the 679 * Define "old": the first time one of an inode's pages is dirtied, we mark the
611 * dirtying-time in the inode's address_space. So this periodic writeback code 680 * dirtying-time in the inode's address_space. So this periodic writeback code
612 * just walks the superblock inode list, writing back any inodes which are 681 * just walks the superblock inode list, writing back any inodes which are
613 * older than a specific point in time. 682 * older than a specific point in time.
614 * 683 *
615 * Try to run once per dirty_writeback_interval. But if a writeback event 684 * Try to run once per dirty_writeback_interval. But if a writeback event
616 * takes longer than a dirty_writeback_interval interval, then leave a 685 * takes longer than a dirty_writeback_interval interval, then leave a
617 * one-second gap. 686 * one-second gap.
618 * 687 *
619 * older_than_this takes precedence over nr_to_write. So we'll only write back 688 * older_than_this takes precedence over nr_to_write. So we'll only write back
620 * all dirty pages if they are all attached to "old" mappings. 689 * all dirty pages if they are all attached to "old" mappings.
621 */ 690 */
622 static long wb_writeback(struct bdi_writeback *wb, 691 static long wb_writeback(struct bdi_writeback *wb,
623 struct wb_writeback_work *work) 692 struct wb_writeback_work *work)
624 { 693 {
625 struct writeback_control wbc = { 694 unsigned long wb_start = jiffies;
626 .sync_mode = work->sync_mode, 695 long nr_pages = work->nr_pages;
627 .older_than_this = NULL,
628 .for_kupdate = work->for_kupdate,
629 .for_background = work->for_background,
630 .range_cyclic = work->range_cyclic,
631 };
632 unsigned long oldest_jif; 696 unsigned long oldest_jif;
633 long wrote = 0;
634 long write_chunk;
635 struct inode *inode; 697 struct inode *inode;
698 long progress;
636 699
637 if (wbc.for_kupdate) { 700 oldest_jif = jiffies;
638 wbc.older_than_this = &oldest_jif; 701 work->older_than_this = &oldest_jif;
639 oldest_jif = jiffies -
640 msecs_to_jiffies(dirty_expire_interval * 10);
641 }
642 if (!wbc.range_cyclic) {
643 wbc.range_start = 0;
644 wbc.range_end = LLONG_MAX;
645 }
646 702
647 /* 703 spin_lock(&wb->list_lock);
648 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
649 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
650 * here avoids calling into writeback_inodes_wb() more than once.
651 *
652 * The intended call sequence for WB_SYNC_ALL writeback is:
653 *
654 * wb_writeback()
655 * __writeback_inodes_sb() <== called only once
656 * write_cache_pages() <== called once for each inode
657 * (quickly) tag currently dirty pages
658 * (maybe slowly) sync all tagged pages
659 */
660 if (wbc.sync_mode == WB_SYNC_NONE)
661 write_chunk = MAX_WRITEBACK_PAGES;
662 else
663 write_chunk = LONG_MAX;
664
665 wbc.wb_start = jiffies; /* livelock avoidance */
666 for (;;) { 704 for (;;) {
667 /* 705 /*
668 * Stop writeback when nr_pages has been consumed 706 * Stop writeback when nr_pages has been consumed
669 */ 707 */
670 if (work->nr_pages <= 0) 708 if (work->nr_pages <= 0)
671 break; 709 break;
672 710
673 /* 711 /*
674 * Background writeout and kupdate-style writeback may 712 * Background writeout and kupdate-style writeback may
675 * run forever. Stop them if there is other work to do 713 * run forever. Stop them if there is other work to do
676 * so that e.g. sync can proceed. They'll be restarted 714 * so that e.g. sync can proceed. They'll be restarted
677 * after the other works are all done. 715 * after the other works are all done.
678 */ 716 */
679 if ((work->for_background || work->for_kupdate) && 717 if ((work->for_background || work->for_kupdate) &&
680 !list_empty(&wb->bdi->work_list)) 718 !list_empty(&wb->bdi->work_list))
681 break; 719 break;
682 720
683 /* 721 /*
684 * For background writeout, stop when we are below the 722 * For background writeout, stop when we are below the
685 * background dirty threshold 723 * background dirty threshold
686 */ 724 */
687 if (work->for_background && !over_bground_thresh()) 725 if (work->for_background && !over_bground_thresh())
688 break; 726 break;
689 727
690 wbc.more_io = 0; 728 if (work->for_kupdate) {
691 wbc.nr_to_write = write_chunk; 729 oldest_jif = jiffies -
692 wbc.pages_skipped = 0; 730 msecs_to_jiffies(dirty_expire_interval * 10);
731 work->older_than_this = &oldest_jif;
732 }
693 733
694 trace_wbc_writeback_start(&wbc, wb->bdi); 734 trace_writeback_start(wb->bdi, work);
735 if (list_empty(&wb->b_io))
736 queue_io(wb, work->older_than_this);
695 if (work->sb) 737 if (work->sb)
696 __writeback_inodes_sb(work->sb, wb, &wbc); 738 progress = writeback_sb_inodes(work->sb, wb, work);
697 else 739 else
698 writeback_inodes_wb(wb, &wbc); 740 progress = __writeback_inodes_wb(wb, work);
699 trace_wbc_writeback_written(&wbc, wb->bdi); 741 trace_writeback_written(wb->bdi, work);
700 742
701 work->nr_pages -= write_chunk - wbc.nr_to_write; 743 wb_update_bandwidth(wb, wb_start);
702 wrote += write_chunk - wbc.nr_to_write;
703 744
704 /* 745 /*
705 * If we consumed everything, see if we have more 746 * Did we write something? Try for more
747 *
748 * Dirty inodes are moved to b_io for writeback in batches.
749 * The completion of the current batch does not necessarily
750 * mean the overall work is done. So we keep looping as long
751 * as made some progress on cleaning pages or inodes.
706 */ 752 */
707 if (wbc.nr_to_write <= 0) 753 if (progress)
708 continue; 754 continue;
709 /* 755 /*
710 * Didn't write everything and we don't have more IO, bail 756 * No more inodes for IO, bail
711 */ 757 */
712 if (!wbc.more_io) 758 if (list_empty(&wb->b_more_io))
713 break; 759 break;
714 /* 760 /*
715 * Did we write something? Try for more
716 */
717 if (wbc.nr_to_write < write_chunk)
718 continue;
719 /*
720 * Nothing written. Wait for some inode to 761 * Nothing written. Wait for some inode to
721 * become available for writeback. Otherwise 762 * become available for writeback. Otherwise
722 * we'll just busyloop. 763 * we'll just busyloop.
723 */ 764 */
724 spin_lock(&inode_wb_list_lock);
725 if (!list_empty(&wb->b_more_io)) { 765 if (!list_empty(&wb->b_more_io)) {
766 trace_writeback_wait(wb->bdi, work);
726 inode = wb_inode(wb->b_more_io.prev); 767 inode = wb_inode(wb->b_more_io.prev);
727 trace_wbc_writeback_wait(&wbc, wb->bdi);
728 spin_lock(&inode->i_lock); 768 spin_lock(&inode->i_lock);
729 inode_wait_for_writeback(inode); 769 inode_wait_for_writeback(inode, wb);
730 spin_unlock(&inode->i_lock); 770 spin_unlock(&inode->i_lock);
731 } 771 }
732 spin_unlock(&inode_wb_list_lock);
733 } 772 }
773 spin_unlock(&wb->list_lock);
734 774
735 return wrote; 775 return nr_pages - work->nr_pages;
736 } 776 }
737 777
738 /* 778 /*
739 * Return the next wb_writeback_work struct that hasn't been processed yet. 779 * Return the next wb_writeback_work struct that hasn't been processed yet.
740 */ 780 */
741 static struct wb_writeback_work * 781 static struct wb_writeback_work *
742 get_next_work_item(struct backing_dev_info *bdi) 782 get_next_work_item(struct backing_dev_info *bdi)
743 { 783 {
744 struct wb_writeback_work *work = NULL; 784 struct wb_writeback_work *work = NULL;
745 785
746 spin_lock_bh(&bdi->wb_lock); 786 spin_lock_bh(&bdi->wb_lock);
747 if (!list_empty(&bdi->work_list)) { 787 if (!list_empty(&bdi->work_list)) {
748 work = list_entry(bdi->work_list.next, 788 work = list_entry(bdi->work_list.next,
749 struct wb_writeback_work, list); 789 struct wb_writeback_work, list);
750 list_del_init(&work->list); 790 list_del_init(&work->list);
751 } 791 }
752 spin_unlock_bh(&bdi->wb_lock); 792 spin_unlock_bh(&bdi->wb_lock);
753 return work; 793 return work;
754 } 794 }
755 795
756 /* 796 /*
757 * Add in the number of potentially dirty inodes, because each inode 797 * Add in the number of potentially dirty inodes, because each inode
758 * write can dirty pagecache in the underlying blockdev. 798 * write can dirty pagecache in the underlying blockdev.
759 */ 799 */
760 static unsigned long get_nr_dirty_pages(void) 800 static unsigned long get_nr_dirty_pages(void)
761 { 801 {
762 return global_page_state(NR_FILE_DIRTY) + 802 return global_page_state(NR_FILE_DIRTY) +
763 global_page_state(NR_UNSTABLE_NFS) + 803 global_page_state(NR_UNSTABLE_NFS) +
764 get_nr_dirty_inodes(); 804 get_nr_dirty_inodes();
765 } 805 }
766 806
767 static long wb_check_background_flush(struct bdi_writeback *wb) 807 static long wb_check_background_flush(struct bdi_writeback *wb)
768 { 808 {
769 if (over_bground_thresh()) { 809 if (over_bground_thresh()) {
770 810
771 struct wb_writeback_work work = { 811 struct wb_writeback_work work = {
772 .nr_pages = LONG_MAX, 812 .nr_pages = LONG_MAX,
773 .sync_mode = WB_SYNC_NONE, 813 .sync_mode = WB_SYNC_NONE,
774 .for_background = 1, 814 .for_background = 1,
775 .range_cyclic = 1, 815 .range_cyclic = 1,
776 }; 816 };
777 817
778 return wb_writeback(wb, &work); 818 return wb_writeback(wb, &work);
779 } 819 }
780 820
781 return 0; 821 return 0;
782 } 822 }
783 823
784 static long wb_check_old_data_flush(struct bdi_writeback *wb) 824 static long wb_check_old_data_flush(struct bdi_writeback *wb)
785 { 825 {
786 unsigned long expired; 826 unsigned long expired;
787 long nr_pages; 827 long nr_pages;
788 828
789 /* 829 /*
790 * When set to zero, disable periodic writeback 830 * When set to zero, disable periodic writeback
791 */ 831 */
792 if (!dirty_writeback_interval) 832 if (!dirty_writeback_interval)
793 return 0; 833 return 0;
794 834
795 expired = wb->last_old_flush + 835 expired = wb->last_old_flush +
796 msecs_to_jiffies(dirty_writeback_interval * 10); 836 msecs_to_jiffies(dirty_writeback_interval * 10);
797 if (time_before(jiffies, expired)) 837 if (time_before(jiffies, expired))
798 return 0; 838 return 0;
799 839
800 wb->last_old_flush = jiffies; 840 wb->last_old_flush = jiffies;
801 nr_pages = get_nr_dirty_pages(); 841 nr_pages = get_nr_dirty_pages();
802 842
803 if (nr_pages) { 843 if (nr_pages) {
804 struct wb_writeback_work work = { 844 struct wb_writeback_work work = {
805 .nr_pages = nr_pages, 845 .nr_pages = nr_pages,
806 .sync_mode = WB_SYNC_NONE, 846 .sync_mode = WB_SYNC_NONE,
807 .for_kupdate = 1, 847 .for_kupdate = 1,
808 .range_cyclic = 1, 848 .range_cyclic = 1,
809 }; 849 };
810 850
811 return wb_writeback(wb, &work); 851 return wb_writeback(wb, &work);
812 } 852 }
813 853
814 return 0; 854 return 0;
815 } 855 }
816 856
817 /* 857 /*
818 * Retrieve work items and do the writeback they describe 858 * Retrieve work items and do the writeback they describe
819 */ 859 */
820 long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 860 long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
821 { 861 {
822 struct backing_dev_info *bdi = wb->bdi; 862 struct backing_dev_info *bdi = wb->bdi;
823 struct wb_writeback_work *work; 863 struct wb_writeback_work *work;
824 long wrote = 0; 864 long wrote = 0;
825 865
826 set_bit(BDI_writeback_running, &wb->bdi->state); 866 set_bit(BDI_writeback_running, &wb->bdi->state);
827 while ((work = get_next_work_item(bdi)) != NULL) { 867 while ((work = get_next_work_item(bdi)) != NULL) {
828 /* 868 /*
829 * Override sync mode, in case we must wait for completion 869 * Override sync mode, in case we must wait for completion
830 * because this thread is exiting now. 870 * because this thread is exiting now.
831 */ 871 */
832 if (force_wait) 872 if (force_wait)
833 work->sync_mode = WB_SYNC_ALL; 873 work->sync_mode = WB_SYNC_ALL;
834 874
835 trace_writeback_exec(bdi, work); 875 trace_writeback_exec(bdi, work);
836 876
837 wrote += wb_writeback(wb, work); 877 wrote += wb_writeback(wb, work);
838 878
839 /* 879 /*
840 * Notify the caller of completion if this is a synchronous 880 * Notify the caller of completion if this is a synchronous
841 * work item, otherwise just free it. 881 * work item, otherwise just free it.
842 */ 882 */
843 if (work->done) 883 if (work->done)
844 complete(work->done); 884 complete(work->done);
845 else 885 else
846 kfree(work); 886 kfree(work);
847 } 887 }
848 888
849 /* 889 /*
850 * Check for periodic writeback, kupdated() style 890 * Check for periodic writeback, kupdated() style
851 */ 891 */
852 wrote += wb_check_old_data_flush(wb); 892 wrote += wb_check_old_data_flush(wb);
853 wrote += wb_check_background_flush(wb); 893 wrote += wb_check_background_flush(wb);
854 clear_bit(BDI_writeback_running, &wb->bdi->state); 894 clear_bit(BDI_writeback_running, &wb->bdi->state);
855 895
856 return wrote; 896 return wrote;
857 } 897 }
858 898
859 /* 899 /*
860 * Handle writeback of dirty data for the device backed by this bdi. Also 900 * Handle writeback of dirty data for the device backed by this bdi. Also
861 * wakes up periodically and does kupdated style flushing. 901 * wakes up periodically and does kupdated style flushing.
862 */ 902 */
863 int bdi_writeback_thread(void *data) 903 int bdi_writeback_thread(void *data)
864 { 904 {
865 struct bdi_writeback *wb = data; 905 struct bdi_writeback *wb = data;
866 struct backing_dev_info *bdi = wb->bdi; 906 struct backing_dev_info *bdi = wb->bdi;
867 long pages_written; 907 long pages_written;
868 908
869 current->flags |= PF_SWAPWRITE; 909 current->flags |= PF_SWAPWRITE;
870 set_freezable(); 910 set_freezable();
871 wb->last_active = jiffies; 911 wb->last_active = jiffies;
872 912
873 /* 913 /*
874 * Our parent may run at a different priority, just set us to normal 914 * Our parent may run at a different priority, just set us to normal
875 */ 915 */
876 set_user_nice(current, 0); 916 set_user_nice(current, 0);
877 917
878 trace_writeback_thread_start(bdi); 918 trace_writeback_thread_start(bdi);
879 919
880 while (!kthread_should_stop()) { 920 while (!kthread_should_stop()) {
881 /* 921 /*
882 * Remove own delayed wake-up timer, since we are already awake 922 * Remove own delayed wake-up timer, since we are already awake
883 * and we'll take care of the preriodic write-back. 923 * and we'll take care of the preriodic write-back.
884 */ 924 */
885 del_timer(&wb->wakeup_timer); 925 del_timer(&wb->wakeup_timer);
886 926
887 pages_written = wb_do_writeback(wb, 0); 927 pages_written = wb_do_writeback(wb, 0);
888 928
889 trace_writeback_pages_written(pages_written); 929 trace_writeback_pages_written(pages_written);
890 930
891 if (pages_written) 931 if (pages_written)
892 wb->last_active = jiffies; 932 wb->last_active = jiffies;
893 933
894 set_current_state(TASK_INTERRUPTIBLE); 934 set_current_state(TASK_INTERRUPTIBLE);
895 if (!list_empty(&bdi->work_list) || kthread_should_stop()) { 935 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
896 __set_current_state(TASK_RUNNING); 936 __set_current_state(TASK_RUNNING);
897 continue; 937 continue;
898 } 938 }
899 939
900 if (wb_has_dirty_io(wb) && dirty_writeback_interval) 940 if (wb_has_dirty_io(wb) && dirty_writeback_interval)
901 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 941 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
902 else { 942 else {
903 /* 943 /*
904 * We have nothing to do, so can go sleep without any 944 * We have nothing to do, so can go sleep without any
905 * timeout and save power. When a work is queued or 945 * timeout and save power. When a work is queued or
906 * something is made dirty - we will be woken up. 946 * something is made dirty - we will be woken up.
907 */ 947 */
908 schedule(); 948 schedule();
909 } 949 }
910 950
911 try_to_freeze(); 951 try_to_freeze();
912 } 952 }
913 953
914 /* Flush any work that raced with us exiting */ 954 /* Flush any work that raced with us exiting */
915 if (!list_empty(&bdi->work_list)) 955 if (!list_empty(&bdi->work_list))
916 wb_do_writeback(wb, 1); 956 wb_do_writeback(wb, 1);
917 957
918 trace_writeback_thread_stop(bdi); 958 trace_writeback_thread_stop(bdi);
919 return 0; 959 return 0;
920 } 960 }
921 961
922 962
923 /* 963 /*
924 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 964 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
925 * the whole world. 965 * the whole world.
926 */ 966 */
927 void wakeup_flusher_threads(long nr_pages) 967 void wakeup_flusher_threads(long nr_pages)
928 { 968 {
929 struct backing_dev_info *bdi; 969 struct backing_dev_info *bdi;
930 970
931 if (!nr_pages) { 971 if (!nr_pages) {
932 nr_pages = global_page_state(NR_FILE_DIRTY) + 972 nr_pages = global_page_state(NR_FILE_DIRTY) +
933 global_page_state(NR_UNSTABLE_NFS); 973 global_page_state(NR_UNSTABLE_NFS);
934 } 974 }
935 975
936 rcu_read_lock(); 976 rcu_read_lock();
937 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 977 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
938 if (!bdi_has_dirty_io(bdi)) 978 if (!bdi_has_dirty_io(bdi))
939 continue; 979 continue;
940 __bdi_start_writeback(bdi, nr_pages, false); 980 __bdi_start_writeback(bdi, nr_pages, false);
941 } 981 }
942 rcu_read_unlock(); 982 rcu_read_unlock();
943 } 983 }
944 984
945 static noinline void block_dump___mark_inode_dirty(struct inode *inode) 985 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
946 { 986 {
947 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 987 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
948 struct dentry *dentry; 988 struct dentry *dentry;
949 const char *name = "?"; 989 const char *name = "?";
950 990
951 dentry = d_find_alias(inode); 991 dentry = d_find_alias(inode);
952 if (dentry) { 992 if (dentry) {
953 spin_lock(&dentry->d_lock); 993 spin_lock(&dentry->d_lock);
954 name = (const char *) dentry->d_name.name; 994 name = (const char *) dentry->d_name.name;
955 } 995 }
956 printk(KERN_DEBUG 996 printk(KERN_DEBUG
957 "%s(%d): dirtied inode %lu (%s) on %s\n", 997 "%s(%d): dirtied inode %lu (%s) on %s\n",
958 current->comm, task_pid_nr(current), inode->i_ino, 998 current->comm, task_pid_nr(current), inode->i_ino,
959 name, inode->i_sb->s_id); 999 name, inode->i_sb->s_id);
960 if (dentry) { 1000 if (dentry) {
961 spin_unlock(&dentry->d_lock); 1001 spin_unlock(&dentry->d_lock);
962 dput(dentry); 1002 dput(dentry);
963 } 1003 }
964 } 1004 }
965 } 1005 }
966 1006
967 /** 1007 /**
968 * __mark_inode_dirty - internal function 1008 * __mark_inode_dirty - internal function
969 * @inode: inode to mark 1009 * @inode: inode to mark
970 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) 1010 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
971 * Mark an inode as dirty. Callers should use mark_inode_dirty or 1011 * Mark an inode as dirty. Callers should use mark_inode_dirty or
972 * mark_inode_dirty_sync. 1012 * mark_inode_dirty_sync.
973 * 1013 *
974 * Put the inode on the super block's dirty list. 1014 * Put the inode on the super block's dirty list.
975 * 1015 *
976 * CAREFUL! We mark it dirty unconditionally, but move it onto the 1016 * CAREFUL! We mark it dirty unconditionally, but move it onto the
977 * dirty list only if it is hashed or if it refers to a blockdev. 1017 * dirty list only if it is hashed or if it refers to a blockdev.
978 * If it was not hashed, it will never be added to the dirty list 1018 * If it was not hashed, it will never be added to the dirty list
979 * even if it is later hashed, as it will have been marked dirty already. 1019 * even if it is later hashed, as it will have been marked dirty already.
980 * 1020 *
981 * In short, make sure you hash any inodes _before_ you start marking 1021 * In short, make sure you hash any inodes _before_ you start marking
982 * them dirty. 1022 * them dirty.
983 * 1023 *
984 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 1024 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
985 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 1025 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
986 * the kernel-internal blockdev inode represents the dirtying time of the 1026 * the kernel-internal blockdev inode represents the dirtying time of the
987 * blockdev's pages. This is why for I_DIRTY_PAGES we always use 1027 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
988 * page->mapping->host, so the page-dirtying time is recorded in the internal 1028 * page->mapping->host, so the page-dirtying time is recorded in the internal
989 * blockdev inode. 1029 * blockdev inode.
990 */ 1030 */
991 void __mark_inode_dirty(struct inode *inode, int flags) 1031 void __mark_inode_dirty(struct inode *inode, int flags)
992 { 1032 {
993 struct super_block *sb = inode->i_sb; 1033 struct super_block *sb = inode->i_sb;
994 struct backing_dev_info *bdi = NULL; 1034 struct backing_dev_info *bdi = NULL;
995 1035
996 /* 1036 /*
997 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1037 * Don't do this for I_DIRTY_PAGES - that doesn't actually
998 * dirty the inode itself 1038 * dirty the inode itself
999 */ 1039 */
1000 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1040 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1001 if (sb->s_op->dirty_inode) 1041 if (sb->s_op->dirty_inode)
1002 sb->s_op->dirty_inode(inode, flags); 1042 sb->s_op->dirty_inode(inode, flags);
1003 } 1043 }
1004 1044
1005 /* 1045 /*
1006 * make sure that changes are seen by all cpus before we test i_state 1046 * make sure that changes are seen by all cpus before we test i_state
1007 * -- mikulas 1047 * -- mikulas
1008 */ 1048 */
1009 smp_mb(); 1049 smp_mb();
1010 1050
1011 /* avoid the locking if we can */ 1051 /* avoid the locking if we can */
1012 if ((inode->i_state & flags) == flags) 1052 if ((inode->i_state & flags) == flags)
1013 return; 1053 return;
1014 1054
1015 if (unlikely(block_dump)) 1055 if (unlikely(block_dump))
1016 block_dump___mark_inode_dirty(inode); 1056 block_dump___mark_inode_dirty(inode);
1017 1057
1018 spin_lock(&inode->i_lock); 1058 spin_lock(&inode->i_lock);
1019 if ((inode->i_state & flags) != flags) { 1059 if ((inode->i_state & flags) != flags) {
1020 const int was_dirty = inode->i_state & I_DIRTY; 1060 const int was_dirty = inode->i_state & I_DIRTY;
1021 1061
1022 inode->i_state |= flags; 1062 inode->i_state |= flags;
1023 1063
1024 /* 1064 /*
1025 * If the inode is being synced, just update its dirty state. 1065 * If the inode is being synced, just update its dirty state.
1026 * The unlocker will place the inode on the appropriate 1066 * The unlocker will place the inode on the appropriate
1027 * superblock list, based upon its state. 1067 * superblock list, based upon its state.
1028 */ 1068 */
1029 if (inode->i_state & I_SYNC) 1069 if (inode->i_state & I_SYNC)
1030 goto out_unlock_inode; 1070 goto out_unlock_inode;
1031 1071
1032 /* 1072 /*
1033 * Only add valid (hashed) inodes to the superblock's 1073 * Only add valid (hashed) inodes to the superblock's
1034 * dirty list. Add blockdev inodes as well. 1074 * dirty list. Add blockdev inodes as well.
1035 */ 1075 */
1036 if (!S_ISBLK(inode->i_mode)) { 1076 if (!S_ISBLK(inode->i_mode)) {
1037 if (inode_unhashed(inode)) 1077 if (inode_unhashed(inode))
1038 goto out_unlock_inode; 1078 goto out_unlock_inode;
1039 } 1079 }
1040 if (inode->i_state & I_FREEING) 1080 if (inode->i_state & I_FREEING)
1041 goto out_unlock_inode; 1081 goto out_unlock_inode;
1042 1082
1043 /* 1083 /*
1044 * If the inode was already on b_dirty/b_io/b_more_io, don't 1084 * If the inode was already on b_dirty/b_io/b_more_io, don't
1045 * reposition it (that would break b_dirty time-ordering). 1085 * reposition it (that would break b_dirty time-ordering).
1046 */ 1086 */
1047 if (!was_dirty) { 1087 if (!was_dirty) {
1048 bool wakeup_bdi = false; 1088 bool wakeup_bdi = false;
1049 bdi = inode_to_bdi(inode); 1089 bdi = inode_to_bdi(inode);
1050 1090
1051 if (bdi_cap_writeback_dirty(bdi)) { 1091 if (bdi_cap_writeback_dirty(bdi)) {
1052 WARN(!test_bit(BDI_registered, &bdi->state), 1092 WARN(!test_bit(BDI_registered, &bdi->state),
1053 "bdi-%s not registered\n", bdi->name); 1093 "bdi-%s not registered\n", bdi->name);
1054 1094
1055 /* 1095 /*
1056 * If this is the first dirty inode for this 1096 * If this is the first dirty inode for this
1057 * bdi, we have to wake-up the corresponding 1097 * bdi, we have to wake-up the corresponding
1058 * bdi thread to make sure background 1098 * bdi thread to make sure background
1059 * write-back happens later. 1099 * write-back happens later.
1060 */ 1100 */
1061 if (!wb_has_dirty_io(&bdi->wb)) 1101 if (!wb_has_dirty_io(&bdi->wb))
1062 wakeup_bdi = true; 1102 wakeup_bdi = true;
1063 } 1103 }
1064 1104
1065 spin_unlock(&inode->i_lock); 1105 spin_unlock(&inode->i_lock);
1066 spin_lock(&inode_wb_list_lock); 1106 spin_lock(&bdi->wb.list_lock);
1067 inode->dirtied_when = jiffies; 1107 inode->dirtied_when = jiffies;
1068 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1108 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1069 spin_unlock(&inode_wb_list_lock); 1109 spin_unlock(&bdi->wb.list_lock);
1070 1110
1071 if (wakeup_bdi) 1111 if (wakeup_bdi)
1072 bdi_wakeup_thread_delayed(bdi); 1112 bdi_wakeup_thread_delayed(bdi);
1073 return; 1113 return;
1074 } 1114 }
1075 } 1115 }
1076 out_unlock_inode: 1116 out_unlock_inode:
1077 spin_unlock(&inode->i_lock); 1117 spin_unlock(&inode->i_lock);
1078 1118
1079 } 1119 }
1080 EXPORT_SYMBOL(__mark_inode_dirty); 1120 EXPORT_SYMBOL(__mark_inode_dirty);
1081 1121
1082 /* 1122 /*
1083 * Write out a superblock's list of dirty inodes. A wait will be performed 1123 * Write out a superblock's list of dirty inodes. A wait will be performed
1084 * upon no inodes, all inodes or the final one, depending upon sync_mode. 1124 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1085 * 1125 *
1086 * If older_than_this is non-NULL, then only write out inodes which 1126 * If older_than_this is non-NULL, then only write out inodes which
1087 * had their first dirtying at a time earlier than *older_than_this. 1127 * had their first dirtying at a time earlier than *older_than_this.
1088 * 1128 *
1089 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 1129 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1090 * This function assumes that the blockdev superblock's inodes are backed by 1130 * This function assumes that the blockdev superblock's inodes are backed by
1091 * a variety of queues, so all inodes are searched. For other superblocks, 1131 * a variety of queues, so all inodes are searched. For other superblocks,
1092 * assume that all inodes are backed by the same queue. 1132 * assume that all inodes are backed by the same queue.
1093 * 1133 *
1094 * The inodes to be written are parked on bdi->b_io. They are moved back onto 1134 * The inodes to be written are parked on bdi->b_io. They are moved back onto
1095 * bdi->b_dirty as they are selected for writing. This way, none can be missed 1135 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1096 * on the writer throttling path, and we get decent balancing between many 1136 * on the writer throttling path, and we get decent balancing between many
1097 * throttled threads: we don't want them all piling up on inode_sync_wait. 1137 * throttled threads: we don't want them all piling up on inode_sync_wait.
1098 */ 1138 */
1099 static void wait_sb_inodes(struct super_block *sb) 1139 static void wait_sb_inodes(struct super_block *sb)
1100 { 1140 {
1101 struct inode *inode, *old_inode = NULL; 1141 struct inode *inode, *old_inode = NULL;
1102 1142
1103 /* 1143 /*
1104 * We need to be protected against the filesystem going from 1144 * We need to be protected against the filesystem going from
1105 * r/o to r/w or vice versa. 1145 * r/o to r/w or vice versa.
1106 */ 1146 */
1107 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1147 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1108 1148
1109 spin_lock(&inode_sb_list_lock); 1149 spin_lock(&inode_sb_list_lock);
1110 1150
1111 /* 1151 /*
1112 * Data integrity sync. Must wait for all pages under writeback, 1152 * Data integrity sync. Must wait for all pages under writeback,
1113 * because there may have been pages dirtied before our sync 1153 * because there may have been pages dirtied before our sync
1114 * call, but which had writeout started before we write it out. 1154 * call, but which had writeout started before we write it out.
1115 * In which case, the inode may not be on the dirty list, but 1155 * In which case, the inode may not be on the dirty list, but
1116 * we still have to wait for that writeout. 1156 * we still have to wait for that writeout.
1117 */ 1157 */
1118 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1158 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1119 struct address_space *mapping = inode->i_mapping; 1159 struct address_space *mapping = inode->i_mapping;
1120 1160
1121 spin_lock(&inode->i_lock); 1161 spin_lock(&inode->i_lock);
1122 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 1162 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1123 (mapping->nrpages == 0)) { 1163 (mapping->nrpages == 0)) {
1124 spin_unlock(&inode->i_lock); 1164 spin_unlock(&inode->i_lock);
1125 continue; 1165 continue;
1126 } 1166 }
1127 __iget(inode); 1167 __iget(inode);
1128 spin_unlock(&inode->i_lock); 1168 spin_unlock(&inode->i_lock);
1129 spin_unlock(&inode_sb_list_lock); 1169 spin_unlock(&inode_sb_list_lock);
1130 1170
1131 /* 1171 /*
1132 * We hold a reference to 'inode' so it couldn't have been 1172 * We hold a reference to 'inode' so it couldn't have been
1133 * removed from s_inodes list while we dropped the 1173 * removed from s_inodes list while we dropped the
1134 * inode_sb_list_lock. We cannot iput the inode now as we can 1174 * inode_sb_list_lock. We cannot iput the inode now as we can
1135 * be holding the last reference and we cannot iput it under 1175 * be holding the last reference and we cannot iput it under
1136 * inode_sb_list_lock. So we keep the reference and iput it 1176 * inode_sb_list_lock. So we keep the reference and iput it
1137 * later. 1177 * later.
1138 */ 1178 */
1139 iput(old_inode); 1179 iput(old_inode);
1140 old_inode = inode; 1180 old_inode = inode;
1141 1181
1142 filemap_fdatawait(mapping); 1182 filemap_fdatawait(mapping);
1143 1183
1144 cond_resched(); 1184 cond_resched();
1145 1185
1146 spin_lock(&inode_sb_list_lock); 1186 spin_lock(&inode_sb_list_lock);
1147 } 1187 }
1148 spin_unlock(&inode_sb_list_lock); 1188 spin_unlock(&inode_sb_list_lock);
1149 iput(old_inode); 1189 iput(old_inode);
1150 } 1190 }
1151 1191
1152 /** 1192 /**
1153 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 1193 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
1154 * @sb: the superblock 1194 * @sb: the superblock
1155 * @nr: the number of pages to write 1195 * @nr: the number of pages to write
1156 * 1196 *
1157 * Start writeback on some inodes on this super_block. No guarantees are made 1197 * Start writeback on some inodes on this super_block. No guarantees are made
1158 * on how many (if any) will be written, and this function does not wait 1198 * on how many (if any) will be written, and this function does not wait
1159 * for IO completion of submitted IO. 1199 * for IO completion of submitted IO.
1160 */ 1200 */
1161 void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1201 void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1162 { 1202 {
1163 DECLARE_COMPLETION_ONSTACK(done); 1203 DECLARE_COMPLETION_ONSTACK(done);
1164 struct wb_writeback_work work = { 1204 struct wb_writeback_work work = {
1165 .sb = sb, 1205 .sb = sb,
1166 .sync_mode = WB_SYNC_NONE, 1206 .sync_mode = WB_SYNC_NONE,
1167 .done = &done, 1207 .tagged_writepages = 1,
1168 .nr_pages = nr, 1208 .done = &done,
1209 .nr_pages = nr,
1169 }; 1210 };
1170 1211
1171 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1212 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1172 bdi_queue_work(sb->s_bdi, &work); 1213 bdi_queue_work(sb->s_bdi, &work);
1173 wait_for_completion(&done); 1214 wait_for_completion(&done);
1174 } 1215 }
1175 EXPORT_SYMBOL(writeback_inodes_sb_nr); 1216 EXPORT_SYMBOL(writeback_inodes_sb_nr);
1176 1217
1177 /** 1218 /**
1178 * writeback_inodes_sb - writeback dirty inodes from given super_block 1219 * writeback_inodes_sb - writeback dirty inodes from given super_block
1179 * @sb: the superblock 1220 * @sb: the superblock
1180 * 1221 *
1181 * Start writeback on some inodes on this super_block. No guarantees are made 1222 * Start writeback on some inodes on this super_block. No guarantees are made
1182 * on how many (if any) will be written, and this function does not wait 1223 * on how many (if any) will be written, and this function does not wait
1183 * for IO completion of submitted IO. 1224 * for IO completion of submitted IO.
1184 */ 1225 */
1185 void writeback_inodes_sb(struct super_block *sb) 1226 void writeback_inodes_sb(struct super_block *sb)
1186 { 1227 {
1187 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); 1228 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
1188 } 1229 }
1189 EXPORT_SYMBOL(writeback_inodes_sb); 1230 EXPORT_SYMBOL(writeback_inodes_sb);
1190 1231
1191 /** 1232 /**
1192 * writeback_inodes_sb_if_idle - start writeback if none underway 1233 * writeback_inodes_sb_if_idle - start writeback if none underway
1193 * @sb: the superblock 1234 * @sb: the superblock
1194 * 1235 *
1195 * Invoke writeback_inodes_sb if no writeback is currently underway. 1236 * Invoke writeback_inodes_sb if no writeback is currently underway.
1196 * Returns 1 if writeback was started, 0 if not. 1237 * Returns 1 if writeback was started, 0 if not.
1197 */ 1238 */
1198 int writeback_inodes_sb_if_idle(struct super_block *sb) 1239 int writeback_inodes_sb_if_idle(struct super_block *sb)
1199 { 1240 {
1200 if (!writeback_in_progress(sb->s_bdi)) { 1241 if (!writeback_in_progress(sb->s_bdi)) {
1201 down_read(&sb->s_umount); 1242 down_read(&sb->s_umount);
1202 writeback_inodes_sb(sb); 1243 writeback_inodes_sb(sb);
1203 up_read(&sb->s_umount); 1244 up_read(&sb->s_umount);
1204 return 1; 1245 return 1;
1205 } else 1246 } else
1206 return 0; 1247 return 0;
1207 } 1248 }
1208 EXPORT_SYMBOL(writeback_inodes_sb_if_idle); 1249 EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1209 1250
1210 /** 1251 /**
1211 * writeback_inodes_sb_if_idle - start writeback if none underway 1252 * writeback_inodes_sb_if_idle - start writeback if none underway
1212 * @sb: the superblock 1253 * @sb: the superblock
1213 * @nr: the number of pages to write 1254 * @nr: the number of pages to write
1214 * 1255 *
1215 * Invoke writeback_inodes_sb if no writeback is currently underway. 1256 * Invoke writeback_inodes_sb if no writeback is currently underway.
1216 * Returns 1 if writeback was started, 0 if not. 1257 * Returns 1 if writeback was started, 0 if not.
1217 */ 1258 */
1218 int writeback_inodes_sb_nr_if_idle(struct super_block *sb, 1259 int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1219 unsigned long nr) 1260 unsigned long nr)
1220 { 1261 {
1221 if (!writeback_in_progress(sb->s_bdi)) { 1262 if (!writeback_in_progress(sb->s_bdi)) {
1222 down_read(&sb->s_umount); 1263 down_read(&sb->s_umount);
1223 writeback_inodes_sb_nr(sb, nr); 1264 writeback_inodes_sb_nr(sb, nr);
1224 up_read(&sb->s_umount); 1265 up_read(&sb->s_umount);
1225 return 1; 1266 return 1;
1226 } else 1267 } else
1227 return 0; 1268 return 0;
1228 } 1269 }
1229 EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); 1270 EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1230 1271
1231 /** 1272 /**
1232 * sync_inodes_sb - sync sb inode pages 1273 * sync_inodes_sb - sync sb inode pages
1233 * @sb: the superblock 1274 * @sb: the superblock
1234 * 1275 *
1235 * This function writes and waits on any dirty inode belonging to this 1276 * This function writes and waits on any dirty inode belonging to this
1236 * super_block. 1277 * super_block.
1237 */ 1278 */
1238 void sync_inodes_sb(struct super_block *sb) 1279 void sync_inodes_sb(struct super_block *sb)
1239 { 1280 {
1240 DECLARE_COMPLETION_ONSTACK(done); 1281 DECLARE_COMPLETION_ONSTACK(done);
1241 struct wb_writeback_work work = { 1282 struct wb_writeback_work work = {
1242 .sb = sb, 1283 .sb = sb,
1243 .sync_mode = WB_SYNC_ALL, 1284 .sync_mode = WB_SYNC_ALL,
1244 .nr_pages = LONG_MAX, 1285 .nr_pages = LONG_MAX,
1245 .range_cyclic = 0, 1286 .range_cyclic = 0,
1246 .done = &done, 1287 .done = &done,
1247 }; 1288 };
1248 1289
1249 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1290 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1250 1291
1251 bdi_queue_work(sb->s_bdi, &work); 1292 bdi_queue_work(sb->s_bdi, &work);
1252 wait_for_completion(&done); 1293 wait_for_completion(&done);
1253 1294
1254 wait_sb_inodes(sb); 1295 wait_sb_inodes(sb);
1255 } 1296 }
1256 EXPORT_SYMBOL(sync_inodes_sb); 1297 EXPORT_SYMBOL(sync_inodes_sb);
1257 1298
1258 /** 1299 /**
1259 * write_inode_now - write an inode to disk 1300 * write_inode_now - write an inode to disk
1260 * @inode: inode to write to disk 1301 * @inode: inode to write to disk
1 /* 1 /*
2 * (C) 1997 Linus Torvalds 2 * (C) 1997 Linus Torvalds
3 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) 3 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
4 */ 4 */
5 #include <linux/fs.h> 5 #include <linux/fs.h>
6 #include <linux/mm.h> 6 #include <linux/mm.h>
7 #include <linux/dcache.h> 7 #include <linux/dcache.h>
8 #include <linux/init.h> 8 #include <linux/init.h>
9 #include <linux/slab.h> 9 #include <linux/slab.h>
10 #include <linux/writeback.h> 10 #include <linux/writeback.h>
11 #include <linux/module.h> 11 #include <linux/module.h>
12 #include <linux/backing-dev.h> 12 #include <linux/backing-dev.h>
13 #include <linux/wait.h> 13 #include <linux/wait.h>
14 #include <linux/rwsem.h> 14 #include <linux/rwsem.h>
15 #include <linux/hash.h> 15 #include <linux/hash.h>
16 #include <linux/swap.h> 16 #include <linux/swap.h>
17 #include <linux/security.h> 17 #include <linux/security.h>
18 #include <linux/pagemap.h> 18 #include <linux/pagemap.h>
19 #include <linux/cdev.h> 19 #include <linux/cdev.h>
20 #include <linux/bootmem.h> 20 #include <linux/bootmem.h>
21 #include <linux/fsnotify.h> 21 #include <linux/fsnotify.h>
22 #include <linux/mount.h> 22 #include <linux/mount.h>
23 #include <linux/async.h> 23 #include <linux/async.h>
24 #include <linux/posix_acl.h> 24 #include <linux/posix_acl.h>
25 #include <linux/prefetch.h> 25 #include <linux/prefetch.h>
26 #include <linux/ima.h> 26 #include <linux/ima.h>
27 #include <linux/cred.h> 27 #include <linux/cred.h>
28 #include <linux/buffer_head.h> /* for inode_has_buffers */ 28 #include <linux/buffer_head.h> /* for inode_has_buffers */
29 #include "internal.h" 29 #include "internal.h"
30 30
31 /* 31 /*
32 * Inode locking rules: 32 * Inode locking rules:
33 * 33 *
34 * inode->i_lock protects: 34 * inode->i_lock protects:
35 * inode->i_state, inode->i_hash, __iget() 35 * inode->i_state, inode->i_hash, __iget()
36 * inode->i_sb->s_inode_lru_lock protects: 36 * inode->i_sb->s_inode_lru_lock protects:
37 * inode->i_sb->s_inode_lru, inode->i_lru 37 * inode->i_sb->s_inode_lru, inode->i_lru
38 * inode_sb_list_lock protects: 38 * inode_sb_list_lock protects:
39 * sb->s_inodes, inode->i_sb_list 39 * sb->s_inodes, inode->i_sb_list
40 * inode_wb_list_lock protects: 40 * bdi->wb.list_lock protects:
41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 * inode_hash_lock protects: 42 * inode_hash_lock protects:
43 * inode_hashtable, inode->i_hash 43 * inode_hashtable, inode->i_hash
44 * 44 *
45 * Lock ordering: 45 * Lock ordering:
46 * 46 *
47 * inode_sb_list_lock 47 * inode_sb_list_lock
48 * inode->i_lock 48 * inode->i_lock
49 * inode->i_sb->s_inode_lru_lock 49 * inode->i_sb->s_inode_lru_lock
50 * 50 *
51 * inode_wb_list_lock 51 * bdi->wb.list_lock
52 * inode->i_lock 52 * inode->i_lock
53 * 53 *
54 * inode_hash_lock 54 * inode_hash_lock
55 * inode_sb_list_lock 55 * inode_sb_list_lock
56 * inode->i_lock 56 * inode->i_lock
57 * 57 *
58 * iunique_lock 58 * iunique_lock
59 * inode_hash_lock 59 * inode_hash_lock
60 */ 60 */
61 61
62 static unsigned int i_hash_mask __read_mostly; 62 static unsigned int i_hash_mask __read_mostly;
63 static unsigned int i_hash_shift __read_mostly; 63 static unsigned int i_hash_shift __read_mostly;
64 static struct hlist_head *inode_hashtable __read_mostly; 64 static struct hlist_head *inode_hashtable __read_mostly;
65 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 65 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
66 66
67 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); 67 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
68 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
69 68
70 /* 69 /*
71 * Empty aops. Can be used for the cases where the user does not 70 * Empty aops. Can be used for the cases where the user does not
72 * define any of the address_space operations. 71 * define any of the address_space operations.
73 */ 72 */
74 const struct address_space_operations empty_aops = { 73 const struct address_space_operations empty_aops = {
75 }; 74 };
76 EXPORT_SYMBOL(empty_aops); 75 EXPORT_SYMBOL(empty_aops);
77 76
78 /* 77 /*
79 * Statistics gathering.. 78 * Statistics gathering..
80 */ 79 */
81 struct inodes_stat_t inodes_stat; 80 struct inodes_stat_t inodes_stat;
82 81
83 static DEFINE_PER_CPU(unsigned int, nr_inodes); 82 static DEFINE_PER_CPU(unsigned int, nr_inodes);
84 static DEFINE_PER_CPU(unsigned int, nr_unused); 83 static DEFINE_PER_CPU(unsigned int, nr_unused);
85 84
86 static struct kmem_cache *inode_cachep __read_mostly; 85 static struct kmem_cache *inode_cachep __read_mostly;
87 86
88 static int get_nr_inodes(void) 87 static int get_nr_inodes(void)
89 { 88 {
90 int i; 89 int i;
91 int sum = 0; 90 int sum = 0;
92 for_each_possible_cpu(i) 91 for_each_possible_cpu(i)
93 sum += per_cpu(nr_inodes, i); 92 sum += per_cpu(nr_inodes, i);
94 return sum < 0 ? 0 : sum; 93 return sum < 0 ? 0 : sum;
95 } 94 }
96 95
97 static inline int get_nr_inodes_unused(void) 96 static inline int get_nr_inodes_unused(void)
98 { 97 {
99 int i; 98 int i;
100 int sum = 0; 99 int sum = 0;
101 for_each_possible_cpu(i) 100 for_each_possible_cpu(i)
102 sum += per_cpu(nr_unused, i); 101 sum += per_cpu(nr_unused, i);
103 return sum < 0 ? 0 : sum; 102 return sum < 0 ? 0 : sum;
104 } 103 }
105 104
106 int get_nr_dirty_inodes(void) 105 int get_nr_dirty_inodes(void)
107 { 106 {
108 /* not actually dirty inodes, but a wild approximation */ 107 /* not actually dirty inodes, but a wild approximation */
109 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 108 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
110 return nr_dirty > 0 ? nr_dirty : 0; 109 return nr_dirty > 0 ? nr_dirty : 0;
111 } 110 }
112 111
113 /* 112 /*
114 * Handle nr_inode sysctl 113 * Handle nr_inode sysctl
115 */ 114 */
116 #ifdef CONFIG_SYSCTL 115 #ifdef CONFIG_SYSCTL
117 int proc_nr_inodes(ctl_table *table, int write, 116 int proc_nr_inodes(ctl_table *table, int write,
118 void __user *buffer, size_t *lenp, loff_t *ppos) 117 void __user *buffer, size_t *lenp, loff_t *ppos)
119 { 118 {
120 inodes_stat.nr_inodes = get_nr_inodes(); 119 inodes_stat.nr_inodes = get_nr_inodes();
121 inodes_stat.nr_unused = get_nr_inodes_unused(); 120 inodes_stat.nr_unused = get_nr_inodes_unused();
122 return proc_dointvec(table, write, buffer, lenp, ppos); 121 return proc_dointvec(table, write, buffer, lenp, ppos);
123 } 122 }
124 #endif 123 #endif
125 124
126 /** 125 /**
127 * inode_init_always - perform inode structure intialisation 126 * inode_init_always - perform inode structure intialisation
128 * @sb: superblock inode belongs to 127 * @sb: superblock inode belongs to
129 * @inode: inode to initialise 128 * @inode: inode to initialise
130 * 129 *
131 * These are initializations that need to be done on every inode 130 * These are initializations that need to be done on every inode
132 * allocation as the fields are not initialised by slab allocation. 131 * allocation as the fields are not initialised by slab allocation.
133 */ 132 */
134 int inode_init_always(struct super_block *sb, struct inode *inode) 133 int inode_init_always(struct super_block *sb, struct inode *inode)
135 { 134 {
136 static const struct inode_operations empty_iops; 135 static const struct inode_operations empty_iops;
137 static const struct file_operations empty_fops; 136 static const struct file_operations empty_fops;
138 struct address_space *const mapping = &inode->i_data; 137 struct address_space *const mapping = &inode->i_data;
139 138
140 inode->i_sb = sb; 139 inode->i_sb = sb;
141 inode->i_blkbits = sb->s_blocksize_bits; 140 inode->i_blkbits = sb->s_blocksize_bits;
142 inode->i_flags = 0; 141 inode->i_flags = 0;
143 atomic_set(&inode->i_count, 1); 142 atomic_set(&inode->i_count, 1);
144 inode->i_op = &empty_iops; 143 inode->i_op = &empty_iops;
145 inode->i_fop = &empty_fops; 144 inode->i_fop = &empty_fops;
146 inode->i_nlink = 1; 145 inode->i_nlink = 1;
147 inode->i_uid = 0; 146 inode->i_uid = 0;
148 inode->i_gid = 0; 147 inode->i_gid = 0;
149 atomic_set(&inode->i_writecount, 0); 148 atomic_set(&inode->i_writecount, 0);
150 inode->i_size = 0; 149 inode->i_size = 0;
151 inode->i_blocks = 0; 150 inode->i_blocks = 0;
152 inode->i_bytes = 0; 151 inode->i_bytes = 0;
153 inode->i_generation = 0; 152 inode->i_generation = 0;
154 #ifdef CONFIG_QUOTA 153 #ifdef CONFIG_QUOTA
155 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); 154 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
156 #endif 155 #endif
157 inode->i_pipe = NULL; 156 inode->i_pipe = NULL;
158 inode->i_bdev = NULL; 157 inode->i_bdev = NULL;
159 inode->i_cdev = NULL; 158 inode->i_cdev = NULL;
160 inode->i_rdev = 0; 159 inode->i_rdev = 0;
161 inode->dirtied_when = 0; 160 inode->dirtied_when = 0;
162 161
163 if (security_inode_alloc(inode)) 162 if (security_inode_alloc(inode))
164 goto out; 163 goto out;
165 spin_lock_init(&inode->i_lock); 164 spin_lock_init(&inode->i_lock);
166 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 165 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
167 166
168 mutex_init(&inode->i_mutex); 167 mutex_init(&inode->i_mutex);
169 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); 168 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
170 169
171 atomic_set(&inode->i_dio_count, 0); 170 atomic_set(&inode->i_dio_count, 0);
172 171
173 mapping->a_ops = &empty_aops; 172 mapping->a_ops = &empty_aops;
174 mapping->host = inode; 173 mapping->host = inode;
175 mapping->flags = 0; 174 mapping->flags = 0;
176 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 175 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
177 mapping->assoc_mapping = NULL; 176 mapping->assoc_mapping = NULL;
178 mapping->backing_dev_info = &default_backing_dev_info; 177 mapping->backing_dev_info = &default_backing_dev_info;
179 mapping->writeback_index = 0; 178 mapping->writeback_index = 0;
180 179
181 /* 180 /*
182 * If the block_device provides a backing_dev_info for client 181 * If the block_device provides a backing_dev_info for client
183 * inodes then use that. Otherwise the inode share the bdev's 182 * inodes then use that. Otherwise the inode share the bdev's
184 * backing_dev_info. 183 * backing_dev_info.
185 */ 184 */
186 if (sb->s_bdev) { 185 if (sb->s_bdev) {
187 struct backing_dev_info *bdi; 186 struct backing_dev_info *bdi;
188 187
189 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 188 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
190 mapping->backing_dev_info = bdi; 189 mapping->backing_dev_info = bdi;
191 } 190 }
192 inode->i_private = NULL; 191 inode->i_private = NULL;
193 inode->i_mapping = mapping; 192 inode->i_mapping = mapping;
194 #ifdef CONFIG_FS_POSIX_ACL 193 #ifdef CONFIG_FS_POSIX_ACL
195 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; 194 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
196 #endif 195 #endif
197 196
198 #ifdef CONFIG_FSNOTIFY 197 #ifdef CONFIG_FSNOTIFY
199 inode->i_fsnotify_mask = 0; 198 inode->i_fsnotify_mask = 0;
200 #endif 199 #endif
201 200
202 this_cpu_inc(nr_inodes); 201 this_cpu_inc(nr_inodes);
203 202
204 return 0; 203 return 0;
205 out: 204 out:
206 return -ENOMEM; 205 return -ENOMEM;
207 } 206 }
208 EXPORT_SYMBOL(inode_init_always); 207 EXPORT_SYMBOL(inode_init_always);
209 208
210 static struct inode *alloc_inode(struct super_block *sb) 209 static struct inode *alloc_inode(struct super_block *sb)
211 { 210 {
212 struct inode *inode; 211 struct inode *inode;
213 212
214 if (sb->s_op->alloc_inode) 213 if (sb->s_op->alloc_inode)
215 inode = sb->s_op->alloc_inode(sb); 214 inode = sb->s_op->alloc_inode(sb);
216 else 215 else
217 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); 216 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
218 217
219 if (!inode) 218 if (!inode)
220 return NULL; 219 return NULL;
221 220
222 if (unlikely(inode_init_always(sb, inode))) { 221 if (unlikely(inode_init_always(sb, inode))) {
223 if (inode->i_sb->s_op->destroy_inode) 222 if (inode->i_sb->s_op->destroy_inode)
224 inode->i_sb->s_op->destroy_inode(inode); 223 inode->i_sb->s_op->destroy_inode(inode);
225 else 224 else
226 kmem_cache_free(inode_cachep, inode); 225 kmem_cache_free(inode_cachep, inode);
227 return NULL; 226 return NULL;
228 } 227 }
229 228
230 return inode; 229 return inode;
231 } 230 }
232 231
233 void free_inode_nonrcu(struct inode *inode) 232 void free_inode_nonrcu(struct inode *inode)
234 { 233 {
235 kmem_cache_free(inode_cachep, inode); 234 kmem_cache_free(inode_cachep, inode);
236 } 235 }
237 EXPORT_SYMBOL(free_inode_nonrcu); 236 EXPORT_SYMBOL(free_inode_nonrcu);
238 237
239 void __destroy_inode(struct inode *inode) 238 void __destroy_inode(struct inode *inode)
240 { 239 {
241 BUG_ON(inode_has_buffers(inode)); 240 BUG_ON(inode_has_buffers(inode));
242 security_inode_free(inode); 241 security_inode_free(inode);
243 fsnotify_inode_delete(inode); 242 fsnotify_inode_delete(inode);
244 #ifdef CONFIG_FS_POSIX_ACL 243 #ifdef CONFIG_FS_POSIX_ACL
245 if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) 244 if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
246 posix_acl_release(inode->i_acl); 245 posix_acl_release(inode->i_acl);
247 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 246 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
248 posix_acl_release(inode->i_default_acl); 247 posix_acl_release(inode->i_default_acl);
249 #endif 248 #endif
250 this_cpu_dec(nr_inodes); 249 this_cpu_dec(nr_inodes);
251 } 250 }
252 EXPORT_SYMBOL(__destroy_inode); 251 EXPORT_SYMBOL(__destroy_inode);
253 252
254 static void i_callback(struct rcu_head *head) 253 static void i_callback(struct rcu_head *head)
255 { 254 {
256 struct inode *inode = container_of(head, struct inode, i_rcu); 255 struct inode *inode = container_of(head, struct inode, i_rcu);
257 INIT_LIST_HEAD(&inode->i_dentry); 256 INIT_LIST_HEAD(&inode->i_dentry);
258 kmem_cache_free(inode_cachep, inode); 257 kmem_cache_free(inode_cachep, inode);
259 } 258 }
260 259
261 static void destroy_inode(struct inode *inode) 260 static void destroy_inode(struct inode *inode)
262 { 261 {
263 BUG_ON(!list_empty(&inode->i_lru)); 262 BUG_ON(!list_empty(&inode->i_lru));
264 __destroy_inode(inode); 263 __destroy_inode(inode);
265 if (inode->i_sb->s_op->destroy_inode) 264 if (inode->i_sb->s_op->destroy_inode)
266 inode->i_sb->s_op->destroy_inode(inode); 265 inode->i_sb->s_op->destroy_inode(inode);
267 else 266 else
268 call_rcu(&inode->i_rcu, i_callback); 267 call_rcu(&inode->i_rcu, i_callback);
269 } 268 }
270 269
271 void address_space_init_once(struct address_space *mapping) 270 void address_space_init_once(struct address_space *mapping)
272 { 271 {
273 memset(mapping, 0, sizeof(*mapping)); 272 memset(mapping, 0, sizeof(*mapping));
274 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); 273 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
275 spin_lock_init(&mapping->tree_lock); 274 spin_lock_init(&mapping->tree_lock);
276 mutex_init(&mapping->i_mmap_mutex); 275 mutex_init(&mapping->i_mmap_mutex);
277 INIT_LIST_HEAD(&mapping->private_list); 276 INIT_LIST_HEAD(&mapping->private_list);
278 spin_lock_init(&mapping->private_lock); 277 spin_lock_init(&mapping->private_lock);
279 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); 278 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
280 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); 279 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
281 } 280 }
282 EXPORT_SYMBOL(address_space_init_once); 281 EXPORT_SYMBOL(address_space_init_once);
283 282
284 /* 283 /*
285 * These are initializations that only need to be done 284 * These are initializations that only need to be done
286 * once, because the fields are idempotent across use 285 * once, because the fields are idempotent across use
287 * of the inode, so let the slab aware of that. 286 * of the inode, so let the slab aware of that.
288 */ 287 */
289 void inode_init_once(struct inode *inode) 288 void inode_init_once(struct inode *inode)
290 { 289 {
291 memset(inode, 0, sizeof(*inode)); 290 memset(inode, 0, sizeof(*inode));
292 INIT_HLIST_NODE(&inode->i_hash); 291 INIT_HLIST_NODE(&inode->i_hash);
293 INIT_LIST_HEAD(&inode->i_dentry); 292 INIT_LIST_HEAD(&inode->i_dentry);
294 INIT_LIST_HEAD(&inode->i_devices); 293 INIT_LIST_HEAD(&inode->i_devices);
295 INIT_LIST_HEAD(&inode->i_wb_list); 294 INIT_LIST_HEAD(&inode->i_wb_list);
296 INIT_LIST_HEAD(&inode->i_lru); 295 INIT_LIST_HEAD(&inode->i_lru);
297 address_space_init_once(&inode->i_data); 296 address_space_init_once(&inode->i_data);
298 i_size_ordered_init(inode); 297 i_size_ordered_init(inode);
299 #ifdef CONFIG_FSNOTIFY 298 #ifdef CONFIG_FSNOTIFY
300 INIT_HLIST_HEAD(&inode->i_fsnotify_marks); 299 INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
301 #endif 300 #endif
302 } 301 }
303 EXPORT_SYMBOL(inode_init_once); 302 EXPORT_SYMBOL(inode_init_once);
304 303
305 static void init_once(void *foo) 304 static void init_once(void *foo)
306 { 305 {
307 struct inode *inode = (struct inode *) foo; 306 struct inode *inode = (struct inode *) foo;
308 307
309 inode_init_once(inode); 308 inode_init_once(inode);
310 } 309 }
311 310
312 /* 311 /*
313 * inode->i_lock must be held 312 * inode->i_lock must be held
314 */ 313 */
315 void __iget(struct inode *inode) 314 void __iget(struct inode *inode)
316 { 315 {
317 atomic_inc(&inode->i_count); 316 atomic_inc(&inode->i_count);
318 } 317 }
319 318
320 /* 319 /*
321 * get additional reference to inode; caller must already hold one. 320 * get additional reference to inode; caller must already hold one.
322 */ 321 */
323 void ihold(struct inode *inode) 322 void ihold(struct inode *inode)
324 { 323 {
325 WARN_ON(atomic_inc_return(&inode->i_count) < 2); 324 WARN_ON(atomic_inc_return(&inode->i_count) < 2);
326 } 325 }
327 EXPORT_SYMBOL(ihold); 326 EXPORT_SYMBOL(ihold);
328 327
329 static void inode_lru_list_add(struct inode *inode) 328 static void inode_lru_list_add(struct inode *inode)
330 { 329 {
331 spin_lock(&inode->i_sb->s_inode_lru_lock); 330 spin_lock(&inode->i_sb->s_inode_lru_lock);
332 if (list_empty(&inode->i_lru)) { 331 if (list_empty(&inode->i_lru)) {
333 list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); 332 list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
334 inode->i_sb->s_nr_inodes_unused++; 333 inode->i_sb->s_nr_inodes_unused++;
335 this_cpu_inc(nr_unused); 334 this_cpu_inc(nr_unused);
336 } 335 }
337 spin_unlock(&inode->i_sb->s_inode_lru_lock); 336 spin_unlock(&inode->i_sb->s_inode_lru_lock);
338 } 337 }
339 338
340 static void inode_lru_list_del(struct inode *inode) 339 static void inode_lru_list_del(struct inode *inode)
341 { 340 {
342 spin_lock(&inode->i_sb->s_inode_lru_lock); 341 spin_lock(&inode->i_sb->s_inode_lru_lock);
343 if (!list_empty(&inode->i_lru)) { 342 if (!list_empty(&inode->i_lru)) {
344 list_del_init(&inode->i_lru); 343 list_del_init(&inode->i_lru);
345 inode->i_sb->s_nr_inodes_unused--; 344 inode->i_sb->s_nr_inodes_unused--;
346 this_cpu_dec(nr_unused); 345 this_cpu_dec(nr_unused);
347 } 346 }
348 spin_unlock(&inode->i_sb->s_inode_lru_lock); 347 spin_unlock(&inode->i_sb->s_inode_lru_lock);
349 } 348 }
350 349
351 /** 350 /**
352 * inode_sb_list_add - add inode to the superblock list of inodes 351 * inode_sb_list_add - add inode to the superblock list of inodes
353 * @inode: inode to add 352 * @inode: inode to add
354 */ 353 */
355 void inode_sb_list_add(struct inode *inode) 354 void inode_sb_list_add(struct inode *inode)
356 { 355 {
357 spin_lock(&inode_sb_list_lock); 356 spin_lock(&inode_sb_list_lock);
358 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 357 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
359 spin_unlock(&inode_sb_list_lock); 358 spin_unlock(&inode_sb_list_lock);
360 } 359 }
361 EXPORT_SYMBOL_GPL(inode_sb_list_add); 360 EXPORT_SYMBOL_GPL(inode_sb_list_add);
362 361
363 static inline void inode_sb_list_del(struct inode *inode) 362 static inline void inode_sb_list_del(struct inode *inode)
364 { 363 {
365 spin_lock(&inode_sb_list_lock); 364 spin_lock(&inode_sb_list_lock);
366 list_del_init(&inode->i_sb_list); 365 list_del_init(&inode->i_sb_list);
367 spin_unlock(&inode_sb_list_lock); 366 spin_unlock(&inode_sb_list_lock);
368 } 367 }
369 368
370 static unsigned long hash(struct super_block *sb, unsigned long hashval) 369 static unsigned long hash(struct super_block *sb, unsigned long hashval)
371 { 370 {
372 unsigned long tmp; 371 unsigned long tmp;
373 372
374 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 373 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
375 L1_CACHE_BYTES; 374 L1_CACHE_BYTES;
376 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); 375 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
377 return tmp & i_hash_mask; 376 return tmp & i_hash_mask;
378 } 377 }
379 378
380 /** 379 /**
381 * __insert_inode_hash - hash an inode 380 * __insert_inode_hash - hash an inode
382 * @inode: unhashed inode 381 * @inode: unhashed inode
383 * @hashval: unsigned long value used to locate this object in the 382 * @hashval: unsigned long value used to locate this object in the
384 * inode_hashtable. 383 * inode_hashtable.
385 * 384 *
386 * Add an inode to the inode hash for this superblock. 385 * Add an inode to the inode hash for this superblock.
387 */ 386 */
388 void __insert_inode_hash(struct inode *inode, unsigned long hashval) 387 void __insert_inode_hash(struct inode *inode, unsigned long hashval)
389 { 388 {
390 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 389 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
391 390
392 spin_lock(&inode_hash_lock); 391 spin_lock(&inode_hash_lock);
393 spin_lock(&inode->i_lock); 392 spin_lock(&inode->i_lock);
394 hlist_add_head(&inode->i_hash, b); 393 hlist_add_head(&inode->i_hash, b);
395 spin_unlock(&inode->i_lock); 394 spin_unlock(&inode->i_lock);
396 spin_unlock(&inode_hash_lock); 395 spin_unlock(&inode_hash_lock);
397 } 396 }
398 EXPORT_SYMBOL(__insert_inode_hash); 397 EXPORT_SYMBOL(__insert_inode_hash);
399 398
400 /** 399 /**
401 * remove_inode_hash - remove an inode from the hash 400 * remove_inode_hash - remove an inode from the hash
402 * @inode: inode to unhash 401 * @inode: inode to unhash
403 * 402 *
404 * Remove an inode from the superblock. 403 * Remove an inode from the superblock.
405 */ 404 */
406 void remove_inode_hash(struct inode *inode) 405 void remove_inode_hash(struct inode *inode)
407 { 406 {
408 spin_lock(&inode_hash_lock); 407 spin_lock(&inode_hash_lock);
409 spin_lock(&inode->i_lock); 408 spin_lock(&inode->i_lock);
410 hlist_del_init(&inode->i_hash); 409 hlist_del_init(&inode->i_hash);
411 spin_unlock(&inode->i_lock); 410 spin_unlock(&inode->i_lock);
412 spin_unlock(&inode_hash_lock); 411 spin_unlock(&inode_hash_lock);
413 } 412 }
414 EXPORT_SYMBOL(remove_inode_hash); 413 EXPORT_SYMBOL(remove_inode_hash);
415 414
416 void end_writeback(struct inode *inode) 415 void end_writeback(struct inode *inode)
417 { 416 {
418 might_sleep(); 417 might_sleep();
419 /* 418 /*
420 * We have to cycle tree_lock here because reclaim can be still in the 419 * We have to cycle tree_lock here because reclaim can be still in the
421 * process of removing the last page (in __delete_from_page_cache()) 420 * process of removing the last page (in __delete_from_page_cache())
422 * and we must not free mapping under it. 421 * and we must not free mapping under it.
423 */ 422 */
424 spin_lock_irq(&inode->i_data.tree_lock); 423 spin_lock_irq(&inode->i_data.tree_lock);
425 BUG_ON(inode->i_data.nrpages); 424 BUG_ON(inode->i_data.nrpages);
426 spin_unlock_irq(&inode->i_data.tree_lock); 425 spin_unlock_irq(&inode->i_data.tree_lock);
427 BUG_ON(!list_empty(&inode->i_data.private_list)); 426 BUG_ON(!list_empty(&inode->i_data.private_list));
428 BUG_ON(!(inode->i_state & I_FREEING)); 427 BUG_ON(!(inode->i_state & I_FREEING));
429 BUG_ON(inode->i_state & I_CLEAR); 428 BUG_ON(inode->i_state & I_CLEAR);
430 inode_sync_wait(inode); 429 inode_sync_wait(inode);
431 /* don't need i_lock here, no concurrent mods to i_state */ 430 /* don't need i_lock here, no concurrent mods to i_state */
432 inode->i_state = I_FREEING | I_CLEAR; 431 inode->i_state = I_FREEING | I_CLEAR;
433 } 432 }
434 EXPORT_SYMBOL(end_writeback); 433 EXPORT_SYMBOL(end_writeback);
435 434
436 /* 435 /*
437 * Free the inode passed in, removing it from the lists it is still connected 436 * Free the inode passed in, removing it from the lists it is still connected
438 * to. We remove any pages still attached to the inode and wait for any IO that 437 * to. We remove any pages still attached to the inode and wait for any IO that
439 * is still in progress before finally destroying the inode. 438 * is still in progress before finally destroying the inode.
440 * 439 *
441 * An inode must already be marked I_FREEING so that we avoid the inode being 440 * An inode must already be marked I_FREEING so that we avoid the inode being
442 * moved back onto lists if we race with other code that manipulates the lists 441 * moved back onto lists if we race with other code that manipulates the lists
443 * (e.g. writeback_single_inode). The caller is responsible for setting this. 442 * (e.g. writeback_single_inode). The caller is responsible for setting this.
444 * 443 *
445 * An inode must already be removed from the LRU list before being evicted from 444 * An inode must already be removed from the LRU list before being evicted from
446 * the cache. This should occur atomically with setting the I_FREEING state 445 * the cache. This should occur atomically with setting the I_FREEING state
447 * flag, so no inodes here should ever be on the LRU when being evicted. 446 * flag, so no inodes here should ever be on the LRU when being evicted.
448 */ 447 */
449 static void evict(struct inode *inode) 448 static void evict(struct inode *inode)
450 { 449 {
451 const struct super_operations *op = inode->i_sb->s_op; 450 const struct super_operations *op = inode->i_sb->s_op;
452 451
453 BUG_ON(!(inode->i_state & I_FREEING)); 452 BUG_ON(!(inode->i_state & I_FREEING));
454 BUG_ON(!list_empty(&inode->i_lru)); 453 BUG_ON(!list_empty(&inode->i_lru));
455 454
456 inode_wb_list_del(inode); 455 inode_wb_list_del(inode);
457 inode_sb_list_del(inode); 456 inode_sb_list_del(inode);
458 457
459 if (op->evict_inode) { 458 if (op->evict_inode) {
460 op->evict_inode(inode); 459 op->evict_inode(inode);
461 } else { 460 } else {
462 if (inode->i_data.nrpages) 461 if (inode->i_data.nrpages)
463 truncate_inode_pages(&inode->i_data, 0); 462 truncate_inode_pages(&inode->i_data, 0);
464 end_writeback(inode); 463 end_writeback(inode);
465 } 464 }
466 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 465 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
467 bd_forget(inode); 466 bd_forget(inode);
468 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 467 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
469 cd_forget(inode); 468 cd_forget(inode);
470 469
471 remove_inode_hash(inode); 470 remove_inode_hash(inode);
472 471
473 spin_lock(&inode->i_lock); 472 spin_lock(&inode->i_lock);
474 wake_up_bit(&inode->i_state, __I_NEW); 473 wake_up_bit(&inode->i_state, __I_NEW);
475 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 474 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
476 spin_unlock(&inode->i_lock); 475 spin_unlock(&inode->i_lock);
477 476
478 destroy_inode(inode); 477 destroy_inode(inode);
479 } 478 }
480 479
481 /* 480 /*
482 * dispose_list - dispose of the contents of a local list 481 * dispose_list - dispose of the contents of a local list
483 * @head: the head of the list to free 482 * @head: the head of the list to free
484 * 483 *
485 * Dispose-list gets a local list with local inodes in it, so it doesn't 484 * Dispose-list gets a local list with local inodes in it, so it doesn't
486 * need to worry about list corruption and SMP locks. 485 * need to worry about list corruption and SMP locks.
487 */ 486 */
488 static void dispose_list(struct list_head *head) 487 static void dispose_list(struct list_head *head)
489 { 488 {
490 while (!list_empty(head)) { 489 while (!list_empty(head)) {
491 struct inode *inode; 490 struct inode *inode;
492 491
493 inode = list_first_entry(head, struct inode, i_lru); 492 inode = list_first_entry(head, struct inode, i_lru);
494 list_del_init(&inode->i_lru); 493 list_del_init(&inode->i_lru);
495 494
496 evict(inode); 495 evict(inode);
497 } 496 }
498 } 497 }
499 498
500 /** 499 /**
501 * evict_inodes - evict all evictable inodes for a superblock 500 * evict_inodes - evict all evictable inodes for a superblock
502 * @sb: superblock to operate on 501 * @sb: superblock to operate on
503 * 502 *
504 * Make sure that no inodes with zero refcount are retained. This is 503 * Make sure that no inodes with zero refcount are retained. This is
505 * called by superblock shutdown after having MS_ACTIVE flag removed, 504 * called by superblock shutdown after having MS_ACTIVE flag removed,
506 * so any inode reaching zero refcount during or after that call will 505 * so any inode reaching zero refcount during or after that call will
507 * be immediately evicted. 506 * be immediately evicted.
508 */ 507 */
509 void evict_inodes(struct super_block *sb) 508 void evict_inodes(struct super_block *sb)
510 { 509 {
511 struct inode *inode, *next; 510 struct inode *inode, *next;
512 LIST_HEAD(dispose); 511 LIST_HEAD(dispose);
513 512
514 spin_lock(&inode_sb_list_lock); 513 spin_lock(&inode_sb_list_lock);
515 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 514 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
516 if (atomic_read(&inode->i_count)) 515 if (atomic_read(&inode->i_count))
517 continue; 516 continue;
518 517
519 spin_lock(&inode->i_lock); 518 spin_lock(&inode->i_lock);
520 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 519 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
521 spin_unlock(&inode->i_lock); 520 spin_unlock(&inode->i_lock);
522 continue; 521 continue;
523 } 522 }
524 523
525 inode->i_state |= I_FREEING; 524 inode->i_state |= I_FREEING;
526 inode_lru_list_del(inode); 525 inode_lru_list_del(inode);
527 spin_unlock(&inode->i_lock); 526 spin_unlock(&inode->i_lock);
528 list_add(&inode->i_lru, &dispose); 527 list_add(&inode->i_lru, &dispose);
529 } 528 }
530 spin_unlock(&inode_sb_list_lock); 529 spin_unlock(&inode_sb_list_lock);
531 530
532 dispose_list(&dispose); 531 dispose_list(&dispose);
533 } 532 }
534 533
535 /** 534 /**
536 * invalidate_inodes - attempt to free all inodes on a superblock 535 * invalidate_inodes - attempt to free all inodes on a superblock
537 * @sb: superblock to operate on 536 * @sb: superblock to operate on
538 * @kill_dirty: flag to guide handling of dirty inodes 537 * @kill_dirty: flag to guide handling of dirty inodes
539 * 538 *
540 * Attempts to free all inodes for a given superblock. If there were any 539 * Attempts to free all inodes for a given superblock. If there were any
541 * busy inodes return a non-zero value, else zero. 540 * busy inodes return a non-zero value, else zero.
542 * If @kill_dirty is set, discard dirty inodes too, otherwise treat 541 * If @kill_dirty is set, discard dirty inodes too, otherwise treat
543 * them as busy. 542 * them as busy.
544 */ 543 */
545 int invalidate_inodes(struct super_block *sb, bool kill_dirty) 544 int invalidate_inodes(struct super_block *sb, bool kill_dirty)
546 { 545 {
547 int busy = 0; 546 int busy = 0;
548 struct inode *inode, *next; 547 struct inode *inode, *next;
549 LIST_HEAD(dispose); 548 LIST_HEAD(dispose);
550 549
551 spin_lock(&inode_sb_list_lock); 550 spin_lock(&inode_sb_list_lock);
552 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 551 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
553 spin_lock(&inode->i_lock); 552 spin_lock(&inode->i_lock);
554 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 553 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
555 spin_unlock(&inode->i_lock); 554 spin_unlock(&inode->i_lock);
556 continue; 555 continue;
557 } 556 }
558 if (inode->i_state & I_DIRTY && !kill_dirty) { 557 if (inode->i_state & I_DIRTY && !kill_dirty) {
559 spin_unlock(&inode->i_lock); 558 spin_unlock(&inode->i_lock);
560 busy = 1; 559 busy = 1;
561 continue; 560 continue;
562 } 561 }
563 if (atomic_read(&inode->i_count)) { 562 if (atomic_read(&inode->i_count)) {
564 spin_unlock(&inode->i_lock); 563 spin_unlock(&inode->i_lock);
565 busy = 1; 564 busy = 1;
566 continue; 565 continue;
567 } 566 }
568 567
569 inode->i_state |= I_FREEING; 568 inode->i_state |= I_FREEING;
570 inode_lru_list_del(inode); 569 inode_lru_list_del(inode);
571 spin_unlock(&inode->i_lock); 570 spin_unlock(&inode->i_lock);
572 list_add(&inode->i_lru, &dispose); 571 list_add(&inode->i_lru, &dispose);
573 } 572 }
574 spin_unlock(&inode_sb_list_lock); 573 spin_unlock(&inode_sb_list_lock);
575 574
576 dispose_list(&dispose); 575 dispose_list(&dispose);
577 576
578 return busy; 577 return busy;
579 } 578 }
580 579
581 static int can_unuse(struct inode *inode) 580 static int can_unuse(struct inode *inode)
582 { 581 {
583 if (inode->i_state & ~I_REFERENCED) 582 if (inode->i_state & ~I_REFERENCED)
584 return 0; 583 return 0;
585 if (inode_has_buffers(inode)) 584 if (inode_has_buffers(inode))
586 return 0; 585 return 0;
587 if (atomic_read(&inode->i_count)) 586 if (atomic_read(&inode->i_count))
588 return 0; 587 return 0;
589 if (inode->i_data.nrpages) 588 if (inode->i_data.nrpages)
590 return 0; 589 return 0;
591 return 1; 590 return 1;
592 } 591 }
593 592
594 /* 593 /*
595 * Walk the superblock inode LRU for freeable inodes and attempt to free them. 594 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
596 * This is called from the superblock shrinker function with a number of inodes 595 * This is called from the superblock shrinker function with a number of inodes
597 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 596 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
598 * then are freed outside inode_lock by dispose_list(). 597 * then are freed outside inode_lock by dispose_list().
599 * 598 *
600 * Any inodes which are pinned purely because of attached pagecache have their 599 * Any inodes which are pinned purely because of attached pagecache have their
601 * pagecache removed. If the inode has metadata buffers attached to 600 * pagecache removed. If the inode has metadata buffers attached to
602 * mapping->private_list then try to remove them. 601 * mapping->private_list then try to remove them.
603 * 602 *
604 * If the inode has the I_REFERENCED flag set, then it means that it has been 603 * If the inode has the I_REFERENCED flag set, then it means that it has been
605 * used recently - the flag is set in iput_final(). When we encounter such an 604 * used recently - the flag is set in iput_final(). When we encounter such an
606 * inode, clear the flag and move it to the back of the LRU so it gets another 605 * inode, clear the flag and move it to the back of the LRU so it gets another
607 * pass through the LRU before it gets reclaimed. This is necessary because of 606 * pass through the LRU before it gets reclaimed. This is necessary because of
608 * the fact we are doing lazy LRU updates to minimise lock contention so the 607 * the fact we are doing lazy LRU updates to minimise lock contention so the
609 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 608 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
610 * with this flag set because they are the inodes that are out of order. 609 * with this flag set because they are the inodes that are out of order.
611 */ 610 */
612 void prune_icache_sb(struct super_block *sb, int nr_to_scan) 611 void prune_icache_sb(struct super_block *sb, int nr_to_scan)
613 { 612 {
614 LIST_HEAD(freeable); 613 LIST_HEAD(freeable);
615 int nr_scanned; 614 int nr_scanned;
616 unsigned long reap = 0; 615 unsigned long reap = 0;
617 616
618 spin_lock(&sb->s_inode_lru_lock); 617 spin_lock(&sb->s_inode_lru_lock);
619 for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { 618 for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
620 struct inode *inode; 619 struct inode *inode;
621 620
622 if (list_empty(&sb->s_inode_lru)) 621 if (list_empty(&sb->s_inode_lru))
623 break; 622 break;
624 623
625 inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); 624 inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
626 625
627 /* 626 /*
628 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, 627 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
629 * so use a trylock. If we fail to get the lock, just move the 628 * so use a trylock. If we fail to get the lock, just move the
630 * inode to the back of the list so we don't spin on it. 629 * inode to the back of the list so we don't spin on it.
631 */ 630 */
632 if (!spin_trylock(&inode->i_lock)) { 631 if (!spin_trylock(&inode->i_lock)) {
633 list_move(&inode->i_lru, &sb->s_inode_lru); 632 list_move(&inode->i_lru, &sb->s_inode_lru);
634 continue; 633 continue;
635 } 634 }
636 635
637 /* 636 /*
638 * Referenced or dirty inodes are still in use. Give them 637 * Referenced or dirty inodes are still in use. Give them
639 * another pass through the LRU as we canot reclaim them now. 638 * another pass through the LRU as we canot reclaim them now.
640 */ 639 */
641 if (atomic_read(&inode->i_count) || 640 if (atomic_read(&inode->i_count) ||
642 (inode->i_state & ~I_REFERENCED)) { 641 (inode->i_state & ~I_REFERENCED)) {
643 list_del_init(&inode->i_lru); 642 list_del_init(&inode->i_lru);
644 spin_unlock(&inode->i_lock); 643 spin_unlock(&inode->i_lock);
645 sb->s_nr_inodes_unused--; 644 sb->s_nr_inodes_unused--;
646 this_cpu_dec(nr_unused); 645 this_cpu_dec(nr_unused);
647 continue; 646 continue;
648 } 647 }
649 648
650 /* recently referenced inodes get one more pass */ 649 /* recently referenced inodes get one more pass */
651 if (inode->i_state & I_REFERENCED) { 650 if (inode->i_state & I_REFERENCED) {
652 inode->i_state &= ~I_REFERENCED; 651 inode->i_state &= ~I_REFERENCED;
653 list_move(&inode->i_lru, &sb->s_inode_lru); 652 list_move(&inode->i_lru, &sb->s_inode_lru);
654 spin_unlock(&inode->i_lock); 653 spin_unlock(&inode->i_lock);
655 continue; 654 continue;
656 } 655 }
657 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 656 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
658 __iget(inode); 657 __iget(inode);
659 spin_unlock(&inode->i_lock); 658 spin_unlock(&inode->i_lock);
660 spin_unlock(&sb->s_inode_lru_lock); 659 spin_unlock(&sb->s_inode_lru_lock);
661 if (remove_inode_buffers(inode)) 660 if (remove_inode_buffers(inode))
662 reap += invalidate_mapping_pages(&inode->i_data, 661 reap += invalidate_mapping_pages(&inode->i_data,
663 0, -1); 662 0, -1);
664 iput(inode); 663 iput(inode);
665 spin_lock(&sb->s_inode_lru_lock); 664 spin_lock(&sb->s_inode_lru_lock);
666 665
667 if (inode != list_entry(sb->s_inode_lru.next, 666 if (inode != list_entry(sb->s_inode_lru.next,
668 struct inode, i_lru)) 667 struct inode, i_lru))
669 continue; /* wrong inode or list_empty */ 668 continue; /* wrong inode or list_empty */
670 /* avoid lock inversions with trylock */ 669 /* avoid lock inversions with trylock */
671 if (!spin_trylock(&inode->i_lock)) 670 if (!spin_trylock(&inode->i_lock))
672 continue; 671 continue;
673 if (!can_unuse(inode)) { 672 if (!can_unuse(inode)) {
674 spin_unlock(&inode->i_lock); 673 spin_unlock(&inode->i_lock);
675 continue; 674 continue;
676 } 675 }
677 } 676 }
678 WARN_ON(inode->i_state & I_NEW); 677 WARN_ON(inode->i_state & I_NEW);
679 inode->i_state |= I_FREEING; 678 inode->i_state |= I_FREEING;
680 spin_unlock(&inode->i_lock); 679 spin_unlock(&inode->i_lock);
681 680
682 list_move(&inode->i_lru, &freeable); 681 list_move(&inode->i_lru, &freeable);
683 sb->s_nr_inodes_unused--; 682 sb->s_nr_inodes_unused--;
684 this_cpu_dec(nr_unused); 683 this_cpu_dec(nr_unused);
685 } 684 }
686 if (current_is_kswapd()) 685 if (current_is_kswapd())
687 __count_vm_events(KSWAPD_INODESTEAL, reap); 686 __count_vm_events(KSWAPD_INODESTEAL, reap);
688 else 687 else
689 __count_vm_events(PGINODESTEAL, reap); 688 __count_vm_events(PGINODESTEAL, reap);
690 spin_unlock(&sb->s_inode_lru_lock); 689 spin_unlock(&sb->s_inode_lru_lock);
691 690
692 dispose_list(&freeable); 691 dispose_list(&freeable);
693 } 692 }
694 693
695 static void __wait_on_freeing_inode(struct inode *inode); 694 static void __wait_on_freeing_inode(struct inode *inode);
696 /* 695 /*
697 * Called with the inode lock held. 696 * Called with the inode lock held.
698 */ 697 */
699 static struct inode *find_inode(struct super_block *sb, 698 static struct inode *find_inode(struct super_block *sb,
700 struct hlist_head *head, 699 struct hlist_head *head,
701 int (*test)(struct inode *, void *), 700 int (*test)(struct inode *, void *),
702 void *data) 701 void *data)
703 { 702 {
704 struct hlist_node *node; 703 struct hlist_node *node;
705 struct inode *inode = NULL; 704 struct inode *inode = NULL;
706 705
707 repeat: 706 repeat:
708 hlist_for_each_entry(inode, node, head, i_hash) { 707 hlist_for_each_entry(inode, node, head, i_hash) {
709 spin_lock(&inode->i_lock); 708 spin_lock(&inode->i_lock);
710 if (inode->i_sb != sb) { 709 if (inode->i_sb != sb) {
711 spin_unlock(&inode->i_lock); 710 spin_unlock(&inode->i_lock);
712 continue; 711 continue;
713 } 712 }
714 if (!test(inode, data)) { 713 if (!test(inode, data)) {
715 spin_unlock(&inode->i_lock); 714 spin_unlock(&inode->i_lock);
716 continue; 715 continue;
717 } 716 }
718 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 717 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
719 __wait_on_freeing_inode(inode); 718 __wait_on_freeing_inode(inode);
720 goto repeat; 719 goto repeat;
721 } 720 }
722 __iget(inode); 721 __iget(inode);
723 spin_unlock(&inode->i_lock); 722 spin_unlock(&inode->i_lock);
724 return inode; 723 return inode;
725 } 724 }
726 return NULL; 725 return NULL;
727 } 726 }
728 727
729 /* 728 /*
730 * find_inode_fast is the fast path version of find_inode, see the comment at 729 * find_inode_fast is the fast path version of find_inode, see the comment at
731 * iget_locked for details. 730 * iget_locked for details.
732 */ 731 */
733 static struct inode *find_inode_fast(struct super_block *sb, 732 static struct inode *find_inode_fast(struct super_block *sb,
734 struct hlist_head *head, unsigned long ino) 733 struct hlist_head *head, unsigned long ino)
735 { 734 {
736 struct hlist_node *node; 735 struct hlist_node *node;
737 struct inode *inode = NULL; 736 struct inode *inode = NULL;
738 737
739 repeat: 738 repeat:
740 hlist_for_each_entry(inode, node, head, i_hash) { 739 hlist_for_each_entry(inode, node, head, i_hash) {
741 spin_lock(&inode->i_lock); 740 spin_lock(&inode->i_lock);
742 if (inode->i_ino != ino) { 741 if (inode->i_ino != ino) {
743 spin_unlock(&inode->i_lock); 742 spin_unlock(&inode->i_lock);
744 continue; 743 continue;
745 } 744 }
746 if (inode->i_sb != sb) { 745 if (inode->i_sb != sb) {
747 spin_unlock(&inode->i_lock); 746 spin_unlock(&inode->i_lock);
748 continue; 747 continue;
749 } 748 }
750 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 749 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
751 __wait_on_freeing_inode(inode); 750 __wait_on_freeing_inode(inode);
752 goto repeat; 751 goto repeat;
753 } 752 }
754 __iget(inode); 753 __iget(inode);
755 spin_unlock(&inode->i_lock); 754 spin_unlock(&inode->i_lock);
756 return inode; 755 return inode;
757 } 756 }
758 return NULL; 757 return NULL;
759 } 758 }
760 759
761 /* 760 /*
762 * Each cpu owns a range of LAST_INO_BATCH numbers. 761 * Each cpu owns a range of LAST_INO_BATCH numbers.
763 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, 762 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
764 * to renew the exhausted range. 763 * to renew the exhausted range.
765 * 764 *
766 * This does not significantly increase overflow rate because every CPU can 765 * This does not significantly increase overflow rate because every CPU can
767 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is 766 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
768 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the 767 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
769 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase 768 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
770 * overflow rate by 2x, which does not seem too significant. 769 * overflow rate by 2x, which does not seem too significant.
771 * 770 *
772 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 771 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
773 * error if st_ino won't fit in target struct field. Use 32bit counter 772 * error if st_ino won't fit in target struct field. Use 32bit counter
774 * here to attempt to avoid that. 773 * here to attempt to avoid that.
775 */ 774 */
776 #define LAST_INO_BATCH 1024 775 #define LAST_INO_BATCH 1024
777 static DEFINE_PER_CPU(unsigned int, last_ino); 776 static DEFINE_PER_CPU(unsigned int, last_ino);
778 777
779 unsigned int get_next_ino(void) 778 unsigned int get_next_ino(void)
780 { 779 {
781 unsigned int *p = &get_cpu_var(last_ino); 780 unsigned int *p = &get_cpu_var(last_ino);
782 unsigned int res = *p; 781 unsigned int res = *p;
783 782
784 #ifdef CONFIG_SMP 783 #ifdef CONFIG_SMP
785 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { 784 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
786 static atomic_t shared_last_ino; 785 static atomic_t shared_last_ino;
787 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); 786 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
788 787
789 res = next - LAST_INO_BATCH; 788 res = next - LAST_INO_BATCH;
790 } 789 }
791 #endif 790 #endif
792 791
793 *p = ++res; 792 *p = ++res;
794 put_cpu_var(last_ino); 793 put_cpu_var(last_ino);
795 return res; 794 return res;
796 } 795 }
797 EXPORT_SYMBOL(get_next_ino); 796 EXPORT_SYMBOL(get_next_ino);
798 797
799 /** 798 /**
800 * new_inode - obtain an inode 799 * new_inode - obtain an inode
801 * @sb: superblock 800 * @sb: superblock
802 * 801 *
803 * Allocates a new inode for given superblock. The default gfp_mask 802 * Allocates a new inode for given superblock. The default gfp_mask
804 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. 803 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
805 * If HIGHMEM pages are unsuitable or it is known that pages allocated 804 * If HIGHMEM pages are unsuitable or it is known that pages allocated
806 * for the page cache are not reclaimable or migratable, 805 * for the page cache are not reclaimable or migratable,
807 * mapping_set_gfp_mask() must be called with suitable flags on the 806 * mapping_set_gfp_mask() must be called with suitable flags on the
808 * newly created inode's mapping 807 * newly created inode's mapping
809 * 808 *
810 */ 809 */
811 struct inode *new_inode(struct super_block *sb) 810 struct inode *new_inode(struct super_block *sb)
812 { 811 {
813 struct inode *inode; 812 struct inode *inode;
814 813
815 spin_lock_prefetch(&inode_sb_list_lock); 814 spin_lock_prefetch(&inode_sb_list_lock);
816 815
817 inode = alloc_inode(sb); 816 inode = alloc_inode(sb);
818 if (inode) { 817 if (inode) {
819 spin_lock(&inode->i_lock); 818 spin_lock(&inode->i_lock);
820 inode->i_state = 0; 819 inode->i_state = 0;
821 spin_unlock(&inode->i_lock); 820 spin_unlock(&inode->i_lock);
822 inode_sb_list_add(inode); 821 inode_sb_list_add(inode);
823 } 822 }
824 return inode; 823 return inode;
825 } 824 }
826 EXPORT_SYMBOL(new_inode); 825 EXPORT_SYMBOL(new_inode);
827 826
828 /** 827 /**
829 * unlock_new_inode - clear the I_NEW state and wake up any waiters 828 * unlock_new_inode - clear the I_NEW state and wake up any waiters
830 * @inode: new inode to unlock 829 * @inode: new inode to unlock
831 * 830 *
832 * Called when the inode is fully initialised to clear the new state of the 831 * Called when the inode is fully initialised to clear the new state of the
833 * inode and wake up anyone waiting for the inode to finish initialisation. 832 * inode and wake up anyone waiting for the inode to finish initialisation.
834 */ 833 */
835 void unlock_new_inode(struct inode *inode) 834 void unlock_new_inode(struct inode *inode)
836 { 835 {
837 #ifdef CONFIG_DEBUG_LOCK_ALLOC 836 #ifdef CONFIG_DEBUG_LOCK_ALLOC
838 if (S_ISDIR(inode->i_mode)) { 837 if (S_ISDIR(inode->i_mode)) {
839 struct file_system_type *type = inode->i_sb->s_type; 838 struct file_system_type *type = inode->i_sb->s_type;
840 839
841 /* Set new key only if filesystem hasn't already changed it */ 840 /* Set new key only if filesystem hasn't already changed it */
842 if (!lockdep_match_class(&inode->i_mutex, 841 if (!lockdep_match_class(&inode->i_mutex,
843 &type->i_mutex_key)) { 842 &type->i_mutex_key)) {
844 /* 843 /*
845 * ensure nobody is actually holding i_mutex 844 * ensure nobody is actually holding i_mutex
846 */ 845 */
847 mutex_destroy(&inode->i_mutex); 846 mutex_destroy(&inode->i_mutex);
848 mutex_init(&inode->i_mutex); 847 mutex_init(&inode->i_mutex);
849 lockdep_set_class(&inode->i_mutex, 848 lockdep_set_class(&inode->i_mutex,
850 &type->i_mutex_dir_key); 849 &type->i_mutex_dir_key);
851 } 850 }
852 } 851 }
853 #endif 852 #endif
854 spin_lock(&inode->i_lock); 853 spin_lock(&inode->i_lock);
855 WARN_ON(!(inode->i_state & I_NEW)); 854 WARN_ON(!(inode->i_state & I_NEW));
856 inode->i_state &= ~I_NEW; 855 inode->i_state &= ~I_NEW;
857 wake_up_bit(&inode->i_state, __I_NEW); 856 wake_up_bit(&inode->i_state, __I_NEW);
858 spin_unlock(&inode->i_lock); 857 spin_unlock(&inode->i_lock);
859 } 858 }
860 EXPORT_SYMBOL(unlock_new_inode); 859 EXPORT_SYMBOL(unlock_new_inode);
861 860
862 /** 861 /**
863 * iget5_locked - obtain an inode from a mounted file system 862 * iget5_locked - obtain an inode from a mounted file system
864 * @sb: super block of file system 863 * @sb: super block of file system
865 * @hashval: hash value (usually inode number) to get 864 * @hashval: hash value (usually inode number) to get
866 * @test: callback used for comparisons between inodes 865 * @test: callback used for comparisons between inodes
867 * @set: callback used to initialize a new struct inode 866 * @set: callback used to initialize a new struct inode
868 * @data: opaque data pointer to pass to @test and @set 867 * @data: opaque data pointer to pass to @test and @set
869 * 868 *
870 * Search for the inode specified by @hashval and @data in the inode cache, 869 * Search for the inode specified by @hashval and @data in the inode cache,
871 * and if present it is return it with an increased reference count. This is 870 * and if present it is return it with an increased reference count. This is
872 * a generalized version of iget_locked() for file systems where the inode 871 * a generalized version of iget_locked() for file systems where the inode
873 * number is not sufficient for unique identification of an inode. 872 * number is not sufficient for unique identification of an inode.
874 * 873 *
875 * If the inode is not in cache, allocate a new inode and return it locked, 874 * If the inode is not in cache, allocate a new inode and return it locked,
876 * hashed, and with the I_NEW flag set. The file system gets to fill it in 875 * hashed, and with the I_NEW flag set. The file system gets to fill it in
877 * before unlocking it via unlock_new_inode(). 876 * before unlocking it via unlock_new_inode().
878 * 877 *
879 * Note both @test and @set are called with the inode_hash_lock held, so can't 878 * Note both @test and @set are called with the inode_hash_lock held, so can't
880 * sleep. 879 * sleep.
881 */ 880 */
882 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 881 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
883 int (*test)(struct inode *, void *), 882 int (*test)(struct inode *, void *),
884 int (*set)(struct inode *, void *), void *data) 883 int (*set)(struct inode *, void *), void *data)
885 { 884 {
886 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 885 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
887 struct inode *inode; 886 struct inode *inode;
888 887
889 spin_lock(&inode_hash_lock); 888 spin_lock(&inode_hash_lock);
890 inode = find_inode(sb, head, test, data); 889 inode = find_inode(sb, head, test, data);
891 spin_unlock(&inode_hash_lock); 890 spin_unlock(&inode_hash_lock);
892 891
893 if (inode) { 892 if (inode) {
894 wait_on_inode(inode); 893 wait_on_inode(inode);
895 return inode; 894 return inode;
896 } 895 }
897 896
898 inode = alloc_inode(sb); 897 inode = alloc_inode(sb);
899 if (inode) { 898 if (inode) {
900 struct inode *old; 899 struct inode *old;
901 900
902 spin_lock(&inode_hash_lock); 901 spin_lock(&inode_hash_lock);
903 /* We released the lock, so.. */ 902 /* We released the lock, so.. */
904 old = find_inode(sb, head, test, data); 903 old = find_inode(sb, head, test, data);
905 if (!old) { 904 if (!old) {
906 if (set(inode, data)) 905 if (set(inode, data))
907 goto set_failed; 906 goto set_failed;
908 907
909 spin_lock(&inode->i_lock); 908 spin_lock(&inode->i_lock);
910 inode->i_state = I_NEW; 909 inode->i_state = I_NEW;
911 hlist_add_head(&inode->i_hash, head); 910 hlist_add_head(&inode->i_hash, head);
912 spin_unlock(&inode->i_lock); 911 spin_unlock(&inode->i_lock);
913 inode_sb_list_add(inode); 912 inode_sb_list_add(inode);
914 spin_unlock(&inode_hash_lock); 913 spin_unlock(&inode_hash_lock);
915 914
916 /* Return the locked inode with I_NEW set, the 915 /* Return the locked inode with I_NEW set, the
917 * caller is responsible for filling in the contents 916 * caller is responsible for filling in the contents
918 */ 917 */
919 return inode; 918 return inode;
920 } 919 }
921 920
922 /* 921 /*
923 * Uhhuh, somebody else created the same inode under 922 * Uhhuh, somebody else created the same inode under
924 * us. Use the old inode instead of the one we just 923 * us. Use the old inode instead of the one we just
925 * allocated. 924 * allocated.
926 */ 925 */
927 spin_unlock(&inode_hash_lock); 926 spin_unlock(&inode_hash_lock);
928 destroy_inode(inode); 927 destroy_inode(inode);
929 inode = old; 928 inode = old;
930 wait_on_inode(inode); 929 wait_on_inode(inode);
931 } 930 }
932 return inode; 931 return inode;
933 932
934 set_failed: 933 set_failed:
935 spin_unlock(&inode_hash_lock); 934 spin_unlock(&inode_hash_lock);
936 destroy_inode(inode); 935 destroy_inode(inode);
937 return NULL; 936 return NULL;
938 } 937 }
939 EXPORT_SYMBOL(iget5_locked); 938 EXPORT_SYMBOL(iget5_locked);
940 939
941 /** 940 /**
942 * iget_locked - obtain an inode from a mounted file system 941 * iget_locked - obtain an inode from a mounted file system
943 * @sb: super block of file system 942 * @sb: super block of file system
944 * @ino: inode number to get 943 * @ino: inode number to get
945 * 944 *
946 * Search for the inode specified by @ino in the inode cache and if present 945 * Search for the inode specified by @ino in the inode cache and if present
947 * return it with an increased reference count. This is for file systems 946 * return it with an increased reference count. This is for file systems
948 * where the inode number is sufficient for unique identification of an inode. 947 * where the inode number is sufficient for unique identification of an inode.
949 * 948 *
950 * If the inode is not in cache, allocate a new inode and return it locked, 949 * If the inode is not in cache, allocate a new inode and return it locked,
951 * hashed, and with the I_NEW flag set. The file system gets to fill it in 950 * hashed, and with the I_NEW flag set. The file system gets to fill it in
952 * before unlocking it via unlock_new_inode(). 951 * before unlocking it via unlock_new_inode().
953 */ 952 */
954 struct inode *iget_locked(struct super_block *sb, unsigned long ino) 953 struct inode *iget_locked(struct super_block *sb, unsigned long ino)
955 { 954 {
956 struct hlist_head *head = inode_hashtable + hash(sb, ino); 955 struct hlist_head *head = inode_hashtable + hash(sb, ino);
957 struct inode *inode; 956 struct inode *inode;
958 957
959 spin_lock(&inode_hash_lock); 958 spin_lock(&inode_hash_lock);
960 inode = find_inode_fast(sb, head, ino); 959 inode = find_inode_fast(sb, head, ino);
961 spin_unlock(&inode_hash_lock); 960 spin_unlock(&inode_hash_lock);
962 if (inode) { 961 if (inode) {
963 wait_on_inode(inode); 962 wait_on_inode(inode);
964 return inode; 963 return inode;
965 } 964 }
966 965
967 inode = alloc_inode(sb); 966 inode = alloc_inode(sb);
968 if (inode) { 967 if (inode) {
969 struct inode *old; 968 struct inode *old;
970 969
971 spin_lock(&inode_hash_lock); 970 spin_lock(&inode_hash_lock);
972 /* We released the lock, so.. */ 971 /* We released the lock, so.. */
973 old = find_inode_fast(sb, head, ino); 972 old = find_inode_fast(sb, head, ino);
974 if (!old) { 973 if (!old) {
975 inode->i_ino = ino; 974 inode->i_ino = ino;
976 spin_lock(&inode->i_lock); 975 spin_lock(&inode->i_lock);
977 inode->i_state = I_NEW; 976 inode->i_state = I_NEW;
978 hlist_add_head(&inode->i_hash, head); 977 hlist_add_head(&inode->i_hash, head);
979 spin_unlock(&inode->i_lock); 978 spin_unlock(&inode->i_lock);
980 inode_sb_list_add(inode); 979 inode_sb_list_add(inode);
981 spin_unlock(&inode_hash_lock); 980 spin_unlock(&inode_hash_lock);
982 981
983 /* Return the locked inode with I_NEW set, the 982 /* Return the locked inode with I_NEW set, the
984 * caller is responsible for filling in the contents 983 * caller is responsible for filling in the contents
985 */ 984 */
986 return inode; 985 return inode;
987 } 986 }
988 987
989 /* 988 /*
990 * Uhhuh, somebody else created the same inode under 989 * Uhhuh, somebody else created the same inode under
991 * us. Use the old inode instead of the one we just 990 * us. Use the old inode instead of the one we just
992 * allocated. 991 * allocated.
993 */ 992 */
994 spin_unlock(&inode_hash_lock); 993 spin_unlock(&inode_hash_lock);
995 destroy_inode(inode); 994 destroy_inode(inode);
996 inode = old; 995 inode = old;
997 wait_on_inode(inode); 996 wait_on_inode(inode);
998 } 997 }
999 return inode; 998 return inode;
1000 } 999 }
1001 EXPORT_SYMBOL(iget_locked); 1000 EXPORT_SYMBOL(iget_locked);
1002 1001
1003 /* 1002 /*
1004 * search the inode cache for a matching inode number. 1003 * search the inode cache for a matching inode number.
1005 * If we find one, then the inode number we are trying to 1004 * If we find one, then the inode number we are trying to
1006 * allocate is not unique and so we should not use it. 1005 * allocate is not unique and so we should not use it.
1007 * 1006 *
1008 * Returns 1 if the inode number is unique, 0 if it is not. 1007 * Returns 1 if the inode number is unique, 0 if it is not.
1009 */ 1008 */
1010 static int test_inode_iunique(struct super_block *sb, unsigned long ino) 1009 static int test_inode_iunique(struct super_block *sb, unsigned long ino)
1011 { 1010 {
1012 struct hlist_head *b = inode_hashtable + hash(sb, ino); 1011 struct hlist_head *b = inode_hashtable + hash(sb, ino);
1013 struct hlist_node *node; 1012 struct hlist_node *node;
1014 struct inode *inode; 1013 struct inode *inode;
1015 1014
1016 spin_lock(&inode_hash_lock); 1015 spin_lock(&inode_hash_lock);
1017 hlist_for_each_entry(inode, node, b, i_hash) { 1016 hlist_for_each_entry(inode, node, b, i_hash) {
1018 if (inode->i_ino == ino && inode->i_sb == sb) { 1017 if (inode->i_ino == ino && inode->i_sb == sb) {
1019 spin_unlock(&inode_hash_lock); 1018 spin_unlock(&inode_hash_lock);
1020 return 0; 1019 return 0;
1021 } 1020 }
1022 } 1021 }
1023 spin_unlock(&inode_hash_lock); 1022 spin_unlock(&inode_hash_lock);
1024 1023
1025 return 1; 1024 return 1;
1026 } 1025 }
1027 1026
1028 /** 1027 /**
1029 * iunique - get a unique inode number 1028 * iunique - get a unique inode number
1030 * @sb: superblock 1029 * @sb: superblock
1031 * @max_reserved: highest reserved inode number 1030 * @max_reserved: highest reserved inode number
1032 * 1031 *
1033 * Obtain an inode number that is unique on the system for a given 1032 * Obtain an inode number that is unique on the system for a given
1034 * superblock. This is used by file systems that have no natural 1033 * superblock. This is used by file systems that have no natural
1035 * permanent inode numbering system. An inode number is returned that 1034 * permanent inode numbering system. An inode number is returned that
1036 * is higher than the reserved limit but unique. 1035 * is higher than the reserved limit but unique.
1037 * 1036 *
1038 * BUGS: 1037 * BUGS:
1039 * With a large number of inodes live on the file system this function 1038 * With a large number of inodes live on the file system this function
1040 * currently becomes quite slow. 1039 * currently becomes quite slow.
1041 */ 1040 */
1042 ino_t iunique(struct super_block *sb, ino_t max_reserved) 1041 ino_t iunique(struct super_block *sb, ino_t max_reserved)
1043 { 1042 {
1044 /* 1043 /*
1045 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 1044 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
1046 * error if st_ino won't fit in target struct field. Use 32bit counter 1045 * error if st_ino won't fit in target struct field. Use 32bit counter
1047 * here to attempt to avoid that. 1046 * here to attempt to avoid that.
1048 */ 1047 */
1049 static DEFINE_SPINLOCK(iunique_lock); 1048 static DEFINE_SPINLOCK(iunique_lock);
1050 static unsigned int counter; 1049 static unsigned int counter;
1051 ino_t res; 1050 ino_t res;
1052 1051
1053 spin_lock(&iunique_lock); 1052 spin_lock(&iunique_lock);
1054 do { 1053 do {
1055 if (counter <= max_reserved) 1054 if (counter <= max_reserved)
1056 counter = max_reserved + 1; 1055 counter = max_reserved + 1;
1057 res = counter++; 1056 res = counter++;
1058 } while (!test_inode_iunique(sb, res)); 1057 } while (!test_inode_iunique(sb, res));
1059 spin_unlock(&iunique_lock); 1058 spin_unlock(&iunique_lock);
1060 1059
1061 return res; 1060 return res;
1062 } 1061 }
1063 EXPORT_SYMBOL(iunique); 1062 EXPORT_SYMBOL(iunique);
1064 1063
1065 struct inode *igrab(struct inode *inode) 1064 struct inode *igrab(struct inode *inode)
1066 { 1065 {
1067 spin_lock(&inode->i_lock); 1066 spin_lock(&inode->i_lock);
1068 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { 1067 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1069 __iget(inode); 1068 __iget(inode);
1070 spin_unlock(&inode->i_lock); 1069 spin_unlock(&inode->i_lock);
1071 } else { 1070 } else {
1072 spin_unlock(&inode->i_lock); 1071 spin_unlock(&inode->i_lock);
1073 /* 1072 /*
1074 * Handle the case where s_op->clear_inode is not been 1073 * Handle the case where s_op->clear_inode is not been
1075 * called yet, and somebody is calling igrab 1074 * called yet, and somebody is calling igrab
1076 * while the inode is getting freed. 1075 * while the inode is getting freed.
1077 */ 1076 */
1078 inode = NULL; 1077 inode = NULL;
1079 } 1078 }
1080 return inode; 1079 return inode;
1081 } 1080 }
1082 EXPORT_SYMBOL(igrab); 1081 EXPORT_SYMBOL(igrab);
1083 1082
1084 /** 1083 /**
1085 * ilookup5_nowait - search for an inode in the inode cache 1084 * ilookup5_nowait - search for an inode in the inode cache
1086 * @sb: super block of file system to search 1085 * @sb: super block of file system to search
1087 * @hashval: hash value (usually inode number) to search for 1086 * @hashval: hash value (usually inode number) to search for
1088 * @test: callback used for comparisons between inodes 1087 * @test: callback used for comparisons between inodes
1089 * @data: opaque data pointer to pass to @test 1088 * @data: opaque data pointer to pass to @test
1090 * 1089 *
1091 * Search for the inode specified by @hashval and @data in the inode cache. 1090 * Search for the inode specified by @hashval and @data in the inode cache.
1092 * If the inode is in the cache, the inode is returned with an incremented 1091 * If the inode is in the cache, the inode is returned with an incremented
1093 * reference count. 1092 * reference count.
1094 * 1093 *
1095 * Note: I_NEW is not waited upon so you have to be very careful what you do 1094 * Note: I_NEW is not waited upon so you have to be very careful what you do
1096 * with the returned inode. You probably should be using ilookup5() instead. 1095 * with the returned inode. You probably should be using ilookup5() instead.
1097 * 1096 *
1098 * Note2: @test is called with the inode_hash_lock held, so can't sleep. 1097 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
1099 */ 1098 */
1100 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1099 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
1101 int (*test)(struct inode *, void *), void *data) 1100 int (*test)(struct inode *, void *), void *data)
1102 { 1101 {
1103 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1102 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1104 struct inode *inode; 1103 struct inode *inode;
1105 1104
1106 spin_lock(&inode_hash_lock); 1105 spin_lock(&inode_hash_lock);
1107 inode = find_inode(sb, head, test, data); 1106 inode = find_inode(sb, head, test, data);
1108 spin_unlock(&inode_hash_lock); 1107 spin_unlock(&inode_hash_lock);
1109 1108
1110 return inode; 1109 return inode;
1111 } 1110 }
1112 EXPORT_SYMBOL(ilookup5_nowait); 1111 EXPORT_SYMBOL(ilookup5_nowait);
1113 1112
1114 /** 1113 /**
1115 * ilookup5 - search for an inode in the inode cache 1114 * ilookup5 - search for an inode in the inode cache
1116 * @sb: super block of file system to search 1115 * @sb: super block of file system to search
1117 * @hashval: hash value (usually inode number) to search for 1116 * @hashval: hash value (usually inode number) to search for
1118 * @test: callback used for comparisons between inodes 1117 * @test: callback used for comparisons between inodes
1119 * @data: opaque data pointer to pass to @test 1118 * @data: opaque data pointer to pass to @test
1120 * 1119 *
1121 * Search for the inode specified by @hashval and @data in the inode cache, 1120 * Search for the inode specified by @hashval and @data in the inode cache,
1122 * and if the inode is in the cache, return the inode with an incremented 1121 * and if the inode is in the cache, return the inode with an incremented
1123 * reference count. Waits on I_NEW before returning the inode. 1122 * reference count. Waits on I_NEW before returning the inode.
1124 * returned with an incremented reference count. 1123 * returned with an incremented reference count.
1125 * 1124 *
1126 * This is a generalized version of ilookup() for file systems where the 1125 * This is a generalized version of ilookup() for file systems where the
1127 * inode number is not sufficient for unique identification of an inode. 1126 * inode number is not sufficient for unique identification of an inode.
1128 * 1127 *
1129 * Note: @test is called with the inode_hash_lock held, so can't sleep. 1128 * Note: @test is called with the inode_hash_lock held, so can't sleep.
1130 */ 1129 */
1131 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1130 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1132 int (*test)(struct inode *, void *), void *data) 1131 int (*test)(struct inode *, void *), void *data)
1133 { 1132 {
1134 struct inode *inode = ilookup5_nowait(sb, hashval, test, data); 1133 struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
1135 1134
1136 if (inode) 1135 if (inode)
1137 wait_on_inode(inode); 1136 wait_on_inode(inode);
1138 return inode; 1137 return inode;
1139 } 1138 }
1140 EXPORT_SYMBOL(ilookup5); 1139 EXPORT_SYMBOL(ilookup5);
1141 1140
1142 /** 1141 /**
1143 * ilookup - search for an inode in the inode cache 1142 * ilookup - search for an inode in the inode cache
1144 * @sb: super block of file system to search 1143 * @sb: super block of file system to search
1145 * @ino: inode number to search for 1144 * @ino: inode number to search for
1146 * 1145 *
1147 * Search for the inode @ino in the inode cache, and if the inode is in the 1146 * Search for the inode @ino in the inode cache, and if the inode is in the
1148 * cache, the inode is returned with an incremented reference count. 1147 * cache, the inode is returned with an incremented reference count.
1149 */ 1148 */
1150 struct inode *ilookup(struct super_block *sb, unsigned long ino) 1149 struct inode *ilookup(struct super_block *sb, unsigned long ino)
1151 { 1150 {
1152 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1151 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1153 struct inode *inode; 1152 struct inode *inode;
1154 1153
1155 spin_lock(&inode_hash_lock); 1154 spin_lock(&inode_hash_lock);
1156 inode = find_inode_fast(sb, head, ino); 1155 inode = find_inode_fast(sb, head, ino);
1157 spin_unlock(&inode_hash_lock); 1156 spin_unlock(&inode_hash_lock);
1158 1157
1159 if (inode) 1158 if (inode)
1160 wait_on_inode(inode); 1159 wait_on_inode(inode);
1161 return inode; 1160 return inode;
1162 } 1161 }
1163 EXPORT_SYMBOL(ilookup); 1162 EXPORT_SYMBOL(ilookup);
1164 1163
1165 int insert_inode_locked(struct inode *inode) 1164 int insert_inode_locked(struct inode *inode)
1166 { 1165 {
1167 struct super_block *sb = inode->i_sb; 1166 struct super_block *sb = inode->i_sb;
1168 ino_t ino = inode->i_ino; 1167 ino_t ino = inode->i_ino;
1169 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1168 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1170 1169
1171 while (1) { 1170 while (1) {
1172 struct hlist_node *node; 1171 struct hlist_node *node;
1173 struct inode *old = NULL; 1172 struct inode *old = NULL;
1174 spin_lock(&inode_hash_lock); 1173 spin_lock(&inode_hash_lock);
1175 hlist_for_each_entry(old, node, head, i_hash) { 1174 hlist_for_each_entry(old, node, head, i_hash) {
1176 if (old->i_ino != ino) 1175 if (old->i_ino != ino)
1177 continue; 1176 continue;
1178 if (old->i_sb != sb) 1177 if (old->i_sb != sb)
1179 continue; 1178 continue;
1180 spin_lock(&old->i_lock); 1179 spin_lock(&old->i_lock);
1181 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1180 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1182 spin_unlock(&old->i_lock); 1181 spin_unlock(&old->i_lock);
1183 continue; 1182 continue;
1184 } 1183 }
1185 break; 1184 break;
1186 } 1185 }
1187 if (likely(!node)) { 1186 if (likely(!node)) {
1188 spin_lock(&inode->i_lock); 1187 spin_lock(&inode->i_lock);
1189 inode->i_state |= I_NEW; 1188 inode->i_state |= I_NEW;
1190 hlist_add_head(&inode->i_hash, head); 1189 hlist_add_head(&inode->i_hash, head);
1191 spin_unlock(&inode->i_lock); 1190 spin_unlock(&inode->i_lock);
1192 spin_unlock(&inode_hash_lock); 1191 spin_unlock(&inode_hash_lock);
1193 return 0; 1192 return 0;
1194 } 1193 }
1195 __iget(old); 1194 __iget(old);
1196 spin_unlock(&old->i_lock); 1195 spin_unlock(&old->i_lock);
1197 spin_unlock(&inode_hash_lock); 1196 spin_unlock(&inode_hash_lock);
1198 wait_on_inode(old); 1197 wait_on_inode(old);
1199 if (unlikely(!inode_unhashed(old))) { 1198 if (unlikely(!inode_unhashed(old))) {
1200 iput(old); 1199 iput(old);
1201 return -EBUSY; 1200 return -EBUSY;
1202 } 1201 }
1203 iput(old); 1202 iput(old);
1204 } 1203 }
1205 } 1204 }
1206 EXPORT_SYMBOL(insert_inode_locked); 1205 EXPORT_SYMBOL(insert_inode_locked);
1207 1206
1208 int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1207 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1209 int (*test)(struct inode *, void *), void *data) 1208 int (*test)(struct inode *, void *), void *data)
1210 { 1209 {
1211 struct super_block *sb = inode->i_sb; 1210 struct super_block *sb = inode->i_sb;
1212 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1211 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1213 1212
1214 while (1) { 1213 while (1) {
1215 struct hlist_node *node; 1214 struct hlist_node *node;
1216 struct inode *old = NULL; 1215 struct inode *old = NULL;
1217 1216
1218 spin_lock(&inode_hash_lock); 1217 spin_lock(&inode_hash_lock);
1219 hlist_for_each_entry(old, node, head, i_hash) { 1218 hlist_for_each_entry(old, node, head, i_hash) {
1220 if (old->i_sb != sb) 1219 if (old->i_sb != sb)
1221 continue; 1220 continue;
1222 if (!test(old, data)) 1221 if (!test(old, data))
1223 continue; 1222 continue;
1224 spin_lock(&old->i_lock); 1223 spin_lock(&old->i_lock);
1225 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1224 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1226 spin_unlock(&old->i_lock); 1225 spin_unlock(&old->i_lock);
1227 continue; 1226 continue;
1228 } 1227 }
1229 break; 1228 break;
1230 } 1229 }
1231 if (likely(!node)) { 1230 if (likely(!node)) {
1232 spin_lock(&inode->i_lock); 1231 spin_lock(&inode->i_lock);
1233 inode->i_state |= I_NEW; 1232 inode->i_state |= I_NEW;
1234 hlist_add_head(&inode->i_hash, head); 1233 hlist_add_head(&inode->i_hash, head);
1235 spin_unlock(&inode->i_lock); 1234 spin_unlock(&inode->i_lock);
1236 spin_unlock(&inode_hash_lock); 1235 spin_unlock(&inode_hash_lock);
1237 return 0; 1236 return 0;
1238 } 1237 }
1239 __iget(old); 1238 __iget(old);
1240 spin_unlock(&old->i_lock); 1239 spin_unlock(&old->i_lock);
1241 spin_unlock(&inode_hash_lock); 1240 spin_unlock(&inode_hash_lock);
1242 wait_on_inode(old); 1241 wait_on_inode(old);
1243 if (unlikely(!inode_unhashed(old))) { 1242 if (unlikely(!inode_unhashed(old))) {
1244 iput(old); 1243 iput(old);
1245 return -EBUSY; 1244 return -EBUSY;
1246 } 1245 }
1247 iput(old); 1246 iput(old);
1248 } 1247 }
1249 } 1248 }
1250 EXPORT_SYMBOL(insert_inode_locked4); 1249 EXPORT_SYMBOL(insert_inode_locked4);
1251 1250
1252 1251
1253 int generic_delete_inode(struct inode *inode) 1252 int generic_delete_inode(struct inode *inode)
1254 { 1253 {
1255 return 1; 1254 return 1;
1256 } 1255 }
1257 EXPORT_SYMBOL(generic_delete_inode); 1256 EXPORT_SYMBOL(generic_delete_inode);
1258 1257
1259 /* 1258 /*
1260 * Normal UNIX filesystem behaviour: delete the 1259 * Normal UNIX filesystem behaviour: delete the
1261 * inode when the usage count drops to zero, and 1260 * inode when the usage count drops to zero, and
1262 * i_nlink is zero. 1261 * i_nlink is zero.
1263 */ 1262 */
1264 int generic_drop_inode(struct inode *inode) 1263 int generic_drop_inode(struct inode *inode)
1265 { 1264 {
1266 return !inode->i_nlink || inode_unhashed(inode); 1265 return !inode->i_nlink || inode_unhashed(inode);
1267 } 1266 }
1268 EXPORT_SYMBOL_GPL(generic_drop_inode); 1267 EXPORT_SYMBOL_GPL(generic_drop_inode);
1269 1268
1270 /* 1269 /*
1271 * Called when we're dropping the last reference 1270 * Called when we're dropping the last reference
1272 * to an inode. 1271 * to an inode.
1273 * 1272 *
1274 * Call the FS "drop_inode()" function, defaulting to 1273 * Call the FS "drop_inode()" function, defaulting to
1275 * the legacy UNIX filesystem behaviour. If it tells 1274 * the legacy UNIX filesystem behaviour. If it tells
1276 * us to evict inode, do so. Otherwise, retain inode 1275 * us to evict inode, do so. Otherwise, retain inode
1277 * in cache if fs is alive, sync and evict if fs is 1276 * in cache if fs is alive, sync and evict if fs is
1278 * shutting down. 1277 * shutting down.
1279 */ 1278 */
1280 static void iput_final(struct inode *inode) 1279 static void iput_final(struct inode *inode)
1281 { 1280 {
1282 struct super_block *sb = inode->i_sb; 1281 struct super_block *sb = inode->i_sb;
1283 const struct super_operations *op = inode->i_sb->s_op; 1282 const struct super_operations *op = inode->i_sb->s_op;
1284 int drop; 1283 int drop;
1285 1284
1286 WARN_ON(inode->i_state & I_NEW); 1285 WARN_ON(inode->i_state & I_NEW);
1287 1286
1288 if (op->drop_inode) 1287 if (op->drop_inode)
1289 drop = op->drop_inode(inode); 1288 drop = op->drop_inode(inode);
1290 else 1289 else
1291 drop = generic_drop_inode(inode); 1290 drop = generic_drop_inode(inode);
1292 1291
1293 if (!drop && (sb->s_flags & MS_ACTIVE)) { 1292 if (!drop && (sb->s_flags & MS_ACTIVE)) {
1294 inode->i_state |= I_REFERENCED; 1293 inode->i_state |= I_REFERENCED;
1295 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 1294 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1296 inode_lru_list_add(inode); 1295 inode_lru_list_add(inode);
1297 spin_unlock(&inode->i_lock); 1296 spin_unlock(&inode->i_lock);
1298 return; 1297 return;
1299 } 1298 }
1300 1299
1301 if (!drop) { 1300 if (!drop) {
1302 inode->i_state |= I_WILL_FREE; 1301 inode->i_state |= I_WILL_FREE;
1303 spin_unlock(&inode->i_lock); 1302 spin_unlock(&inode->i_lock);
1304 write_inode_now(inode, 1); 1303 write_inode_now(inode, 1);
1305 spin_lock(&inode->i_lock); 1304 spin_lock(&inode->i_lock);
1306 WARN_ON(inode->i_state & I_NEW); 1305 WARN_ON(inode->i_state & I_NEW);
1307 inode->i_state &= ~I_WILL_FREE; 1306 inode->i_state &= ~I_WILL_FREE;
1308 } 1307 }
1309 1308
1310 inode->i_state |= I_FREEING; 1309 inode->i_state |= I_FREEING;
1311 inode_lru_list_del(inode); 1310 inode_lru_list_del(inode);
1312 spin_unlock(&inode->i_lock); 1311 spin_unlock(&inode->i_lock);
1313 1312
1314 evict(inode); 1313 evict(inode);
1315 } 1314 }
1316 1315
1317 /** 1316 /**
1318 * iput - put an inode 1317 * iput - put an inode
1319 * @inode: inode to put 1318 * @inode: inode to put
1320 * 1319 *
1321 * Puts an inode, dropping its usage count. If the inode use count hits 1320 * Puts an inode, dropping its usage count. If the inode use count hits
1322 * zero, the inode is then freed and may also be destroyed. 1321 * zero, the inode is then freed and may also be destroyed.
1323 * 1322 *
1324 * Consequently, iput() can sleep. 1323 * Consequently, iput() can sleep.
1325 */ 1324 */
1326 void iput(struct inode *inode) 1325 void iput(struct inode *inode)
1327 { 1326 {
1328 if (inode) { 1327 if (inode) {
1329 BUG_ON(inode->i_state & I_CLEAR); 1328 BUG_ON(inode->i_state & I_CLEAR);
1330 1329
1331 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) 1330 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
1332 iput_final(inode); 1331 iput_final(inode);
1333 } 1332 }
1334 } 1333 }
1335 EXPORT_SYMBOL(iput); 1334 EXPORT_SYMBOL(iput);
1336 1335
1337 /** 1336 /**
1338 * bmap - find a block number in a file 1337 * bmap - find a block number in a file
1339 * @inode: inode of file 1338 * @inode: inode of file
1340 * @block: block to find 1339 * @block: block to find
1341 * 1340 *
1342 * Returns the block number on the device holding the inode that 1341 * Returns the block number on the device holding the inode that
1343 * is the disk block number for the block of the file requested. 1342 * is the disk block number for the block of the file requested.
1344 * That is, asked for block 4 of inode 1 the function will return the 1343 * That is, asked for block 4 of inode 1 the function will return the
1345 * disk block relative to the disk start that holds that block of the 1344 * disk block relative to the disk start that holds that block of the
1346 * file. 1345 * file.
1347 */ 1346 */
1348 sector_t bmap(struct inode *inode, sector_t block) 1347 sector_t bmap(struct inode *inode, sector_t block)
1349 { 1348 {
1350 sector_t res = 0; 1349 sector_t res = 0;
1351 if (inode->i_mapping->a_ops->bmap) 1350 if (inode->i_mapping->a_ops->bmap)
1352 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); 1351 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
1353 return res; 1352 return res;
1354 } 1353 }
1355 EXPORT_SYMBOL(bmap); 1354 EXPORT_SYMBOL(bmap);
1356 1355
1357 /* 1356 /*
1358 * With relative atime, only update atime if the previous atime is 1357 * With relative atime, only update atime if the previous atime is
1359 * earlier than either the ctime or mtime or if at least a day has 1358 * earlier than either the ctime or mtime or if at least a day has
1360 * passed since the last atime update. 1359 * passed since the last atime update.
1361 */ 1360 */
1362 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, 1361 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1363 struct timespec now) 1362 struct timespec now)
1364 { 1363 {
1365 1364
1366 if (!(mnt->mnt_flags & MNT_RELATIME)) 1365 if (!(mnt->mnt_flags & MNT_RELATIME))
1367 return 1; 1366 return 1;
1368 /* 1367 /*
1369 * Is mtime younger than atime? If yes, update atime: 1368 * Is mtime younger than atime? If yes, update atime:
1370 */ 1369 */
1371 if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) 1370 if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
1372 return 1; 1371 return 1;
1373 /* 1372 /*
1374 * Is ctime younger than atime? If yes, update atime: 1373 * Is ctime younger than atime? If yes, update atime:
1375 */ 1374 */
1376 if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) 1375 if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
1377 return 1; 1376 return 1;
1378 1377
1379 /* 1378 /*
1380 * Is the previous atime value older than a day? If yes, 1379 * Is the previous atime value older than a day? If yes,
1381 * update atime: 1380 * update atime:
1382 */ 1381 */
1383 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) 1382 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
1384 return 1; 1383 return 1;
1385 /* 1384 /*
1386 * Good, we can skip the atime update: 1385 * Good, we can skip the atime update:
1387 */ 1386 */
1388 return 0; 1387 return 0;
1389 } 1388 }
1390 1389
1391 /** 1390 /**
1392 * touch_atime - update the access time 1391 * touch_atime - update the access time
1393 * @mnt: mount the inode is accessed on 1392 * @mnt: mount the inode is accessed on
1394 * @dentry: dentry accessed 1393 * @dentry: dentry accessed
1395 * 1394 *
1396 * Update the accessed time on an inode and mark it for writeback. 1395 * Update the accessed time on an inode and mark it for writeback.
1397 * This function automatically handles read only file systems and media, 1396 * This function automatically handles read only file systems and media,
1398 * as well as the "noatime" flag and inode specific "noatime" markers. 1397 * as well as the "noatime" flag and inode specific "noatime" markers.
1399 */ 1398 */
1400 void touch_atime(struct vfsmount *mnt, struct dentry *dentry) 1399 void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1401 { 1400 {
1402 struct inode *inode = dentry->d_inode; 1401 struct inode *inode = dentry->d_inode;
1403 struct timespec now; 1402 struct timespec now;
1404 1403
1405 if (inode->i_flags & S_NOATIME) 1404 if (inode->i_flags & S_NOATIME)
1406 return; 1405 return;
1407 if (IS_NOATIME(inode)) 1406 if (IS_NOATIME(inode))
1408 return; 1407 return;
1409 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1408 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1410 return; 1409 return;
1411 1410
1412 if (mnt->mnt_flags & MNT_NOATIME) 1411 if (mnt->mnt_flags & MNT_NOATIME)
1413 return; 1412 return;
1414 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1413 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1415 return; 1414 return;
1416 1415
1417 now = current_fs_time(inode->i_sb); 1416 now = current_fs_time(inode->i_sb);
1418 1417
1419 if (!relatime_need_update(mnt, inode, now)) 1418 if (!relatime_need_update(mnt, inode, now))
1420 return; 1419 return;
1421 1420
1422 if (timespec_equal(&inode->i_atime, &now)) 1421 if (timespec_equal(&inode->i_atime, &now))
1423 return; 1422 return;
1424 1423
1425 if (mnt_want_write(mnt)) 1424 if (mnt_want_write(mnt))
1426 return; 1425 return;
1427 1426
1428 inode->i_atime = now; 1427 inode->i_atime = now;
1429 mark_inode_dirty_sync(inode); 1428 mark_inode_dirty_sync(inode);
1430 mnt_drop_write(mnt); 1429 mnt_drop_write(mnt);
1431 } 1430 }
1432 EXPORT_SYMBOL(touch_atime); 1431 EXPORT_SYMBOL(touch_atime);
1433 1432
1434 /** 1433 /**
1435 * file_update_time - update mtime and ctime time 1434 * file_update_time - update mtime and ctime time
1436 * @file: file accessed 1435 * @file: file accessed
1437 * 1436 *
1438 * Update the mtime and ctime members of an inode and mark the inode 1437 * Update the mtime and ctime members of an inode and mark the inode
1439 * for writeback. Note that this function is meant exclusively for 1438 * for writeback. Note that this function is meant exclusively for
1440 * usage in the file write path of filesystems, and filesystems may 1439 * usage in the file write path of filesystems, and filesystems may
1441 * choose to explicitly ignore update via this function with the 1440 * choose to explicitly ignore update via this function with the
1442 * S_NOCMTIME inode flag, e.g. for network filesystem where these 1441 * S_NOCMTIME inode flag, e.g. for network filesystem where these
1443 * timestamps are handled by the server. 1442 * timestamps are handled by the server.
1444 */ 1443 */
1445 1444
1446 void file_update_time(struct file *file) 1445 void file_update_time(struct file *file)
1447 { 1446 {
1448 struct inode *inode = file->f_path.dentry->d_inode; 1447 struct inode *inode = file->f_path.dentry->d_inode;
1449 struct timespec now; 1448 struct timespec now;
1450 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; 1449 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
1451 1450
1452 /* First try to exhaust all avenues to not sync */ 1451 /* First try to exhaust all avenues to not sync */
1453 if (IS_NOCMTIME(inode)) 1452 if (IS_NOCMTIME(inode))
1454 return; 1453 return;
1455 1454
1456 now = current_fs_time(inode->i_sb); 1455 now = current_fs_time(inode->i_sb);
1457 if (!timespec_equal(&inode->i_mtime, &now)) 1456 if (!timespec_equal(&inode->i_mtime, &now))
1458 sync_it = S_MTIME; 1457 sync_it = S_MTIME;
1459 1458
1460 if (!timespec_equal(&inode->i_ctime, &now)) 1459 if (!timespec_equal(&inode->i_ctime, &now))
1461 sync_it |= S_CTIME; 1460 sync_it |= S_CTIME;
1462 1461
1463 if (IS_I_VERSION(inode)) 1462 if (IS_I_VERSION(inode))
1464 sync_it |= S_VERSION; 1463 sync_it |= S_VERSION;
1465 1464
1466 if (!sync_it) 1465 if (!sync_it)
1467 return; 1466 return;
1468 1467
1469 /* Finally allowed to write? Takes lock. */ 1468 /* Finally allowed to write? Takes lock. */
1470 if (mnt_want_write_file(file)) 1469 if (mnt_want_write_file(file))
1471 return; 1470 return;
1472 1471
1473 /* Only change inode inside the lock region */ 1472 /* Only change inode inside the lock region */
1474 if (sync_it & S_VERSION) 1473 if (sync_it & S_VERSION)
1475 inode_inc_iversion(inode); 1474 inode_inc_iversion(inode);
1476 if (sync_it & S_CTIME) 1475 if (sync_it & S_CTIME)
1477 inode->i_ctime = now; 1476 inode->i_ctime = now;
1478 if (sync_it & S_MTIME) 1477 if (sync_it & S_MTIME)
1479 inode->i_mtime = now; 1478 inode->i_mtime = now;
1480 mark_inode_dirty_sync(inode); 1479 mark_inode_dirty_sync(inode);
1481 mnt_drop_write(file->f_path.mnt); 1480 mnt_drop_write(file->f_path.mnt);
1482 } 1481 }
1483 EXPORT_SYMBOL(file_update_time); 1482 EXPORT_SYMBOL(file_update_time);
1484 1483
1485 int inode_needs_sync(struct inode *inode) 1484 int inode_needs_sync(struct inode *inode)
1486 { 1485 {
1487 if (IS_SYNC(inode)) 1486 if (IS_SYNC(inode))
1488 return 1; 1487 return 1;
1489 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 1488 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
1490 return 1; 1489 return 1;
1491 return 0; 1490 return 0;
1492 } 1491 }
1493 EXPORT_SYMBOL(inode_needs_sync); 1492 EXPORT_SYMBOL(inode_needs_sync);
1494 1493
1495 int inode_wait(void *word) 1494 int inode_wait(void *word)
1496 { 1495 {
1497 schedule(); 1496 schedule();
1498 return 0; 1497 return 0;
1499 } 1498 }
1500 EXPORT_SYMBOL(inode_wait); 1499 EXPORT_SYMBOL(inode_wait);
1501 1500
1502 /* 1501 /*
1503 * If we try to find an inode in the inode hash while it is being 1502 * If we try to find an inode in the inode hash while it is being
1504 * deleted, we have to wait until the filesystem completes its 1503 * deleted, we have to wait until the filesystem completes its
1505 * deletion before reporting that it isn't found. This function waits 1504 * deletion before reporting that it isn't found. This function waits
1506 * until the deletion _might_ have completed. Callers are responsible 1505 * until the deletion _might_ have completed. Callers are responsible
1507 * to recheck inode state. 1506 * to recheck inode state.
1508 * 1507 *
1509 * It doesn't matter if I_NEW is not set initially, a call to 1508 * It doesn't matter if I_NEW is not set initially, a call to
1510 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 1509 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
1511 * will DTRT. 1510 * will DTRT.
1512 */ 1511 */
1513 static void __wait_on_freeing_inode(struct inode *inode) 1512 static void __wait_on_freeing_inode(struct inode *inode)
1514 { 1513 {
1515 wait_queue_head_t *wq; 1514 wait_queue_head_t *wq;
1516 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1515 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1517 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1516 wq = bit_waitqueue(&inode->i_state, __I_NEW);
1518 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1517 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1519 spin_unlock(&inode->i_lock); 1518 spin_unlock(&inode->i_lock);
1520 spin_unlock(&inode_hash_lock); 1519 spin_unlock(&inode_hash_lock);
1521 schedule(); 1520 schedule();
1522 finish_wait(wq, &wait.wait); 1521 finish_wait(wq, &wait.wait);
1523 spin_lock(&inode_hash_lock); 1522 spin_lock(&inode_hash_lock);
1524 } 1523 }
1525 1524
1526 static __initdata unsigned long ihash_entries; 1525 static __initdata unsigned long ihash_entries;
1527 static int __init set_ihash_entries(char *str) 1526 static int __init set_ihash_entries(char *str)
1528 { 1527 {
1529 if (!str) 1528 if (!str)
1530 return 0; 1529 return 0;
1531 ihash_entries = simple_strtoul(str, &str, 0); 1530 ihash_entries = simple_strtoul(str, &str, 0);
1532 return 1; 1531 return 1;
1533 } 1532 }
1534 __setup("ihash_entries=", set_ihash_entries); 1533 __setup("ihash_entries=", set_ihash_entries);
1535 1534
1536 /* 1535 /*
1537 * Initialize the waitqueues and inode hash table. 1536 * Initialize the waitqueues and inode hash table.
1538 */ 1537 */
1539 void __init inode_init_early(void) 1538 void __init inode_init_early(void)
1540 { 1539 {
1541 int loop; 1540 int loop;
1542 1541
1543 /* If hashes are distributed across NUMA nodes, defer 1542 /* If hashes are distributed across NUMA nodes, defer
1544 * hash allocation until vmalloc space is available. 1543 * hash allocation until vmalloc space is available.
1545 */ 1544 */
1546 if (hashdist) 1545 if (hashdist)
1547 return; 1546 return;
1548 1547
1549 inode_hashtable = 1548 inode_hashtable =
1550 alloc_large_system_hash("Inode-cache", 1549 alloc_large_system_hash("Inode-cache",
1551 sizeof(struct hlist_head), 1550 sizeof(struct hlist_head),
1552 ihash_entries, 1551 ihash_entries,
1553 14, 1552 14,
1554 HASH_EARLY, 1553 HASH_EARLY,
1555 &i_hash_shift, 1554 &i_hash_shift,
1556 &i_hash_mask, 1555 &i_hash_mask,
1557 0); 1556 0);
1558 1557
1559 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1558 for (loop = 0; loop < (1 << i_hash_shift); loop++)
1560 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1559 INIT_HLIST_HEAD(&inode_hashtable[loop]);
1561 } 1560 }
1562 1561
1563 void __init inode_init(void) 1562 void __init inode_init(void)
1564 { 1563 {
1565 int loop; 1564 int loop;
1566 1565
1567 /* inode slab cache */ 1566 /* inode slab cache */
1568 inode_cachep = kmem_cache_create("inode_cache", 1567 inode_cachep = kmem_cache_create("inode_cache",
1569 sizeof(struct inode), 1568 sizeof(struct inode),
1570 0, 1569 0,
1571 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 1570 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
1572 SLAB_MEM_SPREAD), 1571 SLAB_MEM_SPREAD),
1573 init_once); 1572 init_once);
1574 1573
1575 /* Hash may have been set up in inode_init_early */ 1574 /* Hash may have been set up in inode_init_early */
1576 if (!hashdist) 1575 if (!hashdist)
1577 return; 1576 return;
1578 1577
1579 inode_hashtable = 1578 inode_hashtable =
1580 alloc_large_system_hash("Inode-cache", 1579 alloc_large_system_hash("Inode-cache",
1581 sizeof(struct hlist_head), 1580 sizeof(struct hlist_head),
1582 ihash_entries, 1581 ihash_entries,
1583 14, 1582 14,
1584 0, 1583 0,
1585 &i_hash_shift, 1584 &i_hash_shift,
1586 &i_hash_mask, 1585 &i_hash_mask,
1587 0); 1586 0);
1588 1587
1589 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1588 for (loop = 0; loop < (1 << i_hash_shift); loop++)
1590 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1589 INIT_HLIST_HEAD(&inode_hashtable[loop]);
1591 } 1590 }
1592 1591
1593 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 1592 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1594 { 1593 {
1595 inode->i_mode = mode; 1594 inode->i_mode = mode;
1596 if (S_ISCHR(mode)) { 1595 if (S_ISCHR(mode)) {
1597 inode->i_fop = &def_chr_fops; 1596 inode->i_fop = &def_chr_fops;
1598 inode->i_rdev = rdev; 1597 inode->i_rdev = rdev;
1599 } else if (S_ISBLK(mode)) { 1598 } else if (S_ISBLK(mode)) {
1600 inode->i_fop = &def_blk_fops; 1599 inode->i_fop = &def_blk_fops;
1601 inode->i_rdev = rdev; 1600 inode->i_rdev = rdev;
1602 } else if (S_ISFIFO(mode)) 1601 } else if (S_ISFIFO(mode))
1603 inode->i_fop = &def_fifo_fops; 1602 inode->i_fop = &def_fifo_fops;
1604 else if (S_ISSOCK(mode)) 1603 else if (S_ISSOCK(mode))
1605 inode->i_fop = &bad_sock_fops; 1604 inode->i_fop = &bad_sock_fops;
1606 else 1605 else
1607 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" 1606 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
1608 " inode %s:%lu\n", mode, inode->i_sb->s_id, 1607 " inode %s:%lu\n", mode, inode->i_sb->s_id,
1609 inode->i_ino); 1608 inode->i_ino);
1610 } 1609 }
1611 EXPORT_SYMBOL(init_special_inode); 1610 EXPORT_SYMBOL(init_special_inode);
1612 1611
1613 /** 1612 /**
1614 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards 1613 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
1615 * @inode: New inode 1614 * @inode: New inode
1616 * @dir: Directory inode 1615 * @dir: Directory inode
1617 * @mode: mode of the new inode 1616 * @mode: mode of the new inode
1618 */ 1617 */
1619 void inode_init_owner(struct inode *inode, const struct inode *dir, 1618 void inode_init_owner(struct inode *inode, const struct inode *dir,
1620 mode_t mode) 1619 mode_t mode)
1621 { 1620 {
1622 inode->i_uid = current_fsuid(); 1621 inode->i_uid = current_fsuid();
1623 if (dir && dir->i_mode & S_ISGID) { 1622 if (dir && dir->i_mode & S_ISGID) {
1624 inode->i_gid = dir->i_gid; 1623 inode->i_gid = dir->i_gid;
1625 if (S_ISDIR(mode)) 1624 if (S_ISDIR(mode))
1626 mode |= S_ISGID; 1625 mode |= S_ISGID;
1627 } else 1626 } else
1628 inode->i_gid = current_fsgid(); 1627 inode->i_gid = current_fsgid();
1629 inode->i_mode = mode; 1628 inode->i_mode = mode;
1630 } 1629 }
1631 EXPORT_SYMBOL(inode_init_owner); 1630 EXPORT_SYMBOL(inode_init_owner);
1632 1631
1633 /** 1632 /**
1634 * inode_owner_or_capable - check current task permissions to inode 1633 * inode_owner_or_capable - check current task permissions to inode
1635 * @inode: inode being checked 1634 * @inode: inode being checked
1636 * 1635 *
1637 * Return true if current either has CAP_FOWNER to the inode, or 1636 * Return true if current either has CAP_FOWNER to the inode, or
1638 * owns the file. 1637 * owns the file.
1639 */ 1638 */
1640 bool inode_owner_or_capable(const struct inode *inode) 1639 bool inode_owner_or_capable(const struct inode *inode)
1641 { 1640 {
1642 struct user_namespace *ns = inode_userns(inode); 1641 struct user_namespace *ns = inode_userns(inode);
1643 1642
1644 if (current_user_ns() == ns && current_fsuid() == inode->i_uid) 1643 if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
1645 return true; 1644 return true;
1646 if (ns_capable(ns, CAP_FOWNER)) 1645 if (ns_capable(ns, CAP_FOWNER))
1647 return true; 1646 return true;
1648 return false; 1647 return false;
1649 } 1648 }
1650 EXPORT_SYMBOL(inode_owner_or_capable); 1649 EXPORT_SYMBOL(inode_owner_or_capable);
1651 1650
1 /* 1 /*
2 * linux/fs/nfs/write.c 2 * linux/fs/nfs/write.c
3 * 3 *
4 * Write file data over NFS. 4 * Write file data over NFS.
5 * 5 *
6 * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
7 */ 7 */
8 8
9 #include <linux/types.h> 9 #include <linux/types.h>
10 #include <linux/slab.h> 10 #include <linux/slab.h>
11 #include <linux/mm.h> 11 #include <linux/mm.h>
12 #include <linux/pagemap.h> 12 #include <linux/pagemap.h>
13 #include <linux/file.h> 13 #include <linux/file.h>
14 #include <linux/writeback.h> 14 #include <linux/writeback.h>
15 #include <linux/swap.h> 15 #include <linux/swap.h>
16 #include <linux/migrate.h> 16 #include <linux/migrate.h>
17 17
18 #include <linux/sunrpc/clnt.h> 18 #include <linux/sunrpc/clnt.h>
19 #include <linux/nfs_fs.h> 19 #include <linux/nfs_fs.h>
20 #include <linux/nfs_mount.h> 20 #include <linux/nfs_mount.h>
21 #include <linux/nfs_page.h> 21 #include <linux/nfs_page.h>
22 #include <linux/backing-dev.h> 22 #include <linux/backing-dev.h>
23 23
24 #include <asm/uaccess.h> 24 #include <asm/uaccess.h>
25 25
26 #include "delegation.h" 26 #include "delegation.h"
27 #include "internal.h" 27 #include "internal.h"
28 #include "iostat.h" 28 #include "iostat.h"
29 #include "nfs4_fs.h" 29 #include "nfs4_fs.h"
30 #include "fscache.h" 30 #include "fscache.h"
31 #include "pnfs.h" 31 #include "pnfs.h"
32 32
33 #define NFSDBG_FACILITY NFSDBG_PAGECACHE 33 #define NFSDBG_FACILITY NFSDBG_PAGECACHE
34 34
35 #define MIN_POOL_WRITE (32) 35 #define MIN_POOL_WRITE (32)
36 #define MIN_POOL_COMMIT (4) 36 #define MIN_POOL_COMMIT (4)
37 37
38 /* 38 /*
39 * Local function declarations 39 * Local function declarations
40 */ 40 */
41 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, 41 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
42 struct inode *inode, int ioflags); 42 struct inode *inode, int ioflags);
43 static void nfs_redirty_request(struct nfs_page *req); 43 static void nfs_redirty_request(struct nfs_page *req);
44 static const struct rpc_call_ops nfs_write_partial_ops; 44 static const struct rpc_call_ops nfs_write_partial_ops;
45 static const struct rpc_call_ops nfs_write_full_ops; 45 static const struct rpc_call_ops nfs_write_full_ops;
46 static const struct rpc_call_ops nfs_commit_ops; 46 static const struct rpc_call_ops nfs_commit_ops;
47 47
48 static struct kmem_cache *nfs_wdata_cachep; 48 static struct kmem_cache *nfs_wdata_cachep;
49 static mempool_t *nfs_wdata_mempool; 49 static mempool_t *nfs_wdata_mempool;
50 static mempool_t *nfs_commit_mempool; 50 static mempool_t *nfs_commit_mempool;
51 51
52 struct nfs_write_data *nfs_commitdata_alloc(void) 52 struct nfs_write_data *nfs_commitdata_alloc(void)
53 { 53 {
54 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); 54 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
55 55
56 if (p) { 56 if (p) {
57 memset(p, 0, sizeof(*p)); 57 memset(p, 0, sizeof(*p));
58 INIT_LIST_HEAD(&p->pages); 58 INIT_LIST_HEAD(&p->pages);
59 } 59 }
60 return p; 60 return p;
61 } 61 }
62 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); 62 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
63 63
64 void nfs_commit_free(struct nfs_write_data *p) 64 void nfs_commit_free(struct nfs_write_data *p)
65 { 65 {
66 if (p && (p->pagevec != &p->page_array[0])) 66 if (p && (p->pagevec != &p->page_array[0]))
67 kfree(p->pagevec); 67 kfree(p->pagevec);
68 mempool_free(p, nfs_commit_mempool); 68 mempool_free(p, nfs_commit_mempool);
69 } 69 }
70 EXPORT_SYMBOL_GPL(nfs_commit_free); 70 EXPORT_SYMBOL_GPL(nfs_commit_free);
71 71
72 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) 72 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
73 { 73 {
74 struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); 74 struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
75 75
76 if (p) { 76 if (p) {
77 memset(p, 0, sizeof(*p)); 77 memset(p, 0, sizeof(*p));
78 INIT_LIST_HEAD(&p->pages); 78 INIT_LIST_HEAD(&p->pages);
79 p->npages = pagecount; 79 p->npages = pagecount;
80 if (pagecount <= ARRAY_SIZE(p->page_array)) 80 if (pagecount <= ARRAY_SIZE(p->page_array))
81 p->pagevec = p->page_array; 81 p->pagevec = p->page_array;
82 else { 82 else {
83 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); 83 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
84 if (!p->pagevec) { 84 if (!p->pagevec) {
85 mempool_free(p, nfs_wdata_mempool); 85 mempool_free(p, nfs_wdata_mempool);
86 p = NULL; 86 p = NULL;
87 } 87 }
88 } 88 }
89 } 89 }
90 return p; 90 return p;
91 } 91 }
92 92
93 void nfs_writedata_free(struct nfs_write_data *p) 93 void nfs_writedata_free(struct nfs_write_data *p)
94 { 94 {
95 if (p && (p->pagevec != &p->page_array[0])) 95 if (p && (p->pagevec != &p->page_array[0]))
96 kfree(p->pagevec); 96 kfree(p->pagevec);
97 mempool_free(p, nfs_wdata_mempool); 97 mempool_free(p, nfs_wdata_mempool);
98 } 98 }
99 99
100 static void nfs_writedata_release(struct nfs_write_data *wdata) 100 static void nfs_writedata_release(struct nfs_write_data *wdata)
101 { 101 {
102 put_lseg(wdata->lseg); 102 put_lseg(wdata->lseg);
103 put_nfs_open_context(wdata->args.context); 103 put_nfs_open_context(wdata->args.context);
104 nfs_writedata_free(wdata); 104 nfs_writedata_free(wdata);
105 } 105 }
106 106
107 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) 107 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
108 { 108 {
109 ctx->error = error; 109 ctx->error = error;
110 smp_wmb(); 110 smp_wmb();
111 set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 111 set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
112 } 112 }
113 113
114 static struct nfs_page *nfs_page_find_request_locked(struct page *page) 114 static struct nfs_page *nfs_page_find_request_locked(struct page *page)
115 { 115 {
116 struct nfs_page *req = NULL; 116 struct nfs_page *req = NULL;
117 117
118 if (PagePrivate(page)) { 118 if (PagePrivate(page)) {
119 req = (struct nfs_page *)page_private(page); 119 req = (struct nfs_page *)page_private(page);
120 if (req != NULL) 120 if (req != NULL)
121 kref_get(&req->wb_kref); 121 kref_get(&req->wb_kref);
122 } 122 }
123 return req; 123 return req;
124 } 124 }
125 125
126 static struct nfs_page *nfs_page_find_request(struct page *page) 126 static struct nfs_page *nfs_page_find_request(struct page *page)
127 { 127 {
128 struct inode *inode = page->mapping->host; 128 struct inode *inode = page->mapping->host;
129 struct nfs_page *req = NULL; 129 struct nfs_page *req = NULL;
130 130
131 spin_lock(&inode->i_lock); 131 spin_lock(&inode->i_lock);
132 req = nfs_page_find_request_locked(page); 132 req = nfs_page_find_request_locked(page);
133 spin_unlock(&inode->i_lock); 133 spin_unlock(&inode->i_lock);
134 return req; 134 return req;
135 } 135 }
136 136
137 /* Adjust the file length if we're writing beyond the end */ 137 /* Adjust the file length if we're writing beyond the end */
138 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) 138 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
139 { 139 {
140 struct inode *inode = page->mapping->host; 140 struct inode *inode = page->mapping->host;
141 loff_t end, i_size; 141 loff_t end, i_size;
142 pgoff_t end_index; 142 pgoff_t end_index;
143 143
144 spin_lock(&inode->i_lock); 144 spin_lock(&inode->i_lock);
145 i_size = i_size_read(inode); 145 i_size = i_size_read(inode);
146 end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; 146 end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
147 if (i_size > 0 && page->index < end_index) 147 if (i_size > 0 && page->index < end_index)
148 goto out; 148 goto out;
149 end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); 149 end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
150 if (i_size >= end) 150 if (i_size >= end)
151 goto out; 151 goto out;
152 i_size_write(inode, end); 152 i_size_write(inode, end);
153 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); 153 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
154 out: 154 out:
155 spin_unlock(&inode->i_lock); 155 spin_unlock(&inode->i_lock);
156 } 156 }
157 157
158 /* A writeback failed: mark the page as bad, and invalidate the page cache */ 158 /* A writeback failed: mark the page as bad, and invalidate the page cache */
159 static void nfs_set_pageerror(struct page *page) 159 static void nfs_set_pageerror(struct page *page)
160 { 160 {
161 SetPageError(page); 161 SetPageError(page);
162 nfs_zap_mapping(page->mapping->host, page->mapping); 162 nfs_zap_mapping(page->mapping->host, page->mapping);
163 } 163 }
164 164
165 /* We can set the PG_uptodate flag if we see that a write request 165 /* We can set the PG_uptodate flag if we see that a write request
166 * covers the full page. 166 * covers the full page.
167 */ 167 */
168 static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) 168 static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
169 { 169 {
170 if (PageUptodate(page)) 170 if (PageUptodate(page))
171 return; 171 return;
172 if (base != 0) 172 if (base != 0)
173 return; 173 return;
174 if (count != nfs_page_length(page)) 174 if (count != nfs_page_length(page))
175 return; 175 return;
176 SetPageUptodate(page); 176 SetPageUptodate(page);
177 } 177 }
178 178
179 static int wb_priority(struct writeback_control *wbc) 179 static int wb_priority(struct writeback_control *wbc)
180 { 180 {
181 if (wbc->for_reclaim) 181 if (wbc->for_reclaim)
182 return FLUSH_HIGHPRI | FLUSH_STABLE; 182 return FLUSH_HIGHPRI | FLUSH_STABLE;
183 if (wbc->for_kupdate || wbc->for_background) 183 if (wbc->for_kupdate || wbc->for_background)
184 return FLUSH_LOWPRI | FLUSH_COND_STABLE; 184 return FLUSH_LOWPRI | FLUSH_COND_STABLE;
185 return FLUSH_COND_STABLE; 185 return FLUSH_COND_STABLE;
186 } 186 }
187 187
188 /* 188 /*
189 * NFS congestion control 189 * NFS congestion control
190 */ 190 */
191 191
192 int nfs_congestion_kb; 192 int nfs_congestion_kb;
193 193
194 #define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) 194 #define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
195 #define NFS_CONGESTION_OFF_THRESH \ 195 #define NFS_CONGESTION_OFF_THRESH \
196 (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) 196 (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
197 197
198 static int nfs_set_page_writeback(struct page *page) 198 static int nfs_set_page_writeback(struct page *page)
199 { 199 {
200 int ret = test_set_page_writeback(page); 200 int ret = test_set_page_writeback(page);
201 201
202 if (!ret) { 202 if (!ret) {
203 struct inode *inode = page->mapping->host; 203 struct inode *inode = page->mapping->host;
204 struct nfs_server *nfss = NFS_SERVER(inode); 204 struct nfs_server *nfss = NFS_SERVER(inode);
205 205
206 page_cache_get(page); 206 page_cache_get(page);
207 if (atomic_long_inc_return(&nfss->writeback) > 207 if (atomic_long_inc_return(&nfss->writeback) >
208 NFS_CONGESTION_ON_THRESH) { 208 NFS_CONGESTION_ON_THRESH) {
209 set_bdi_congested(&nfss->backing_dev_info, 209 set_bdi_congested(&nfss->backing_dev_info,
210 BLK_RW_ASYNC); 210 BLK_RW_ASYNC);
211 } 211 }
212 } 212 }
213 return ret; 213 return ret;
214 } 214 }
215 215
216 static void nfs_end_page_writeback(struct page *page) 216 static void nfs_end_page_writeback(struct page *page)
217 { 217 {
218 struct inode *inode = page->mapping->host; 218 struct inode *inode = page->mapping->host;
219 struct nfs_server *nfss = NFS_SERVER(inode); 219 struct nfs_server *nfss = NFS_SERVER(inode);
220 220
221 end_page_writeback(page); 221 end_page_writeback(page);
222 page_cache_release(page); 222 page_cache_release(page);
223 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 223 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
224 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 224 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
225 } 225 }
226 226
227 static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) 227 static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
228 { 228 {
229 struct inode *inode = page->mapping->host; 229 struct inode *inode = page->mapping->host;
230 struct nfs_page *req; 230 struct nfs_page *req;
231 int ret; 231 int ret;
232 232
233 spin_lock(&inode->i_lock); 233 spin_lock(&inode->i_lock);
234 for (;;) { 234 for (;;) {
235 req = nfs_page_find_request_locked(page); 235 req = nfs_page_find_request_locked(page);
236 if (req == NULL) 236 if (req == NULL)
237 break; 237 break;
238 if (nfs_set_page_tag_locked(req)) 238 if (nfs_set_page_tag_locked(req))
239 break; 239 break;
240 /* Note: If we hold the page lock, as is the case in nfs_writepage, 240 /* Note: If we hold the page lock, as is the case in nfs_writepage,
241 * then the call to nfs_set_page_tag_locked() will always 241 * then the call to nfs_set_page_tag_locked() will always
242 * succeed provided that someone hasn't already marked the 242 * succeed provided that someone hasn't already marked the
243 * request as dirty (in which case we don't care). 243 * request as dirty (in which case we don't care).
244 */ 244 */
245 spin_unlock(&inode->i_lock); 245 spin_unlock(&inode->i_lock);
246 if (!nonblock) 246 if (!nonblock)
247 ret = nfs_wait_on_request(req); 247 ret = nfs_wait_on_request(req);
248 else 248 else
249 ret = -EAGAIN; 249 ret = -EAGAIN;
250 nfs_release_request(req); 250 nfs_release_request(req);
251 if (ret != 0) 251 if (ret != 0)
252 return ERR_PTR(ret); 252 return ERR_PTR(ret);
253 spin_lock(&inode->i_lock); 253 spin_lock(&inode->i_lock);
254 } 254 }
255 spin_unlock(&inode->i_lock); 255 spin_unlock(&inode->i_lock);
256 return req; 256 return req;
257 } 257 }
258 258
259 /* 259 /*
260 * Find an associated nfs write request, and prepare to flush it out 260 * Find an associated nfs write request, and prepare to flush it out
261 * May return an error if the user signalled nfs_wait_on_request(). 261 * May return an error if the user signalled nfs_wait_on_request().
262 */ 262 */
263 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, 263 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
264 struct page *page, bool nonblock) 264 struct page *page, bool nonblock)
265 { 265 {
266 struct nfs_page *req; 266 struct nfs_page *req;
267 int ret = 0; 267 int ret = 0;
268 268
269 req = nfs_find_and_lock_request(page, nonblock); 269 req = nfs_find_and_lock_request(page, nonblock);
270 if (!req) 270 if (!req)
271 goto out; 271 goto out;
272 ret = PTR_ERR(req); 272 ret = PTR_ERR(req);
273 if (IS_ERR(req)) 273 if (IS_ERR(req))
274 goto out; 274 goto out;
275 275
276 ret = nfs_set_page_writeback(page); 276 ret = nfs_set_page_writeback(page);
277 BUG_ON(ret != 0); 277 BUG_ON(ret != 0);
278 BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); 278 BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
279 279
280 if (!nfs_pageio_add_request(pgio, req)) { 280 if (!nfs_pageio_add_request(pgio, req)) {
281 nfs_redirty_request(req); 281 nfs_redirty_request(req);
282 ret = pgio->pg_error; 282 ret = pgio->pg_error;
283 } 283 }
284 out: 284 out:
285 return ret; 285 return ret;
286 } 286 }
287 287
288 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) 288 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
289 { 289 {
290 struct inode *inode = page->mapping->host; 290 struct inode *inode = page->mapping->host;
291 int ret; 291 int ret;
292 292
293 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 293 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
294 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 294 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
295 295
296 nfs_pageio_cond_complete(pgio, page->index); 296 nfs_pageio_cond_complete(pgio, page->index);
297 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); 297 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
298 if (ret == -EAGAIN) { 298 if (ret == -EAGAIN) {
299 redirty_page_for_writepage(wbc, page); 299 redirty_page_for_writepage(wbc, page);
300 ret = 0; 300 ret = 0;
301 } 301 }
302 return ret; 302 return ret;
303 } 303 }
304 304
305 /* 305 /*
306 * Write an mmapped page to the server. 306 * Write an mmapped page to the server.
307 */ 307 */
308 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) 308 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
309 { 309 {
310 struct nfs_pageio_descriptor pgio; 310 struct nfs_pageio_descriptor pgio;
311 int err; 311 int err;
312 312
313 nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); 313 nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
314 err = nfs_do_writepage(page, wbc, &pgio); 314 err = nfs_do_writepage(page, wbc, &pgio);
315 nfs_pageio_complete(&pgio); 315 nfs_pageio_complete(&pgio);
316 if (err < 0) 316 if (err < 0)
317 return err; 317 return err;
318 if (pgio.pg_error < 0) 318 if (pgio.pg_error < 0)
319 return pgio.pg_error; 319 return pgio.pg_error;
320 return 0; 320 return 0;
321 } 321 }
322 322
323 int nfs_writepage(struct page *page, struct writeback_control *wbc) 323 int nfs_writepage(struct page *page, struct writeback_control *wbc)
324 { 324 {
325 int ret; 325 int ret;
326 326
327 ret = nfs_writepage_locked(page, wbc); 327 ret = nfs_writepage_locked(page, wbc);
328 unlock_page(page); 328 unlock_page(page);
329 return ret; 329 return ret;
330 } 330 }
331 331
332 static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) 332 static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
333 { 333 {
334 int ret; 334 int ret;
335 335
336 ret = nfs_do_writepage(page, wbc, data); 336 ret = nfs_do_writepage(page, wbc, data);
337 unlock_page(page); 337 unlock_page(page);
338 return ret; 338 return ret;
339 } 339 }
340 340
341 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 341 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
342 { 342 {
343 struct inode *inode = mapping->host; 343 struct inode *inode = mapping->host;
344 unsigned long *bitlock = &NFS_I(inode)->flags; 344 unsigned long *bitlock = &NFS_I(inode)->flags;
345 struct nfs_pageio_descriptor pgio; 345 struct nfs_pageio_descriptor pgio;
346 int err; 346 int err;
347 347
348 /* Stop dirtying of new pages while we sync */ 348 /* Stop dirtying of new pages while we sync */
349 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, 349 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
350 nfs_wait_bit_killable, TASK_KILLABLE); 350 nfs_wait_bit_killable, TASK_KILLABLE);
351 if (err) 351 if (err)
352 goto out_err; 352 goto out_err;
353 353
354 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 354 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
355 355
356 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); 356 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
357 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 357 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
358 nfs_pageio_complete(&pgio); 358 nfs_pageio_complete(&pgio);
359 359
360 clear_bit_unlock(NFS_INO_FLUSHING, bitlock); 360 clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
361 smp_mb__after_clear_bit(); 361 smp_mb__after_clear_bit();
362 wake_up_bit(bitlock, NFS_INO_FLUSHING); 362 wake_up_bit(bitlock, NFS_INO_FLUSHING);
363 363
364 if (err < 0) 364 if (err < 0)
365 goto out_err; 365 goto out_err;
366 err = pgio.pg_error; 366 err = pgio.pg_error;
367 if (err < 0) 367 if (err < 0)
368 goto out_err; 368 goto out_err;
369 return 0; 369 return 0;
370 out_err: 370 out_err:
371 return err; 371 return err;
372 } 372 }
373 373
374 /* 374 /*
375 * Insert a write request into an inode 375 * Insert a write request into an inode
376 */ 376 */
377 static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) 377 static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
378 { 378 {
379 struct nfs_inode *nfsi = NFS_I(inode); 379 struct nfs_inode *nfsi = NFS_I(inode);
380 int error; 380 int error;
381 381
382 error = radix_tree_preload(GFP_NOFS); 382 error = radix_tree_preload(GFP_NOFS);
383 if (error != 0) 383 if (error != 0)
384 goto out; 384 goto out;
385 385
386 /* Lock the request! */ 386 /* Lock the request! */
387 nfs_lock_request_dontget(req); 387 nfs_lock_request_dontget(req);
388 388
389 spin_lock(&inode->i_lock); 389 spin_lock(&inode->i_lock);
390 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); 390 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
391 BUG_ON(error); 391 BUG_ON(error);
392 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) 392 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
393 nfsi->change_attr++; 393 nfsi->change_attr++;
394 set_bit(PG_MAPPED, &req->wb_flags); 394 set_bit(PG_MAPPED, &req->wb_flags);
395 SetPagePrivate(req->wb_page); 395 SetPagePrivate(req->wb_page);
396 set_page_private(req->wb_page, (unsigned long)req); 396 set_page_private(req->wb_page, (unsigned long)req);
397 nfsi->npages++; 397 nfsi->npages++;
398 kref_get(&req->wb_kref); 398 kref_get(&req->wb_kref);
399 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, 399 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
400 NFS_PAGE_TAG_LOCKED); 400 NFS_PAGE_TAG_LOCKED);
401 spin_unlock(&inode->i_lock); 401 spin_unlock(&inode->i_lock);
402 radix_tree_preload_end(); 402 radix_tree_preload_end();
403 out: 403 out:
404 return error; 404 return error;
405 } 405 }
406 406
407 /* 407 /*
408 * Remove a write request from an inode 408 * Remove a write request from an inode
409 */ 409 */
410 static void nfs_inode_remove_request(struct nfs_page *req) 410 static void nfs_inode_remove_request(struct nfs_page *req)
411 { 411 {
412 struct inode *inode = req->wb_context->dentry->d_inode; 412 struct inode *inode = req->wb_context->dentry->d_inode;
413 struct nfs_inode *nfsi = NFS_I(inode); 413 struct nfs_inode *nfsi = NFS_I(inode);
414 414
415 BUG_ON (!NFS_WBACK_BUSY(req)); 415 BUG_ON (!NFS_WBACK_BUSY(req));
416 416
417 spin_lock(&inode->i_lock); 417 spin_lock(&inode->i_lock);
418 set_page_private(req->wb_page, 0); 418 set_page_private(req->wb_page, 0);
419 ClearPagePrivate(req->wb_page); 419 ClearPagePrivate(req->wb_page);
420 clear_bit(PG_MAPPED, &req->wb_flags); 420 clear_bit(PG_MAPPED, &req->wb_flags);
421 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 421 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
422 nfsi->npages--; 422 nfsi->npages--;
423 spin_unlock(&inode->i_lock); 423 spin_unlock(&inode->i_lock);
424 nfs_release_request(req); 424 nfs_release_request(req);
425 } 425 }
426 426
427 static void 427 static void
428 nfs_mark_request_dirty(struct nfs_page *req) 428 nfs_mark_request_dirty(struct nfs_page *req)
429 { 429 {
430 __set_page_dirty_nobuffers(req->wb_page); 430 __set_page_dirty_nobuffers(req->wb_page);
431 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); 431 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
432 } 432 }
433 433
434 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 434 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
435 /* 435 /*
436 * Add a request to the inode's commit list. 436 * Add a request to the inode's commit list.
437 */ 437 */
438 static void 438 static void
439 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) 439 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
440 { 440 {
441 struct inode *inode = req->wb_context->dentry->d_inode; 441 struct inode *inode = req->wb_context->dentry->d_inode;
442 struct nfs_inode *nfsi = NFS_I(inode); 442 struct nfs_inode *nfsi = NFS_I(inode);
443 443
444 spin_lock(&inode->i_lock); 444 spin_lock(&inode->i_lock);
445 set_bit(PG_CLEAN, &(req)->wb_flags); 445 set_bit(PG_CLEAN, &(req)->wb_flags);
446 radix_tree_tag_set(&nfsi->nfs_page_tree, 446 radix_tree_tag_set(&nfsi->nfs_page_tree,
447 req->wb_index, 447 req->wb_index,
448 NFS_PAGE_TAG_COMMIT); 448 NFS_PAGE_TAG_COMMIT);
449 nfsi->ncommit++; 449 nfsi->ncommit++;
450 spin_unlock(&inode->i_lock); 450 spin_unlock(&inode->i_lock);
451 pnfs_mark_request_commit(req, lseg); 451 pnfs_mark_request_commit(req, lseg);
452 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 452 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
453 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 453 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
454 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 454 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
455 } 455 }
456 456
457 static int 457 static int
458 nfs_clear_request_commit(struct nfs_page *req) 458 nfs_clear_request_commit(struct nfs_page *req)
459 { 459 {
460 struct page *page = req->wb_page; 460 struct page *page = req->wb_page;
461 461
462 if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { 462 if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
463 dec_zone_page_state(page, NR_UNSTABLE_NFS); 463 dec_zone_page_state(page, NR_UNSTABLE_NFS);
464 dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); 464 dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
465 return 1; 465 return 1;
466 } 466 }
467 return 0; 467 return 0;
468 } 468 }
469 469
470 static inline 470 static inline
471 int nfs_write_need_commit(struct nfs_write_data *data) 471 int nfs_write_need_commit(struct nfs_write_data *data)
472 { 472 {
473 if (data->verf.committed == NFS_DATA_SYNC) 473 if (data->verf.committed == NFS_DATA_SYNC)
474 return data->lseg == NULL; 474 return data->lseg == NULL;
475 else 475 else
476 return data->verf.committed != NFS_FILE_SYNC; 476 return data->verf.committed != NFS_FILE_SYNC;
477 } 477 }
478 478
479 static inline 479 static inline
480 int nfs_reschedule_unstable_write(struct nfs_page *req, 480 int nfs_reschedule_unstable_write(struct nfs_page *req,
481 struct nfs_write_data *data) 481 struct nfs_write_data *data)
482 { 482 {
483 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { 483 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
484 nfs_mark_request_commit(req, data->lseg); 484 nfs_mark_request_commit(req, data->lseg);
485 return 1; 485 return 1;
486 } 486 }
487 if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { 487 if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
488 nfs_mark_request_dirty(req); 488 nfs_mark_request_dirty(req);
489 return 1; 489 return 1;
490 } 490 }
491 return 0; 491 return 0;
492 } 492 }
493 #else 493 #else
494 static inline void 494 static inline void
495 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) 495 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
496 { 496 {
497 } 497 }
498 498
499 static inline int 499 static inline int
500 nfs_clear_request_commit(struct nfs_page *req) 500 nfs_clear_request_commit(struct nfs_page *req)
501 { 501 {
502 return 0; 502 return 0;
503 } 503 }
504 504
505 static inline 505 static inline
506 int nfs_write_need_commit(struct nfs_write_data *data) 506 int nfs_write_need_commit(struct nfs_write_data *data)
507 { 507 {
508 return 0; 508 return 0;
509 } 509 }
510 510
511 static inline 511 static inline
512 int nfs_reschedule_unstable_write(struct nfs_page *req, 512 int nfs_reschedule_unstable_write(struct nfs_page *req,
513 struct nfs_write_data *data) 513 struct nfs_write_data *data)
514 { 514 {
515 return 0; 515 return 0;
516 } 516 }
517 #endif 517 #endif
518 518
519 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 519 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
520 static int 520 static int
521 nfs_need_commit(struct nfs_inode *nfsi) 521 nfs_need_commit(struct nfs_inode *nfsi)
522 { 522 {
523 return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); 523 return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
524 } 524 }
525 525
526 /* 526 /*
527 * nfs_scan_commit - Scan an inode for commit requests 527 * nfs_scan_commit - Scan an inode for commit requests
528 * @inode: NFS inode to scan 528 * @inode: NFS inode to scan
529 * @dst: destination list 529 * @dst: destination list
530 * @idx_start: lower bound of page->index to scan. 530 * @idx_start: lower bound of page->index to scan.
531 * @npages: idx_start + npages sets the upper bound to scan. 531 * @npages: idx_start + npages sets the upper bound to scan.
532 * 532 *
533 * Moves requests from the inode's 'commit' request list. 533 * Moves requests from the inode's 'commit' request list.
534 * The requests are *not* checked to ensure that they form a contiguous set. 534 * The requests are *not* checked to ensure that they form a contiguous set.
535 */ 535 */
536 static int 536 static int
537 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 537 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
538 { 538 {
539 struct nfs_inode *nfsi = NFS_I(inode); 539 struct nfs_inode *nfsi = NFS_I(inode);
540 int ret; 540 int ret;
541 541
542 if (!nfs_need_commit(nfsi)) 542 if (!nfs_need_commit(nfsi))
543 return 0; 543 return 0;
544 544
545 spin_lock(&inode->i_lock); 545 spin_lock(&inode->i_lock);
546 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); 546 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
547 if (ret > 0) 547 if (ret > 0)
548 nfsi->ncommit -= ret; 548 nfsi->ncommit -= ret;
549 spin_unlock(&inode->i_lock); 549 spin_unlock(&inode->i_lock);
550 550
551 if (nfs_need_commit(NFS_I(inode))) 551 if (nfs_need_commit(NFS_I(inode)))
552 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 552 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
553 553
554 return ret; 554 return ret;
555 } 555 }
556 #else 556 #else
557 static inline int nfs_need_commit(struct nfs_inode *nfsi) 557 static inline int nfs_need_commit(struct nfs_inode *nfsi)
558 { 558 {
559 return 0; 559 return 0;
560 } 560 }
561 561
562 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 562 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
563 { 563 {
564 return 0; 564 return 0;
565 } 565 }
566 #endif 566 #endif
567 567
568 /* 568 /*
569 * Search for an existing write request, and attempt to update 569 * Search for an existing write request, and attempt to update
570 * it to reflect a new dirty region on a given page. 570 * it to reflect a new dirty region on a given page.
571 * 571 *
572 * If the attempt fails, then the existing request is flushed out 572 * If the attempt fails, then the existing request is flushed out
573 * to disk. 573 * to disk.
574 */ 574 */
575 static struct nfs_page *nfs_try_to_update_request(struct inode *inode, 575 static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
576 struct page *page, 576 struct page *page,
577 unsigned int offset, 577 unsigned int offset,
578 unsigned int bytes) 578 unsigned int bytes)
579 { 579 {
580 struct nfs_page *req; 580 struct nfs_page *req;
581 unsigned int rqend; 581 unsigned int rqend;
582 unsigned int end; 582 unsigned int end;
583 int error; 583 int error;
584 584
585 if (!PagePrivate(page)) 585 if (!PagePrivate(page))
586 return NULL; 586 return NULL;
587 587
588 end = offset + bytes; 588 end = offset + bytes;
589 spin_lock(&inode->i_lock); 589 spin_lock(&inode->i_lock);
590 590
591 for (;;) { 591 for (;;) {
592 req = nfs_page_find_request_locked(page); 592 req = nfs_page_find_request_locked(page);
593 if (req == NULL) 593 if (req == NULL)
594 goto out_unlock; 594 goto out_unlock;
595 595
596 rqend = req->wb_offset + req->wb_bytes; 596 rqend = req->wb_offset + req->wb_bytes;
597 /* 597 /*
598 * Tell the caller to flush out the request if 598 * Tell the caller to flush out the request if
599 * the offsets are non-contiguous. 599 * the offsets are non-contiguous.
600 * Note: nfs_flush_incompatible() will already 600 * Note: nfs_flush_incompatible() will already
601 * have flushed out requests having wrong owners. 601 * have flushed out requests having wrong owners.
602 */ 602 */
603 if (offset > rqend 603 if (offset > rqend
604 || end < req->wb_offset) 604 || end < req->wb_offset)
605 goto out_flushme; 605 goto out_flushme;
606 606
607 if (nfs_set_page_tag_locked(req)) 607 if (nfs_set_page_tag_locked(req))
608 break; 608 break;
609 609
610 /* The request is locked, so wait and then retry */ 610 /* The request is locked, so wait and then retry */
611 spin_unlock(&inode->i_lock); 611 spin_unlock(&inode->i_lock);
612 error = nfs_wait_on_request(req); 612 error = nfs_wait_on_request(req);
613 nfs_release_request(req); 613 nfs_release_request(req);
614 if (error != 0) 614 if (error != 0)
615 goto out_err; 615 goto out_err;
616 spin_lock(&inode->i_lock); 616 spin_lock(&inode->i_lock);
617 } 617 }
618 618
619 if (nfs_clear_request_commit(req) && 619 if (nfs_clear_request_commit(req) &&
620 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, 620 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
621 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { 621 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
622 NFS_I(inode)->ncommit--; 622 NFS_I(inode)->ncommit--;
623 pnfs_clear_request_commit(req); 623 pnfs_clear_request_commit(req);
624 } 624 }
625 625
626 /* Okay, the request matches. Update the region */ 626 /* Okay, the request matches. Update the region */
627 if (offset < req->wb_offset) { 627 if (offset < req->wb_offset) {
628 req->wb_offset = offset; 628 req->wb_offset = offset;
629 req->wb_pgbase = offset; 629 req->wb_pgbase = offset;
630 } 630 }
631 if (end > rqend) 631 if (end > rqend)
632 req->wb_bytes = end - req->wb_offset; 632 req->wb_bytes = end - req->wb_offset;
633 else 633 else
634 req->wb_bytes = rqend - req->wb_offset; 634 req->wb_bytes = rqend - req->wb_offset;
635 out_unlock: 635 out_unlock:
636 spin_unlock(&inode->i_lock); 636 spin_unlock(&inode->i_lock);
637 return req; 637 return req;
638 out_flushme: 638 out_flushme:
639 spin_unlock(&inode->i_lock); 639 spin_unlock(&inode->i_lock);
640 nfs_release_request(req); 640 nfs_release_request(req);
641 error = nfs_wb_page(inode, page); 641 error = nfs_wb_page(inode, page);
642 out_err: 642 out_err:
643 return ERR_PTR(error); 643 return ERR_PTR(error);
644 } 644 }
645 645
646 /* 646 /*
647 * Try to update an existing write request, or create one if there is none. 647 * Try to update an existing write request, or create one if there is none.
648 * 648 *
649 * Note: Should always be called with the Page Lock held to prevent races 649 * Note: Should always be called with the Page Lock held to prevent races
650 * if we have to add a new request. Also assumes that the caller has 650 * if we have to add a new request. Also assumes that the caller has
651 * already called nfs_flush_incompatible() if necessary. 651 * already called nfs_flush_incompatible() if necessary.
652 */ 652 */
653 static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, 653 static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
654 struct page *page, unsigned int offset, unsigned int bytes) 654 struct page *page, unsigned int offset, unsigned int bytes)
655 { 655 {
656 struct inode *inode = page->mapping->host; 656 struct inode *inode = page->mapping->host;
657 struct nfs_page *req; 657 struct nfs_page *req;
658 int error; 658 int error;
659 659
660 req = nfs_try_to_update_request(inode, page, offset, bytes); 660 req = nfs_try_to_update_request(inode, page, offset, bytes);
661 if (req != NULL) 661 if (req != NULL)
662 goto out; 662 goto out;
663 req = nfs_create_request(ctx, inode, page, offset, bytes); 663 req = nfs_create_request(ctx, inode, page, offset, bytes);
664 if (IS_ERR(req)) 664 if (IS_ERR(req))
665 goto out; 665 goto out;
666 error = nfs_inode_add_request(inode, req); 666 error = nfs_inode_add_request(inode, req);
667 if (error != 0) { 667 if (error != 0) {
668 nfs_release_request(req); 668 nfs_release_request(req);
669 req = ERR_PTR(error); 669 req = ERR_PTR(error);
670 } 670 }
671 out: 671 out:
672 return req; 672 return req;
673 } 673 }
674 674
675 static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, 675 static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
676 unsigned int offset, unsigned int count) 676 unsigned int offset, unsigned int count)
677 { 677 {
678 struct nfs_page *req; 678 struct nfs_page *req;
679 679
680 req = nfs_setup_write_request(ctx, page, offset, count); 680 req = nfs_setup_write_request(ctx, page, offset, count);
681 if (IS_ERR(req)) 681 if (IS_ERR(req))
682 return PTR_ERR(req); 682 return PTR_ERR(req);
683 /* Update file length */ 683 /* Update file length */
684 nfs_grow_file(page, offset, count); 684 nfs_grow_file(page, offset, count);
685 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); 685 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
686 nfs_mark_request_dirty(req); 686 nfs_mark_request_dirty(req);
687 nfs_clear_page_tag_locked(req); 687 nfs_clear_page_tag_locked(req);
688 return 0; 688 return 0;
689 } 689 }
690 690
691 int nfs_flush_incompatible(struct file *file, struct page *page) 691 int nfs_flush_incompatible(struct file *file, struct page *page)
692 { 692 {
693 struct nfs_open_context *ctx = nfs_file_open_context(file); 693 struct nfs_open_context *ctx = nfs_file_open_context(file);
694 struct nfs_page *req; 694 struct nfs_page *req;
695 int do_flush, status; 695 int do_flush, status;
696 /* 696 /*
697 * Look for a request corresponding to this page. If there 697 * Look for a request corresponding to this page. If there
698 * is one, and it belongs to another file, we flush it out 698 * is one, and it belongs to another file, we flush it out
699 * before we try to copy anything into the page. Do this 699 * before we try to copy anything into the page. Do this
700 * due to the lack of an ACCESS-type call in NFSv2. 700 * due to the lack of an ACCESS-type call in NFSv2.
701 * Also do the same if we find a request from an existing 701 * Also do the same if we find a request from an existing
702 * dropped page. 702 * dropped page.
703 */ 703 */
704 do { 704 do {
705 req = nfs_page_find_request(page); 705 req = nfs_page_find_request(page);
706 if (req == NULL) 706 if (req == NULL)
707 return 0; 707 return 0;
708 do_flush = req->wb_page != page || req->wb_context != ctx || 708 do_flush = req->wb_page != page || req->wb_context != ctx ||
709 req->wb_lock_context->lockowner != current->files || 709 req->wb_lock_context->lockowner != current->files ||
710 req->wb_lock_context->pid != current->tgid; 710 req->wb_lock_context->pid != current->tgid;
711 nfs_release_request(req); 711 nfs_release_request(req);
712 if (!do_flush) 712 if (!do_flush)
713 return 0; 713 return 0;
714 status = nfs_wb_page(page->mapping->host, page); 714 status = nfs_wb_page(page->mapping->host, page);
715 } while (status == 0); 715 } while (status == 0);
716 return status; 716 return status;
717 } 717 }
718 718
719 /* 719 /*
720 * If the page cache is marked as unsafe or invalid, then we can't rely on 720 * If the page cache is marked as unsafe or invalid, then we can't rely on
721 * the PageUptodate() flag. In this case, we will need to turn off 721 * the PageUptodate() flag. In this case, we will need to turn off
722 * write optimisations that depend on the page contents being correct. 722 * write optimisations that depend on the page contents being correct.
723 */ 723 */
724 static int nfs_write_pageuptodate(struct page *page, struct inode *inode) 724 static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
725 { 725 {
726 return PageUptodate(page) && 726 return PageUptodate(page) &&
727 !(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA)); 727 !(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA));
728 } 728 }
729 729
730 /* 730 /*
731 * Update and possibly write a cached page of an NFS file. 731 * Update and possibly write a cached page of an NFS file.
732 * 732 *
733 * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad 733 * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
734 * things with a page scheduled for an RPC call (e.g. invalidate it). 734 * things with a page scheduled for an RPC call (e.g. invalidate it).
735 */ 735 */
736 int nfs_updatepage(struct file *file, struct page *page, 736 int nfs_updatepage(struct file *file, struct page *page,
737 unsigned int offset, unsigned int count) 737 unsigned int offset, unsigned int count)
738 { 738 {
739 struct nfs_open_context *ctx = nfs_file_open_context(file); 739 struct nfs_open_context *ctx = nfs_file_open_context(file);
740 struct inode *inode = page->mapping->host; 740 struct inode *inode = page->mapping->host;
741 int status = 0; 741 int status = 0;
742 742
743 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); 743 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
744 744
745 dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", 745 dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
746 file->f_path.dentry->d_parent->d_name.name, 746 file->f_path.dentry->d_parent->d_name.name,
747 file->f_path.dentry->d_name.name, count, 747 file->f_path.dentry->d_name.name, count,
748 (long long)(page_offset(page) + offset)); 748 (long long)(page_offset(page) + offset));
749 749
750 /* If we're not using byte range locks, and we know the page 750 /* If we're not using byte range locks, and we know the page
751 * is up to date, it may be more efficient to extend the write 751 * is up to date, it may be more efficient to extend the write
752 * to cover the entire page in order to avoid fragmentation 752 * to cover the entire page in order to avoid fragmentation
753 * inefficiencies. 753 * inefficiencies.
754 */ 754 */
755 if (nfs_write_pageuptodate(page, inode) && 755 if (nfs_write_pageuptodate(page, inode) &&
756 inode->i_flock == NULL && 756 inode->i_flock == NULL &&
757 !(file->f_flags & O_DSYNC)) { 757 !(file->f_flags & O_DSYNC)) {
758 count = max(count + offset, nfs_page_length(page)); 758 count = max(count + offset, nfs_page_length(page));
759 offset = 0; 759 offset = 0;
760 } 760 }
761 761
762 status = nfs_writepage_setup(ctx, page, offset, count); 762 status = nfs_writepage_setup(ctx, page, offset, count);
763 if (status < 0) 763 if (status < 0)
764 nfs_set_pageerror(page); 764 nfs_set_pageerror(page);
765 765
766 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 766 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
767 status, (long long)i_size_read(inode)); 767 status, (long long)i_size_read(inode));
768 return status; 768 return status;
769 } 769 }
770 770
771 static void nfs_writepage_release(struct nfs_page *req, 771 static void nfs_writepage_release(struct nfs_page *req,
772 struct nfs_write_data *data) 772 struct nfs_write_data *data)
773 { 773 {
774 struct page *page = req->wb_page; 774 struct page *page = req->wb_page;
775 775
776 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) 776 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
777 nfs_inode_remove_request(req); 777 nfs_inode_remove_request(req);
778 nfs_clear_page_tag_locked(req); 778 nfs_clear_page_tag_locked(req);
779 nfs_end_page_writeback(page); 779 nfs_end_page_writeback(page);
780 } 780 }
781 781
782 static int flush_task_priority(int how) 782 static int flush_task_priority(int how)
783 { 783 {
784 switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { 784 switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
785 case FLUSH_HIGHPRI: 785 case FLUSH_HIGHPRI:
786 return RPC_PRIORITY_HIGH; 786 return RPC_PRIORITY_HIGH;
787 case FLUSH_LOWPRI: 787 case FLUSH_LOWPRI:
788 return RPC_PRIORITY_LOW; 788 return RPC_PRIORITY_LOW;
789 } 789 }
790 return RPC_PRIORITY_NORMAL; 790 return RPC_PRIORITY_NORMAL;
791 } 791 }
792 792
793 int nfs_initiate_write(struct nfs_write_data *data, 793 int nfs_initiate_write(struct nfs_write_data *data,
794 struct rpc_clnt *clnt, 794 struct rpc_clnt *clnt,
795 const struct rpc_call_ops *call_ops, 795 const struct rpc_call_ops *call_ops,
796 int how) 796 int how)
797 { 797 {
798 struct inode *inode = data->inode; 798 struct inode *inode = data->inode;
799 int priority = flush_task_priority(how); 799 int priority = flush_task_priority(how);
800 struct rpc_task *task; 800 struct rpc_task *task;
801 struct rpc_message msg = { 801 struct rpc_message msg = {
802 .rpc_argp = &data->args, 802 .rpc_argp = &data->args,
803 .rpc_resp = &data->res, 803 .rpc_resp = &data->res,
804 .rpc_cred = data->cred, 804 .rpc_cred = data->cred,
805 }; 805 };
806 struct rpc_task_setup task_setup_data = { 806 struct rpc_task_setup task_setup_data = {
807 .rpc_client = clnt, 807 .rpc_client = clnt,
808 .task = &data->task, 808 .task = &data->task,
809 .rpc_message = &msg, 809 .rpc_message = &msg,
810 .callback_ops = call_ops, 810 .callback_ops = call_ops,
811 .callback_data = data, 811 .callback_data = data,
812 .workqueue = nfsiod_workqueue, 812 .workqueue = nfsiod_workqueue,
813 .flags = RPC_TASK_ASYNC, 813 .flags = RPC_TASK_ASYNC,
814 .priority = priority, 814 .priority = priority,
815 }; 815 };
816 int ret = 0; 816 int ret = 0;
817 817
818 /* Set up the initial task struct. */ 818 /* Set up the initial task struct. */
819 NFS_PROTO(inode)->write_setup(data, &msg); 819 NFS_PROTO(inode)->write_setup(data, &msg);
820 820
821 dprintk("NFS: %5u initiated write call " 821 dprintk("NFS: %5u initiated write call "
822 "(req %s/%lld, %u bytes @ offset %llu)\n", 822 "(req %s/%lld, %u bytes @ offset %llu)\n",
823 data->task.tk_pid, 823 data->task.tk_pid,
824 inode->i_sb->s_id, 824 inode->i_sb->s_id,
825 (long long)NFS_FILEID(inode), 825 (long long)NFS_FILEID(inode),
826 data->args.count, 826 data->args.count,
827 (unsigned long long)data->args.offset); 827 (unsigned long long)data->args.offset);
828 828
829 task = rpc_run_task(&task_setup_data); 829 task = rpc_run_task(&task_setup_data);
830 if (IS_ERR(task)) { 830 if (IS_ERR(task)) {
831 ret = PTR_ERR(task); 831 ret = PTR_ERR(task);
832 goto out; 832 goto out;
833 } 833 }
834 if (how & FLUSH_SYNC) { 834 if (how & FLUSH_SYNC) {
835 ret = rpc_wait_for_completion_task(task); 835 ret = rpc_wait_for_completion_task(task);
836 if (ret == 0) 836 if (ret == 0)
837 ret = task->tk_status; 837 ret = task->tk_status;
838 } 838 }
839 rpc_put_task(task); 839 rpc_put_task(task);
840 out: 840 out:
841 return ret; 841 return ret;
842 } 842 }
843 EXPORT_SYMBOL_GPL(nfs_initiate_write); 843 EXPORT_SYMBOL_GPL(nfs_initiate_write);
844 844
845 /* 845 /*
846 * Set up the argument/result storage required for the RPC call. 846 * Set up the argument/result storage required for the RPC call.
847 */ 847 */
848 static int nfs_write_rpcsetup(struct nfs_page *req, 848 static int nfs_write_rpcsetup(struct nfs_page *req,
849 struct nfs_write_data *data, 849 struct nfs_write_data *data,
850 const struct rpc_call_ops *call_ops, 850 const struct rpc_call_ops *call_ops,
851 unsigned int count, unsigned int offset, 851 unsigned int count, unsigned int offset,
852 struct pnfs_layout_segment *lseg, 852 struct pnfs_layout_segment *lseg,
853 int how) 853 int how)
854 { 854 {
855 struct inode *inode = req->wb_context->dentry->d_inode; 855 struct inode *inode = req->wb_context->dentry->d_inode;
856 856
857 /* Set up the RPC argument and reply structs 857 /* Set up the RPC argument and reply structs
858 * NB: take care not to mess about with data->commit et al. */ 858 * NB: take care not to mess about with data->commit et al. */
859 859
860 data->req = req; 860 data->req = req;
861 data->inode = inode = req->wb_context->dentry->d_inode; 861 data->inode = inode = req->wb_context->dentry->d_inode;
862 data->cred = req->wb_context->cred; 862 data->cred = req->wb_context->cred;
863 data->lseg = get_lseg(lseg); 863 data->lseg = get_lseg(lseg);
864 864
865 data->args.fh = NFS_FH(inode); 865 data->args.fh = NFS_FH(inode);
866 data->args.offset = req_offset(req) + offset; 866 data->args.offset = req_offset(req) + offset;
867 /* pnfs_set_layoutcommit needs this */ 867 /* pnfs_set_layoutcommit needs this */
868 data->mds_offset = data->args.offset; 868 data->mds_offset = data->args.offset;
869 data->args.pgbase = req->wb_pgbase + offset; 869 data->args.pgbase = req->wb_pgbase + offset;
870 data->args.pages = data->pagevec; 870 data->args.pages = data->pagevec;
871 data->args.count = count; 871 data->args.count = count;
872 data->args.context = get_nfs_open_context(req->wb_context); 872 data->args.context = get_nfs_open_context(req->wb_context);
873 data->args.lock_context = req->wb_lock_context; 873 data->args.lock_context = req->wb_lock_context;
874 data->args.stable = NFS_UNSTABLE; 874 data->args.stable = NFS_UNSTABLE;
875 if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { 875 if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
876 data->args.stable = NFS_DATA_SYNC; 876 data->args.stable = NFS_DATA_SYNC;
877 if (!nfs_need_commit(NFS_I(inode))) 877 if (!nfs_need_commit(NFS_I(inode)))
878 data->args.stable = NFS_FILE_SYNC; 878 data->args.stable = NFS_FILE_SYNC;
879 } 879 }
880 880
881 data->res.fattr = &data->fattr; 881 data->res.fattr = &data->fattr;
882 data->res.count = count; 882 data->res.count = count;
883 data->res.verf = &data->verf; 883 data->res.verf = &data->verf;
884 nfs_fattr_init(&data->fattr); 884 nfs_fattr_init(&data->fattr);
885 885
886 if (data->lseg && 886 if (data->lseg &&
887 (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) 887 (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
888 return 0; 888 return 0;
889 889
890 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); 890 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
891 } 891 }
892 892
893 /* If a nfs_flush_* function fails, it should remove reqs from @head and 893 /* If a nfs_flush_* function fails, it should remove reqs from @head and
894 * call this on each, which will prepare them to be retried on next 894 * call this on each, which will prepare them to be retried on next
895 * writeback using standard nfs. 895 * writeback using standard nfs.
896 */ 896 */
897 static void nfs_redirty_request(struct nfs_page *req) 897 static void nfs_redirty_request(struct nfs_page *req)
898 { 898 {
899 struct page *page = req->wb_page; 899 struct page *page = req->wb_page;
900 900
901 nfs_mark_request_dirty(req); 901 nfs_mark_request_dirty(req);
902 nfs_clear_page_tag_locked(req); 902 nfs_clear_page_tag_locked(req);
903 nfs_end_page_writeback(page); 903 nfs_end_page_writeback(page);
904 } 904 }
905 905
906 /* 906 /*
907 * Generate multiple small requests to write out a single 907 * Generate multiple small requests to write out a single
908 * contiguous dirty area on one page. 908 * contiguous dirty area on one page.
909 */ 909 */
910 static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) 910 static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
911 { 911 {
912 struct nfs_page *req = nfs_list_entry(desc->pg_list.next); 912 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
913 struct page *page = req->wb_page; 913 struct page *page = req->wb_page;
914 struct nfs_write_data *data; 914 struct nfs_write_data *data;
915 size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; 915 size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
916 unsigned int offset; 916 unsigned int offset;
917 int requests = 0; 917 int requests = 0;
918 int ret = 0; 918 int ret = 0;
919 struct pnfs_layout_segment *lseg; 919 struct pnfs_layout_segment *lseg;
920 LIST_HEAD(list); 920 LIST_HEAD(list);
921 921
922 nfs_list_remove_request(req); 922 nfs_list_remove_request(req);
923 923
924 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 924 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
925 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || 925 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
926 desc->pg_count > wsize)) 926 desc->pg_count > wsize))
927 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 927 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
928 928
929 929
930 nbytes = desc->pg_count; 930 nbytes = desc->pg_count;
931 do { 931 do {
932 size_t len = min(nbytes, wsize); 932 size_t len = min(nbytes, wsize);
933 933
934 data = nfs_writedata_alloc(1); 934 data = nfs_writedata_alloc(1);
935 if (!data) 935 if (!data)
936 goto out_bad; 936 goto out_bad;
937 list_add(&data->pages, &list); 937 list_add(&data->pages, &list);
938 requests++; 938 requests++;
939 nbytes -= len; 939 nbytes -= len;
940 } while (nbytes != 0); 940 } while (nbytes != 0);
941 atomic_set(&req->wb_complete, requests); 941 atomic_set(&req->wb_complete, requests);
942 942
943 BUG_ON(desc->pg_lseg); 943 BUG_ON(desc->pg_lseg);
944 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, 944 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
945 req_offset(req), desc->pg_count, 945 req_offset(req), desc->pg_count,
946 IOMODE_RW, GFP_NOFS); 946 IOMODE_RW, GFP_NOFS);
947 ClearPageError(page); 947 ClearPageError(page);
948 offset = 0; 948 offset = 0;
949 nbytes = desc->pg_count; 949 nbytes = desc->pg_count;
950 do { 950 do {
951 int ret2; 951 int ret2;
952 952
953 data = list_entry(list.next, struct nfs_write_data, pages); 953 data = list_entry(list.next, struct nfs_write_data, pages);
954 list_del_init(&data->pages); 954 list_del_init(&data->pages);
955 955
956 data->pagevec[0] = page; 956 data->pagevec[0] = page;
957 957
958 if (nbytes < wsize) 958 if (nbytes < wsize)
959 wsize = nbytes; 959 wsize = nbytes;
960 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, 960 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
961 wsize, offset, lseg, desc->pg_ioflags); 961 wsize, offset, lseg, desc->pg_ioflags);
962 if (ret == 0) 962 if (ret == 0)
963 ret = ret2; 963 ret = ret2;
964 offset += wsize; 964 offset += wsize;
965 nbytes -= wsize; 965 nbytes -= wsize;
966 } while (nbytes != 0); 966 } while (nbytes != 0);
967 967
968 put_lseg(lseg); 968 put_lseg(lseg);
969 desc->pg_lseg = NULL; 969 desc->pg_lseg = NULL;
970 return ret; 970 return ret;
971 971
972 out_bad: 972 out_bad:
973 while (!list_empty(&list)) { 973 while (!list_empty(&list)) {
974 data = list_entry(list.next, struct nfs_write_data, pages); 974 data = list_entry(list.next, struct nfs_write_data, pages);
975 list_del(&data->pages); 975 list_del(&data->pages);
976 nfs_writedata_free(data); 976 nfs_writedata_free(data);
977 } 977 }
978 nfs_redirty_request(req); 978 nfs_redirty_request(req);
979 return -ENOMEM; 979 return -ENOMEM;
980 } 980 }
981 981
982 /* 982 /*
983 * Create an RPC task for the given write request and kick it. 983 * Create an RPC task for the given write request and kick it.
984 * The page must have been locked by the caller. 984 * The page must have been locked by the caller.
985 * 985 *
986 * It may happen that the page we're passed is not marked dirty. 986 * It may happen that the page we're passed is not marked dirty.
987 * This is the case if nfs_updatepage detects a conflicting request 987 * This is the case if nfs_updatepage detects a conflicting request
988 * that has been written but not committed. 988 * that has been written but not committed.
989 */ 989 */
990 static int nfs_flush_one(struct nfs_pageio_descriptor *desc) 990 static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
991 { 991 {
992 struct nfs_page *req; 992 struct nfs_page *req;
993 struct page **pages; 993 struct page **pages;
994 struct nfs_write_data *data; 994 struct nfs_write_data *data;
995 struct list_head *head = &desc->pg_list; 995 struct list_head *head = &desc->pg_list;
996 struct pnfs_layout_segment *lseg = desc->pg_lseg; 996 struct pnfs_layout_segment *lseg = desc->pg_lseg;
997 int ret; 997 int ret;
998 998
999 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, 999 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
1000 desc->pg_count)); 1000 desc->pg_count));
1001 if (!data) { 1001 if (!data) {
1002 while (!list_empty(head)) { 1002 while (!list_empty(head)) {
1003 req = nfs_list_entry(head->next); 1003 req = nfs_list_entry(head->next);
1004 nfs_list_remove_request(req); 1004 nfs_list_remove_request(req);
1005 nfs_redirty_request(req); 1005 nfs_redirty_request(req);
1006 } 1006 }
1007 ret = -ENOMEM; 1007 ret = -ENOMEM;
1008 goto out; 1008 goto out;
1009 } 1009 }
1010 pages = data->pagevec; 1010 pages = data->pagevec;
1011 while (!list_empty(head)) { 1011 while (!list_empty(head)) {
1012 req = nfs_list_entry(head->next); 1012 req = nfs_list_entry(head->next);
1013 nfs_list_remove_request(req); 1013 nfs_list_remove_request(req);
1014 nfs_list_add_request(req, &data->pages); 1014 nfs_list_add_request(req, &data->pages);
1015 ClearPageError(req->wb_page); 1015 ClearPageError(req->wb_page);
1016 *pages++ = req->wb_page; 1016 *pages++ = req->wb_page;
1017 } 1017 }
1018 req = nfs_list_entry(data->pages.next); 1018 req = nfs_list_entry(data->pages.next);
1019 if ((!lseg) && list_is_singular(&data->pages)) 1019 if ((!lseg) && list_is_singular(&data->pages))
1020 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, 1020 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
1021 req_offset(req), desc->pg_count, 1021 req_offset(req), desc->pg_count,
1022 IOMODE_RW, GFP_NOFS); 1022 IOMODE_RW, GFP_NOFS);
1023 1023
1024 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 1024 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1025 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) 1025 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
1026 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 1026 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
1027 1027
1028 /* Set up the argument struct */ 1028 /* Set up the argument struct */
1029 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); 1029 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
1030 out: 1030 out:
1031 put_lseg(lseg); /* Cleans any gotten in ->pg_test */ 1031 put_lseg(lseg); /* Cleans any gotten in ->pg_test */
1032 desc->pg_lseg = NULL; 1032 desc->pg_lseg = NULL;
1033 return ret; 1033 return ret;
1034 } 1034 }
1035 1035
1036 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 1036 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
1037 struct inode *inode, int ioflags) 1037 struct inode *inode, int ioflags)
1038 { 1038 {
1039 size_t wsize = NFS_SERVER(inode)->wsize; 1039 size_t wsize = NFS_SERVER(inode)->wsize;
1040 1040
1041 if (wsize < PAGE_CACHE_SIZE) 1041 if (wsize < PAGE_CACHE_SIZE)
1042 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1042 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
1043 else 1043 else
1044 nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); 1044 nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
1045 } 1045 }
1046 1046
1047 /* 1047 /*
1048 * Handle a write reply that flushed part of a page. 1048 * Handle a write reply that flushed part of a page.
1049 */ 1049 */
1050 static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) 1050 static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
1051 { 1051 {
1052 struct nfs_write_data *data = calldata; 1052 struct nfs_write_data *data = calldata;
1053 1053
1054 dprintk("NFS: %5u write(%s/%lld %d@%lld)", 1054 dprintk("NFS: %5u write(%s/%lld %d@%lld)",
1055 task->tk_pid, 1055 task->tk_pid,
1056 data->req->wb_context->dentry->d_inode->i_sb->s_id, 1056 data->req->wb_context->dentry->d_inode->i_sb->s_id,
1057 (long long) 1057 (long long)
1058 NFS_FILEID(data->req->wb_context->dentry->d_inode), 1058 NFS_FILEID(data->req->wb_context->dentry->d_inode),
1059 data->req->wb_bytes, (long long)req_offset(data->req)); 1059 data->req->wb_bytes, (long long)req_offset(data->req));
1060 1060
1061 nfs_writeback_done(task, data); 1061 nfs_writeback_done(task, data);
1062 } 1062 }
1063 1063
1064 static void nfs_writeback_release_partial(void *calldata) 1064 static void nfs_writeback_release_partial(void *calldata)
1065 { 1065 {
1066 struct nfs_write_data *data = calldata; 1066 struct nfs_write_data *data = calldata;
1067 struct nfs_page *req = data->req; 1067 struct nfs_page *req = data->req;
1068 struct page *page = req->wb_page; 1068 struct page *page = req->wb_page;
1069 int status = data->task.tk_status; 1069 int status = data->task.tk_status;
1070 1070
1071 if (status < 0) { 1071 if (status < 0) {
1072 nfs_set_pageerror(page); 1072 nfs_set_pageerror(page);
1073 nfs_context_set_write_error(req->wb_context, status); 1073 nfs_context_set_write_error(req->wb_context, status);
1074 dprintk(", error = %d\n", status); 1074 dprintk(", error = %d\n", status);
1075 goto out; 1075 goto out;
1076 } 1076 }
1077 1077
1078 if (nfs_write_need_commit(data)) { 1078 if (nfs_write_need_commit(data)) {
1079 struct inode *inode = page->mapping->host; 1079 struct inode *inode = page->mapping->host;
1080 1080
1081 spin_lock(&inode->i_lock); 1081 spin_lock(&inode->i_lock);
1082 if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { 1082 if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
1083 /* Do nothing we need to resend the writes */ 1083 /* Do nothing we need to resend the writes */
1084 } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { 1084 } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
1085 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1085 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1086 dprintk(" defer commit\n"); 1086 dprintk(" defer commit\n");
1087 } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { 1087 } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
1088 set_bit(PG_NEED_RESCHED, &req->wb_flags); 1088 set_bit(PG_NEED_RESCHED, &req->wb_flags);
1089 clear_bit(PG_NEED_COMMIT, &req->wb_flags); 1089 clear_bit(PG_NEED_COMMIT, &req->wb_flags);
1090 dprintk(" server reboot detected\n"); 1090 dprintk(" server reboot detected\n");
1091 } 1091 }
1092 spin_unlock(&inode->i_lock); 1092 spin_unlock(&inode->i_lock);
1093 } else 1093 } else
1094 dprintk(" OK\n"); 1094 dprintk(" OK\n");
1095 1095
1096 out: 1096 out:
1097 if (atomic_dec_and_test(&req->wb_complete)) 1097 if (atomic_dec_and_test(&req->wb_complete))
1098 nfs_writepage_release(req, data); 1098 nfs_writepage_release(req, data);
1099 nfs_writedata_release(calldata); 1099 nfs_writedata_release(calldata);
1100 } 1100 }
1101 1101
1102 #if defined(CONFIG_NFS_V4_1) 1102 #if defined(CONFIG_NFS_V4_1)
1103 void nfs_write_prepare(struct rpc_task *task, void *calldata) 1103 void nfs_write_prepare(struct rpc_task *task, void *calldata)
1104 { 1104 {
1105 struct nfs_write_data *data = calldata; 1105 struct nfs_write_data *data = calldata;
1106 1106
1107 if (nfs4_setup_sequence(NFS_SERVER(data->inode), 1107 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
1108 &data->args.seq_args, 1108 &data->args.seq_args,
1109 &data->res.seq_res, 1, task)) 1109 &data->res.seq_res, 1, task))
1110 return; 1110 return;
1111 rpc_call_start(task); 1111 rpc_call_start(task);
1112 } 1112 }
1113 #endif /* CONFIG_NFS_V4_1 */ 1113 #endif /* CONFIG_NFS_V4_1 */
1114 1114
1115 static const struct rpc_call_ops nfs_write_partial_ops = { 1115 static const struct rpc_call_ops nfs_write_partial_ops = {
1116 #if defined(CONFIG_NFS_V4_1) 1116 #if defined(CONFIG_NFS_V4_1)
1117 .rpc_call_prepare = nfs_write_prepare, 1117 .rpc_call_prepare = nfs_write_prepare,
1118 #endif /* CONFIG_NFS_V4_1 */ 1118 #endif /* CONFIG_NFS_V4_1 */
1119 .rpc_call_done = nfs_writeback_done_partial, 1119 .rpc_call_done = nfs_writeback_done_partial,
1120 .rpc_release = nfs_writeback_release_partial, 1120 .rpc_release = nfs_writeback_release_partial,
1121 }; 1121 };
1122 1122
1123 /* 1123 /*
1124 * Handle a write reply that flushes a whole page. 1124 * Handle a write reply that flushes a whole page.
1125 * 1125 *
1126 * FIXME: There is an inherent race with invalidate_inode_pages and 1126 * FIXME: There is an inherent race with invalidate_inode_pages and
1127 * writebacks since the page->count is kept > 1 for as long 1127 * writebacks since the page->count is kept > 1 for as long
1128 * as the page has a write request pending. 1128 * as the page has a write request pending.
1129 */ 1129 */
1130 static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) 1130 static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1131 { 1131 {
1132 struct nfs_write_data *data = calldata; 1132 struct nfs_write_data *data = calldata;
1133 1133
1134 nfs_writeback_done(task, data); 1134 nfs_writeback_done(task, data);
1135 } 1135 }
1136 1136
1137 static void nfs_writeback_release_full(void *calldata) 1137 static void nfs_writeback_release_full(void *calldata)
1138 { 1138 {
1139 struct nfs_write_data *data = calldata; 1139 struct nfs_write_data *data = calldata;
1140 int status = data->task.tk_status; 1140 int status = data->task.tk_status;
1141 1141
1142 /* Update attributes as result of writeback. */ 1142 /* Update attributes as result of writeback. */
1143 while (!list_empty(&data->pages)) { 1143 while (!list_empty(&data->pages)) {
1144 struct nfs_page *req = nfs_list_entry(data->pages.next); 1144 struct nfs_page *req = nfs_list_entry(data->pages.next);
1145 struct page *page = req->wb_page; 1145 struct page *page = req->wb_page;
1146 1146
1147 nfs_list_remove_request(req); 1147 nfs_list_remove_request(req);
1148 1148
1149 dprintk("NFS: %5u write (%s/%lld %d@%lld)", 1149 dprintk("NFS: %5u write (%s/%lld %d@%lld)",
1150 data->task.tk_pid, 1150 data->task.tk_pid,
1151 req->wb_context->dentry->d_inode->i_sb->s_id, 1151 req->wb_context->dentry->d_inode->i_sb->s_id,
1152 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 1152 (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
1153 req->wb_bytes, 1153 req->wb_bytes,
1154 (long long)req_offset(req)); 1154 (long long)req_offset(req));
1155 1155
1156 if (status < 0) { 1156 if (status < 0) {
1157 nfs_set_pageerror(page); 1157 nfs_set_pageerror(page);
1158 nfs_context_set_write_error(req->wb_context, status); 1158 nfs_context_set_write_error(req->wb_context, status);
1159 dprintk(", error = %d\n", status); 1159 dprintk(", error = %d\n", status);
1160 goto remove_request; 1160 goto remove_request;
1161 } 1161 }
1162 1162
1163 if (nfs_write_need_commit(data)) { 1163 if (nfs_write_need_commit(data)) {
1164 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1164 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1165 nfs_mark_request_commit(req, data->lseg); 1165 nfs_mark_request_commit(req, data->lseg);
1166 dprintk(" marked for commit\n"); 1166 dprintk(" marked for commit\n");
1167 goto next; 1167 goto next;
1168 } 1168 }
1169 dprintk(" OK\n"); 1169 dprintk(" OK\n");
1170 remove_request: 1170 remove_request:
1171 nfs_inode_remove_request(req); 1171 nfs_inode_remove_request(req);
1172 next: 1172 next:
1173 nfs_clear_page_tag_locked(req); 1173 nfs_clear_page_tag_locked(req);
1174 nfs_end_page_writeback(page); 1174 nfs_end_page_writeback(page);
1175 } 1175 }
1176 nfs_writedata_release(calldata); 1176 nfs_writedata_release(calldata);
1177 } 1177 }
1178 1178
1179 static const struct rpc_call_ops nfs_write_full_ops = { 1179 static const struct rpc_call_ops nfs_write_full_ops = {
1180 #if defined(CONFIG_NFS_V4_1) 1180 #if defined(CONFIG_NFS_V4_1)
1181 .rpc_call_prepare = nfs_write_prepare, 1181 .rpc_call_prepare = nfs_write_prepare,
1182 #endif /* CONFIG_NFS_V4_1 */ 1182 #endif /* CONFIG_NFS_V4_1 */
1183 .rpc_call_done = nfs_writeback_done_full, 1183 .rpc_call_done = nfs_writeback_done_full,
1184 .rpc_release = nfs_writeback_release_full, 1184 .rpc_release = nfs_writeback_release_full,
1185 }; 1185 };
1186 1186
1187 1187
1188 /* 1188 /*
1189 * This function is called when the WRITE call is complete. 1189 * This function is called when the WRITE call is complete.
1190 */ 1190 */
1191 void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) 1191 void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1192 { 1192 {
1193 struct nfs_writeargs *argp = &data->args; 1193 struct nfs_writeargs *argp = &data->args;
1194 struct nfs_writeres *resp = &data->res; 1194 struct nfs_writeres *resp = &data->res;
1195 struct nfs_server *server = NFS_SERVER(data->inode); 1195 struct nfs_server *server = NFS_SERVER(data->inode);
1196 int status; 1196 int status;
1197 1197
1198 dprintk("NFS: %5u nfs_writeback_done (status %d)\n", 1198 dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
1199 task->tk_pid, task->tk_status); 1199 task->tk_pid, task->tk_status);
1200 1200
1201 /* 1201 /*
1202 * ->write_done will attempt to use post-op attributes to detect 1202 * ->write_done will attempt to use post-op attributes to detect
1203 * conflicting writes by other clients. A strict interpretation 1203 * conflicting writes by other clients. A strict interpretation
1204 * of close-to-open would allow us to continue caching even if 1204 * of close-to-open would allow us to continue caching even if
1205 * another writer had changed the file, but some applications 1205 * another writer had changed the file, but some applications
1206 * depend on tighter cache coherency when writing. 1206 * depend on tighter cache coherency when writing.
1207 */ 1207 */
1208 status = NFS_PROTO(data->inode)->write_done(task, data); 1208 status = NFS_PROTO(data->inode)->write_done(task, data);
1209 if (status != 0) 1209 if (status != 0)
1210 return; 1210 return;
1211 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); 1211 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
1212 1212
1213 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1213 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1214 if (resp->verf->committed < argp->stable && task->tk_status >= 0) { 1214 if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
1215 /* We tried a write call, but the server did not 1215 /* We tried a write call, but the server did not
1216 * commit data to stable storage even though we 1216 * commit data to stable storage even though we
1217 * requested it. 1217 * requested it.
1218 * Note: There is a known bug in Tru64 < 5.0 in which 1218 * Note: There is a known bug in Tru64 < 5.0 in which
1219 * the server reports NFS_DATA_SYNC, but performs 1219 * the server reports NFS_DATA_SYNC, but performs
1220 * NFS_FILE_SYNC. We therefore implement this checking 1220 * NFS_FILE_SYNC. We therefore implement this checking
1221 * as a dprintk() in order to avoid filling syslog. 1221 * as a dprintk() in order to avoid filling syslog.
1222 */ 1222 */
1223 static unsigned long complain; 1223 static unsigned long complain;
1224 1224
1225 /* Note this will print the MDS for a DS write */ 1225 /* Note this will print the MDS for a DS write */
1226 if (time_before(complain, jiffies)) { 1226 if (time_before(complain, jiffies)) {
1227 dprintk("NFS: faulty NFS server %s:" 1227 dprintk("NFS: faulty NFS server %s:"
1228 " (committed = %d) != (stable = %d)\n", 1228 " (committed = %d) != (stable = %d)\n",
1229 server->nfs_client->cl_hostname, 1229 server->nfs_client->cl_hostname,
1230 resp->verf->committed, argp->stable); 1230 resp->verf->committed, argp->stable);
1231 complain = jiffies + 300 * HZ; 1231 complain = jiffies + 300 * HZ;
1232 } 1232 }
1233 } 1233 }
1234 #endif 1234 #endif
1235 /* Is this a short write? */ 1235 /* Is this a short write? */
1236 if (task->tk_status >= 0 && resp->count < argp->count) { 1236 if (task->tk_status >= 0 && resp->count < argp->count) {
1237 static unsigned long complain; 1237 static unsigned long complain;
1238 1238
1239 nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); 1239 nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE);
1240 1240
1241 /* Has the server at least made some progress? */ 1241 /* Has the server at least made some progress? */
1242 if (resp->count != 0) { 1242 if (resp->count != 0) {
1243 /* Was this an NFSv2 write or an NFSv3 stable write? */ 1243 /* Was this an NFSv2 write or an NFSv3 stable write? */
1244 if (resp->verf->committed != NFS_UNSTABLE) { 1244 if (resp->verf->committed != NFS_UNSTABLE) {
1245 /* Resend from where the server left off */ 1245 /* Resend from where the server left off */
1246 data->mds_offset += resp->count; 1246 data->mds_offset += resp->count;
1247 argp->offset += resp->count; 1247 argp->offset += resp->count;
1248 argp->pgbase += resp->count; 1248 argp->pgbase += resp->count;
1249 argp->count -= resp->count; 1249 argp->count -= resp->count;
1250 } else { 1250 } else {
1251 /* Resend as a stable write in order to avoid 1251 /* Resend as a stable write in order to avoid
1252 * headaches in the case of a server crash. 1252 * headaches in the case of a server crash.
1253 */ 1253 */
1254 argp->stable = NFS_FILE_SYNC; 1254 argp->stable = NFS_FILE_SYNC;
1255 } 1255 }
1256 nfs_restart_rpc(task, server->nfs_client); 1256 nfs_restart_rpc(task, server->nfs_client);
1257 return; 1257 return;
1258 } 1258 }
1259 if (time_before(complain, jiffies)) { 1259 if (time_before(complain, jiffies)) {
1260 printk(KERN_WARNING 1260 printk(KERN_WARNING
1261 "NFS: Server wrote zero bytes, expected %u.\n", 1261 "NFS: Server wrote zero bytes, expected %u.\n",
1262 argp->count); 1262 argp->count);
1263 complain = jiffies + 300 * HZ; 1263 complain = jiffies + 300 * HZ;
1264 } 1264 }
1265 /* Can't do anything about it except throw an error. */ 1265 /* Can't do anything about it except throw an error. */
1266 task->tk_status = -EIO; 1266 task->tk_status = -EIO;
1267 } 1267 }
1268 return; 1268 return;
1269 } 1269 }
1270 1270
1271 1271
1272 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1272 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1273 static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) 1273 static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1274 { 1274 {
1275 int ret; 1275 int ret;
1276 1276
1277 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) 1277 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
1278 return 1; 1278 return 1;
1279 if (!may_wait) 1279 if (!may_wait)
1280 return 0; 1280 return 0;
1281 ret = out_of_line_wait_on_bit_lock(&nfsi->flags, 1281 ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
1282 NFS_INO_COMMIT, 1282 NFS_INO_COMMIT,
1283 nfs_wait_bit_killable, 1283 nfs_wait_bit_killable,
1284 TASK_KILLABLE); 1284 TASK_KILLABLE);
1285 return (ret < 0) ? ret : 1; 1285 return (ret < 0) ? ret : 1;
1286 } 1286 }
1287 1287
1288 void nfs_commit_clear_lock(struct nfs_inode *nfsi) 1288 void nfs_commit_clear_lock(struct nfs_inode *nfsi)
1289 { 1289 {
1290 clear_bit(NFS_INO_COMMIT, &nfsi->flags); 1290 clear_bit(NFS_INO_COMMIT, &nfsi->flags);
1291 smp_mb__after_clear_bit(); 1291 smp_mb__after_clear_bit();
1292 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); 1292 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
1293 } 1293 }
1294 EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); 1294 EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
1295 1295
1296 void nfs_commitdata_release(void *data) 1296 void nfs_commitdata_release(void *data)
1297 { 1297 {
1298 struct nfs_write_data *wdata = data; 1298 struct nfs_write_data *wdata = data;
1299 1299
1300 put_lseg(wdata->lseg); 1300 put_lseg(wdata->lseg);
1301 put_nfs_open_context(wdata->args.context); 1301 put_nfs_open_context(wdata->args.context);
1302 nfs_commit_free(wdata); 1302 nfs_commit_free(wdata);
1303 } 1303 }
1304 EXPORT_SYMBOL_GPL(nfs_commitdata_release); 1304 EXPORT_SYMBOL_GPL(nfs_commitdata_release);
1305 1305
1306 int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, 1306 int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
1307 const struct rpc_call_ops *call_ops, 1307 const struct rpc_call_ops *call_ops,
1308 int how) 1308 int how)
1309 { 1309 {
1310 struct rpc_task *task; 1310 struct rpc_task *task;
1311 int priority = flush_task_priority(how); 1311 int priority = flush_task_priority(how);
1312 struct rpc_message msg = { 1312 struct rpc_message msg = {
1313 .rpc_argp = &data->args, 1313 .rpc_argp = &data->args,
1314 .rpc_resp = &data->res, 1314 .rpc_resp = &data->res,
1315 .rpc_cred = data->cred, 1315 .rpc_cred = data->cred,
1316 }; 1316 };
1317 struct rpc_task_setup task_setup_data = { 1317 struct rpc_task_setup task_setup_data = {
1318 .task = &data->task, 1318 .task = &data->task,
1319 .rpc_client = clnt, 1319 .rpc_client = clnt,
1320 .rpc_message = &msg, 1320 .rpc_message = &msg,
1321 .callback_ops = call_ops, 1321 .callback_ops = call_ops,
1322 .callback_data = data, 1322 .callback_data = data,
1323 .workqueue = nfsiod_workqueue, 1323 .workqueue = nfsiod_workqueue,
1324 .flags = RPC_TASK_ASYNC, 1324 .flags = RPC_TASK_ASYNC,
1325 .priority = priority, 1325 .priority = priority,
1326 }; 1326 };
1327 /* Set up the initial task struct. */ 1327 /* Set up the initial task struct. */
1328 NFS_PROTO(data->inode)->commit_setup(data, &msg); 1328 NFS_PROTO(data->inode)->commit_setup(data, &msg);
1329 1329
1330 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1330 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1331 1331
1332 task = rpc_run_task(&task_setup_data); 1332 task = rpc_run_task(&task_setup_data);
1333 if (IS_ERR(task)) 1333 if (IS_ERR(task))
1334 return PTR_ERR(task); 1334 return PTR_ERR(task);
1335 if (how & FLUSH_SYNC) 1335 if (how & FLUSH_SYNC)
1336 rpc_wait_for_completion_task(task); 1336 rpc_wait_for_completion_task(task);
1337 rpc_put_task(task); 1337 rpc_put_task(task);
1338 return 0; 1338 return 0;
1339 } 1339 }
1340 EXPORT_SYMBOL_GPL(nfs_initiate_commit); 1340 EXPORT_SYMBOL_GPL(nfs_initiate_commit);
1341 1341
1342 /* 1342 /*
1343 * Set up the argument/result storage required for the RPC call. 1343 * Set up the argument/result storage required for the RPC call.
1344 */ 1344 */
1345 void nfs_init_commit(struct nfs_write_data *data, 1345 void nfs_init_commit(struct nfs_write_data *data,
1346 struct list_head *head, 1346 struct list_head *head,
1347 struct pnfs_layout_segment *lseg) 1347 struct pnfs_layout_segment *lseg)
1348 { 1348 {
1349 struct nfs_page *first = nfs_list_entry(head->next); 1349 struct nfs_page *first = nfs_list_entry(head->next);
1350 struct inode *inode = first->wb_context->dentry->d_inode; 1350 struct inode *inode = first->wb_context->dentry->d_inode;
1351 1351
1352 /* Set up the RPC argument and reply structs 1352 /* Set up the RPC argument and reply structs
1353 * NB: take care not to mess about with data->commit et al. */ 1353 * NB: take care not to mess about with data->commit et al. */
1354 1354
1355 list_splice_init(head, &data->pages); 1355 list_splice_init(head, &data->pages);
1356 1356
1357 data->inode = inode; 1357 data->inode = inode;
1358 data->cred = first->wb_context->cred; 1358 data->cred = first->wb_context->cred;
1359 data->lseg = lseg; /* reference transferred */ 1359 data->lseg = lseg; /* reference transferred */
1360 data->mds_ops = &nfs_commit_ops; 1360 data->mds_ops = &nfs_commit_ops;
1361 1361
1362 data->args.fh = NFS_FH(data->inode); 1362 data->args.fh = NFS_FH(data->inode);
1363 /* Note: we always request a commit of the entire inode */ 1363 /* Note: we always request a commit of the entire inode */
1364 data->args.offset = 0; 1364 data->args.offset = 0;
1365 data->args.count = 0; 1365 data->args.count = 0;
1366 data->args.context = get_nfs_open_context(first->wb_context); 1366 data->args.context = get_nfs_open_context(first->wb_context);
1367 data->res.count = 0; 1367 data->res.count = 0;
1368 data->res.fattr = &data->fattr; 1368 data->res.fattr = &data->fattr;
1369 data->res.verf = &data->verf; 1369 data->res.verf = &data->verf;
1370 nfs_fattr_init(&data->fattr); 1370 nfs_fattr_init(&data->fattr);
1371 } 1371 }
1372 EXPORT_SYMBOL_GPL(nfs_init_commit); 1372 EXPORT_SYMBOL_GPL(nfs_init_commit);
1373 1373
1374 void nfs_retry_commit(struct list_head *page_list, 1374 void nfs_retry_commit(struct list_head *page_list,
1375 struct pnfs_layout_segment *lseg) 1375 struct pnfs_layout_segment *lseg)
1376 { 1376 {
1377 struct nfs_page *req; 1377 struct nfs_page *req;
1378 1378
1379 while (!list_empty(page_list)) { 1379 while (!list_empty(page_list)) {
1380 req = nfs_list_entry(page_list->next); 1380 req = nfs_list_entry(page_list->next);
1381 nfs_list_remove_request(req); 1381 nfs_list_remove_request(req);
1382 nfs_mark_request_commit(req, lseg); 1382 nfs_mark_request_commit(req, lseg);
1383 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1383 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1384 dec_bdi_stat(req->wb_page->mapping->backing_dev_info, 1384 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1385 BDI_RECLAIMABLE); 1385 BDI_RECLAIMABLE);
1386 nfs_clear_page_tag_locked(req); 1386 nfs_clear_page_tag_locked(req);
1387 } 1387 }
1388 } 1388 }
1389 EXPORT_SYMBOL_GPL(nfs_retry_commit); 1389 EXPORT_SYMBOL_GPL(nfs_retry_commit);
1390 1390
1391 /* 1391 /*
1392 * Commit dirty pages 1392 * Commit dirty pages
1393 */ 1393 */
1394 static int 1394 static int
1395 nfs_commit_list(struct inode *inode, struct list_head *head, int how) 1395 nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1396 { 1396 {
1397 struct nfs_write_data *data; 1397 struct nfs_write_data *data;
1398 1398
1399 data = nfs_commitdata_alloc(); 1399 data = nfs_commitdata_alloc();
1400 1400
1401 if (!data) 1401 if (!data)
1402 goto out_bad; 1402 goto out_bad;
1403 1403
1404 /* Set up the argument struct */ 1404 /* Set up the argument struct */
1405 nfs_init_commit(data, head, NULL); 1405 nfs_init_commit(data, head, NULL);
1406 return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); 1406 return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
1407 out_bad: 1407 out_bad:
1408 nfs_retry_commit(head, NULL); 1408 nfs_retry_commit(head, NULL);
1409 nfs_commit_clear_lock(NFS_I(inode)); 1409 nfs_commit_clear_lock(NFS_I(inode));
1410 return -ENOMEM; 1410 return -ENOMEM;
1411 } 1411 }
1412 1412
1413 /* 1413 /*
1414 * COMMIT call returned 1414 * COMMIT call returned
1415 */ 1415 */
1416 static void nfs_commit_done(struct rpc_task *task, void *calldata) 1416 static void nfs_commit_done(struct rpc_task *task, void *calldata)
1417 { 1417 {
1418 struct nfs_write_data *data = calldata; 1418 struct nfs_write_data *data = calldata;
1419 1419
1420 dprintk("NFS: %5u nfs_commit_done (status %d)\n", 1420 dprintk("NFS: %5u nfs_commit_done (status %d)\n",
1421 task->tk_pid, task->tk_status); 1421 task->tk_pid, task->tk_status);
1422 1422
1423 /* Call the NFS version-specific code */ 1423 /* Call the NFS version-specific code */
1424 NFS_PROTO(data->inode)->commit_done(task, data); 1424 NFS_PROTO(data->inode)->commit_done(task, data);
1425 } 1425 }
1426 1426
1427 void nfs_commit_release_pages(struct nfs_write_data *data) 1427 void nfs_commit_release_pages(struct nfs_write_data *data)
1428 { 1428 {
1429 struct nfs_page *req; 1429 struct nfs_page *req;
1430 int status = data->task.tk_status; 1430 int status = data->task.tk_status;
1431 1431
1432 while (!list_empty(&data->pages)) { 1432 while (!list_empty(&data->pages)) {
1433 req = nfs_list_entry(data->pages.next); 1433 req = nfs_list_entry(data->pages.next);
1434 nfs_list_remove_request(req); 1434 nfs_list_remove_request(req);
1435 nfs_clear_request_commit(req); 1435 nfs_clear_request_commit(req);
1436 1436
1437 dprintk("NFS: commit (%s/%lld %d@%lld)", 1437 dprintk("NFS: commit (%s/%lld %d@%lld)",
1438 req->wb_context->dentry->d_sb->s_id, 1438 req->wb_context->dentry->d_sb->s_id,
1439 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 1439 (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
1440 req->wb_bytes, 1440 req->wb_bytes,
1441 (long long)req_offset(req)); 1441 (long long)req_offset(req));
1442 if (status < 0) { 1442 if (status < 0) {
1443 nfs_context_set_write_error(req->wb_context, status); 1443 nfs_context_set_write_error(req->wb_context, status);
1444 nfs_inode_remove_request(req); 1444 nfs_inode_remove_request(req);
1445 dprintk(", error = %d\n", status); 1445 dprintk(", error = %d\n", status);
1446 goto next; 1446 goto next;
1447 } 1447 }
1448 1448
1449 /* Okay, COMMIT succeeded, apparently. Check the verifier 1449 /* Okay, COMMIT succeeded, apparently. Check the verifier
1450 * returned by the server against all stored verfs. */ 1450 * returned by the server against all stored verfs. */
1451 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { 1451 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
1452 /* We have a match */ 1452 /* We have a match */
1453 nfs_inode_remove_request(req); 1453 nfs_inode_remove_request(req);
1454 dprintk(" OK\n"); 1454 dprintk(" OK\n");
1455 goto next; 1455 goto next;
1456 } 1456 }
1457 /* We have a mismatch. Write the page again */ 1457 /* We have a mismatch. Write the page again */
1458 dprintk(" mismatch\n"); 1458 dprintk(" mismatch\n");
1459 nfs_mark_request_dirty(req); 1459 nfs_mark_request_dirty(req);
1460 next: 1460 next:
1461 nfs_clear_page_tag_locked(req); 1461 nfs_clear_page_tag_locked(req);
1462 } 1462 }
1463 } 1463 }
1464 EXPORT_SYMBOL_GPL(nfs_commit_release_pages); 1464 EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
1465 1465
1466 static void nfs_commit_release(void *calldata) 1466 static void nfs_commit_release(void *calldata)
1467 { 1467 {
1468 struct nfs_write_data *data = calldata; 1468 struct nfs_write_data *data = calldata;
1469 1469
1470 nfs_commit_release_pages(data); 1470 nfs_commit_release_pages(data);
1471 nfs_commit_clear_lock(NFS_I(data->inode)); 1471 nfs_commit_clear_lock(NFS_I(data->inode));
1472 nfs_commitdata_release(calldata); 1472 nfs_commitdata_release(calldata);
1473 } 1473 }
1474 1474
1475 static const struct rpc_call_ops nfs_commit_ops = { 1475 static const struct rpc_call_ops nfs_commit_ops = {
1476 #if defined(CONFIG_NFS_V4_1) 1476 #if defined(CONFIG_NFS_V4_1)
1477 .rpc_call_prepare = nfs_write_prepare, 1477 .rpc_call_prepare = nfs_write_prepare,
1478 #endif /* CONFIG_NFS_V4_1 */ 1478 #endif /* CONFIG_NFS_V4_1 */
1479 .rpc_call_done = nfs_commit_done, 1479 .rpc_call_done = nfs_commit_done,
1480 .rpc_release = nfs_commit_release, 1480 .rpc_release = nfs_commit_release,
1481 }; 1481 };
1482 1482
1483 int nfs_commit_inode(struct inode *inode, int how) 1483 int nfs_commit_inode(struct inode *inode, int how)
1484 { 1484 {
1485 LIST_HEAD(head); 1485 LIST_HEAD(head);
1486 int may_wait = how & FLUSH_SYNC; 1486 int may_wait = how & FLUSH_SYNC;
1487 int res; 1487 int res;
1488 1488
1489 res = nfs_commit_set_lock(NFS_I(inode), may_wait); 1489 res = nfs_commit_set_lock(NFS_I(inode), may_wait);
1490 if (res <= 0) 1490 if (res <= 0)
1491 goto out_mark_dirty; 1491 goto out_mark_dirty;
1492 res = nfs_scan_commit(inode, &head, 0, 0); 1492 res = nfs_scan_commit(inode, &head, 0, 0);
1493 if (res) { 1493 if (res) {
1494 int error; 1494 int error;
1495 1495
1496 error = pnfs_commit_list(inode, &head, how); 1496 error = pnfs_commit_list(inode, &head, how);
1497 if (error == PNFS_NOT_ATTEMPTED) 1497 if (error == PNFS_NOT_ATTEMPTED)
1498 error = nfs_commit_list(inode, &head, how); 1498 error = nfs_commit_list(inode, &head, how);
1499 if (error < 0) 1499 if (error < 0)
1500 return error; 1500 return error;
1501 if (!may_wait) 1501 if (!may_wait)
1502 goto out_mark_dirty; 1502 goto out_mark_dirty;
1503 error = wait_on_bit(&NFS_I(inode)->flags, 1503 error = wait_on_bit(&NFS_I(inode)->flags,
1504 NFS_INO_COMMIT, 1504 NFS_INO_COMMIT,
1505 nfs_wait_bit_killable, 1505 nfs_wait_bit_killable,
1506 TASK_KILLABLE); 1506 TASK_KILLABLE);
1507 if (error < 0) 1507 if (error < 0)
1508 return error; 1508 return error;
1509 } else 1509 } else
1510 nfs_commit_clear_lock(NFS_I(inode)); 1510 nfs_commit_clear_lock(NFS_I(inode));
1511 return res; 1511 return res;
1512 /* Note: If we exit without ensuring that the commit is complete, 1512 /* Note: If we exit without ensuring that the commit is complete,
1513 * we must mark the inode as dirty. Otherwise, future calls to 1513 * we must mark the inode as dirty. Otherwise, future calls to
1514 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure 1514 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1515 * that the data is on the disk. 1515 * that the data is on the disk.
1516 */ 1516 */
1517 out_mark_dirty: 1517 out_mark_dirty:
1518 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1518 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1519 return res; 1519 return res;
1520 } 1520 }
1521 1521
1522 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) 1522 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1523 { 1523 {
1524 struct nfs_inode *nfsi = NFS_I(inode); 1524 struct nfs_inode *nfsi = NFS_I(inode);
1525 int flags = FLUSH_SYNC; 1525 int flags = FLUSH_SYNC;
1526 int ret = 0; 1526 int ret = 0;
1527 1527
1528 if (wbc->sync_mode == WB_SYNC_NONE) { 1528 if (wbc->sync_mode == WB_SYNC_NONE) {
1529 /* Don't commit yet if this is a non-blocking flush and there 1529 /* Don't commit yet if this is a non-blocking flush and there
1530 * are a lot of outstanding writes for this mapping. 1530 * are a lot of outstanding writes for this mapping.
1531 */ 1531 */
1532 if (nfsi->ncommit <= (nfsi->npages >> 1)) 1532 if (nfsi->ncommit <= (nfsi->npages >> 1))
1533 goto out_mark_dirty; 1533 goto out_mark_dirty;
1534 1534
1535 /* don't wait for the COMMIT response */ 1535 /* don't wait for the COMMIT response */
1536 flags = 0; 1536 flags = 0;
1537 } 1537 }
1538 1538
1539 ret = nfs_commit_inode(inode, flags); 1539 ret = nfs_commit_inode(inode, flags);
1540 if (ret >= 0) { 1540 if (ret >= 0) {
1541 if (wbc->sync_mode == WB_SYNC_NONE) { 1541 if (wbc->sync_mode == WB_SYNC_NONE) {
1542 if (ret < wbc->nr_to_write) 1542 if (ret < wbc->nr_to_write)
1543 wbc->nr_to_write -= ret; 1543 wbc->nr_to_write -= ret;
1544 else 1544 else
1545 wbc->nr_to_write = 0; 1545 wbc->nr_to_write = 0;
1546 } 1546 }
1547 return 0; 1547 return 0;
1548 } 1548 }
1549 out_mark_dirty: 1549 out_mark_dirty:
1550 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1550 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1551 return ret; 1551 return ret;
1552 } 1552 }
1553 #else 1553 #else
1554 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) 1554 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1555 { 1555 {
1556 return 0; 1556 return 0;
1557 } 1557 }
1558 #endif 1558 #endif
1559 1559
1560 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1560 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1561 { 1561 {
1562 int ret; 1562 int ret;
1563 1563
1564 ret = nfs_commit_unstable_pages(inode, wbc); 1564 ret = nfs_commit_unstable_pages(inode, wbc);
1565 if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { 1565 if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
1566 int status; 1566 int status;
1567 bool sync = true; 1567 bool sync = true;
1568 1568
1569 if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || 1569 if (wbc->sync_mode == WB_SYNC_NONE)
1570 wbc->for_background)
1571 sync = false; 1570 sync = false;
1572 1571
1573 status = pnfs_layoutcommit_inode(inode, sync); 1572 status = pnfs_layoutcommit_inode(inode, sync);
1574 if (status < 0) 1573 if (status < 0)
1575 return status; 1574 return status;
1576 } 1575 }
1577 return ret; 1576 return ret;
1578 } 1577 }
1579 1578
1580 /* 1579 /*
1581 * flush the inode to disk. 1580 * flush the inode to disk.
1582 */ 1581 */
1583 int nfs_wb_all(struct inode *inode) 1582 int nfs_wb_all(struct inode *inode)
1584 { 1583 {
1585 struct writeback_control wbc = { 1584 struct writeback_control wbc = {
1586 .sync_mode = WB_SYNC_ALL, 1585 .sync_mode = WB_SYNC_ALL,
1587 .nr_to_write = LONG_MAX, 1586 .nr_to_write = LONG_MAX,
1588 .range_start = 0, 1587 .range_start = 0,
1589 .range_end = LLONG_MAX, 1588 .range_end = LLONG_MAX,
1590 }; 1589 };
1591 1590
1592 return sync_inode(inode, &wbc); 1591 return sync_inode(inode, &wbc);
1593 } 1592 }
1594 1593
1595 int nfs_wb_page_cancel(struct inode *inode, struct page *page) 1594 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1596 { 1595 {
1597 struct nfs_page *req; 1596 struct nfs_page *req;
1598 int ret = 0; 1597 int ret = 0;
1599 1598
1600 BUG_ON(!PageLocked(page)); 1599 BUG_ON(!PageLocked(page));
1601 for (;;) { 1600 for (;;) {
1602 wait_on_page_writeback(page); 1601 wait_on_page_writeback(page);
1603 req = nfs_page_find_request(page); 1602 req = nfs_page_find_request(page);
1604 if (req == NULL) 1603 if (req == NULL)
1605 break; 1604 break;
1606 if (nfs_lock_request_dontget(req)) { 1605 if (nfs_lock_request_dontget(req)) {
1607 nfs_inode_remove_request(req); 1606 nfs_inode_remove_request(req);
1608 /* 1607 /*
1609 * In case nfs_inode_remove_request has marked the 1608 * In case nfs_inode_remove_request has marked the
1610 * page as being dirty 1609 * page as being dirty
1611 */ 1610 */
1612 cancel_dirty_page(page, PAGE_CACHE_SIZE); 1611 cancel_dirty_page(page, PAGE_CACHE_SIZE);
1613 nfs_unlock_request(req); 1612 nfs_unlock_request(req);
1614 break; 1613 break;
1615 } 1614 }
1616 ret = nfs_wait_on_request(req); 1615 ret = nfs_wait_on_request(req);
1617 nfs_release_request(req); 1616 nfs_release_request(req);
1618 if (ret < 0) 1617 if (ret < 0)
1619 break; 1618 break;
1620 } 1619 }
1621 return ret; 1620 return ret;
1622 } 1621 }
1623 1622
1624 /* 1623 /*
1625 * Write back all requests on one page - we do this before reading it. 1624 * Write back all requests on one page - we do this before reading it.
1626 */ 1625 */
1627 int nfs_wb_page(struct inode *inode, struct page *page) 1626 int nfs_wb_page(struct inode *inode, struct page *page)
1628 { 1627 {
1629 loff_t range_start = page_offset(page); 1628 loff_t range_start = page_offset(page);
1630 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); 1629 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1631 struct writeback_control wbc = { 1630 struct writeback_control wbc = {
1632 .sync_mode = WB_SYNC_ALL, 1631 .sync_mode = WB_SYNC_ALL,
1633 .nr_to_write = 0, 1632 .nr_to_write = 0,
1634 .range_start = range_start, 1633 .range_start = range_start,
1635 .range_end = range_end, 1634 .range_end = range_end,
1636 }; 1635 };
1637 int ret; 1636 int ret;
1638 1637
1639 for (;;) { 1638 for (;;) {
1640 wait_on_page_writeback(page); 1639 wait_on_page_writeback(page);
1641 if (clear_page_dirty_for_io(page)) { 1640 if (clear_page_dirty_for_io(page)) {
1642 ret = nfs_writepage_locked(page, &wbc); 1641 ret = nfs_writepage_locked(page, &wbc);
1643 if (ret < 0) 1642 if (ret < 0)
1644 goto out_error; 1643 goto out_error;
1645 continue; 1644 continue;
1646 } 1645 }
1647 if (!PagePrivate(page)) 1646 if (!PagePrivate(page))
1648 break; 1647 break;
1649 ret = nfs_commit_inode(inode, FLUSH_SYNC); 1648 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1650 if (ret < 0) 1649 if (ret < 0)
1651 goto out_error; 1650 goto out_error;
1652 } 1651 }
1653 return 0; 1652 return 0;
1654 out_error: 1653 out_error:
1655 return ret; 1654 return ret;
1656 } 1655 }
1657 1656
1658 #ifdef CONFIG_MIGRATION 1657 #ifdef CONFIG_MIGRATION
1659 int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1658 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1660 struct page *page) 1659 struct page *page)
1661 { 1660 {
1662 struct nfs_page *req; 1661 struct nfs_page *req;
1663 int ret; 1662 int ret;
1664 1663
1665 nfs_fscache_release_page(page, GFP_KERNEL); 1664 nfs_fscache_release_page(page, GFP_KERNEL);
1666 1665
1667 req = nfs_find_and_lock_request(page, false); 1666 req = nfs_find_and_lock_request(page, false);
1668 ret = PTR_ERR(req); 1667 ret = PTR_ERR(req);
1669 if (IS_ERR(req)) 1668 if (IS_ERR(req))
1670 goto out; 1669 goto out;
1671 1670
1672 ret = migrate_page(mapping, newpage, page); 1671 ret = migrate_page(mapping, newpage, page);
1673 if (!req) 1672 if (!req)
1674 goto out; 1673 goto out;
1675 if (ret) 1674 if (ret)
1676 goto out_unlock; 1675 goto out_unlock;
1677 page_cache_get(newpage); 1676 page_cache_get(newpage);
1678 spin_lock(&mapping->host->i_lock); 1677 spin_lock(&mapping->host->i_lock);
1679 req->wb_page = newpage; 1678 req->wb_page = newpage;
1680 SetPagePrivate(newpage); 1679 SetPagePrivate(newpage);
1681 set_page_private(newpage, (unsigned long)req); 1680 set_page_private(newpage, (unsigned long)req);
1682 ClearPagePrivate(page); 1681 ClearPagePrivate(page);
1683 set_page_private(page, 0); 1682 set_page_private(page, 0);
1684 spin_unlock(&mapping->host->i_lock); 1683 spin_unlock(&mapping->host->i_lock);
1685 page_cache_release(page); 1684 page_cache_release(page);
1686 out_unlock: 1685 out_unlock:
1687 nfs_clear_page_tag_locked(req); 1686 nfs_clear_page_tag_locked(req);
1688 out: 1687 out:
1689 return ret; 1688 return ret;
1690 } 1689 }
1691 #endif 1690 #endif
1692 1691
1693 int __init nfs_init_writepagecache(void) 1692 int __init nfs_init_writepagecache(void)
1694 { 1693 {
1695 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1694 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
1696 sizeof(struct nfs_write_data), 1695 sizeof(struct nfs_write_data),
1697 0, SLAB_HWCACHE_ALIGN, 1696 0, SLAB_HWCACHE_ALIGN,
1698 NULL); 1697 NULL);
1699 if (nfs_wdata_cachep == NULL) 1698 if (nfs_wdata_cachep == NULL)
1700 return -ENOMEM; 1699 return -ENOMEM;
1701 1700
1702 nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, 1701 nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
1703 nfs_wdata_cachep); 1702 nfs_wdata_cachep);
1704 if (nfs_wdata_mempool == NULL) 1703 if (nfs_wdata_mempool == NULL)
1705 return -ENOMEM; 1704 return -ENOMEM;
1706 1705
1707 nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, 1706 nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
1708 nfs_wdata_cachep); 1707 nfs_wdata_cachep);
1709 if (nfs_commit_mempool == NULL) 1708 if (nfs_commit_mempool == NULL)
1710 return -ENOMEM; 1709 return -ENOMEM;
1711 1710
1712 /* 1711 /*
1713 * NFS congestion size, scale with available memory. 1712 * NFS congestion size, scale with available memory.
1714 * 1713 *
1715 * 64MB: 8192k 1714 * 64MB: 8192k
1716 * 128MB: 11585k 1715 * 128MB: 11585k
1717 * 256MB: 16384k 1716 * 256MB: 16384k
1718 * 512MB: 23170k 1717 * 512MB: 23170k
1719 * 1GB: 32768k 1718 * 1GB: 32768k
1720 * 2GB: 46340k 1719 * 2GB: 46340k
1721 * 4GB: 65536k 1720 * 4GB: 65536k
1722 * 8GB: 92681k 1721 * 8GB: 92681k
1723 * 16GB: 131072k 1722 * 16GB: 131072k
1724 * 1723 *
1725 * This allows larger machines to have larger/more transfers. 1724 * This allows larger machines to have larger/more transfers.
1726 * Limit the default to 256M 1725 * Limit the default to 256M
1727 */ 1726 */
1728 nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); 1727 nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
1729 if (nfs_congestion_kb > 256*1024) 1728 if (nfs_congestion_kb > 256*1024)
1730 nfs_congestion_kb = 256*1024; 1729 nfs_congestion_kb = 256*1024;
1731 1730
1732 return 0; 1731 return 0;
1733 } 1732 }
1734 1733
1735 void nfs_destroy_writepagecache(void) 1734 void nfs_destroy_writepagecache(void)
1736 { 1735 {
1737 mempool_destroy(nfs_commit_mempool); 1736 mempool_destroy(nfs_commit_mempool);
1738 mempool_destroy(nfs_wdata_mempool); 1737 mempool_destroy(nfs_wdata_mempool);
1739 kmem_cache_destroy(nfs_wdata_cachep); 1738 kmem_cache_destroy(nfs_wdata_cachep);
1740 } 1739 }
1741 1740
1742 1741
include/linux/backing-dev.h
1 /* 1 /*
2 * include/linux/backing-dev.h 2 * include/linux/backing-dev.h
3 * 3 *
4 * low-level device information and state which is propagated up through 4 * low-level device information and state which is propagated up through
5 * to high-level code. 5 * to high-level code.
6 */ 6 */
7 7
8 #ifndef _LINUX_BACKING_DEV_H 8 #ifndef _LINUX_BACKING_DEV_H
9 #define _LINUX_BACKING_DEV_H 9 #define _LINUX_BACKING_DEV_H
10 10
11 #include <linux/percpu_counter.h> 11 #include <linux/percpu_counter.h>
12 #include <linux/log2.h> 12 #include <linux/log2.h>
13 #include <linux/proportions.h> 13 #include <linux/proportions.h>
14 #include <linux/kernel.h> 14 #include <linux/kernel.h>
15 #include <linux/fs.h> 15 #include <linux/fs.h>
16 #include <linux/sched.h> 16 #include <linux/sched.h>
17 #include <linux/timer.h> 17 #include <linux/timer.h>
18 #include <linux/writeback.h> 18 #include <linux/writeback.h>
19 #include <asm/atomic.h> 19 #include <asm/atomic.h>
20 20
21 struct page; 21 struct page;
22 struct device; 22 struct device;
23 struct dentry; 23 struct dentry;
24 24
25 /* 25 /*
26 * Bits in backing_dev_info.state 26 * Bits in backing_dev_info.state
27 */ 27 */
28 enum bdi_state { 28 enum bdi_state {
29 BDI_pending, /* On its way to being activated */ 29 BDI_pending, /* On its way to being activated */
30 BDI_wb_alloc, /* Default embedded wb allocated */ 30 BDI_wb_alloc, /* Default embedded wb allocated */
31 BDI_async_congested, /* The async (write) queue is getting full */ 31 BDI_async_congested, /* The async (write) queue is getting full */
32 BDI_sync_congested, /* The sync queue is getting full */ 32 BDI_sync_congested, /* The sync queue is getting full */
33 BDI_registered, /* bdi_register() was done */ 33 BDI_registered, /* bdi_register() was done */
34 BDI_writeback_running, /* Writeback is in progress */ 34 BDI_writeback_running, /* Writeback is in progress */
35 BDI_unused, /* Available bits start here */ 35 BDI_unused, /* Available bits start here */
36 }; 36 };
37 37
38 typedef int (congested_fn)(void *, int); 38 typedef int (congested_fn)(void *, int);
39 39
40 enum bdi_stat_item { 40 enum bdi_stat_item {
41 BDI_RECLAIMABLE, 41 BDI_RECLAIMABLE,
42 BDI_WRITEBACK, 42 BDI_WRITEBACK,
43 BDI_WRITTEN,
43 NR_BDI_STAT_ITEMS 44 NR_BDI_STAT_ITEMS
44 }; 45 };
45 46
46 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) 47 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
47 48
48 struct bdi_writeback { 49 struct bdi_writeback {
49 struct backing_dev_info *bdi; /* our parent bdi */ 50 struct backing_dev_info *bdi; /* our parent bdi */
50 unsigned int nr; 51 unsigned int nr;
51 52
52 unsigned long last_old_flush; /* last old data flush */ 53 unsigned long last_old_flush; /* last old data flush */
53 unsigned long last_active; /* last time bdi thread was active */ 54 unsigned long last_active; /* last time bdi thread was active */
54 55
55 struct task_struct *task; /* writeback thread */ 56 struct task_struct *task; /* writeback thread */
56 struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ 57 struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
57 struct list_head b_dirty; /* dirty inodes */ 58 struct list_head b_dirty; /* dirty inodes */
58 struct list_head b_io; /* parked for writeback */ 59 struct list_head b_io; /* parked for writeback */
59 struct list_head b_more_io; /* parked for more writeback */ 60 struct list_head b_more_io; /* parked for more writeback */
61 spinlock_t list_lock; /* protects the b_* lists */
60 }; 62 };
61 63
62 struct backing_dev_info { 64 struct backing_dev_info {
63 struct list_head bdi_list; 65 struct list_head bdi_list;
64 unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ 66 unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
65 unsigned long state; /* Always use atomic bitops on this */ 67 unsigned long state; /* Always use atomic bitops on this */
66 unsigned int capabilities; /* Device capabilities */ 68 unsigned int capabilities; /* Device capabilities */
67 congested_fn *congested_fn; /* Function pointer if device is md/dm */ 69 congested_fn *congested_fn; /* Function pointer if device is md/dm */
68 void *congested_data; /* Pointer to aux data for congested func */ 70 void *congested_data; /* Pointer to aux data for congested func */
69 71
70 char *name; 72 char *name;
71 73
72 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; 74 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
73 75
76 unsigned long bw_time_stamp; /* last time write bw is updated */
77 unsigned long written_stamp; /* pages written at bw_time_stamp */
78 unsigned long write_bandwidth; /* the estimated write bandwidth */
79 unsigned long avg_write_bandwidth; /* further smoothed write bw */
80
74 struct prop_local_percpu completions; 81 struct prop_local_percpu completions;
75 int dirty_exceeded; 82 int dirty_exceeded;
76 83
77 unsigned int min_ratio; 84 unsigned int min_ratio;
78 unsigned int max_ratio, max_prop_frac; 85 unsigned int max_ratio, max_prop_frac;
79 86
80 struct bdi_writeback wb; /* default writeback info for this bdi */ 87 struct bdi_writeback wb; /* default writeback info for this bdi */
81 spinlock_t wb_lock; /* protects work_list */ 88 spinlock_t wb_lock; /* protects work_list */
82 89
83 struct list_head work_list; 90 struct list_head work_list;
84 91
85 struct device *dev; 92 struct device *dev;
86 93
87 struct timer_list laptop_mode_wb_timer; 94 struct timer_list laptop_mode_wb_timer;
88 95
89 #ifdef CONFIG_DEBUG_FS 96 #ifdef CONFIG_DEBUG_FS
90 struct dentry *debug_dir; 97 struct dentry *debug_dir;
91 struct dentry *debug_stats; 98 struct dentry *debug_stats;
92 #endif 99 #endif
93 }; 100 };
94 101
95 int bdi_init(struct backing_dev_info *bdi); 102 int bdi_init(struct backing_dev_info *bdi);
96 void bdi_destroy(struct backing_dev_info *bdi); 103 void bdi_destroy(struct backing_dev_info *bdi);
97 104
98 int bdi_register(struct backing_dev_info *bdi, struct device *parent, 105 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
99 const char *fmt, ...); 106 const char *fmt, ...);
100 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); 107 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
101 void bdi_unregister(struct backing_dev_info *bdi); 108 void bdi_unregister(struct backing_dev_info *bdi);
102 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); 109 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
103 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); 110 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
104 void bdi_start_background_writeback(struct backing_dev_info *bdi); 111 void bdi_start_background_writeback(struct backing_dev_info *bdi);
105 int bdi_writeback_thread(void *data); 112 int bdi_writeback_thread(void *data);
106 int bdi_has_dirty_io(struct backing_dev_info *bdi); 113 int bdi_has_dirty_io(struct backing_dev_info *bdi);
107 void bdi_arm_supers_timer(void); 114 void bdi_arm_supers_timer(void);
108 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); 115 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
116 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
109 117
110 extern spinlock_t bdi_lock; 118 extern spinlock_t bdi_lock;
111 extern struct list_head bdi_list; 119 extern struct list_head bdi_list;
112 extern struct list_head bdi_pending_list; 120 extern struct list_head bdi_pending_list;
113 121
114 static inline int wb_has_dirty_io(struct bdi_writeback *wb) 122 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
115 { 123 {
116 return !list_empty(&wb->b_dirty) || 124 return !list_empty(&wb->b_dirty) ||
117 !list_empty(&wb->b_io) || 125 !list_empty(&wb->b_io) ||
118 !list_empty(&wb->b_more_io); 126 !list_empty(&wb->b_more_io);
119 } 127 }
120 128
121 static inline void __add_bdi_stat(struct backing_dev_info *bdi, 129 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
122 enum bdi_stat_item item, s64 amount) 130 enum bdi_stat_item item, s64 amount)
123 { 131 {
124 __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); 132 __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
125 } 133 }
126 134
127 static inline void __inc_bdi_stat(struct backing_dev_info *bdi, 135 static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
128 enum bdi_stat_item item) 136 enum bdi_stat_item item)
129 { 137 {
130 __add_bdi_stat(bdi, item, 1); 138 __add_bdi_stat(bdi, item, 1);
131 } 139 }
132 140
133 static inline void inc_bdi_stat(struct backing_dev_info *bdi, 141 static inline void inc_bdi_stat(struct backing_dev_info *bdi,
134 enum bdi_stat_item item) 142 enum bdi_stat_item item)
135 { 143 {
136 unsigned long flags; 144 unsigned long flags;
137 145
138 local_irq_save(flags); 146 local_irq_save(flags);
139 __inc_bdi_stat(bdi, item); 147 __inc_bdi_stat(bdi, item);
140 local_irq_restore(flags); 148 local_irq_restore(flags);
141 } 149 }
142 150
143 static inline void __dec_bdi_stat(struct backing_dev_info *bdi, 151 static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
144 enum bdi_stat_item item) 152 enum bdi_stat_item item)
145 { 153 {
146 __add_bdi_stat(bdi, item, -1); 154 __add_bdi_stat(bdi, item, -1);
147 } 155 }
148 156
149 static inline void dec_bdi_stat(struct backing_dev_info *bdi, 157 static inline void dec_bdi_stat(struct backing_dev_info *bdi,
150 enum bdi_stat_item item) 158 enum bdi_stat_item item)
151 { 159 {
152 unsigned long flags; 160 unsigned long flags;
153 161
154 local_irq_save(flags); 162 local_irq_save(flags);
155 __dec_bdi_stat(bdi, item); 163 __dec_bdi_stat(bdi, item);
156 local_irq_restore(flags); 164 local_irq_restore(flags);
157 } 165 }
158 166
159 static inline s64 bdi_stat(struct backing_dev_info *bdi, 167 static inline s64 bdi_stat(struct backing_dev_info *bdi,
160 enum bdi_stat_item item) 168 enum bdi_stat_item item)
161 { 169 {
162 return percpu_counter_read_positive(&bdi->bdi_stat[item]); 170 return percpu_counter_read_positive(&bdi->bdi_stat[item]);
163 } 171 }
164 172
165 static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, 173 static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
166 enum bdi_stat_item item) 174 enum bdi_stat_item item)
167 { 175 {
168 return percpu_counter_sum_positive(&bdi->bdi_stat[item]); 176 return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
169 } 177 }
170 178
171 static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, 179 static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
172 enum bdi_stat_item item) 180 enum bdi_stat_item item)
173 { 181 {
174 s64 sum; 182 s64 sum;
175 unsigned long flags; 183 unsigned long flags;
176 184
177 local_irq_save(flags); 185 local_irq_save(flags);
178 sum = __bdi_stat_sum(bdi, item); 186 sum = __bdi_stat_sum(bdi, item);
179 local_irq_restore(flags); 187 local_irq_restore(flags);
180 188
181 return sum; 189 return sum;
182 } 190 }
183 191
184 extern void bdi_writeout_inc(struct backing_dev_info *bdi); 192 extern void bdi_writeout_inc(struct backing_dev_info *bdi);
185 193
186 /* 194 /*
187 * maximal error of a stat counter. 195 * maximal error of a stat counter.
188 */ 196 */
189 static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) 197 static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
190 { 198 {
191 #ifdef CONFIG_SMP 199 #ifdef CONFIG_SMP
192 return nr_cpu_ids * BDI_STAT_BATCH; 200 return nr_cpu_ids * BDI_STAT_BATCH;
193 #else 201 #else
194 return 1; 202 return 1;
195 #endif 203 #endif
196 } 204 }
197 205
198 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); 206 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
199 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); 207 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
200 208
201 /* 209 /*
202 * Flags in backing_dev_info::capability 210 * Flags in backing_dev_info::capability
203 * 211 *
204 * The first three flags control whether dirty pages will contribute to the 212 * The first three flags control whether dirty pages will contribute to the
205 * VM's accounting and whether writepages() should be called for dirty pages 213 * VM's accounting and whether writepages() should be called for dirty pages
206 * (something that would not, for example, be appropriate for ramfs) 214 * (something that would not, for example, be appropriate for ramfs)
207 * 215 *
208 * WARNING: these flags are closely related and should not normally be 216 * WARNING: these flags are closely related and should not normally be
209 * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these 217 * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these
210 * three flags into a single convenience macro. 218 * three flags into a single convenience macro.
211 * 219 *
212 * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting 220 * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting
213 * BDI_CAP_NO_WRITEBACK: Don't write pages back 221 * BDI_CAP_NO_WRITEBACK: Don't write pages back
214 * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages 222 * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages
215 * 223 *
216 * These flags let !MMU mmap() govern direct device mapping vs immediate 224 * These flags let !MMU mmap() govern direct device mapping vs immediate
217 * copying more easily for MAP_PRIVATE, especially for ROM filesystems. 225 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
218 * 226 *
219 * BDI_CAP_MAP_COPY: Copy can be mapped (MAP_PRIVATE) 227 * BDI_CAP_MAP_COPY: Copy can be mapped (MAP_PRIVATE)
220 * BDI_CAP_MAP_DIRECT: Can be mapped directly (MAP_SHARED) 228 * BDI_CAP_MAP_DIRECT: Can be mapped directly (MAP_SHARED)
221 * BDI_CAP_READ_MAP: Can be mapped for reading 229 * BDI_CAP_READ_MAP: Can be mapped for reading
222 * BDI_CAP_WRITE_MAP: Can be mapped for writing 230 * BDI_CAP_WRITE_MAP: Can be mapped for writing
223 * BDI_CAP_EXEC_MAP: Can be mapped for execution 231 * BDI_CAP_EXEC_MAP: Can be mapped for execution
224 * 232 *
225 * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. 233 * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed.
226 */ 234 */
227 #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 235 #define BDI_CAP_NO_ACCT_DIRTY 0x00000001
228 #define BDI_CAP_NO_WRITEBACK 0x00000002 236 #define BDI_CAP_NO_WRITEBACK 0x00000002
229 #define BDI_CAP_MAP_COPY 0x00000004 237 #define BDI_CAP_MAP_COPY 0x00000004
230 #define BDI_CAP_MAP_DIRECT 0x00000008 238 #define BDI_CAP_MAP_DIRECT 0x00000008
231 #define BDI_CAP_READ_MAP 0x00000010 239 #define BDI_CAP_READ_MAP 0x00000010
232 #define BDI_CAP_WRITE_MAP 0x00000020 240 #define BDI_CAP_WRITE_MAP 0x00000020
233 #define BDI_CAP_EXEC_MAP 0x00000040 241 #define BDI_CAP_EXEC_MAP 0x00000040
234 #define BDI_CAP_NO_ACCT_WB 0x00000080 242 #define BDI_CAP_NO_ACCT_WB 0x00000080
235 #define BDI_CAP_SWAP_BACKED 0x00000100 243 #define BDI_CAP_SWAP_BACKED 0x00000100
236 244
237 #define BDI_CAP_VMFLAGS \ 245 #define BDI_CAP_VMFLAGS \
238 (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) 246 (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
239 247
240 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ 248 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
241 (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) 249 (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
242 250
243 #if defined(VM_MAYREAD) && \ 251 #if defined(VM_MAYREAD) && \
244 (BDI_CAP_READ_MAP != VM_MAYREAD || \ 252 (BDI_CAP_READ_MAP != VM_MAYREAD || \
245 BDI_CAP_WRITE_MAP != VM_MAYWRITE || \ 253 BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
246 BDI_CAP_EXEC_MAP != VM_MAYEXEC) 254 BDI_CAP_EXEC_MAP != VM_MAYEXEC)
247 #error please change backing_dev_info::capabilities flags 255 #error please change backing_dev_info::capabilities flags
248 #endif 256 #endif
249 257
250 extern struct backing_dev_info default_backing_dev_info; 258 extern struct backing_dev_info default_backing_dev_info;
251 extern struct backing_dev_info noop_backing_dev_info; 259 extern struct backing_dev_info noop_backing_dev_info;
252 260
253 int writeback_in_progress(struct backing_dev_info *bdi); 261 int writeback_in_progress(struct backing_dev_info *bdi);
254 262
255 static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) 263 static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
256 { 264 {
257 if (bdi->congested_fn) 265 if (bdi->congested_fn)
258 return bdi->congested_fn(bdi->congested_data, bdi_bits); 266 return bdi->congested_fn(bdi->congested_data, bdi_bits);
259 return (bdi->state & bdi_bits); 267 return (bdi->state & bdi_bits);
260 } 268 }
261 269
262 static inline int bdi_read_congested(struct backing_dev_info *bdi) 270 static inline int bdi_read_congested(struct backing_dev_info *bdi)
263 { 271 {
264 return bdi_congested(bdi, 1 << BDI_sync_congested); 272 return bdi_congested(bdi, 1 << BDI_sync_congested);
265 } 273 }
266 274
267 static inline int bdi_write_congested(struct backing_dev_info *bdi) 275 static inline int bdi_write_congested(struct backing_dev_info *bdi)
268 { 276 {
269 return bdi_congested(bdi, 1 << BDI_async_congested); 277 return bdi_congested(bdi, 1 << BDI_async_congested);
270 } 278 }
271 279
272 static inline int bdi_rw_congested(struct backing_dev_info *bdi) 280 static inline int bdi_rw_congested(struct backing_dev_info *bdi)
273 { 281 {
274 return bdi_congested(bdi, (1 << BDI_sync_congested) | 282 return bdi_congested(bdi, (1 << BDI_sync_congested) |
275 (1 << BDI_async_congested)); 283 (1 << BDI_async_congested));
276 } 284 }
277 285
278 enum { 286 enum {
279 BLK_RW_ASYNC = 0, 287 BLK_RW_ASYNC = 0,
280 BLK_RW_SYNC = 1, 288 BLK_RW_SYNC = 1,
281 }; 289 };
282 290
283 void clear_bdi_congested(struct backing_dev_info *bdi, int sync); 291 void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
284 void set_bdi_congested(struct backing_dev_info *bdi, int sync); 292 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
285 long congestion_wait(int sync, long timeout); 293 long congestion_wait(int sync, long timeout);
286 long wait_iff_congested(struct zone *zone, int sync, long timeout); 294 long wait_iff_congested(struct zone *zone, int sync, long timeout);
287 295
288 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) 296 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
289 { 297 {
290 return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK); 298 return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK);
291 } 299 }
292 300
293 static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi) 301 static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi)
294 { 302 {
295 return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY); 303 return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY);
296 } 304 }
297 305
298 static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) 306 static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
299 { 307 {
300 /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */ 308 /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */
301 return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB | 309 return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB |
302 BDI_CAP_NO_WRITEBACK)); 310 BDI_CAP_NO_WRITEBACK));
303 } 311 }
304 312
305 static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) 313 static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
306 { 314 {
307 return bdi->capabilities & BDI_CAP_SWAP_BACKED; 315 return bdi->capabilities & BDI_CAP_SWAP_BACKED;
308 } 316 }
309 317
310 static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) 318 static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
311 { 319 {
312 return bdi == &default_backing_dev_info; 320 return bdi == &default_backing_dev_info;
313 } 321 }
314 322
315 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) 323 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
316 { 324 {
317 return bdi_cap_writeback_dirty(mapping->backing_dev_info); 325 return bdi_cap_writeback_dirty(mapping->backing_dev_info);
318 } 326 }
319 327
320 static inline bool mapping_cap_account_dirty(struct address_space *mapping) 328 static inline bool mapping_cap_account_dirty(struct address_space *mapping)
321 { 329 {
322 return bdi_cap_account_dirty(mapping->backing_dev_info); 330 return bdi_cap_account_dirty(mapping->backing_dev_info);
323 } 331 }
324 332
325 static inline bool mapping_cap_swap_backed(struct address_space *mapping) 333 static inline bool mapping_cap_swap_backed(struct address_space *mapping)
326 { 334 {
327 return bdi_cap_swap_backed(mapping->backing_dev_info); 335 return bdi_cap_swap_backed(mapping->backing_dev_info);
328 } 336 }
329 337
330 static inline int bdi_sched_wait(void *word) 338 static inline int bdi_sched_wait(void *word)
331 { 339 {
332 schedule(); 340 schedule();
333 return 0; 341 return 0;
334 } 342 }
335 343
336 #endif /* _LINUX_BACKING_DEV_H */ 344 #endif /* _LINUX_BACKING_DEV_H */
337 345
include/linux/writeback.h
1 /* 1 /*
2 * include/linux/writeback.h 2 * include/linux/writeback.h
3 */ 3 */
4 #ifndef WRITEBACK_H 4 #ifndef WRITEBACK_H
5 #define WRITEBACK_H 5 #define WRITEBACK_H
6 6
7 #include <linux/sched.h> 7 #include <linux/sched.h>
8 #include <linux/fs.h> 8 #include <linux/fs.h>
9 9
10 struct backing_dev_info; 10 /*
11 * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
12 *
13 * (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
14 *
15 * The 1/16 region above the global dirty limit will be put to maximum pauses:
16 *
17 * (limit, limit + limit/DIRTY_MAXPAUSE_AREA)
18 *
19 * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
20 * to loops:
21 *
22 * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
23 *
24 * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
25 * time) for the dirty pages to drop, unless written enough pages.
26 *
27 * The global dirty threshold is normally equal to the global dirty limit,
28 * except when the system suddenly allocates a lot of anonymous memory and
29 * knocks down the global dirty threshold quickly, in which case the global
30 * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
31 */
32 #define DIRTY_SCOPE 8
33 #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
34 #define DIRTY_MAXPAUSE_AREA 16
35 #define DIRTY_PASSGOOD_AREA 8
11 36
12 extern spinlock_t inode_wb_list_lock; 37 /*
38 * 4MB minimal write chunk size
39 */
40 #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
13 41
42 struct backing_dev_info;
43
14 /* 44 /*
15 * fs/fs-writeback.c 45 * fs/fs-writeback.c
16 */ 46 */
17 enum writeback_sync_modes { 47 enum writeback_sync_modes {
18 WB_SYNC_NONE, /* Don't wait on anything */ 48 WB_SYNC_NONE, /* Don't wait on anything */
19 WB_SYNC_ALL, /* Wait on every mapping */ 49 WB_SYNC_ALL, /* Wait on every mapping */
20 }; 50 };
21 51
22 /* 52 /*
23 * A control structure which tells the writeback code what to do. These are 53 * A control structure which tells the writeback code what to do. These are
24 * always on the stack, and hence need no locking. They are always initialised 54 * always on the stack, and hence need no locking. They are always initialised
25 * in a manner such that unspecified fields are set to zero. 55 * in a manner such that unspecified fields are set to zero.
26 */ 56 */
27 struct writeback_control { 57 struct writeback_control {
28 enum writeback_sync_modes sync_mode; 58 enum writeback_sync_modes sync_mode;
29 unsigned long *older_than_this; /* If !NULL, only write back inodes
30 older than this */
31 unsigned long wb_start; /* Time writeback_inodes_wb was
32 called. This is needed to avoid
33 extra jobs and livelock */
34 long nr_to_write; /* Write this many pages, and decrement 59 long nr_to_write; /* Write this many pages, and decrement
35 this for each page written */ 60 this for each page written */
36 long pages_skipped; /* Pages which were not written */ 61 long pages_skipped; /* Pages which were not written */
37 62
38 /* 63 /*
39 * For a_ops->writepages(): is start or end are non-zero then this is 64 * For a_ops->writepages(): is start or end are non-zero then this is
40 * a hint that the filesystem need only write out the pages inside that 65 * a hint that the filesystem need only write out the pages inside that
41 * byterange. The byte at `end' is included in the writeout request. 66 * byterange. The byte at `end' is included in the writeout request.
42 */ 67 */
43 loff_t range_start; 68 loff_t range_start;
44 loff_t range_end; 69 loff_t range_end;
45 70
46 unsigned nonblocking:1; /* Don't get stuck on request queues */
47 unsigned encountered_congestion:1; /* An output: a queue is full */
48 unsigned for_kupdate:1; /* A kupdate writeback */ 71 unsigned for_kupdate:1; /* A kupdate writeback */
49 unsigned for_background:1; /* A background writeback */ 72 unsigned for_background:1; /* A background writeback */
73 unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
50 unsigned for_reclaim:1; /* Invoked from the page allocator */ 74 unsigned for_reclaim:1; /* Invoked from the page allocator */
51 unsigned range_cyclic:1; /* range_start is cyclic */ 75 unsigned range_cyclic:1; /* range_start is cyclic */
52 unsigned more_io:1; /* more io to be dispatched */
53 }; 76 };
54 77
55 /* 78 /*
56 * fs/fs-writeback.c 79 * fs/fs-writeback.c
57 */ 80 */
58 struct bdi_writeback; 81 struct bdi_writeback;
59 int inode_wait(void *); 82 int inode_wait(void *);
60 void writeback_inodes_sb(struct super_block *); 83 void writeback_inodes_sb(struct super_block *);
61 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); 84 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
62 int writeback_inodes_sb_if_idle(struct super_block *); 85 int writeback_inodes_sb_if_idle(struct super_block *);
63 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); 86 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
64 void sync_inodes_sb(struct super_block *); 87 void sync_inodes_sb(struct super_block *);
65 void writeback_inodes_wb(struct bdi_writeback *wb, 88 long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
66 struct writeback_control *wbc);
67 long wb_do_writeback(struct bdi_writeback *wb, int force_wait); 89 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
68 void wakeup_flusher_threads(long nr_pages); 90 void wakeup_flusher_threads(long nr_pages);
69 91
70 /* writeback.h requires fs.h; it, too, is not included from here. */ 92 /* writeback.h requires fs.h; it, too, is not included from here. */
71 static inline void wait_on_inode(struct inode *inode) 93 static inline void wait_on_inode(struct inode *inode)
72 { 94 {
73 might_sleep(); 95 might_sleep();
74 wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); 96 wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE);
75 } 97 }
76 static inline void inode_sync_wait(struct inode *inode) 98 static inline void inode_sync_wait(struct inode *inode)
77 { 99 {
78 might_sleep(); 100 might_sleep();
79 wait_on_bit(&inode->i_state, __I_SYNC, inode_wait, 101 wait_on_bit(&inode->i_state, __I_SYNC, inode_wait,
80 TASK_UNINTERRUPTIBLE); 102 TASK_UNINTERRUPTIBLE);
81 } 103 }
82 104
83 105
84 /* 106 /*
85 * mm/page-writeback.c 107 * mm/page-writeback.c
86 */ 108 */
87 #ifdef CONFIG_BLOCK 109 #ifdef CONFIG_BLOCK
88 void laptop_io_completion(struct backing_dev_info *info); 110 void laptop_io_completion(struct backing_dev_info *info);
89 void laptop_sync_completion(void); 111 void laptop_sync_completion(void);
90 void laptop_mode_sync(struct work_struct *work); 112 void laptop_mode_sync(struct work_struct *work);
91 void laptop_mode_timer_fn(unsigned long data); 113 void laptop_mode_timer_fn(unsigned long data);
92 #else 114 #else
93 static inline void laptop_sync_completion(void) { } 115 static inline void laptop_sync_completion(void) { }
94 #endif 116 #endif
95 void throttle_vm_writeout(gfp_t gfp_mask); 117 void throttle_vm_writeout(gfp_t gfp_mask);
96 118
119 extern unsigned long global_dirty_limit;
120
97 /* These are exported to sysctl. */ 121 /* These are exported to sysctl. */
98 extern int dirty_background_ratio; 122 extern int dirty_background_ratio;
99 extern unsigned long dirty_background_bytes; 123 extern unsigned long dirty_background_bytes;
100 extern int vm_dirty_ratio; 124 extern int vm_dirty_ratio;
101 extern unsigned long vm_dirty_bytes; 125 extern unsigned long vm_dirty_bytes;
102 extern unsigned int dirty_writeback_interval; 126 extern unsigned int dirty_writeback_interval;
103 extern unsigned int dirty_expire_interval; 127 extern unsigned int dirty_expire_interval;
104 extern int vm_highmem_is_dirtyable; 128 extern int vm_highmem_is_dirtyable;
105 extern int block_dump; 129 extern int block_dump;
106 extern int laptop_mode; 130 extern int laptop_mode;
107 131
108 extern unsigned long determine_dirtyable_memory(void); 132 extern unsigned long determine_dirtyable_memory(void);
109 133
110 extern int dirty_background_ratio_handler(struct ctl_table *table, int write, 134 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
111 void __user *buffer, size_t *lenp, 135 void __user *buffer, size_t *lenp,
112 loff_t *ppos); 136 loff_t *ppos);
113 extern int dirty_background_bytes_handler(struct ctl_table *table, int write, 137 extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
114 void __user *buffer, size_t *lenp, 138 void __user *buffer, size_t *lenp,
115 loff_t *ppos); 139 loff_t *ppos);
116 extern int dirty_ratio_handler(struct ctl_table *table, int write, 140 extern int dirty_ratio_handler(struct ctl_table *table, int write,
117 void __user *buffer, size_t *lenp, 141 void __user *buffer, size_t *lenp,
118 loff_t *ppos); 142 loff_t *ppos);
119 extern int dirty_bytes_handler(struct ctl_table *table, int write, 143 extern int dirty_bytes_handler(struct ctl_table *table, int write,
120 void __user *buffer, size_t *lenp, 144 void __user *buffer, size_t *lenp,
121 loff_t *ppos); 145 loff_t *ppos);
122 146
123 struct ctl_table; 147 struct ctl_table;
124 int dirty_writeback_centisecs_handler(struct ctl_table *, int, 148 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
125 void __user *, size_t *, loff_t *); 149 void __user *, size_t *, loff_t *);
126 150
127 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); 151 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
128 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, 152 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
129 unsigned long dirty); 153 unsigned long dirty);
154
155 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
156 unsigned long thresh,
157 unsigned long dirty,
158 unsigned long bdi_thresh,
159 unsigned long bdi_dirty,
160 unsigned long start_time);
130 161
131 void page_writeback_init(void); 162 void page_writeback_init(void);
132 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 163 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
133 unsigned long nr_pages_dirtied); 164 unsigned long nr_pages_dirtied);
134 165
135 static inline void 166 static inline void
136 balance_dirty_pages_ratelimited(struct address_space *mapping) 167 balance_dirty_pages_ratelimited(struct address_space *mapping)
137 { 168 {
138 balance_dirty_pages_ratelimited_nr(mapping, 1); 169 balance_dirty_pages_ratelimited_nr(mapping, 1);
139 } 170 }
140 171
141 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, 172 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
142 void *data); 173 void *data);
143 174
144 int generic_writepages(struct address_space *mapping, 175 int generic_writepages(struct address_space *mapping,
145 struct writeback_control *wbc); 176 struct writeback_control *wbc);
146 void tag_pages_for_writeback(struct address_space *mapping, 177 void tag_pages_for_writeback(struct address_space *mapping,
147 pgoff_t start, pgoff_t end); 178 pgoff_t start, pgoff_t end);
148 int write_cache_pages(struct address_space *mapping, 179 int write_cache_pages(struct address_space *mapping,
149 struct writeback_control *wbc, writepage_t writepage, 180 struct writeback_control *wbc, writepage_t writepage,
150 void *data); 181 void *data);
151 int do_writepages(struct address_space *mapping, struct writeback_control *wbc); 182 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
152 void set_page_dirty_balance(struct page *page, int page_mkwrite); 183 void set_page_dirty_balance(struct page *page, int page_mkwrite);
153 void writeback_set_ratelimit(void); 184 void writeback_set_ratelimit(void);
154 void tag_pages_for_writeback(struct address_space *mapping, 185 void tag_pages_for_writeback(struct address_space *mapping,
include/trace/events/btrfs.h
1 #undef TRACE_SYSTEM 1 #undef TRACE_SYSTEM
2 #define TRACE_SYSTEM btrfs 2 #define TRACE_SYSTEM btrfs
3 3
4 #if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ) 4 #if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ)
5 #define _TRACE_BTRFS_H 5 #define _TRACE_BTRFS_H
6 6
7 #include <linux/writeback.h> 7 #include <linux/writeback.h>
8 #include <linux/tracepoint.h> 8 #include <linux/tracepoint.h>
9 9
10 struct btrfs_root; 10 struct btrfs_root;
11 struct btrfs_fs_info; 11 struct btrfs_fs_info;
12 struct btrfs_inode; 12 struct btrfs_inode;
13 struct extent_map; 13 struct extent_map;
14 struct btrfs_ordered_extent; 14 struct btrfs_ordered_extent;
15 struct btrfs_delayed_ref_node; 15 struct btrfs_delayed_ref_node;
16 struct btrfs_delayed_tree_ref; 16 struct btrfs_delayed_tree_ref;
17 struct btrfs_delayed_data_ref; 17 struct btrfs_delayed_data_ref;
18 struct btrfs_delayed_ref_head; 18 struct btrfs_delayed_ref_head;
19 struct map_lookup; 19 struct map_lookup;
20 struct extent_buffer; 20 struct extent_buffer;
21 21
22 #define show_ref_type(type) \ 22 #define show_ref_type(type) \
23 __print_symbolic(type, \ 23 __print_symbolic(type, \
24 { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ 24 { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \
25 { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \ 25 { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \
26 { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \ 26 { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \
27 { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \ 27 { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \
28 { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" }) 28 { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" })
29 29
30 #define __show_root_type(obj) \ 30 #define __show_root_type(obj) \
31 __print_symbolic_u64(obj, \ 31 __print_symbolic_u64(obj, \
32 { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, \ 32 { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, \
33 { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, \ 33 { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, \
34 { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, \ 34 { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, \
35 { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, \ 35 { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, \
36 { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \ 36 { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \
37 { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \ 37 { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \
38 { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \ 38 { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \
39 { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \ 39 { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \
40 { BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \ 40 { BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \
41 { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) 41 { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
42 42
43 #define show_root_type(obj) \ 43 #define show_root_type(obj) \
44 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ 44 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \
45 (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" 45 (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
46 46
47 TRACE_EVENT(btrfs_transaction_commit, 47 TRACE_EVENT(btrfs_transaction_commit,
48 48
49 TP_PROTO(struct btrfs_root *root), 49 TP_PROTO(struct btrfs_root *root),
50 50
51 TP_ARGS(root), 51 TP_ARGS(root),
52 52
53 TP_STRUCT__entry( 53 TP_STRUCT__entry(
54 __field( u64, generation ) 54 __field( u64, generation )
55 __field( u64, root_objectid ) 55 __field( u64, root_objectid )
56 ), 56 ),
57 57
58 TP_fast_assign( 58 TP_fast_assign(
59 __entry->generation = root->fs_info->generation; 59 __entry->generation = root->fs_info->generation;
60 __entry->root_objectid = root->root_key.objectid; 60 __entry->root_objectid = root->root_key.objectid;
61 ), 61 ),
62 62
63 TP_printk("root = %llu(%s), gen = %llu", 63 TP_printk("root = %llu(%s), gen = %llu",
64 show_root_type(__entry->root_objectid), 64 show_root_type(__entry->root_objectid),
65 (unsigned long long)__entry->generation) 65 (unsigned long long)__entry->generation)
66 ); 66 );
67 67
68 DECLARE_EVENT_CLASS(btrfs__inode, 68 DECLARE_EVENT_CLASS(btrfs__inode,
69 69
70 TP_PROTO(struct inode *inode), 70 TP_PROTO(struct inode *inode),
71 71
72 TP_ARGS(inode), 72 TP_ARGS(inode),
73 73
74 TP_STRUCT__entry( 74 TP_STRUCT__entry(
75 __field( ino_t, ino ) 75 __field( ino_t, ino )
76 __field( blkcnt_t, blocks ) 76 __field( blkcnt_t, blocks )
77 __field( u64, disk_i_size ) 77 __field( u64, disk_i_size )
78 __field( u64, generation ) 78 __field( u64, generation )
79 __field( u64, last_trans ) 79 __field( u64, last_trans )
80 __field( u64, logged_trans ) 80 __field( u64, logged_trans )
81 __field( u64, root_objectid ) 81 __field( u64, root_objectid )
82 ), 82 ),
83 83
84 TP_fast_assign( 84 TP_fast_assign(
85 __entry->ino = inode->i_ino; 85 __entry->ino = inode->i_ino;
86 __entry->blocks = inode->i_blocks; 86 __entry->blocks = inode->i_blocks;
87 __entry->disk_i_size = BTRFS_I(inode)->disk_i_size; 87 __entry->disk_i_size = BTRFS_I(inode)->disk_i_size;
88 __entry->generation = BTRFS_I(inode)->generation; 88 __entry->generation = BTRFS_I(inode)->generation;
89 __entry->last_trans = BTRFS_I(inode)->last_trans; 89 __entry->last_trans = BTRFS_I(inode)->last_trans;
90 __entry->logged_trans = BTRFS_I(inode)->logged_trans; 90 __entry->logged_trans = BTRFS_I(inode)->logged_trans;
91 __entry->root_objectid = 91 __entry->root_objectid =
92 BTRFS_I(inode)->root->root_key.objectid; 92 BTRFS_I(inode)->root->root_key.objectid;
93 ), 93 ),
94 94
95 TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, " 95 TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, "
96 "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu", 96 "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu",
97 show_root_type(__entry->root_objectid), 97 show_root_type(__entry->root_objectid),
98 (unsigned long long)__entry->generation, 98 (unsigned long long)__entry->generation,
99 (unsigned long)__entry->ino, 99 (unsigned long)__entry->ino,
100 (unsigned long long)__entry->blocks, 100 (unsigned long long)__entry->blocks,
101 (unsigned long long)__entry->disk_i_size, 101 (unsigned long long)__entry->disk_i_size,
102 (unsigned long long)__entry->last_trans, 102 (unsigned long long)__entry->last_trans,
103 (unsigned long long)__entry->logged_trans) 103 (unsigned long long)__entry->logged_trans)
104 ); 104 );
105 105
106 DEFINE_EVENT(btrfs__inode, btrfs_inode_new, 106 DEFINE_EVENT(btrfs__inode, btrfs_inode_new,
107 107
108 TP_PROTO(struct inode *inode), 108 TP_PROTO(struct inode *inode),
109 109
110 TP_ARGS(inode) 110 TP_ARGS(inode)
111 ); 111 );
112 112
113 DEFINE_EVENT(btrfs__inode, btrfs_inode_request, 113 DEFINE_EVENT(btrfs__inode, btrfs_inode_request,
114 114
115 TP_PROTO(struct inode *inode), 115 TP_PROTO(struct inode *inode),
116 116
117 TP_ARGS(inode) 117 TP_ARGS(inode)
118 ); 118 );
119 119
120 DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, 120 DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
121 121
122 TP_PROTO(struct inode *inode), 122 TP_PROTO(struct inode *inode),
123 123
124 TP_ARGS(inode) 124 TP_ARGS(inode)
125 ); 125 );
126 126
127 #define __show_map_type(type) \ 127 #define __show_map_type(type) \
128 __print_symbolic_u64(type, \ 128 __print_symbolic_u64(type, \
129 { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ 129 { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \
130 { EXTENT_MAP_HOLE, "HOLE" }, \ 130 { EXTENT_MAP_HOLE, "HOLE" }, \
131 { EXTENT_MAP_INLINE, "INLINE" }, \ 131 { EXTENT_MAP_INLINE, "INLINE" }, \
132 { EXTENT_MAP_DELALLOC, "DELALLOC" }) 132 { EXTENT_MAP_DELALLOC, "DELALLOC" })
133 133
134 #define show_map_type(type) \ 134 #define show_map_type(type) \
135 type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) 135 type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type)
136 136
137 #define show_map_flags(flag) \ 137 #define show_map_flags(flag) \
138 __print_flags(flag, "|", \ 138 __print_flags(flag, "|", \
139 { EXTENT_FLAG_PINNED, "PINNED" }, \ 139 { EXTENT_FLAG_PINNED, "PINNED" }, \
140 { EXTENT_FLAG_COMPRESSED, "COMPRESSED" }, \ 140 { EXTENT_FLAG_COMPRESSED, "COMPRESSED" }, \
141 { EXTENT_FLAG_VACANCY, "VACANCY" }, \ 141 { EXTENT_FLAG_VACANCY, "VACANCY" }, \
142 { EXTENT_FLAG_PREALLOC, "PREALLOC" }) 142 { EXTENT_FLAG_PREALLOC, "PREALLOC" })
143 143
144 TRACE_EVENT(btrfs_get_extent, 144 TRACE_EVENT(btrfs_get_extent,
145 145
146 TP_PROTO(struct btrfs_root *root, struct extent_map *map), 146 TP_PROTO(struct btrfs_root *root, struct extent_map *map),
147 147
148 TP_ARGS(root, map), 148 TP_ARGS(root, map),
149 149
150 TP_STRUCT__entry( 150 TP_STRUCT__entry(
151 __field( u64, root_objectid ) 151 __field( u64, root_objectid )
152 __field( u64, start ) 152 __field( u64, start )
153 __field( u64, len ) 153 __field( u64, len )
154 __field( u64, orig_start ) 154 __field( u64, orig_start )
155 __field( u64, block_start ) 155 __field( u64, block_start )
156 __field( u64, block_len ) 156 __field( u64, block_len )
157 __field( unsigned long, flags ) 157 __field( unsigned long, flags )
158 __field( int, refs ) 158 __field( int, refs )
159 __field( unsigned int, compress_type ) 159 __field( unsigned int, compress_type )
160 ), 160 ),
161 161
162 TP_fast_assign( 162 TP_fast_assign(
163 __entry->root_objectid = root->root_key.objectid; 163 __entry->root_objectid = root->root_key.objectid;
164 __entry->start = map->start; 164 __entry->start = map->start;
165 __entry->len = map->len; 165 __entry->len = map->len;
166 __entry->orig_start = map->orig_start; 166 __entry->orig_start = map->orig_start;
167 __entry->block_start = map->block_start; 167 __entry->block_start = map->block_start;
168 __entry->block_len = map->block_len; 168 __entry->block_len = map->block_len;
169 __entry->flags = map->flags; 169 __entry->flags = map->flags;
170 __entry->refs = atomic_read(&map->refs); 170 __entry->refs = atomic_read(&map->refs);
171 __entry->compress_type = map->compress_type; 171 __entry->compress_type = map->compress_type;
172 ), 172 ),
173 173
174 TP_printk("root = %llu(%s), start = %llu, len = %llu, " 174 TP_printk("root = %llu(%s), start = %llu, len = %llu, "
175 "orig_start = %llu, block_start = %llu(%s), " 175 "orig_start = %llu, block_start = %llu(%s), "
176 "block_len = %llu, flags = %s, refs = %u, " 176 "block_len = %llu, flags = %s, refs = %u, "
177 "compress_type = %u", 177 "compress_type = %u",
178 show_root_type(__entry->root_objectid), 178 show_root_type(__entry->root_objectid),
179 (unsigned long long)__entry->start, 179 (unsigned long long)__entry->start,
180 (unsigned long long)__entry->len, 180 (unsigned long long)__entry->len,
181 (unsigned long long)__entry->orig_start, 181 (unsigned long long)__entry->orig_start,
182 show_map_type(__entry->block_start), 182 show_map_type(__entry->block_start),
183 (unsigned long long)__entry->block_len, 183 (unsigned long long)__entry->block_len,
184 show_map_flags(__entry->flags), 184 show_map_flags(__entry->flags),
185 __entry->refs, __entry->compress_type) 185 __entry->refs, __entry->compress_type)
186 ); 186 );
187 187
188 #define show_ordered_flags(flags) \ 188 #define show_ordered_flags(flags) \
189 __print_symbolic(flags, \ 189 __print_symbolic(flags, \
190 { BTRFS_ORDERED_IO_DONE, "IO_DONE" }, \ 190 { BTRFS_ORDERED_IO_DONE, "IO_DONE" }, \
191 { BTRFS_ORDERED_COMPLETE, "COMPLETE" }, \ 191 { BTRFS_ORDERED_COMPLETE, "COMPLETE" }, \
192 { BTRFS_ORDERED_NOCOW, "NOCOW" }, \ 192 { BTRFS_ORDERED_NOCOW, "NOCOW" }, \
193 { BTRFS_ORDERED_COMPRESSED, "COMPRESSED" }, \ 193 { BTRFS_ORDERED_COMPRESSED, "COMPRESSED" }, \
194 { BTRFS_ORDERED_PREALLOC, "PREALLOC" }, \ 194 { BTRFS_ORDERED_PREALLOC, "PREALLOC" }, \
195 { BTRFS_ORDERED_DIRECT, "DIRECT" }) 195 { BTRFS_ORDERED_DIRECT, "DIRECT" })
196 196
197 DECLARE_EVENT_CLASS(btrfs__ordered_extent, 197 DECLARE_EVENT_CLASS(btrfs__ordered_extent,
198 198
199 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), 199 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
200 200
201 TP_ARGS(inode, ordered), 201 TP_ARGS(inode, ordered),
202 202
203 TP_STRUCT__entry( 203 TP_STRUCT__entry(
204 __field( ino_t, ino ) 204 __field( ino_t, ino )
205 __field( u64, file_offset ) 205 __field( u64, file_offset )
206 __field( u64, start ) 206 __field( u64, start )
207 __field( u64, len ) 207 __field( u64, len )
208 __field( u64, disk_len ) 208 __field( u64, disk_len )
209 __field( u64, bytes_left ) 209 __field( u64, bytes_left )
210 __field( unsigned long, flags ) 210 __field( unsigned long, flags )
211 __field( int, compress_type ) 211 __field( int, compress_type )
212 __field( int, refs ) 212 __field( int, refs )
213 __field( u64, root_objectid ) 213 __field( u64, root_objectid )
214 ), 214 ),
215 215
216 TP_fast_assign( 216 TP_fast_assign(
217 __entry->ino = inode->i_ino; 217 __entry->ino = inode->i_ino;
218 __entry->file_offset = ordered->file_offset; 218 __entry->file_offset = ordered->file_offset;
219 __entry->start = ordered->start; 219 __entry->start = ordered->start;
220 __entry->len = ordered->len; 220 __entry->len = ordered->len;
221 __entry->disk_len = ordered->disk_len; 221 __entry->disk_len = ordered->disk_len;
222 __entry->bytes_left = ordered->bytes_left; 222 __entry->bytes_left = ordered->bytes_left;
223 __entry->flags = ordered->flags; 223 __entry->flags = ordered->flags;
224 __entry->compress_type = ordered->compress_type; 224 __entry->compress_type = ordered->compress_type;
225 __entry->refs = atomic_read(&ordered->refs); 225 __entry->refs = atomic_read(&ordered->refs);
226 __entry->root_objectid = 226 __entry->root_objectid =
227 BTRFS_I(inode)->root->root_key.objectid; 227 BTRFS_I(inode)->root->root_key.objectid;
228 ), 228 ),
229 229
230 TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, " 230 TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, "
231 "start = %llu, len = %llu, disk_len = %llu, " 231 "start = %llu, len = %llu, disk_len = %llu, "
232 "bytes_left = %llu, flags = %s, compress_type = %d, " 232 "bytes_left = %llu, flags = %s, compress_type = %d, "
233 "refs = %d", 233 "refs = %d",
234 show_root_type(__entry->root_objectid), 234 show_root_type(__entry->root_objectid),
235 (unsigned long long)__entry->ino, 235 (unsigned long long)__entry->ino,
236 (unsigned long long)__entry->file_offset, 236 (unsigned long long)__entry->file_offset,
237 (unsigned long long)__entry->start, 237 (unsigned long long)__entry->start,
238 (unsigned long long)__entry->len, 238 (unsigned long long)__entry->len,
239 (unsigned long long)__entry->disk_len, 239 (unsigned long long)__entry->disk_len,
240 (unsigned long long)__entry->bytes_left, 240 (unsigned long long)__entry->bytes_left,
241 show_ordered_flags(__entry->flags), 241 show_ordered_flags(__entry->flags),
242 __entry->compress_type, __entry->refs) 242 __entry->compress_type, __entry->refs)
243 ); 243 );
244 244
245 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, 245 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add,
246 246
247 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), 247 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
248 248
249 TP_ARGS(inode, ordered) 249 TP_ARGS(inode, ordered)
250 ); 250 );
251 251
252 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, 252 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove,
253 253
254 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), 254 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
255 255
256 TP_ARGS(inode, ordered) 256 TP_ARGS(inode, ordered)
257 ); 257 );
258 258
259 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, 259 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start,
260 260
261 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), 261 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
262 262
263 TP_ARGS(inode, ordered) 263 TP_ARGS(inode, ordered)
264 ); 264 );
265 265
266 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, 266 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put,
267 267
268 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), 268 TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
269 269
270 TP_ARGS(inode, ordered) 270 TP_ARGS(inode, ordered)
271 ); 271 );
272 272
273 DECLARE_EVENT_CLASS(btrfs__writepage, 273 DECLARE_EVENT_CLASS(btrfs__writepage,
274 274
275 TP_PROTO(struct page *page, struct inode *inode, 275 TP_PROTO(struct page *page, struct inode *inode,
276 struct writeback_control *wbc), 276 struct writeback_control *wbc),
277 277
278 TP_ARGS(page, inode, wbc), 278 TP_ARGS(page, inode, wbc),
279 279
280 TP_STRUCT__entry( 280 TP_STRUCT__entry(
281 __field( ino_t, ino ) 281 __field( ino_t, ino )
282 __field( pgoff_t, index ) 282 __field( pgoff_t, index )
283 __field( long, nr_to_write ) 283 __field( long, nr_to_write )
284 __field( long, pages_skipped ) 284 __field( long, pages_skipped )
285 __field( loff_t, range_start ) 285 __field( loff_t, range_start )
286 __field( loff_t, range_end ) 286 __field( loff_t, range_end )
287 __field( char, nonblocking )
288 __field( char, for_kupdate ) 287 __field( char, for_kupdate )
289 __field( char, for_reclaim ) 288 __field( char, for_reclaim )
290 __field( char, range_cyclic ) 289 __field( char, range_cyclic )
291 __field( pgoff_t, writeback_index ) 290 __field( pgoff_t, writeback_index )
292 __field( u64, root_objectid ) 291 __field( u64, root_objectid )
293 ), 292 ),
294 293
295 TP_fast_assign( 294 TP_fast_assign(
296 __entry->ino = inode->i_ino; 295 __entry->ino = inode->i_ino;
297 __entry->index = page->index; 296 __entry->index = page->index;
298 __entry->nr_to_write = wbc->nr_to_write; 297 __entry->nr_to_write = wbc->nr_to_write;
299 __entry->pages_skipped = wbc->pages_skipped; 298 __entry->pages_skipped = wbc->pages_skipped;
300 __entry->range_start = wbc->range_start; 299 __entry->range_start = wbc->range_start;
301 __entry->range_end = wbc->range_end; 300 __entry->range_end = wbc->range_end;
302 __entry->nonblocking = wbc->nonblocking;
303 __entry->for_kupdate = wbc->for_kupdate; 301 __entry->for_kupdate = wbc->for_kupdate;
304 __entry->for_reclaim = wbc->for_reclaim; 302 __entry->for_reclaim = wbc->for_reclaim;
305 __entry->range_cyclic = wbc->range_cyclic; 303 __entry->range_cyclic = wbc->range_cyclic;
306 __entry->writeback_index = inode->i_mapping->writeback_index; 304 __entry->writeback_index = inode->i_mapping->writeback_index;
307 __entry->root_objectid = 305 __entry->root_objectid =
308 BTRFS_I(inode)->root->root_key.objectid; 306 BTRFS_I(inode)->root->root_key.objectid;
309 ), 307 ),
310 308
311 TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " 309 TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
312 "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " 310 "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
313 "range_end = %llu, nonblocking = %d, for_kupdate = %d, " 311 "range_end = %llu, for_kupdate = %d, "
314 "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", 312 "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
315 show_root_type(__entry->root_objectid), 313 show_root_type(__entry->root_objectid),
316 (unsigned long)__entry->ino, __entry->index, 314 (unsigned long)__entry->ino, __entry->index,
317 __entry->nr_to_write, __entry->pages_skipped, 315 __entry->nr_to_write, __entry->pages_skipped,
318 __entry->range_start, __entry->range_end, 316 __entry->range_start, __entry->range_end,
319 __entry->nonblocking, __entry->for_kupdate, 317 __entry->for_kupdate,
320 __entry->for_reclaim, __entry->range_cyclic, 318 __entry->for_reclaim, __entry->range_cyclic,
321 (unsigned long)__entry->writeback_index) 319 (unsigned long)__entry->writeback_index)
322 ); 320 );
323 321
324 DEFINE_EVENT(btrfs__writepage, __extent_writepage, 322 DEFINE_EVENT(btrfs__writepage, __extent_writepage,
325 323
326 TP_PROTO(struct page *page, struct inode *inode, 324 TP_PROTO(struct page *page, struct inode *inode,
327 struct writeback_control *wbc), 325 struct writeback_control *wbc),
328 326
329 TP_ARGS(page, inode, wbc) 327 TP_ARGS(page, inode, wbc)
330 ); 328 );
331 329
332 TRACE_EVENT(btrfs_writepage_end_io_hook, 330 TRACE_EVENT(btrfs_writepage_end_io_hook,
333 331
334 TP_PROTO(struct page *page, u64 start, u64 end, int uptodate), 332 TP_PROTO(struct page *page, u64 start, u64 end, int uptodate),
335 333
336 TP_ARGS(page, start, end, uptodate), 334 TP_ARGS(page, start, end, uptodate),
337 335
338 TP_STRUCT__entry( 336 TP_STRUCT__entry(
339 __field( ino_t, ino ) 337 __field( ino_t, ino )
340 __field( pgoff_t, index ) 338 __field( pgoff_t, index )
341 __field( u64, start ) 339 __field( u64, start )
342 __field( u64, end ) 340 __field( u64, end )
343 __field( int, uptodate ) 341 __field( int, uptodate )
344 __field( u64, root_objectid ) 342 __field( u64, root_objectid )
345 ), 343 ),
346 344
347 TP_fast_assign( 345 TP_fast_assign(
348 __entry->ino = page->mapping->host->i_ino; 346 __entry->ino = page->mapping->host->i_ino;
349 __entry->index = page->index; 347 __entry->index = page->index;
350 __entry->start = start; 348 __entry->start = start;
351 __entry->end = end; 349 __entry->end = end;
352 __entry->uptodate = uptodate; 350 __entry->uptodate = uptodate;
353 __entry->root_objectid = 351 __entry->root_objectid =
354 BTRFS_I(page->mapping->host)->root->root_key.objectid; 352 BTRFS_I(page->mapping->host)->root->root_key.objectid;
355 ), 353 ),
356 354
357 TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, " 355 TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, "
358 "end = %llu, uptodate = %d", 356 "end = %llu, uptodate = %d",
359 show_root_type(__entry->root_objectid), 357 show_root_type(__entry->root_objectid),
360 (unsigned long)__entry->ino, (unsigned long)__entry->index, 358 (unsigned long)__entry->ino, (unsigned long)__entry->index,
361 (unsigned long long)__entry->start, 359 (unsigned long long)__entry->start,
362 (unsigned long long)__entry->end, __entry->uptodate) 360 (unsigned long long)__entry->end, __entry->uptodate)
363 ); 361 );
364 362
365 TRACE_EVENT(btrfs_sync_file, 363 TRACE_EVENT(btrfs_sync_file,
366 364
367 TP_PROTO(struct file *file, int datasync), 365 TP_PROTO(struct file *file, int datasync),
368 366
369 TP_ARGS(file, datasync), 367 TP_ARGS(file, datasync),
370 368
371 TP_STRUCT__entry( 369 TP_STRUCT__entry(
372 __field( ino_t, ino ) 370 __field( ino_t, ino )
373 __field( ino_t, parent ) 371 __field( ino_t, parent )
374 __field( int, datasync ) 372 __field( int, datasync )
375 __field( u64, root_objectid ) 373 __field( u64, root_objectid )
376 ), 374 ),
377 375
378 TP_fast_assign( 376 TP_fast_assign(
379 struct dentry *dentry = file->f_path.dentry; 377 struct dentry *dentry = file->f_path.dentry;
380 struct inode *inode = dentry->d_inode; 378 struct inode *inode = dentry->d_inode;
381 379
382 __entry->ino = inode->i_ino; 380 __entry->ino = inode->i_ino;
383 __entry->parent = dentry->d_parent->d_inode->i_ino; 381 __entry->parent = dentry->d_parent->d_inode->i_ino;
384 __entry->datasync = datasync; 382 __entry->datasync = datasync;
385 __entry->root_objectid = 383 __entry->root_objectid =
386 BTRFS_I(inode)->root->root_key.objectid; 384 BTRFS_I(inode)->root->root_key.objectid;
387 ), 385 ),
388 386
389 TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d", 387 TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d",
390 show_root_type(__entry->root_objectid), 388 show_root_type(__entry->root_objectid),
391 (unsigned long)__entry->ino, (unsigned long)__entry->parent, 389 (unsigned long)__entry->ino, (unsigned long)__entry->parent,
392 __entry->datasync) 390 __entry->datasync)
393 ); 391 );
394 392
395 TRACE_EVENT(btrfs_sync_fs, 393 TRACE_EVENT(btrfs_sync_fs,
396 394
397 TP_PROTO(int wait), 395 TP_PROTO(int wait),
398 396
399 TP_ARGS(wait), 397 TP_ARGS(wait),
400 398
401 TP_STRUCT__entry( 399 TP_STRUCT__entry(
402 __field( int, wait ) 400 __field( int, wait )
403 ), 401 ),
404 402
405 TP_fast_assign( 403 TP_fast_assign(
406 __entry->wait = wait; 404 __entry->wait = wait;
407 ), 405 ),
408 406
409 TP_printk("wait = %d", __entry->wait) 407 TP_printk("wait = %d", __entry->wait)
410 ); 408 );
411 409
412 #define show_ref_action(action) \ 410 #define show_ref_action(action) \
413 __print_symbolic(action, \ 411 __print_symbolic(action, \
414 { BTRFS_ADD_DELAYED_REF, "ADD_DELAYED_REF" }, \ 412 { BTRFS_ADD_DELAYED_REF, "ADD_DELAYED_REF" }, \
415 { BTRFS_DROP_DELAYED_REF, "DROP_DELAYED_REF" }, \ 413 { BTRFS_DROP_DELAYED_REF, "DROP_DELAYED_REF" }, \
416 { BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" }, \ 414 { BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" }, \
417 { BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" }) 415 { BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" })
418 416
419 417
420 TRACE_EVENT(btrfs_delayed_tree_ref, 418 TRACE_EVENT(btrfs_delayed_tree_ref,
421 419
422 TP_PROTO(struct btrfs_delayed_ref_node *ref, 420 TP_PROTO(struct btrfs_delayed_ref_node *ref,
423 struct btrfs_delayed_tree_ref *full_ref, 421 struct btrfs_delayed_tree_ref *full_ref,
424 int action), 422 int action),
425 423
426 TP_ARGS(ref, full_ref, action), 424 TP_ARGS(ref, full_ref, action),
427 425
428 TP_STRUCT__entry( 426 TP_STRUCT__entry(
429 __field( u64, bytenr ) 427 __field( u64, bytenr )
430 __field( u64, num_bytes ) 428 __field( u64, num_bytes )
431 __field( int, action ) 429 __field( int, action )
432 __field( u64, parent ) 430 __field( u64, parent )
433 __field( u64, ref_root ) 431 __field( u64, ref_root )
434 __field( int, level ) 432 __field( int, level )
435 __field( int, type ) 433 __field( int, type )
436 ), 434 ),
437 435
438 TP_fast_assign( 436 TP_fast_assign(
439 __entry->bytenr = ref->bytenr; 437 __entry->bytenr = ref->bytenr;
440 __entry->num_bytes = ref->num_bytes; 438 __entry->num_bytes = ref->num_bytes;
441 __entry->action = action; 439 __entry->action = action;
442 __entry->parent = full_ref->parent; 440 __entry->parent = full_ref->parent;
443 __entry->ref_root = full_ref->root; 441 __entry->ref_root = full_ref->root;
444 __entry->level = full_ref->level; 442 __entry->level = full_ref->level;
445 __entry->type = ref->type; 443 __entry->type = ref->type;
446 ), 444 ),
447 445
448 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " 446 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
449 "parent = %llu(%s), ref_root = %llu(%s), level = %d, " 447 "parent = %llu(%s), ref_root = %llu(%s), level = %d, "
450 "type = %s", 448 "type = %s",
451 (unsigned long long)__entry->bytenr, 449 (unsigned long long)__entry->bytenr,
452 (unsigned long long)__entry->num_bytes, 450 (unsigned long long)__entry->num_bytes,
453 show_ref_action(__entry->action), 451 show_ref_action(__entry->action),
454 show_root_type(__entry->parent), 452 show_root_type(__entry->parent),
455 show_root_type(__entry->ref_root), 453 show_root_type(__entry->ref_root),
456 __entry->level, show_ref_type(__entry->type)) 454 __entry->level, show_ref_type(__entry->type))
457 ); 455 );
458 456
459 TRACE_EVENT(btrfs_delayed_data_ref, 457 TRACE_EVENT(btrfs_delayed_data_ref,
460 458
461 TP_PROTO(struct btrfs_delayed_ref_node *ref, 459 TP_PROTO(struct btrfs_delayed_ref_node *ref,
462 struct btrfs_delayed_data_ref *full_ref, 460 struct btrfs_delayed_data_ref *full_ref,
463 int action), 461 int action),
464 462
465 TP_ARGS(ref, full_ref, action), 463 TP_ARGS(ref, full_ref, action),
466 464
467 TP_STRUCT__entry( 465 TP_STRUCT__entry(
468 __field( u64, bytenr ) 466 __field( u64, bytenr )
469 __field( u64, num_bytes ) 467 __field( u64, num_bytes )
470 __field( int, action ) 468 __field( int, action )
471 __field( u64, parent ) 469 __field( u64, parent )
472 __field( u64, ref_root ) 470 __field( u64, ref_root )
473 __field( u64, owner ) 471 __field( u64, owner )
474 __field( u64, offset ) 472 __field( u64, offset )
475 __field( int, type ) 473 __field( int, type )
476 ), 474 ),
477 475
478 TP_fast_assign( 476 TP_fast_assign(
479 __entry->bytenr = ref->bytenr; 477 __entry->bytenr = ref->bytenr;
480 __entry->num_bytes = ref->num_bytes; 478 __entry->num_bytes = ref->num_bytes;
481 __entry->action = action; 479 __entry->action = action;
482 __entry->parent = full_ref->parent; 480 __entry->parent = full_ref->parent;
483 __entry->ref_root = full_ref->root; 481 __entry->ref_root = full_ref->root;
484 __entry->owner = full_ref->objectid; 482 __entry->owner = full_ref->objectid;
485 __entry->offset = full_ref->offset; 483 __entry->offset = full_ref->offset;
486 __entry->type = ref->type; 484 __entry->type = ref->type;
487 ), 485 ),
488 486
489 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " 487 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
490 "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, " 488 "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, "
491 "offset = %llu, type = %s", 489 "offset = %llu, type = %s",
492 (unsigned long long)__entry->bytenr, 490 (unsigned long long)__entry->bytenr,
493 (unsigned long long)__entry->num_bytes, 491 (unsigned long long)__entry->num_bytes,
494 show_ref_action(__entry->action), 492 show_ref_action(__entry->action),
495 show_root_type(__entry->parent), 493 show_root_type(__entry->parent),
496 show_root_type(__entry->ref_root), 494 show_root_type(__entry->ref_root),
497 (unsigned long long)__entry->owner, 495 (unsigned long long)__entry->owner,
498 (unsigned long long)__entry->offset, 496 (unsigned long long)__entry->offset,
499 show_ref_type(__entry->type)) 497 show_ref_type(__entry->type))
500 ); 498 );
501 499
502 TRACE_EVENT(btrfs_delayed_ref_head, 500 TRACE_EVENT(btrfs_delayed_ref_head,
503 501
504 TP_PROTO(struct btrfs_delayed_ref_node *ref, 502 TP_PROTO(struct btrfs_delayed_ref_node *ref,
505 struct btrfs_delayed_ref_head *head_ref, 503 struct btrfs_delayed_ref_head *head_ref,
506 int action), 504 int action),
507 505
508 TP_ARGS(ref, head_ref, action), 506 TP_ARGS(ref, head_ref, action),
509 507
510 TP_STRUCT__entry( 508 TP_STRUCT__entry(
511 __field( u64, bytenr ) 509 __field( u64, bytenr )
512 __field( u64, num_bytes ) 510 __field( u64, num_bytes )
513 __field( int, action ) 511 __field( int, action )
514 __field( int, is_data ) 512 __field( int, is_data )
515 ), 513 ),
516 514
517 TP_fast_assign( 515 TP_fast_assign(
518 __entry->bytenr = ref->bytenr; 516 __entry->bytenr = ref->bytenr;
519 __entry->num_bytes = ref->num_bytes; 517 __entry->num_bytes = ref->num_bytes;
520 __entry->action = action; 518 __entry->action = action;
521 __entry->is_data = head_ref->is_data; 519 __entry->is_data = head_ref->is_data;
522 ), 520 ),
523 521
524 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d", 522 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d",
525 (unsigned long long)__entry->bytenr, 523 (unsigned long long)__entry->bytenr,
526 (unsigned long long)__entry->num_bytes, 524 (unsigned long long)__entry->num_bytes,
527 show_ref_action(__entry->action), 525 show_ref_action(__entry->action),
528 __entry->is_data) 526 __entry->is_data)
529 ); 527 );
530 528
531 #define show_chunk_type(type) \ 529 #define show_chunk_type(type) \
532 __print_flags(type, "|", \ 530 __print_flags(type, "|", \
533 { BTRFS_BLOCK_GROUP_DATA, "DATA" }, \ 531 { BTRFS_BLOCK_GROUP_DATA, "DATA" }, \
534 { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ 532 { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \
535 { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ 533 { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \
536 { BTRFS_BLOCK_GROUP_RAID0, "RAID0" }, \ 534 { BTRFS_BLOCK_GROUP_RAID0, "RAID0" }, \
537 { BTRFS_BLOCK_GROUP_RAID1, "RAID1" }, \ 535 { BTRFS_BLOCK_GROUP_RAID1, "RAID1" }, \
538 { BTRFS_BLOCK_GROUP_DUP, "DUP" }, \ 536 { BTRFS_BLOCK_GROUP_DUP, "DUP" }, \
539 { BTRFS_BLOCK_GROUP_RAID10, "RAID10"}) 537 { BTRFS_BLOCK_GROUP_RAID10, "RAID10"})
540 538
541 DECLARE_EVENT_CLASS(btrfs__chunk, 539 DECLARE_EVENT_CLASS(btrfs__chunk,
542 540
543 TP_PROTO(struct btrfs_root *root, struct map_lookup *map, 541 TP_PROTO(struct btrfs_root *root, struct map_lookup *map,
544 u64 offset, u64 size), 542 u64 offset, u64 size),
545 543
546 TP_ARGS(root, map, offset, size), 544 TP_ARGS(root, map, offset, size),
547 545
548 TP_STRUCT__entry( 546 TP_STRUCT__entry(
549 __field( int, num_stripes ) 547 __field( int, num_stripes )
550 __field( u64, type ) 548 __field( u64, type )
551 __field( int, sub_stripes ) 549 __field( int, sub_stripes )
552 __field( u64, offset ) 550 __field( u64, offset )
553 __field( u64, size ) 551 __field( u64, size )
554 __field( u64, root_objectid ) 552 __field( u64, root_objectid )
555 ), 553 ),
556 554
557 TP_fast_assign( 555 TP_fast_assign(
558 __entry->num_stripes = map->num_stripes; 556 __entry->num_stripes = map->num_stripes;
559 __entry->type = map->type; 557 __entry->type = map->type;
560 __entry->sub_stripes = map->sub_stripes; 558 __entry->sub_stripes = map->sub_stripes;
561 __entry->offset = offset; 559 __entry->offset = offset;
562 __entry->size = size; 560 __entry->size = size;
563 __entry->root_objectid = root->root_key.objectid; 561 __entry->root_objectid = root->root_key.objectid;
564 ), 562 ),
565 563
566 TP_printk("root = %llu(%s), offset = %llu, size = %llu, " 564 TP_printk("root = %llu(%s), offset = %llu, size = %llu, "
567 "num_stripes = %d, sub_stripes = %d, type = %s", 565 "num_stripes = %d, sub_stripes = %d, type = %s",
568 show_root_type(__entry->root_objectid), 566 show_root_type(__entry->root_objectid),
569 (unsigned long long)__entry->offset, 567 (unsigned long long)__entry->offset,
570 (unsigned long long)__entry->size, 568 (unsigned long long)__entry->size,
571 __entry->num_stripes, __entry->sub_stripes, 569 __entry->num_stripes, __entry->sub_stripes,
572 show_chunk_type(__entry->type)) 570 show_chunk_type(__entry->type))
573 ); 571 );
574 572
575 DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc, 573 DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc,
576 574
577 TP_PROTO(struct btrfs_root *root, struct map_lookup *map, 575 TP_PROTO(struct btrfs_root *root, struct map_lookup *map,
578 u64 offset, u64 size), 576 u64 offset, u64 size),
579 577
580 TP_ARGS(root, map, offset, size) 578 TP_ARGS(root, map, offset, size)
581 ); 579 );
582 580
583 DEFINE_EVENT(btrfs__chunk, btrfs_chunk_free, 581 DEFINE_EVENT(btrfs__chunk, btrfs_chunk_free,
584 582
585 TP_PROTO(struct btrfs_root *root, struct map_lookup *map, 583 TP_PROTO(struct btrfs_root *root, struct map_lookup *map,
586 u64 offset, u64 size), 584 u64 offset, u64 size),
587 585
588 TP_ARGS(root, map, offset, size) 586 TP_ARGS(root, map, offset, size)
589 ); 587 );
590 588
591 TRACE_EVENT(btrfs_cow_block, 589 TRACE_EVENT(btrfs_cow_block,
592 590
593 TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf, 591 TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf,
594 struct extent_buffer *cow), 592 struct extent_buffer *cow),
595 593
596 TP_ARGS(root, buf, cow), 594 TP_ARGS(root, buf, cow),
597 595
598 TP_STRUCT__entry( 596 TP_STRUCT__entry(
599 __field( u64, root_objectid ) 597 __field( u64, root_objectid )
600 __field( u64, buf_start ) 598 __field( u64, buf_start )
601 __field( int, refs ) 599 __field( int, refs )
602 __field( u64, cow_start ) 600 __field( u64, cow_start )
603 __field( int, buf_level ) 601 __field( int, buf_level )
604 __field( int, cow_level ) 602 __field( int, cow_level )
605 ), 603 ),
606 604
607 TP_fast_assign( 605 TP_fast_assign(
608 __entry->root_objectid = root->root_key.objectid; 606 __entry->root_objectid = root->root_key.objectid;
609 __entry->buf_start = buf->start; 607 __entry->buf_start = buf->start;
610 __entry->refs = atomic_read(&buf->refs); 608 __entry->refs = atomic_read(&buf->refs);
611 __entry->cow_start = cow->start; 609 __entry->cow_start = cow->start;
612 __entry->buf_level = btrfs_header_level(buf); 610 __entry->buf_level = btrfs_header_level(buf);
613 __entry->cow_level = btrfs_header_level(cow); 611 __entry->cow_level = btrfs_header_level(cow);
614 ), 612 ),
615 613
616 TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu " 614 TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu "
617 "(orig_level = %d), cow_buf = %llu (cow_level = %d)", 615 "(orig_level = %d), cow_buf = %llu (cow_level = %d)",
618 show_root_type(__entry->root_objectid), 616 show_root_type(__entry->root_objectid),
619 __entry->refs, 617 __entry->refs,
620 (unsigned long long)__entry->buf_start, 618 (unsigned long long)__entry->buf_start,
621 __entry->buf_level, 619 __entry->buf_level,
622 (unsigned long long)__entry->cow_start, 620 (unsigned long long)__entry->cow_start,
623 __entry->cow_level) 621 __entry->cow_level)
624 ); 622 );
625 623
626 DECLARE_EVENT_CLASS(btrfs__reserved_extent, 624 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
627 625
628 TP_PROTO(struct btrfs_root *root, u64 start, u64 len), 626 TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
629 627
630 TP_ARGS(root, start, len), 628 TP_ARGS(root, start, len),
631 629
632 TP_STRUCT__entry( 630 TP_STRUCT__entry(
633 __field( u64, root_objectid ) 631 __field( u64, root_objectid )
634 __field( u64, start ) 632 __field( u64, start )
635 __field( u64, len ) 633 __field( u64, len )
636 ), 634 ),
637 635
638 TP_fast_assign( 636 TP_fast_assign(
639 __entry->root_objectid = root->root_key.objectid; 637 __entry->root_objectid = root->root_key.objectid;
640 __entry->start = start; 638 __entry->start = start;
641 __entry->len = len; 639 __entry->len = len;
642 ), 640 ),
643 641
644 TP_printk("root = %llu(%s), start = %llu, len = %llu", 642 TP_printk("root = %llu(%s), start = %llu, len = %llu",
645 show_root_type(__entry->root_objectid), 643 show_root_type(__entry->root_objectid),
646 (unsigned long long)__entry->start, 644 (unsigned long long)__entry->start,
647 (unsigned long long)__entry->len) 645 (unsigned long long)__entry->len)
648 ); 646 );
649 647
650 DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_alloc, 648 DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_alloc,
651 649
652 TP_PROTO(struct btrfs_root *root, u64 start, u64 len), 650 TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
653 651
654 TP_ARGS(root, start, len) 652 TP_ARGS(root, start, len)
655 ); 653 );
656 654
657 DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, 655 DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free,
658 656
659 TP_PROTO(struct btrfs_root *root, u64 start, u64 len), 657 TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
660 658
661 TP_ARGS(root, start, len) 659 TP_ARGS(root, start, len)
662 ); 660 );
663 661
664 #endif /* _TRACE_BTRFS_H */ 662 #endif /* _TRACE_BTRFS_H */
665 663
666 /* This part must be outside protection */ 664 /* This part must be outside protection */
667 #include <trace/define_trace.h> 665 #include <trace/define_trace.h>
668 666
include/trace/events/ext4.h
1 #undef TRACE_SYSTEM 1 #undef TRACE_SYSTEM
2 #define TRACE_SYSTEM ext4 2 #define TRACE_SYSTEM ext4
3 3
4 #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) 4 #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
5 #define _TRACE_EXT4_H 5 #define _TRACE_EXT4_H
6 6
7 #include <linux/writeback.h> 7 #include <linux/writeback.h>
8 #include <linux/tracepoint.h> 8 #include <linux/tracepoint.h>
9 9
10 struct ext4_allocation_context; 10 struct ext4_allocation_context;
11 struct ext4_allocation_request; 11 struct ext4_allocation_request;
12 struct ext4_prealloc_space; 12 struct ext4_prealloc_space;
13 struct ext4_inode_info; 13 struct ext4_inode_info;
14 struct mpage_da_data; 14 struct mpage_da_data;
15 15
16 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) 16 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
17 17
18 TRACE_EVENT(ext4_free_inode, 18 TRACE_EVENT(ext4_free_inode,
19 TP_PROTO(struct inode *inode), 19 TP_PROTO(struct inode *inode),
20 20
21 TP_ARGS(inode), 21 TP_ARGS(inode),
22 22
23 TP_STRUCT__entry( 23 TP_STRUCT__entry(
24 __field( dev_t, dev ) 24 __field( dev_t, dev )
25 __field( ino_t, ino ) 25 __field( ino_t, ino )
26 __field( umode_t, mode ) 26 __field( umode_t, mode )
27 __field( uid_t, uid ) 27 __field( uid_t, uid )
28 __field( gid_t, gid ) 28 __field( gid_t, gid )
29 __field( __u64, blocks ) 29 __field( __u64, blocks )
30 ), 30 ),
31 31
32 TP_fast_assign( 32 TP_fast_assign(
33 __entry->dev = inode->i_sb->s_dev; 33 __entry->dev = inode->i_sb->s_dev;
34 __entry->ino = inode->i_ino; 34 __entry->ino = inode->i_ino;
35 __entry->mode = inode->i_mode; 35 __entry->mode = inode->i_mode;
36 __entry->uid = inode->i_uid; 36 __entry->uid = inode->i_uid;
37 __entry->gid = inode->i_gid; 37 __entry->gid = inode->i_gid;
38 __entry->blocks = inode->i_blocks; 38 __entry->blocks = inode->i_blocks;
39 ), 39 ),
40 40
41 TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", 41 TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
42 MAJOR(__entry->dev), MINOR(__entry->dev), 42 MAJOR(__entry->dev), MINOR(__entry->dev),
43 (unsigned long) __entry->ino, __entry->mode, 43 (unsigned long) __entry->ino, __entry->mode,
44 __entry->uid, __entry->gid, __entry->blocks) 44 __entry->uid, __entry->gid, __entry->blocks)
45 ); 45 );
46 46
47 TRACE_EVENT(ext4_request_inode, 47 TRACE_EVENT(ext4_request_inode,
48 TP_PROTO(struct inode *dir, int mode), 48 TP_PROTO(struct inode *dir, int mode),
49 49
50 TP_ARGS(dir, mode), 50 TP_ARGS(dir, mode),
51 51
52 TP_STRUCT__entry( 52 TP_STRUCT__entry(
53 __field( dev_t, dev ) 53 __field( dev_t, dev )
54 __field( ino_t, dir ) 54 __field( ino_t, dir )
55 __field( umode_t, mode ) 55 __field( umode_t, mode )
56 ), 56 ),
57 57
58 TP_fast_assign( 58 TP_fast_assign(
59 __entry->dev = dir->i_sb->s_dev; 59 __entry->dev = dir->i_sb->s_dev;
60 __entry->dir = dir->i_ino; 60 __entry->dir = dir->i_ino;
61 __entry->mode = mode; 61 __entry->mode = mode;
62 ), 62 ),
63 63
64 TP_printk("dev %d,%d dir %lu mode 0%o", 64 TP_printk("dev %d,%d dir %lu mode 0%o",
65 MAJOR(__entry->dev), MINOR(__entry->dev), 65 MAJOR(__entry->dev), MINOR(__entry->dev),
66 (unsigned long) __entry->dir, __entry->mode) 66 (unsigned long) __entry->dir, __entry->mode)
67 ); 67 );
68 68
69 TRACE_EVENT(ext4_allocate_inode, 69 TRACE_EVENT(ext4_allocate_inode,
70 TP_PROTO(struct inode *inode, struct inode *dir, int mode), 70 TP_PROTO(struct inode *inode, struct inode *dir, int mode),
71 71
72 TP_ARGS(inode, dir, mode), 72 TP_ARGS(inode, dir, mode),
73 73
74 TP_STRUCT__entry( 74 TP_STRUCT__entry(
75 __field( dev_t, dev ) 75 __field( dev_t, dev )
76 __field( ino_t, ino ) 76 __field( ino_t, ino )
77 __field( ino_t, dir ) 77 __field( ino_t, dir )
78 __field( umode_t, mode ) 78 __field( umode_t, mode )
79 ), 79 ),
80 80
81 TP_fast_assign( 81 TP_fast_assign(
82 __entry->dev = inode->i_sb->s_dev; 82 __entry->dev = inode->i_sb->s_dev;
83 __entry->ino = inode->i_ino; 83 __entry->ino = inode->i_ino;
84 __entry->dir = dir->i_ino; 84 __entry->dir = dir->i_ino;
85 __entry->mode = mode; 85 __entry->mode = mode;
86 ), 86 ),
87 87
88 TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", 88 TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
89 MAJOR(__entry->dev), MINOR(__entry->dev), 89 MAJOR(__entry->dev), MINOR(__entry->dev),
90 (unsigned long) __entry->ino, 90 (unsigned long) __entry->ino,
91 (unsigned long) __entry->dir, __entry->mode) 91 (unsigned long) __entry->dir, __entry->mode)
92 ); 92 );
93 93
94 TRACE_EVENT(ext4_evict_inode, 94 TRACE_EVENT(ext4_evict_inode,
95 TP_PROTO(struct inode *inode), 95 TP_PROTO(struct inode *inode),
96 96
97 TP_ARGS(inode), 97 TP_ARGS(inode),
98 98
99 TP_STRUCT__entry( 99 TP_STRUCT__entry(
100 __field( dev_t, dev ) 100 __field( dev_t, dev )
101 __field( ino_t, ino ) 101 __field( ino_t, ino )
102 __field( int, nlink ) 102 __field( int, nlink )
103 ), 103 ),
104 104
105 TP_fast_assign( 105 TP_fast_assign(
106 __entry->dev = inode->i_sb->s_dev; 106 __entry->dev = inode->i_sb->s_dev;
107 __entry->ino = inode->i_ino; 107 __entry->ino = inode->i_ino;
108 __entry->nlink = inode->i_nlink; 108 __entry->nlink = inode->i_nlink;
109 ), 109 ),
110 110
111 TP_printk("dev %d,%d ino %lu nlink %d", 111 TP_printk("dev %d,%d ino %lu nlink %d",
112 MAJOR(__entry->dev), MINOR(__entry->dev), 112 MAJOR(__entry->dev), MINOR(__entry->dev),
113 (unsigned long) __entry->ino, __entry->nlink) 113 (unsigned long) __entry->ino, __entry->nlink)
114 ); 114 );
115 115
116 TRACE_EVENT(ext4_drop_inode, 116 TRACE_EVENT(ext4_drop_inode,
117 TP_PROTO(struct inode *inode, int drop), 117 TP_PROTO(struct inode *inode, int drop),
118 118
119 TP_ARGS(inode, drop), 119 TP_ARGS(inode, drop),
120 120
121 TP_STRUCT__entry( 121 TP_STRUCT__entry(
122 __field( dev_t, dev ) 122 __field( dev_t, dev )
123 __field( ino_t, ino ) 123 __field( ino_t, ino )
124 __field( int, drop ) 124 __field( int, drop )
125 ), 125 ),
126 126
127 TP_fast_assign( 127 TP_fast_assign(
128 __entry->dev = inode->i_sb->s_dev; 128 __entry->dev = inode->i_sb->s_dev;
129 __entry->ino = inode->i_ino; 129 __entry->ino = inode->i_ino;
130 __entry->drop = drop; 130 __entry->drop = drop;
131 ), 131 ),
132 132
133 TP_printk("dev %d,%d ino %lu drop %d", 133 TP_printk("dev %d,%d ino %lu drop %d",
134 MAJOR(__entry->dev), MINOR(__entry->dev), 134 MAJOR(__entry->dev), MINOR(__entry->dev),
135 (unsigned long) __entry->ino, __entry->drop) 135 (unsigned long) __entry->ino, __entry->drop)
136 ); 136 );
137 137
138 TRACE_EVENT(ext4_mark_inode_dirty, 138 TRACE_EVENT(ext4_mark_inode_dirty,
139 TP_PROTO(struct inode *inode, unsigned long IP), 139 TP_PROTO(struct inode *inode, unsigned long IP),
140 140
141 TP_ARGS(inode, IP), 141 TP_ARGS(inode, IP),
142 142
143 TP_STRUCT__entry( 143 TP_STRUCT__entry(
144 __field( dev_t, dev ) 144 __field( dev_t, dev )
145 __field( ino_t, ino ) 145 __field( ino_t, ino )
146 __field(unsigned long, ip ) 146 __field(unsigned long, ip )
147 ), 147 ),
148 148
149 TP_fast_assign( 149 TP_fast_assign(
150 __entry->dev = inode->i_sb->s_dev; 150 __entry->dev = inode->i_sb->s_dev;
151 __entry->ino = inode->i_ino; 151 __entry->ino = inode->i_ino;
152 __entry->ip = IP; 152 __entry->ip = IP;
153 ), 153 ),
154 154
155 TP_printk("dev %d,%d ino %lu caller %pF", 155 TP_printk("dev %d,%d ino %lu caller %pF",
156 MAJOR(__entry->dev), MINOR(__entry->dev), 156 MAJOR(__entry->dev), MINOR(__entry->dev),
157 (unsigned long) __entry->ino, (void *)__entry->ip) 157 (unsigned long) __entry->ino, (void *)__entry->ip)
158 ); 158 );
159 159
160 TRACE_EVENT(ext4_begin_ordered_truncate, 160 TRACE_EVENT(ext4_begin_ordered_truncate,
161 TP_PROTO(struct inode *inode, loff_t new_size), 161 TP_PROTO(struct inode *inode, loff_t new_size),
162 162
163 TP_ARGS(inode, new_size), 163 TP_ARGS(inode, new_size),
164 164
165 TP_STRUCT__entry( 165 TP_STRUCT__entry(
166 __field( dev_t, dev ) 166 __field( dev_t, dev )
167 __field( ino_t, ino ) 167 __field( ino_t, ino )
168 __field( loff_t, new_size ) 168 __field( loff_t, new_size )
169 ), 169 ),
170 170
171 TP_fast_assign( 171 TP_fast_assign(
172 __entry->dev = inode->i_sb->s_dev; 172 __entry->dev = inode->i_sb->s_dev;
173 __entry->ino = inode->i_ino; 173 __entry->ino = inode->i_ino;
174 __entry->new_size = new_size; 174 __entry->new_size = new_size;
175 ), 175 ),
176 176
177 TP_printk("dev %d,%d ino %lu new_size %lld", 177 TP_printk("dev %d,%d ino %lu new_size %lld",
178 MAJOR(__entry->dev), MINOR(__entry->dev), 178 MAJOR(__entry->dev), MINOR(__entry->dev),
179 (unsigned long) __entry->ino, 179 (unsigned long) __entry->ino,
180 __entry->new_size) 180 __entry->new_size)
181 ); 181 );
182 182
183 DECLARE_EVENT_CLASS(ext4__write_begin, 183 DECLARE_EVENT_CLASS(ext4__write_begin,
184 184
185 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 185 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
186 unsigned int flags), 186 unsigned int flags),
187 187
188 TP_ARGS(inode, pos, len, flags), 188 TP_ARGS(inode, pos, len, flags),
189 189
190 TP_STRUCT__entry( 190 TP_STRUCT__entry(
191 __field( dev_t, dev ) 191 __field( dev_t, dev )
192 __field( ino_t, ino ) 192 __field( ino_t, ino )
193 __field( loff_t, pos ) 193 __field( loff_t, pos )
194 __field( unsigned int, len ) 194 __field( unsigned int, len )
195 __field( unsigned int, flags ) 195 __field( unsigned int, flags )
196 ), 196 ),
197 197
198 TP_fast_assign( 198 TP_fast_assign(
199 __entry->dev = inode->i_sb->s_dev; 199 __entry->dev = inode->i_sb->s_dev;
200 __entry->ino = inode->i_ino; 200 __entry->ino = inode->i_ino;
201 __entry->pos = pos; 201 __entry->pos = pos;
202 __entry->len = len; 202 __entry->len = len;
203 __entry->flags = flags; 203 __entry->flags = flags;
204 ), 204 ),
205 205
206 TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u", 206 TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
207 MAJOR(__entry->dev), MINOR(__entry->dev), 207 MAJOR(__entry->dev), MINOR(__entry->dev),
208 (unsigned long) __entry->ino, 208 (unsigned long) __entry->ino,
209 __entry->pos, __entry->len, __entry->flags) 209 __entry->pos, __entry->len, __entry->flags)
210 ); 210 );
211 211
212 DEFINE_EVENT(ext4__write_begin, ext4_write_begin, 212 DEFINE_EVENT(ext4__write_begin, ext4_write_begin,
213 213
214 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 214 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
215 unsigned int flags), 215 unsigned int flags),
216 216
217 TP_ARGS(inode, pos, len, flags) 217 TP_ARGS(inode, pos, len, flags)
218 ); 218 );
219 219
220 DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, 220 DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,
221 221
222 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 222 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
223 unsigned int flags), 223 unsigned int flags),
224 224
225 TP_ARGS(inode, pos, len, flags) 225 TP_ARGS(inode, pos, len, flags)
226 ); 226 );
227 227
228 DECLARE_EVENT_CLASS(ext4__write_end, 228 DECLARE_EVENT_CLASS(ext4__write_end,
229 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 229 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
230 unsigned int copied), 230 unsigned int copied),
231 231
232 TP_ARGS(inode, pos, len, copied), 232 TP_ARGS(inode, pos, len, copied),
233 233
234 TP_STRUCT__entry( 234 TP_STRUCT__entry(
235 __field( dev_t, dev ) 235 __field( dev_t, dev )
236 __field( ino_t, ino ) 236 __field( ino_t, ino )
237 __field( loff_t, pos ) 237 __field( loff_t, pos )
238 __field( unsigned int, len ) 238 __field( unsigned int, len )
239 __field( unsigned int, copied ) 239 __field( unsigned int, copied )
240 ), 240 ),
241 241
242 TP_fast_assign( 242 TP_fast_assign(
243 __entry->dev = inode->i_sb->s_dev; 243 __entry->dev = inode->i_sb->s_dev;
244 __entry->ino = inode->i_ino; 244 __entry->ino = inode->i_ino;
245 __entry->pos = pos; 245 __entry->pos = pos;
246 __entry->len = len; 246 __entry->len = len;
247 __entry->copied = copied; 247 __entry->copied = copied;
248 ), 248 ),
249 249
250 TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", 250 TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
251 MAJOR(__entry->dev), MINOR(__entry->dev), 251 MAJOR(__entry->dev), MINOR(__entry->dev),
252 (unsigned long) __entry->ino, 252 (unsigned long) __entry->ino,
253 __entry->pos, __entry->len, __entry->copied) 253 __entry->pos, __entry->len, __entry->copied)
254 ); 254 );
255 255
256 DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, 256 DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
257 257
258 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 258 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
259 unsigned int copied), 259 unsigned int copied),
260 260
261 TP_ARGS(inode, pos, len, copied) 261 TP_ARGS(inode, pos, len, copied)
262 ); 262 );
263 263
264 DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end, 264 DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
265 265
266 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 266 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
267 unsigned int copied), 267 unsigned int copied),
268 268
269 TP_ARGS(inode, pos, len, copied) 269 TP_ARGS(inode, pos, len, copied)
270 ); 270 );
271 271
272 DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end, 272 DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,
273 273
274 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 274 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
275 unsigned int copied), 275 unsigned int copied),
276 276
277 TP_ARGS(inode, pos, len, copied) 277 TP_ARGS(inode, pos, len, copied)
278 ); 278 );
279 279
280 DEFINE_EVENT(ext4__write_end, ext4_da_write_end, 280 DEFINE_EVENT(ext4__write_end, ext4_da_write_end,
281 281
282 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 282 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
283 unsigned int copied), 283 unsigned int copied),
284 284
285 TP_ARGS(inode, pos, len, copied) 285 TP_ARGS(inode, pos, len, copied)
286 ); 286 );
287 287
288 TRACE_EVENT(ext4_da_writepages, 288 TRACE_EVENT(ext4_da_writepages,
289 TP_PROTO(struct inode *inode, struct writeback_control *wbc), 289 TP_PROTO(struct inode *inode, struct writeback_control *wbc),
290 290
291 TP_ARGS(inode, wbc), 291 TP_ARGS(inode, wbc),
292 292
293 TP_STRUCT__entry( 293 TP_STRUCT__entry(
294 __field( dev_t, dev ) 294 __field( dev_t, dev )
295 __field( ino_t, ino ) 295 __field( ino_t, ino )
296 __field( long, nr_to_write ) 296 __field( long, nr_to_write )
297 __field( long, pages_skipped ) 297 __field( long, pages_skipped )
298 __field( loff_t, range_start ) 298 __field( loff_t, range_start )
299 __field( loff_t, range_end ) 299 __field( loff_t, range_end )
300 __field( int, sync_mode ) 300 __field( int, sync_mode )
301 __field( char, for_kupdate ) 301 __field( char, for_kupdate )
302 __field( char, range_cyclic ) 302 __field( char, range_cyclic )
303 __field( pgoff_t, writeback_index ) 303 __field( pgoff_t, writeback_index )
304 ), 304 ),
305 305
306 TP_fast_assign( 306 TP_fast_assign(
307 __entry->dev = inode->i_sb->s_dev; 307 __entry->dev = inode->i_sb->s_dev;
308 __entry->ino = inode->i_ino; 308 __entry->ino = inode->i_ino;
309 __entry->nr_to_write = wbc->nr_to_write; 309 __entry->nr_to_write = wbc->nr_to_write;
310 __entry->pages_skipped = wbc->pages_skipped; 310 __entry->pages_skipped = wbc->pages_skipped;
311 __entry->range_start = wbc->range_start; 311 __entry->range_start = wbc->range_start;
312 __entry->range_end = wbc->range_end; 312 __entry->range_end = wbc->range_end;
313 __entry->sync_mode = wbc->sync_mode; 313 __entry->sync_mode = wbc->sync_mode;
314 __entry->for_kupdate = wbc->for_kupdate; 314 __entry->for_kupdate = wbc->for_kupdate;
315 __entry->range_cyclic = wbc->range_cyclic; 315 __entry->range_cyclic = wbc->range_cyclic;
316 __entry->writeback_index = inode->i_mapping->writeback_index; 316 __entry->writeback_index = inode->i_mapping->writeback_index;
317 ), 317 ),
318 318
319 TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " 319 TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
320 "range_start %lld range_end %lld sync_mode %d" 320 "range_start %lld range_end %lld sync_mode %d"
321 "for_kupdate %d range_cyclic %d writeback_index %lu", 321 "for_kupdate %d range_cyclic %d writeback_index %lu",
322 MAJOR(__entry->dev), MINOR(__entry->dev), 322 MAJOR(__entry->dev), MINOR(__entry->dev),
323 (unsigned long) __entry->ino, __entry->nr_to_write, 323 (unsigned long) __entry->ino, __entry->nr_to_write,
324 __entry->pages_skipped, __entry->range_start, 324 __entry->pages_skipped, __entry->range_start,
325 __entry->range_end, __entry->sync_mode, 325 __entry->range_end, __entry->sync_mode,
326 __entry->for_kupdate, __entry->range_cyclic, 326 __entry->for_kupdate, __entry->range_cyclic,
327 (unsigned long) __entry->writeback_index) 327 (unsigned long) __entry->writeback_index)
328 ); 328 );
329 329
330 TRACE_EVENT(ext4_da_write_pages, 330 TRACE_EVENT(ext4_da_write_pages,
331 TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), 331 TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
332 332
333 TP_ARGS(inode, mpd), 333 TP_ARGS(inode, mpd),
334 334
335 TP_STRUCT__entry( 335 TP_STRUCT__entry(
336 __field( dev_t, dev ) 336 __field( dev_t, dev )
337 __field( ino_t, ino ) 337 __field( ino_t, ino )
338 __field( __u64, b_blocknr ) 338 __field( __u64, b_blocknr )
339 __field( __u32, b_size ) 339 __field( __u32, b_size )
340 __field( __u32, b_state ) 340 __field( __u32, b_state )
341 __field( unsigned long, first_page ) 341 __field( unsigned long, first_page )
342 __field( int, io_done ) 342 __field( int, io_done )
343 __field( int, pages_written ) 343 __field( int, pages_written )
344 __field( int, sync_mode ) 344 __field( int, sync_mode )
345 ), 345 ),
346 346
347 TP_fast_assign( 347 TP_fast_assign(
348 __entry->dev = inode->i_sb->s_dev; 348 __entry->dev = inode->i_sb->s_dev;
349 __entry->ino = inode->i_ino; 349 __entry->ino = inode->i_ino;
350 __entry->b_blocknr = mpd->b_blocknr; 350 __entry->b_blocknr = mpd->b_blocknr;
351 __entry->b_size = mpd->b_size; 351 __entry->b_size = mpd->b_size;
352 __entry->b_state = mpd->b_state; 352 __entry->b_state = mpd->b_state;
353 __entry->first_page = mpd->first_page; 353 __entry->first_page = mpd->first_page;
354 __entry->io_done = mpd->io_done; 354 __entry->io_done = mpd->io_done;
355 __entry->pages_written = mpd->pages_written; 355 __entry->pages_written = mpd->pages_written;
356 __entry->sync_mode = mpd->wbc->sync_mode; 356 __entry->sync_mode = mpd->wbc->sync_mode;
357 ), 357 ),
358 358
359 TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x " 359 TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
360 "first_page %lu io_done %d pages_written %d sync_mode %d", 360 "first_page %lu io_done %d pages_written %d sync_mode %d",
361 MAJOR(__entry->dev), MINOR(__entry->dev), 361 MAJOR(__entry->dev), MINOR(__entry->dev),
362 (unsigned long) __entry->ino, 362 (unsigned long) __entry->ino,
363 __entry->b_blocknr, __entry->b_size, 363 __entry->b_blocknr, __entry->b_size,
364 __entry->b_state, __entry->first_page, 364 __entry->b_state, __entry->first_page,
365 __entry->io_done, __entry->pages_written, 365 __entry->io_done, __entry->pages_written,
366 __entry->sync_mode 366 __entry->sync_mode
367 ) 367 )
368 ); 368 );
369 369
370 TRACE_EVENT(ext4_da_writepages_result, 370 TRACE_EVENT(ext4_da_writepages_result,
371 TP_PROTO(struct inode *inode, struct writeback_control *wbc, 371 TP_PROTO(struct inode *inode, struct writeback_control *wbc,
372 int ret, int pages_written), 372 int ret, int pages_written),
373 373
374 TP_ARGS(inode, wbc, ret, pages_written), 374 TP_ARGS(inode, wbc, ret, pages_written),
375 375
376 TP_STRUCT__entry( 376 TP_STRUCT__entry(
377 __field( dev_t, dev ) 377 __field( dev_t, dev )
378 __field( ino_t, ino ) 378 __field( ino_t, ino )
379 __field( int, ret ) 379 __field( int, ret )
380 __field( int, pages_written ) 380 __field( int, pages_written )
381 __field( long, pages_skipped ) 381 __field( long, pages_skipped )
382 __field( int, sync_mode ) 382 __field( int, sync_mode )
383 __field( char, more_io )
384 __field( pgoff_t, writeback_index ) 383 __field( pgoff_t, writeback_index )
385 ), 384 ),
386 385
387 TP_fast_assign( 386 TP_fast_assign(
388 __entry->dev = inode->i_sb->s_dev; 387 __entry->dev = inode->i_sb->s_dev;
389 __entry->ino = inode->i_ino; 388 __entry->ino = inode->i_ino;
390 __entry->ret = ret; 389 __entry->ret = ret;
391 __entry->pages_written = pages_written; 390 __entry->pages_written = pages_written;
392 __entry->pages_skipped = wbc->pages_skipped; 391 __entry->pages_skipped = wbc->pages_skipped;
393 __entry->sync_mode = wbc->sync_mode; 392 __entry->sync_mode = wbc->sync_mode;
394 __entry->more_io = wbc->more_io;
395 __entry->writeback_index = inode->i_mapping->writeback_index; 393 __entry->writeback_index = inode->i_mapping->writeback_index;
396 ), 394 ),
397 395
398 TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " 396 TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
399 " more_io %d sync_mode %d writeback_index %lu", 397 "sync_mode %d writeback_index %lu",
400 MAJOR(__entry->dev), MINOR(__entry->dev), 398 MAJOR(__entry->dev), MINOR(__entry->dev),
401 (unsigned long) __entry->ino, __entry->ret, 399 (unsigned long) __entry->ino, __entry->ret,
402 __entry->pages_written, __entry->pages_skipped, 400 __entry->pages_written, __entry->pages_skipped,
403 __entry->more_io, __entry->sync_mode, 401 __entry->sync_mode,
404 (unsigned long) __entry->writeback_index) 402 (unsigned long) __entry->writeback_index)
405 ); 403 );
406 404
407 DECLARE_EVENT_CLASS(ext4__page_op, 405 DECLARE_EVENT_CLASS(ext4__page_op,
408 TP_PROTO(struct page *page), 406 TP_PROTO(struct page *page),
409 407
410 TP_ARGS(page), 408 TP_ARGS(page),
411 409
412 TP_STRUCT__entry( 410 TP_STRUCT__entry(
413 __field( pgoff_t, index ) 411 __field( pgoff_t, index )
414 __field( ino_t, ino ) 412 __field( ino_t, ino )
415 __field( dev_t, dev ) 413 __field( dev_t, dev )
416 414
417 ), 415 ),
418 416
419 TP_fast_assign( 417 TP_fast_assign(
420 __entry->index = page->index; 418 __entry->index = page->index;
421 __entry->ino = page->mapping->host->i_ino; 419 __entry->ino = page->mapping->host->i_ino;
422 __entry->dev = page->mapping->host->i_sb->s_dev; 420 __entry->dev = page->mapping->host->i_sb->s_dev;
423 ), 421 ),
424 422
425 TP_printk("dev %d,%d ino %lu page_index %lu", 423 TP_printk("dev %d,%d ino %lu page_index %lu",
426 MAJOR(__entry->dev), MINOR(__entry->dev), 424 MAJOR(__entry->dev), MINOR(__entry->dev),
427 (unsigned long) __entry->ino, 425 (unsigned long) __entry->ino,
428 (unsigned long) __entry->index) 426 (unsigned long) __entry->index)
429 ); 427 );
430 428
431 DEFINE_EVENT(ext4__page_op, ext4_writepage, 429 DEFINE_EVENT(ext4__page_op, ext4_writepage,
432 430
433 TP_PROTO(struct page *page), 431 TP_PROTO(struct page *page),
434 432
435 TP_ARGS(page) 433 TP_ARGS(page)
436 ); 434 );
437 435
438 DEFINE_EVENT(ext4__page_op, ext4_readpage, 436 DEFINE_EVENT(ext4__page_op, ext4_readpage,
439 437
440 TP_PROTO(struct page *page), 438 TP_PROTO(struct page *page),
441 439
442 TP_ARGS(page) 440 TP_ARGS(page)
443 ); 441 );
444 442
445 DEFINE_EVENT(ext4__page_op, ext4_releasepage, 443 DEFINE_EVENT(ext4__page_op, ext4_releasepage,
446 444
447 TP_PROTO(struct page *page), 445 TP_PROTO(struct page *page),
448 446
449 TP_ARGS(page) 447 TP_ARGS(page)
450 ); 448 );
451 449
452 TRACE_EVENT(ext4_invalidatepage, 450 TRACE_EVENT(ext4_invalidatepage,
453 TP_PROTO(struct page *page, unsigned long offset), 451 TP_PROTO(struct page *page, unsigned long offset),
454 452
455 TP_ARGS(page, offset), 453 TP_ARGS(page, offset),
456 454
457 TP_STRUCT__entry( 455 TP_STRUCT__entry(
458 __field( pgoff_t, index ) 456 __field( pgoff_t, index )
459 __field( unsigned long, offset ) 457 __field( unsigned long, offset )
460 __field( ino_t, ino ) 458 __field( ino_t, ino )
461 __field( dev_t, dev ) 459 __field( dev_t, dev )
462 460
463 ), 461 ),
464 462
465 TP_fast_assign( 463 TP_fast_assign(
466 __entry->index = page->index; 464 __entry->index = page->index;
467 __entry->offset = offset; 465 __entry->offset = offset;
468 __entry->ino = page->mapping->host->i_ino; 466 __entry->ino = page->mapping->host->i_ino;
469 __entry->dev = page->mapping->host->i_sb->s_dev; 467 __entry->dev = page->mapping->host->i_sb->s_dev;
470 ), 468 ),
471 469
472 TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", 470 TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
473 MAJOR(__entry->dev), MINOR(__entry->dev), 471 MAJOR(__entry->dev), MINOR(__entry->dev),
474 (unsigned long) __entry->ino, 472 (unsigned long) __entry->ino,
475 (unsigned long) __entry->index, __entry->offset) 473 (unsigned long) __entry->index, __entry->offset)
476 ); 474 );
477 475
478 TRACE_EVENT(ext4_discard_blocks, 476 TRACE_EVENT(ext4_discard_blocks,
479 TP_PROTO(struct super_block *sb, unsigned long long blk, 477 TP_PROTO(struct super_block *sb, unsigned long long blk,
480 unsigned long long count), 478 unsigned long long count),
481 479
482 TP_ARGS(sb, blk, count), 480 TP_ARGS(sb, blk, count),
483 481
484 TP_STRUCT__entry( 482 TP_STRUCT__entry(
485 __field( dev_t, dev ) 483 __field( dev_t, dev )
486 __field( __u64, blk ) 484 __field( __u64, blk )
487 __field( __u64, count ) 485 __field( __u64, count )
488 486
489 ), 487 ),
490 488
491 TP_fast_assign( 489 TP_fast_assign(
492 __entry->dev = sb->s_dev; 490 __entry->dev = sb->s_dev;
493 __entry->blk = blk; 491 __entry->blk = blk;
494 __entry->count = count; 492 __entry->count = count;
495 ), 493 ),
496 494
497 TP_printk("dev %d,%d blk %llu count %llu", 495 TP_printk("dev %d,%d blk %llu count %llu",
498 MAJOR(__entry->dev), MINOR(__entry->dev), 496 MAJOR(__entry->dev), MINOR(__entry->dev),
499 __entry->blk, __entry->count) 497 __entry->blk, __entry->count)
500 ); 498 );
501 499
502 DECLARE_EVENT_CLASS(ext4__mb_new_pa, 500 DECLARE_EVENT_CLASS(ext4__mb_new_pa,
503 TP_PROTO(struct ext4_allocation_context *ac, 501 TP_PROTO(struct ext4_allocation_context *ac,
504 struct ext4_prealloc_space *pa), 502 struct ext4_prealloc_space *pa),
505 503
506 TP_ARGS(ac, pa), 504 TP_ARGS(ac, pa),
507 505
508 TP_STRUCT__entry( 506 TP_STRUCT__entry(
509 __field( dev_t, dev ) 507 __field( dev_t, dev )
510 __field( ino_t, ino ) 508 __field( ino_t, ino )
511 __field( __u64, pa_pstart ) 509 __field( __u64, pa_pstart )
512 __field( __u32, pa_len ) 510 __field( __u32, pa_len )
513 __field( __u64, pa_lstart ) 511 __field( __u64, pa_lstart )
514 512
515 ), 513 ),
516 514
517 TP_fast_assign( 515 TP_fast_assign(
518 __entry->dev = ac->ac_sb->s_dev; 516 __entry->dev = ac->ac_sb->s_dev;
519 __entry->ino = ac->ac_inode->i_ino; 517 __entry->ino = ac->ac_inode->i_ino;
520 __entry->pa_pstart = pa->pa_pstart; 518 __entry->pa_pstart = pa->pa_pstart;
521 __entry->pa_len = pa->pa_len; 519 __entry->pa_len = pa->pa_len;
522 __entry->pa_lstart = pa->pa_lstart; 520 __entry->pa_lstart = pa->pa_lstart;
523 ), 521 ),
524 522
525 TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", 523 TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
526 MAJOR(__entry->dev), MINOR(__entry->dev), 524 MAJOR(__entry->dev), MINOR(__entry->dev),
527 (unsigned long) __entry->ino, 525 (unsigned long) __entry->ino,
528 __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) 526 __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
529 ); 527 );
530 528
531 DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, 529 DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,
532 530
533 TP_PROTO(struct ext4_allocation_context *ac, 531 TP_PROTO(struct ext4_allocation_context *ac,
534 struct ext4_prealloc_space *pa), 532 struct ext4_prealloc_space *pa),
535 533
536 TP_ARGS(ac, pa) 534 TP_ARGS(ac, pa)
537 ); 535 );
538 536
539 DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, 537 DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,
540 538
541 TP_PROTO(struct ext4_allocation_context *ac, 539 TP_PROTO(struct ext4_allocation_context *ac,
542 struct ext4_prealloc_space *pa), 540 struct ext4_prealloc_space *pa),
543 541
544 TP_ARGS(ac, pa) 542 TP_ARGS(ac, pa)
545 ); 543 );
546 544
547 TRACE_EVENT(ext4_mb_release_inode_pa, 545 TRACE_EVENT(ext4_mb_release_inode_pa,
548 TP_PROTO(struct ext4_prealloc_space *pa, 546 TP_PROTO(struct ext4_prealloc_space *pa,
549 unsigned long long block, unsigned int count), 547 unsigned long long block, unsigned int count),
550 548
551 TP_ARGS(pa, block, count), 549 TP_ARGS(pa, block, count),
552 550
553 TP_STRUCT__entry( 551 TP_STRUCT__entry(
554 __field( dev_t, dev ) 552 __field( dev_t, dev )
555 __field( ino_t, ino ) 553 __field( ino_t, ino )
556 __field( __u64, block ) 554 __field( __u64, block )
557 __field( __u32, count ) 555 __field( __u32, count )
558 556
559 ), 557 ),
560 558
561 TP_fast_assign( 559 TP_fast_assign(
562 __entry->dev = pa->pa_inode->i_sb->s_dev; 560 __entry->dev = pa->pa_inode->i_sb->s_dev;
563 __entry->ino = pa->pa_inode->i_ino; 561 __entry->ino = pa->pa_inode->i_ino;
564 __entry->block = block; 562 __entry->block = block;
565 __entry->count = count; 563 __entry->count = count;
566 ), 564 ),
567 565
568 TP_printk("dev %d,%d ino %lu block %llu count %u", 566 TP_printk("dev %d,%d ino %lu block %llu count %u",
569 MAJOR(__entry->dev), MINOR(__entry->dev), 567 MAJOR(__entry->dev), MINOR(__entry->dev),
570 (unsigned long) __entry->ino, 568 (unsigned long) __entry->ino,
571 __entry->block, __entry->count) 569 __entry->block, __entry->count)
572 ); 570 );
573 571
574 TRACE_EVENT(ext4_mb_release_group_pa, 572 TRACE_EVENT(ext4_mb_release_group_pa,
575 TP_PROTO(struct ext4_prealloc_space *pa), 573 TP_PROTO(struct ext4_prealloc_space *pa),
576 574
577 TP_ARGS(pa), 575 TP_ARGS(pa),
578 576
579 TP_STRUCT__entry( 577 TP_STRUCT__entry(
580 __field( dev_t, dev ) 578 __field( dev_t, dev )
581 __field( __u64, pa_pstart ) 579 __field( __u64, pa_pstart )
582 __field( __u32, pa_len ) 580 __field( __u32, pa_len )
583 581
584 ), 582 ),
585 583
586 TP_fast_assign( 584 TP_fast_assign(
587 __entry->dev = pa->pa_inode->i_sb->s_dev; 585 __entry->dev = pa->pa_inode->i_sb->s_dev;
588 __entry->pa_pstart = pa->pa_pstart; 586 __entry->pa_pstart = pa->pa_pstart;
589 __entry->pa_len = pa->pa_len; 587 __entry->pa_len = pa->pa_len;
590 ), 588 ),
591 589
592 TP_printk("dev %d,%d pstart %llu len %u", 590 TP_printk("dev %d,%d pstart %llu len %u",
593 MAJOR(__entry->dev), MINOR(__entry->dev), 591 MAJOR(__entry->dev), MINOR(__entry->dev),
594 __entry->pa_pstart, __entry->pa_len) 592 __entry->pa_pstart, __entry->pa_len)
595 ); 593 );
596 594
597 TRACE_EVENT(ext4_discard_preallocations, 595 TRACE_EVENT(ext4_discard_preallocations,
598 TP_PROTO(struct inode *inode), 596 TP_PROTO(struct inode *inode),
599 597
600 TP_ARGS(inode), 598 TP_ARGS(inode),
601 599
602 TP_STRUCT__entry( 600 TP_STRUCT__entry(
603 __field( dev_t, dev ) 601 __field( dev_t, dev )
604 __field( ino_t, ino ) 602 __field( ino_t, ino )
605 603
606 ), 604 ),
607 605
608 TP_fast_assign( 606 TP_fast_assign(
609 __entry->dev = inode->i_sb->s_dev; 607 __entry->dev = inode->i_sb->s_dev;
610 __entry->ino = inode->i_ino; 608 __entry->ino = inode->i_ino;
611 ), 609 ),
612 610
613 TP_printk("dev %d,%d ino %lu", 611 TP_printk("dev %d,%d ino %lu",
614 MAJOR(__entry->dev), MINOR(__entry->dev), 612 MAJOR(__entry->dev), MINOR(__entry->dev),
615 (unsigned long) __entry->ino) 613 (unsigned long) __entry->ino)
616 ); 614 );
617 615
618 TRACE_EVENT(ext4_mb_discard_preallocations, 616 TRACE_EVENT(ext4_mb_discard_preallocations,
619 TP_PROTO(struct super_block *sb, int needed), 617 TP_PROTO(struct super_block *sb, int needed),
620 618
621 TP_ARGS(sb, needed), 619 TP_ARGS(sb, needed),
622 620
623 TP_STRUCT__entry( 621 TP_STRUCT__entry(
624 __field( dev_t, dev ) 622 __field( dev_t, dev )
625 __field( int, needed ) 623 __field( int, needed )
626 624
627 ), 625 ),
628 626
629 TP_fast_assign( 627 TP_fast_assign(
630 __entry->dev = sb->s_dev; 628 __entry->dev = sb->s_dev;
631 __entry->needed = needed; 629 __entry->needed = needed;
632 ), 630 ),
633 631
634 TP_printk("dev %d,%d needed %d", 632 TP_printk("dev %d,%d needed %d",
635 MAJOR(__entry->dev), MINOR(__entry->dev), 633 MAJOR(__entry->dev), MINOR(__entry->dev),
636 __entry->needed) 634 __entry->needed)
637 ); 635 );
638 636
639 TRACE_EVENT(ext4_request_blocks, 637 TRACE_EVENT(ext4_request_blocks,
640 TP_PROTO(struct ext4_allocation_request *ar), 638 TP_PROTO(struct ext4_allocation_request *ar),
641 639
642 TP_ARGS(ar), 640 TP_ARGS(ar),
643 641
644 TP_STRUCT__entry( 642 TP_STRUCT__entry(
645 __field( dev_t, dev ) 643 __field( dev_t, dev )
646 __field( ino_t, ino ) 644 __field( ino_t, ino )
647 __field( unsigned int, flags ) 645 __field( unsigned int, flags )
648 __field( unsigned int, len ) 646 __field( unsigned int, len )
649 __field( __u32, logical ) 647 __field( __u32, logical )
650 __field( __u32, lleft ) 648 __field( __u32, lleft )
651 __field( __u32, lright ) 649 __field( __u32, lright )
652 __field( __u64, goal ) 650 __field( __u64, goal )
653 __field( __u64, pleft ) 651 __field( __u64, pleft )
654 __field( __u64, pright ) 652 __field( __u64, pright )
655 ), 653 ),
656 654
657 TP_fast_assign( 655 TP_fast_assign(
658 __entry->dev = ar->inode->i_sb->s_dev; 656 __entry->dev = ar->inode->i_sb->s_dev;
659 __entry->ino = ar->inode->i_ino; 657 __entry->ino = ar->inode->i_ino;
660 __entry->flags = ar->flags; 658 __entry->flags = ar->flags;
661 __entry->len = ar->len; 659 __entry->len = ar->len;
662 __entry->logical = ar->logical; 660 __entry->logical = ar->logical;
663 __entry->goal = ar->goal; 661 __entry->goal = ar->goal;
664 __entry->lleft = ar->lleft; 662 __entry->lleft = ar->lleft;
665 __entry->lright = ar->lright; 663 __entry->lright = ar->lright;
666 __entry->pleft = ar->pleft; 664 __entry->pleft = ar->pleft;
667 __entry->pright = ar->pright; 665 __entry->pright = ar->pright;
668 ), 666 ),
669 667
670 TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " 668 TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
671 "lleft %u lright %u pleft %llu pright %llu ", 669 "lleft %u lright %u pleft %llu pright %llu ",
672 MAJOR(__entry->dev), MINOR(__entry->dev), 670 MAJOR(__entry->dev), MINOR(__entry->dev),
673 (unsigned long) __entry->ino, __entry->flags, 671 (unsigned long) __entry->ino, __entry->flags,
674 __entry->len, __entry->logical, __entry->goal, 672 __entry->len, __entry->logical, __entry->goal,
675 __entry->lleft, __entry->lright, __entry->pleft, 673 __entry->lleft, __entry->lright, __entry->pleft,
676 __entry->pright) 674 __entry->pright)
677 ); 675 );
678 676
679 TRACE_EVENT(ext4_allocate_blocks, 677 TRACE_EVENT(ext4_allocate_blocks,
680 TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), 678 TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),
681 679
682 TP_ARGS(ar, block), 680 TP_ARGS(ar, block),
683 681
684 TP_STRUCT__entry( 682 TP_STRUCT__entry(
685 __field( dev_t, dev ) 683 __field( dev_t, dev )
686 __field( ino_t, ino ) 684 __field( ino_t, ino )
687 __field( __u64, block ) 685 __field( __u64, block )
688 __field( unsigned int, flags ) 686 __field( unsigned int, flags )
689 __field( unsigned int, len ) 687 __field( unsigned int, len )
690 __field( __u32, logical ) 688 __field( __u32, logical )
691 __field( __u32, lleft ) 689 __field( __u32, lleft )
692 __field( __u32, lright ) 690 __field( __u32, lright )
693 __field( __u64, goal ) 691 __field( __u64, goal )
694 __field( __u64, pleft ) 692 __field( __u64, pleft )
695 __field( __u64, pright ) 693 __field( __u64, pright )
696 ), 694 ),
697 695
698 TP_fast_assign( 696 TP_fast_assign(
699 __entry->dev = ar->inode->i_sb->s_dev; 697 __entry->dev = ar->inode->i_sb->s_dev;
700 __entry->ino = ar->inode->i_ino; 698 __entry->ino = ar->inode->i_ino;
701 __entry->block = block; 699 __entry->block = block;
702 __entry->flags = ar->flags; 700 __entry->flags = ar->flags;
703 __entry->len = ar->len; 701 __entry->len = ar->len;
704 __entry->logical = ar->logical; 702 __entry->logical = ar->logical;
705 __entry->goal = ar->goal; 703 __entry->goal = ar->goal;
706 __entry->lleft = ar->lleft; 704 __entry->lleft = ar->lleft;
707 __entry->lright = ar->lright; 705 __entry->lright = ar->lright;
708 __entry->pleft = ar->pleft; 706 __entry->pleft = ar->pleft;
709 __entry->pright = ar->pright; 707 __entry->pright = ar->pright;
710 ), 708 ),
711 709
712 TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " 710 TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
713 "goal %llu lleft %u lright %u pleft %llu pright %llu", 711 "goal %llu lleft %u lright %u pleft %llu pright %llu",
714 MAJOR(__entry->dev), MINOR(__entry->dev), 712 MAJOR(__entry->dev), MINOR(__entry->dev),
715 (unsigned long) __entry->ino, __entry->flags, 713 (unsigned long) __entry->ino, __entry->flags,
716 __entry->len, __entry->block, __entry->logical, 714 __entry->len, __entry->block, __entry->logical,
717 __entry->goal, __entry->lleft, __entry->lright, 715 __entry->goal, __entry->lleft, __entry->lright,
718 __entry->pleft, __entry->pright) 716 __entry->pleft, __entry->pright)
719 ); 717 );
720 718
721 TRACE_EVENT(ext4_free_blocks, 719 TRACE_EVENT(ext4_free_blocks,
722 TP_PROTO(struct inode *inode, __u64 block, unsigned long count, 720 TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
723 int flags), 721 int flags),
724 722
725 TP_ARGS(inode, block, count, flags), 723 TP_ARGS(inode, block, count, flags),
726 724
727 TP_STRUCT__entry( 725 TP_STRUCT__entry(
728 __field( dev_t, dev ) 726 __field( dev_t, dev )
729 __field( ino_t, ino ) 727 __field( ino_t, ino )
730 __field( umode_t, mode ) 728 __field( umode_t, mode )
731 __field( __u64, block ) 729 __field( __u64, block )
732 __field( unsigned long, count ) 730 __field( unsigned long, count )
733 __field( int, flags ) 731 __field( int, flags )
734 ), 732 ),
735 733
736 TP_fast_assign( 734 TP_fast_assign(
737 __entry->dev = inode->i_sb->s_dev; 735 __entry->dev = inode->i_sb->s_dev;
738 __entry->ino = inode->i_ino; 736 __entry->ino = inode->i_ino;
739 __entry->mode = inode->i_mode; 737 __entry->mode = inode->i_mode;
740 __entry->block = block; 738 __entry->block = block;
741 __entry->count = count; 739 __entry->count = count;
742 __entry->flags = flags; 740 __entry->flags = flags;
743 ), 741 ),
744 742
745 TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", 743 TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d",
746 MAJOR(__entry->dev), MINOR(__entry->dev), 744 MAJOR(__entry->dev), MINOR(__entry->dev),
747 (unsigned long) __entry->ino, 745 (unsigned long) __entry->ino,
748 __entry->mode, __entry->block, __entry->count, 746 __entry->mode, __entry->block, __entry->count,
749 __entry->flags) 747 __entry->flags)
750 ); 748 );
751 749
752 TRACE_EVENT(ext4_sync_file_enter, 750 TRACE_EVENT(ext4_sync_file_enter,
753 TP_PROTO(struct file *file, int datasync), 751 TP_PROTO(struct file *file, int datasync),
754 752
755 TP_ARGS(file, datasync), 753 TP_ARGS(file, datasync),
756 754
757 TP_STRUCT__entry( 755 TP_STRUCT__entry(
758 __field( dev_t, dev ) 756 __field( dev_t, dev )
759 __field( ino_t, ino ) 757 __field( ino_t, ino )
760 __field( ino_t, parent ) 758 __field( ino_t, parent )
761 __field( int, datasync ) 759 __field( int, datasync )
762 ), 760 ),
763 761
764 TP_fast_assign( 762 TP_fast_assign(
765 struct dentry *dentry = file->f_path.dentry; 763 struct dentry *dentry = file->f_path.dentry;
766 764
767 __entry->dev = dentry->d_inode->i_sb->s_dev; 765 __entry->dev = dentry->d_inode->i_sb->s_dev;
768 __entry->ino = dentry->d_inode->i_ino; 766 __entry->ino = dentry->d_inode->i_ino;
769 __entry->datasync = datasync; 767 __entry->datasync = datasync;
770 __entry->parent = dentry->d_parent->d_inode->i_ino; 768 __entry->parent = dentry->d_parent->d_inode->i_ino;
771 ), 769 ),
772 770
773 TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", 771 TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
774 MAJOR(__entry->dev), MINOR(__entry->dev), 772 MAJOR(__entry->dev), MINOR(__entry->dev),
775 (unsigned long) __entry->ino, 773 (unsigned long) __entry->ino,
776 (unsigned long) __entry->parent, __entry->datasync) 774 (unsigned long) __entry->parent, __entry->datasync)
777 ); 775 );
778 776
779 TRACE_EVENT(ext4_sync_file_exit, 777 TRACE_EVENT(ext4_sync_file_exit,
780 TP_PROTO(struct inode *inode, int ret), 778 TP_PROTO(struct inode *inode, int ret),
781 779
782 TP_ARGS(inode, ret), 780 TP_ARGS(inode, ret),
783 781
784 TP_STRUCT__entry( 782 TP_STRUCT__entry(
785 __field( int, ret ) 783 __field( int, ret )
786 __field( ino_t, ino ) 784 __field( ino_t, ino )
787 __field( dev_t, dev ) 785 __field( dev_t, dev )
788 ), 786 ),
789 787
790 TP_fast_assign( 788 TP_fast_assign(
791 __entry->ret = ret; 789 __entry->ret = ret;
792 __entry->ino = inode->i_ino; 790 __entry->ino = inode->i_ino;
793 __entry->dev = inode->i_sb->s_dev; 791 __entry->dev = inode->i_sb->s_dev;
794 ), 792 ),
795 793
796 TP_printk("dev %d,%d ino %lu ret %d", 794 TP_printk("dev %d,%d ino %lu ret %d",
797 MAJOR(__entry->dev), MINOR(__entry->dev), 795 MAJOR(__entry->dev), MINOR(__entry->dev),
798 (unsigned long) __entry->ino, 796 (unsigned long) __entry->ino,
799 __entry->ret) 797 __entry->ret)
800 ); 798 );
801 799
802 TRACE_EVENT(ext4_sync_fs, 800 TRACE_EVENT(ext4_sync_fs,
803 TP_PROTO(struct super_block *sb, int wait), 801 TP_PROTO(struct super_block *sb, int wait),
804 802
805 TP_ARGS(sb, wait), 803 TP_ARGS(sb, wait),
806 804
807 TP_STRUCT__entry( 805 TP_STRUCT__entry(
808 __field( dev_t, dev ) 806 __field( dev_t, dev )
809 __field( int, wait ) 807 __field( int, wait )
810 808
811 ), 809 ),
812 810
813 TP_fast_assign( 811 TP_fast_assign(
814 __entry->dev = sb->s_dev; 812 __entry->dev = sb->s_dev;
815 __entry->wait = wait; 813 __entry->wait = wait;
816 ), 814 ),
817 815
818 TP_printk("dev %d,%d wait %d", 816 TP_printk("dev %d,%d wait %d",
819 MAJOR(__entry->dev), MINOR(__entry->dev), 817 MAJOR(__entry->dev), MINOR(__entry->dev),
820 __entry->wait) 818 __entry->wait)
821 ); 819 );
822 820
823 TRACE_EVENT(ext4_alloc_da_blocks, 821 TRACE_EVENT(ext4_alloc_da_blocks,
824 TP_PROTO(struct inode *inode), 822 TP_PROTO(struct inode *inode),
825 823
826 TP_ARGS(inode), 824 TP_ARGS(inode),
827 825
828 TP_STRUCT__entry( 826 TP_STRUCT__entry(
829 __field( dev_t, dev ) 827 __field( dev_t, dev )
830 __field( ino_t, ino ) 828 __field( ino_t, ino )
831 __field( unsigned int, data_blocks ) 829 __field( unsigned int, data_blocks )
832 __field( unsigned int, meta_blocks ) 830 __field( unsigned int, meta_blocks )
833 ), 831 ),
834 832
835 TP_fast_assign( 833 TP_fast_assign(
836 __entry->dev = inode->i_sb->s_dev; 834 __entry->dev = inode->i_sb->s_dev;
837 __entry->ino = inode->i_ino; 835 __entry->ino = inode->i_ino;
838 __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; 836 __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
839 __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; 837 __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
840 ), 838 ),
841 839
842 TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u", 840 TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u",
843 MAJOR(__entry->dev), MINOR(__entry->dev), 841 MAJOR(__entry->dev), MINOR(__entry->dev),
844 (unsigned long) __entry->ino, 842 (unsigned long) __entry->ino,
845 __entry->data_blocks, __entry->meta_blocks) 843 __entry->data_blocks, __entry->meta_blocks)
846 ); 844 );
847 845
848 TRACE_EVENT(ext4_mballoc_alloc, 846 TRACE_EVENT(ext4_mballoc_alloc,
849 TP_PROTO(struct ext4_allocation_context *ac), 847 TP_PROTO(struct ext4_allocation_context *ac),
850 848
851 TP_ARGS(ac), 849 TP_ARGS(ac),
852 850
853 TP_STRUCT__entry( 851 TP_STRUCT__entry(
854 __field( dev_t, dev ) 852 __field( dev_t, dev )
855 __field( ino_t, ino ) 853 __field( ino_t, ino )
856 __field( __u16, found ) 854 __field( __u16, found )
857 __field( __u16, groups ) 855 __field( __u16, groups )
858 __field( __u16, buddy ) 856 __field( __u16, buddy )
859 __field( __u16, flags ) 857 __field( __u16, flags )
860 __field( __u16, tail ) 858 __field( __u16, tail )
861 __field( __u8, cr ) 859 __field( __u8, cr )
862 __field( __u32, orig_logical ) 860 __field( __u32, orig_logical )
863 __field( int, orig_start ) 861 __field( int, orig_start )
864 __field( __u32, orig_group ) 862 __field( __u32, orig_group )
865 __field( int, orig_len ) 863 __field( int, orig_len )
866 __field( __u32, goal_logical ) 864 __field( __u32, goal_logical )
867 __field( int, goal_start ) 865 __field( int, goal_start )
868 __field( __u32, goal_group ) 866 __field( __u32, goal_group )
869 __field( int, goal_len ) 867 __field( int, goal_len )
870 __field( __u32, result_logical ) 868 __field( __u32, result_logical )
871 __field( int, result_start ) 869 __field( int, result_start )
872 __field( __u32, result_group ) 870 __field( __u32, result_group )
873 __field( int, result_len ) 871 __field( int, result_len )
874 ), 872 ),
875 873
876 TP_fast_assign( 874 TP_fast_assign(
877 __entry->dev = ac->ac_inode->i_sb->s_dev; 875 __entry->dev = ac->ac_inode->i_sb->s_dev;
878 __entry->ino = ac->ac_inode->i_ino; 876 __entry->ino = ac->ac_inode->i_ino;
879 __entry->found = ac->ac_found; 877 __entry->found = ac->ac_found;
880 __entry->flags = ac->ac_flags; 878 __entry->flags = ac->ac_flags;
881 __entry->groups = ac->ac_groups_scanned; 879 __entry->groups = ac->ac_groups_scanned;
882 __entry->buddy = ac->ac_buddy; 880 __entry->buddy = ac->ac_buddy;
883 __entry->tail = ac->ac_tail; 881 __entry->tail = ac->ac_tail;
884 __entry->cr = ac->ac_criteria; 882 __entry->cr = ac->ac_criteria;
885 __entry->orig_logical = ac->ac_o_ex.fe_logical; 883 __entry->orig_logical = ac->ac_o_ex.fe_logical;
886 __entry->orig_start = ac->ac_o_ex.fe_start; 884 __entry->orig_start = ac->ac_o_ex.fe_start;
887 __entry->orig_group = ac->ac_o_ex.fe_group; 885 __entry->orig_group = ac->ac_o_ex.fe_group;
888 __entry->orig_len = ac->ac_o_ex.fe_len; 886 __entry->orig_len = ac->ac_o_ex.fe_len;
889 __entry->goal_logical = ac->ac_g_ex.fe_logical; 887 __entry->goal_logical = ac->ac_g_ex.fe_logical;
890 __entry->goal_start = ac->ac_g_ex.fe_start; 888 __entry->goal_start = ac->ac_g_ex.fe_start;
891 __entry->goal_group = ac->ac_g_ex.fe_group; 889 __entry->goal_group = ac->ac_g_ex.fe_group;
892 __entry->goal_len = ac->ac_g_ex.fe_len; 890 __entry->goal_len = ac->ac_g_ex.fe_len;
893 __entry->result_logical = ac->ac_f_ex.fe_logical; 891 __entry->result_logical = ac->ac_f_ex.fe_logical;
894 __entry->result_start = ac->ac_f_ex.fe_start; 892 __entry->result_start = ac->ac_f_ex.fe_start;
895 __entry->result_group = ac->ac_f_ex.fe_group; 893 __entry->result_group = ac->ac_f_ex.fe_group;
896 __entry->result_len = ac->ac_f_ex.fe_len; 894 __entry->result_len = ac->ac_f_ex.fe_len;
897 ), 895 ),
898 896
899 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " 897 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
900 "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " 898 "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
901 "tail %u broken %u", 899 "tail %u broken %u",
902 MAJOR(__entry->dev), MINOR(__entry->dev), 900 MAJOR(__entry->dev), MINOR(__entry->dev),
903 (unsigned long) __entry->ino, 901 (unsigned long) __entry->ino,
904 __entry->orig_group, __entry->orig_start, 902 __entry->orig_group, __entry->orig_start,
905 __entry->orig_len, __entry->orig_logical, 903 __entry->orig_len, __entry->orig_logical,
906 __entry->goal_group, __entry->goal_start, 904 __entry->goal_group, __entry->goal_start,
907 __entry->goal_len, __entry->goal_logical, 905 __entry->goal_len, __entry->goal_logical,
908 __entry->result_group, __entry->result_start, 906 __entry->result_group, __entry->result_start,
909 __entry->result_len, __entry->result_logical, 907 __entry->result_len, __entry->result_logical,
910 __entry->found, __entry->groups, __entry->cr, 908 __entry->found, __entry->groups, __entry->cr,
911 __entry->flags, __entry->tail, 909 __entry->flags, __entry->tail,
912 __entry->buddy ? 1 << __entry->buddy : 0) 910 __entry->buddy ? 1 << __entry->buddy : 0)
913 ); 911 );
914 912
915 TRACE_EVENT(ext4_mballoc_prealloc, 913 TRACE_EVENT(ext4_mballoc_prealloc,
916 TP_PROTO(struct ext4_allocation_context *ac), 914 TP_PROTO(struct ext4_allocation_context *ac),
917 915
918 TP_ARGS(ac), 916 TP_ARGS(ac),
919 917
920 TP_STRUCT__entry( 918 TP_STRUCT__entry(
921 __field( dev_t, dev ) 919 __field( dev_t, dev )
922 __field( ino_t, ino ) 920 __field( ino_t, ino )
923 __field( __u32, orig_logical ) 921 __field( __u32, orig_logical )
924 __field( int, orig_start ) 922 __field( int, orig_start )
925 __field( __u32, orig_group ) 923 __field( __u32, orig_group )
926 __field( int, orig_len ) 924 __field( int, orig_len )
927 __field( __u32, result_logical ) 925 __field( __u32, result_logical )
928 __field( int, result_start ) 926 __field( int, result_start )
929 __field( __u32, result_group ) 927 __field( __u32, result_group )
930 __field( int, result_len ) 928 __field( int, result_len )
931 ), 929 ),
932 930
933 TP_fast_assign( 931 TP_fast_assign(
934 __entry->dev = ac->ac_inode->i_sb->s_dev; 932 __entry->dev = ac->ac_inode->i_sb->s_dev;
935 __entry->ino = ac->ac_inode->i_ino; 933 __entry->ino = ac->ac_inode->i_ino;
936 __entry->orig_logical = ac->ac_o_ex.fe_logical; 934 __entry->orig_logical = ac->ac_o_ex.fe_logical;
937 __entry->orig_start = ac->ac_o_ex.fe_start; 935 __entry->orig_start = ac->ac_o_ex.fe_start;
938 __entry->orig_group = ac->ac_o_ex.fe_group; 936 __entry->orig_group = ac->ac_o_ex.fe_group;
939 __entry->orig_len = ac->ac_o_ex.fe_len; 937 __entry->orig_len = ac->ac_o_ex.fe_len;
940 __entry->result_logical = ac->ac_b_ex.fe_logical; 938 __entry->result_logical = ac->ac_b_ex.fe_logical;
941 __entry->result_start = ac->ac_b_ex.fe_start; 939 __entry->result_start = ac->ac_b_ex.fe_start;
942 __entry->result_group = ac->ac_b_ex.fe_group; 940 __entry->result_group = ac->ac_b_ex.fe_group;
943 __entry->result_len = ac->ac_b_ex.fe_len; 941 __entry->result_len = ac->ac_b_ex.fe_len;
944 ), 942 ),
945 943
946 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", 944 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
947 MAJOR(__entry->dev), MINOR(__entry->dev), 945 MAJOR(__entry->dev), MINOR(__entry->dev),
948 (unsigned long) __entry->ino, 946 (unsigned long) __entry->ino,
949 __entry->orig_group, __entry->orig_start, 947 __entry->orig_group, __entry->orig_start,
950 __entry->orig_len, __entry->orig_logical, 948 __entry->orig_len, __entry->orig_logical,
951 __entry->result_group, __entry->result_start, 949 __entry->result_group, __entry->result_start,
952 __entry->result_len, __entry->result_logical) 950 __entry->result_len, __entry->result_logical)
953 ); 951 );
954 952
955 DECLARE_EVENT_CLASS(ext4__mballoc, 953 DECLARE_EVENT_CLASS(ext4__mballoc,
956 TP_PROTO(struct super_block *sb, 954 TP_PROTO(struct super_block *sb,
957 struct inode *inode, 955 struct inode *inode,
958 ext4_group_t group, 956 ext4_group_t group,
959 ext4_grpblk_t start, 957 ext4_grpblk_t start,
960 ext4_grpblk_t len), 958 ext4_grpblk_t len),
961 959
962 TP_ARGS(sb, inode, group, start, len), 960 TP_ARGS(sb, inode, group, start, len),
963 961
964 TP_STRUCT__entry( 962 TP_STRUCT__entry(
965 __field( dev_t, dev ) 963 __field( dev_t, dev )
966 __field( ino_t, ino ) 964 __field( ino_t, ino )
967 __field( int, result_start ) 965 __field( int, result_start )
968 __field( __u32, result_group ) 966 __field( __u32, result_group )
969 __field( int, result_len ) 967 __field( int, result_len )
970 ), 968 ),
971 969
972 TP_fast_assign( 970 TP_fast_assign(
973 __entry->dev = sb->s_dev; 971 __entry->dev = sb->s_dev;
974 __entry->ino = inode ? inode->i_ino : 0; 972 __entry->ino = inode ? inode->i_ino : 0;
975 __entry->result_start = start; 973 __entry->result_start = start;
976 __entry->result_group = group; 974 __entry->result_group = group;
977 __entry->result_len = len; 975 __entry->result_len = len;
978 ), 976 ),
979 977
980 TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", 978 TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
981 MAJOR(__entry->dev), MINOR(__entry->dev), 979 MAJOR(__entry->dev), MINOR(__entry->dev),
982 (unsigned long) __entry->ino, 980 (unsigned long) __entry->ino,
983 __entry->result_group, __entry->result_start, 981 __entry->result_group, __entry->result_start,
984 __entry->result_len) 982 __entry->result_len)
985 ); 983 );
986 984
987 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, 985 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,
988 986
989 TP_PROTO(struct super_block *sb, 987 TP_PROTO(struct super_block *sb,
990 struct inode *inode, 988 struct inode *inode,
991 ext4_group_t group, 989 ext4_group_t group,
992 ext4_grpblk_t start, 990 ext4_grpblk_t start,
993 ext4_grpblk_t len), 991 ext4_grpblk_t len),
994 992
995 TP_ARGS(sb, inode, group, start, len) 993 TP_ARGS(sb, inode, group, start, len)
996 ); 994 );
997 995
998 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, 996 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,
999 997
1000 TP_PROTO(struct super_block *sb, 998 TP_PROTO(struct super_block *sb,
1001 struct inode *inode, 999 struct inode *inode,
1002 ext4_group_t group, 1000 ext4_group_t group,
1003 ext4_grpblk_t start, 1001 ext4_grpblk_t start,
1004 ext4_grpblk_t len), 1002 ext4_grpblk_t len),
1005 1003
1006 TP_ARGS(sb, inode, group, start, len) 1004 TP_ARGS(sb, inode, group, start, len)
1007 ); 1005 );
1008 1006
1009 TRACE_EVENT(ext4_forget, 1007 TRACE_EVENT(ext4_forget,
1010 TP_PROTO(struct inode *inode, int is_metadata, __u64 block), 1008 TP_PROTO(struct inode *inode, int is_metadata, __u64 block),
1011 1009
1012 TP_ARGS(inode, is_metadata, block), 1010 TP_ARGS(inode, is_metadata, block),
1013 1011
1014 TP_STRUCT__entry( 1012 TP_STRUCT__entry(
1015 __field( dev_t, dev ) 1013 __field( dev_t, dev )
1016 __field( ino_t, ino ) 1014 __field( ino_t, ino )
1017 __field( umode_t, mode ) 1015 __field( umode_t, mode )
1018 __field( int, is_metadata ) 1016 __field( int, is_metadata )
1019 __field( __u64, block ) 1017 __field( __u64, block )
1020 ), 1018 ),
1021 1019
1022 TP_fast_assign( 1020 TP_fast_assign(
1023 __entry->dev = inode->i_sb->s_dev; 1021 __entry->dev = inode->i_sb->s_dev;
1024 __entry->ino = inode->i_ino; 1022 __entry->ino = inode->i_ino;
1025 __entry->mode = inode->i_mode; 1023 __entry->mode = inode->i_mode;
1026 __entry->is_metadata = is_metadata; 1024 __entry->is_metadata = is_metadata;
1027 __entry->block = block; 1025 __entry->block = block;
1028 ), 1026 ),
1029 1027
1030 TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", 1028 TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
1031 MAJOR(__entry->dev), MINOR(__entry->dev), 1029 MAJOR(__entry->dev), MINOR(__entry->dev),
1032 (unsigned long) __entry->ino, 1030 (unsigned long) __entry->ino,
1033 __entry->mode, __entry->is_metadata, __entry->block) 1031 __entry->mode, __entry->is_metadata, __entry->block)
1034 ); 1032 );
1035 1033
1036 TRACE_EVENT(ext4_da_update_reserve_space, 1034 TRACE_EVENT(ext4_da_update_reserve_space,
1037 TP_PROTO(struct inode *inode, int used_blocks), 1035 TP_PROTO(struct inode *inode, int used_blocks),
1038 1036
1039 TP_ARGS(inode, used_blocks), 1037 TP_ARGS(inode, used_blocks),
1040 1038
1041 TP_STRUCT__entry( 1039 TP_STRUCT__entry(
1042 __field( dev_t, dev ) 1040 __field( dev_t, dev )
1043 __field( ino_t, ino ) 1041 __field( ino_t, ino )
1044 __field( umode_t, mode ) 1042 __field( umode_t, mode )
1045 __field( __u64, i_blocks ) 1043 __field( __u64, i_blocks )
1046 __field( int, used_blocks ) 1044 __field( int, used_blocks )
1047 __field( int, reserved_data_blocks ) 1045 __field( int, reserved_data_blocks )
1048 __field( int, reserved_meta_blocks ) 1046 __field( int, reserved_meta_blocks )
1049 __field( int, allocated_meta_blocks ) 1047 __field( int, allocated_meta_blocks )
1050 ), 1048 ),
1051 1049
1052 TP_fast_assign( 1050 TP_fast_assign(
1053 __entry->dev = inode->i_sb->s_dev; 1051 __entry->dev = inode->i_sb->s_dev;
1054 __entry->ino = inode->i_ino; 1052 __entry->ino = inode->i_ino;
1055 __entry->mode = inode->i_mode; 1053 __entry->mode = inode->i_mode;
1056 __entry->i_blocks = inode->i_blocks; 1054 __entry->i_blocks = inode->i_blocks;
1057 __entry->used_blocks = used_blocks; 1055 __entry->used_blocks = used_blocks;
1058 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1056 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
1059 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; 1057 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
1060 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; 1058 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
1061 ), 1059 ),
1062 1060
1063 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " 1061 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
1064 "reserved_data_blocks %d reserved_meta_blocks %d " 1062 "reserved_data_blocks %d reserved_meta_blocks %d "
1065 "allocated_meta_blocks %d", 1063 "allocated_meta_blocks %d",
1066 MAJOR(__entry->dev), MINOR(__entry->dev), 1064 MAJOR(__entry->dev), MINOR(__entry->dev),
1067 (unsigned long) __entry->ino, 1065 (unsigned long) __entry->ino,
1068 __entry->mode, __entry->i_blocks, 1066 __entry->mode, __entry->i_blocks,
1069 __entry->used_blocks, __entry->reserved_data_blocks, 1067 __entry->used_blocks, __entry->reserved_data_blocks,
1070 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) 1068 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
1071 ); 1069 );
1072 1070
1073 TRACE_EVENT(ext4_da_reserve_space, 1071 TRACE_EVENT(ext4_da_reserve_space,
1074 TP_PROTO(struct inode *inode, int md_needed), 1072 TP_PROTO(struct inode *inode, int md_needed),
1075 1073
1076 TP_ARGS(inode, md_needed), 1074 TP_ARGS(inode, md_needed),
1077 1075
1078 TP_STRUCT__entry( 1076 TP_STRUCT__entry(
1079 __field( dev_t, dev ) 1077 __field( dev_t, dev )
1080 __field( ino_t, ino ) 1078 __field( ino_t, ino )
1081 __field( umode_t, mode ) 1079 __field( umode_t, mode )
1082 __field( __u64, i_blocks ) 1080 __field( __u64, i_blocks )
1083 __field( int, md_needed ) 1081 __field( int, md_needed )
1084 __field( int, reserved_data_blocks ) 1082 __field( int, reserved_data_blocks )
1085 __field( int, reserved_meta_blocks ) 1083 __field( int, reserved_meta_blocks )
1086 ), 1084 ),
1087 1085
1088 TP_fast_assign( 1086 TP_fast_assign(
1089 __entry->dev = inode->i_sb->s_dev; 1087 __entry->dev = inode->i_sb->s_dev;
1090 __entry->ino = inode->i_ino; 1088 __entry->ino = inode->i_ino;
1091 __entry->mode = inode->i_mode; 1089 __entry->mode = inode->i_mode;
1092 __entry->i_blocks = inode->i_blocks; 1090 __entry->i_blocks = inode->i_blocks;
1093 __entry->md_needed = md_needed; 1091 __entry->md_needed = md_needed;
1094 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1092 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
1095 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; 1093 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
1096 ), 1094 ),
1097 1095
1098 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d " 1096 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d "
1099 "reserved_data_blocks %d reserved_meta_blocks %d", 1097 "reserved_data_blocks %d reserved_meta_blocks %d",
1100 MAJOR(__entry->dev), MINOR(__entry->dev), 1098 MAJOR(__entry->dev), MINOR(__entry->dev),
1101 (unsigned long) __entry->ino, 1099 (unsigned long) __entry->ino,
1102 __entry->mode, __entry->i_blocks, 1100 __entry->mode, __entry->i_blocks,
1103 __entry->md_needed, __entry->reserved_data_blocks, 1101 __entry->md_needed, __entry->reserved_data_blocks,
1104 __entry->reserved_meta_blocks) 1102 __entry->reserved_meta_blocks)
1105 ); 1103 );
1106 1104
1107 TRACE_EVENT(ext4_da_release_space, 1105 TRACE_EVENT(ext4_da_release_space,
1108 TP_PROTO(struct inode *inode, int freed_blocks), 1106 TP_PROTO(struct inode *inode, int freed_blocks),
1109 1107
1110 TP_ARGS(inode, freed_blocks), 1108 TP_ARGS(inode, freed_blocks),
1111 1109
1112 TP_STRUCT__entry( 1110 TP_STRUCT__entry(
1113 __field( dev_t, dev ) 1111 __field( dev_t, dev )
1114 __field( ino_t, ino ) 1112 __field( ino_t, ino )
1115 __field( umode_t, mode ) 1113 __field( umode_t, mode )
1116 __field( __u64, i_blocks ) 1114 __field( __u64, i_blocks )
1117 __field( int, freed_blocks ) 1115 __field( int, freed_blocks )
1118 __field( int, reserved_data_blocks ) 1116 __field( int, reserved_data_blocks )
1119 __field( int, reserved_meta_blocks ) 1117 __field( int, reserved_meta_blocks )
1120 __field( int, allocated_meta_blocks ) 1118 __field( int, allocated_meta_blocks )
1121 ), 1119 ),
1122 1120
1123 TP_fast_assign( 1121 TP_fast_assign(
1124 __entry->dev = inode->i_sb->s_dev; 1122 __entry->dev = inode->i_sb->s_dev;
1125 __entry->ino = inode->i_ino; 1123 __entry->ino = inode->i_ino;
1126 __entry->mode = inode->i_mode; 1124 __entry->mode = inode->i_mode;
1127 __entry->i_blocks = inode->i_blocks; 1125 __entry->i_blocks = inode->i_blocks;
1128 __entry->freed_blocks = freed_blocks; 1126 __entry->freed_blocks = freed_blocks;
1129 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1127 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
1130 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; 1128 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
1131 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; 1129 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
1132 ), 1130 ),
1133 1131
1134 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " 1132 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
1135 "reserved_data_blocks %d reserved_meta_blocks %d " 1133 "reserved_data_blocks %d reserved_meta_blocks %d "
1136 "allocated_meta_blocks %d", 1134 "allocated_meta_blocks %d",
1137 MAJOR(__entry->dev), MINOR(__entry->dev), 1135 MAJOR(__entry->dev), MINOR(__entry->dev),
1138 (unsigned long) __entry->ino, 1136 (unsigned long) __entry->ino,
1139 __entry->mode, __entry->i_blocks, 1137 __entry->mode, __entry->i_blocks,
1140 __entry->freed_blocks, __entry->reserved_data_blocks, 1138 __entry->freed_blocks, __entry->reserved_data_blocks,
1141 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) 1139 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
1142 ); 1140 );
1143 1141
1144 DECLARE_EVENT_CLASS(ext4__bitmap_load, 1142 DECLARE_EVENT_CLASS(ext4__bitmap_load,
1145 TP_PROTO(struct super_block *sb, unsigned long group), 1143 TP_PROTO(struct super_block *sb, unsigned long group),
1146 1144
1147 TP_ARGS(sb, group), 1145 TP_ARGS(sb, group),
1148 1146
1149 TP_STRUCT__entry( 1147 TP_STRUCT__entry(
1150 __field( dev_t, dev ) 1148 __field( dev_t, dev )
1151 __field( __u32, group ) 1149 __field( __u32, group )
1152 1150
1153 ), 1151 ),
1154 1152
1155 TP_fast_assign( 1153 TP_fast_assign(
1156 __entry->dev = sb->s_dev; 1154 __entry->dev = sb->s_dev;
1157 __entry->group = group; 1155 __entry->group = group;
1158 ), 1156 ),
1159 1157
1160 TP_printk("dev %d,%d group %u", 1158 TP_printk("dev %d,%d group %u",
1161 MAJOR(__entry->dev), MINOR(__entry->dev), 1159 MAJOR(__entry->dev), MINOR(__entry->dev),
1162 __entry->group) 1160 __entry->group)
1163 ); 1161 );
1164 1162
1165 DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, 1163 DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,
1166 1164
1167 TP_PROTO(struct super_block *sb, unsigned long group), 1165 TP_PROTO(struct super_block *sb, unsigned long group),
1168 1166
1169 TP_ARGS(sb, group) 1167 TP_ARGS(sb, group)
1170 ); 1168 );
1171 1169
1172 DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, 1170 DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,
1173 1171
1174 TP_PROTO(struct super_block *sb, unsigned long group), 1172 TP_PROTO(struct super_block *sb, unsigned long group),
1175 1173
1176 TP_ARGS(sb, group) 1174 TP_ARGS(sb, group)
1177 ); 1175 );
1178 1176
1179 DEFINE_EVENT(ext4__bitmap_load, ext4_read_block_bitmap_load, 1177 DEFINE_EVENT(ext4__bitmap_load, ext4_read_block_bitmap_load,
1180 1178
1181 TP_PROTO(struct super_block *sb, unsigned long group), 1179 TP_PROTO(struct super_block *sb, unsigned long group),
1182 1180
1183 TP_ARGS(sb, group) 1181 TP_ARGS(sb, group)
1184 ); 1182 );
1185 1183
1186 DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, 1184 DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,
1187 1185
1188 TP_PROTO(struct super_block *sb, unsigned long group), 1186 TP_PROTO(struct super_block *sb, unsigned long group),
1189 1187
1190 TP_ARGS(sb, group) 1188 TP_ARGS(sb, group)
1191 ); 1189 );
1192 1190
1193 TRACE_EVENT(ext4_direct_IO_enter, 1191 TRACE_EVENT(ext4_direct_IO_enter,
1194 TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), 1192 TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw),
1195 1193
1196 TP_ARGS(inode, offset, len, rw), 1194 TP_ARGS(inode, offset, len, rw),
1197 1195
1198 TP_STRUCT__entry( 1196 TP_STRUCT__entry(
1199 __field( ino_t, ino ) 1197 __field( ino_t, ino )
1200 __field( dev_t, dev ) 1198 __field( dev_t, dev )
1201 __field( loff_t, pos ) 1199 __field( loff_t, pos )
1202 __field( unsigned long, len ) 1200 __field( unsigned long, len )
1203 __field( int, rw ) 1201 __field( int, rw )
1204 ), 1202 ),
1205 1203
1206 TP_fast_assign( 1204 TP_fast_assign(
1207 __entry->ino = inode->i_ino; 1205 __entry->ino = inode->i_ino;
1208 __entry->dev = inode->i_sb->s_dev; 1206 __entry->dev = inode->i_sb->s_dev;
1209 __entry->pos = offset; 1207 __entry->pos = offset;
1210 __entry->len = len; 1208 __entry->len = len;
1211 __entry->rw = rw; 1209 __entry->rw = rw;
1212 ), 1210 ),
1213 1211
1214 TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", 1212 TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
1215 MAJOR(__entry->dev), MINOR(__entry->dev), 1213 MAJOR(__entry->dev), MINOR(__entry->dev),
1216 (unsigned long) __entry->ino, 1214 (unsigned long) __entry->ino,
1217 __entry->pos, __entry->len, __entry->rw) 1215 __entry->pos, __entry->len, __entry->rw)
1218 ); 1216 );
1219 1217
1220 TRACE_EVENT(ext4_direct_IO_exit, 1218 TRACE_EVENT(ext4_direct_IO_exit,
1221 TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, 1219 TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
1222 int rw, int ret), 1220 int rw, int ret),
1223 1221
1224 TP_ARGS(inode, offset, len, rw, ret), 1222 TP_ARGS(inode, offset, len, rw, ret),
1225 1223
1226 TP_STRUCT__entry( 1224 TP_STRUCT__entry(
1227 __field( ino_t, ino ) 1225 __field( ino_t, ino )
1228 __field( dev_t, dev ) 1226 __field( dev_t, dev )
1229 __field( loff_t, pos ) 1227 __field( loff_t, pos )
1230 __field( unsigned long, len ) 1228 __field( unsigned long, len )
1231 __field( int, rw ) 1229 __field( int, rw )
1232 __field( int, ret ) 1230 __field( int, ret )
1233 ), 1231 ),
1234 1232
1235 TP_fast_assign( 1233 TP_fast_assign(
1236 __entry->ino = inode->i_ino; 1234 __entry->ino = inode->i_ino;
1237 __entry->dev = inode->i_sb->s_dev; 1235 __entry->dev = inode->i_sb->s_dev;
1238 __entry->pos = offset; 1236 __entry->pos = offset;
1239 __entry->len = len; 1237 __entry->len = len;
1240 __entry->rw = rw; 1238 __entry->rw = rw;
1241 __entry->ret = ret; 1239 __entry->ret = ret;
1242 ), 1240 ),
1243 1241
1244 TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", 1242 TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
1245 MAJOR(__entry->dev), MINOR(__entry->dev), 1243 MAJOR(__entry->dev), MINOR(__entry->dev),
1246 (unsigned long) __entry->ino, 1244 (unsigned long) __entry->ino,
1247 __entry->pos, __entry->len, 1245 __entry->pos, __entry->len,
1248 __entry->rw, __entry->ret) 1246 __entry->rw, __entry->ret)
1249 ); 1247 );
1250 1248
1251 TRACE_EVENT(ext4_fallocate_enter, 1249 TRACE_EVENT(ext4_fallocate_enter,
1252 TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), 1250 TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
1253 1251
1254 TP_ARGS(inode, offset, len, mode), 1252 TP_ARGS(inode, offset, len, mode),
1255 1253
1256 TP_STRUCT__entry( 1254 TP_STRUCT__entry(
1257 __field( ino_t, ino ) 1255 __field( ino_t, ino )
1258 __field( dev_t, dev ) 1256 __field( dev_t, dev )
1259 __field( loff_t, pos ) 1257 __field( loff_t, pos )
1260 __field( loff_t, len ) 1258 __field( loff_t, len )
1261 __field( int, mode ) 1259 __field( int, mode )
1262 ), 1260 ),
1263 1261
1264 TP_fast_assign( 1262 TP_fast_assign(
1265 __entry->ino = inode->i_ino; 1263 __entry->ino = inode->i_ino;
1266 __entry->dev = inode->i_sb->s_dev; 1264 __entry->dev = inode->i_sb->s_dev;
1267 __entry->pos = offset; 1265 __entry->pos = offset;
1268 __entry->len = len; 1266 __entry->len = len;
1269 __entry->mode = mode; 1267 __entry->mode = mode;
1270 ), 1268 ),
1271 1269
1272 TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d", 1270 TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
1273 MAJOR(__entry->dev), MINOR(__entry->dev), 1271 MAJOR(__entry->dev), MINOR(__entry->dev),
1274 (unsigned long) __entry->ino, __entry->pos, 1272 (unsigned long) __entry->ino, __entry->pos,
1275 __entry->len, __entry->mode) 1273 __entry->len, __entry->mode)
1276 ); 1274 );
1277 1275
1278 TRACE_EVENT(ext4_fallocate_exit, 1276 TRACE_EVENT(ext4_fallocate_exit,
1279 TP_PROTO(struct inode *inode, loff_t offset, 1277 TP_PROTO(struct inode *inode, loff_t offset,
1280 unsigned int max_blocks, int ret), 1278 unsigned int max_blocks, int ret),
1281 1279
1282 TP_ARGS(inode, offset, max_blocks, ret), 1280 TP_ARGS(inode, offset, max_blocks, ret),
1283 1281
1284 TP_STRUCT__entry( 1282 TP_STRUCT__entry(
1285 __field( ino_t, ino ) 1283 __field( ino_t, ino )
1286 __field( dev_t, dev ) 1284 __field( dev_t, dev )
1287 __field( loff_t, pos ) 1285 __field( loff_t, pos )
1288 __field( unsigned int, blocks ) 1286 __field( unsigned int, blocks )
1289 __field( int, ret ) 1287 __field( int, ret )
1290 ), 1288 ),
1291 1289
1292 TP_fast_assign( 1290 TP_fast_assign(
1293 __entry->ino = inode->i_ino; 1291 __entry->ino = inode->i_ino;
1294 __entry->dev = inode->i_sb->s_dev; 1292 __entry->dev = inode->i_sb->s_dev;
1295 __entry->pos = offset; 1293 __entry->pos = offset;
1296 __entry->blocks = max_blocks; 1294 __entry->blocks = max_blocks;
1297 __entry->ret = ret; 1295 __entry->ret = ret;
1298 ), 1296 ),
1299 1297
1300 TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", 1298 TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
1301 MAJOR(__entry->dev), MINOR(__entry->dev), 1299 MAJOR(__entry->dev), MINOR(__entry->dev),
1302 (unsigned long) __entry->ino, 1300 (unsigned long) __entry->ino,
1303 __entry->pos, __entry->blocks, 1301 __entry->pos, __entry->blocks,
1304 __entry->ret) 1302 __entry->ret)
1305 ); 1303 );
1306 1304
1307 TRACE_EVENT(ext4_unlink_enter, 1305 TRACE_EVENT(ext4_unlink_enter,
1308 TP_PROTO(struct inode *parent, struct dentry *dentry), 1306 TP_PROTO(struct inode *parent, struct dentry *dentry),
1309 1307
1310 TP_ARGS(parent, dentry), 1308 TP_ARGS(parent, dentry),
1311 1309
1312 TP_STRUCT__entry( 1310 TP_STRUCT__entry(
1313 __field( ino_t, parent ) 1311 __field( ino_t, parent )
1314 __field( ino_t, ino ) 1312 __field( ino_t, ino )
1315 __field( loff_t, size ) 1313 __field( loff_t, size )
1316 __field( dev_t, dev ) 1314 __field( dev_t, dev )
1317 ), 1315 ),
1318 1316
1319 TP_fast_assign( 1317 TP_fast_assign(
1320 __entry->parent = parent->i_ino; 1318 __entry->parent = parent->i_ino;
1321 __entry->ino = dentry->d_inode->i_ino; 1319 __entry->ino = dentry->d_inode->i_ino;
1322 __entry->size = dentry->d_inode->i_size; 1320 __entry->size = dentry->d_inode->i_size;
1323 __entry->dev = dentry->d_inode->i_sb->s_dev; 1321 __entry->dev = dentry->d_inode->i_sb->s_dev;
1324 ), 1322 ),
1325 1323
1326 TP_printk("dev %d,%d ino %lu size %lld parent %lu", 1324 TP_printk("dev %d,%d ino %lu size %lld parent %lu",
1327 MAJOR(__entry->dev), MINOR(__entry->dev), 1325 MAJOR(__entry->dev), MINOR(__entry->dev),
1328 (unsigned long) __entry->ino, __entry->size, 1326 (unsigned long) __entry->ino, __entry->size,
1329 (unsigned long) __entry->parent) 1327 (unsigned long) __entry->parent)
1330 ); 1328 );
1331 1329
1332 TRACE_EVENT(ext4_unlink_exit, 1330 TRACE_EVENT(ext4_unlink_exit,
1333 TP_PROTO(struct dentry *dentry, int ret), 1331 TP_PROTO(struct dentry *dentry, int ret),
1334 1332
1335 TP_ARGS(dentry, ret), 1333 TP_ARGS(dentry, ret),
1336 1334
1337 TP_STRUCT__entry( 1335 TP_STRUCT__entry(
1338 __field( ino_t, ino ) 1336 __field( ino_t, ino )
1339 __field( dev_t, dev ) 1337 __field( dev_t, dev )
1340 __field( int, ret ) 1338 __field( int, ret )
1341 ), 1339 ),
1342 1340
1343 TP_fast_assign( 1341 TP_fast_assign(
1344 __entry->ino = dentry->d_inode->i_ino; 1342 __entry->ino = dentry->d_inode->i_ino;
1345 __entry->dev = dentry->d_inode->i_sb->s_dev; 1343 __entry->dev = dentry->d_inode->i_sb->s_dev;
1346 __entry->ret = ret; 1344 __entry->ret = ret;
1347 ), 1345 ),
1348 1346
1349 TP_printk("dev %d,%d ino %lu ret %d", 1347 TP_printk("dev %d,%d ino %lu ret %d",
1350 MAJOR(__entry->dev), MINOR(__entry->dev), 1348 MAJOR(__entry->dev), MINOR(__entry->dev),
1351 (unsigned long) __entry->ino, 1349 (unsigned long) __entry->ino,
1352 __entry->ret) 1350 __entry->ret)
1353 ); 1351 );
1354 1352
1355 DECLARE_EVENT_CLASS(ext4__truncate, 1353 DECLARE_EVENT_CLASS(ext4__truncate,
1356 TP_PROTO(struct inode *inode), 1354 TP_PROTO(struct inode *inode),
1357 1355
1358 TP_ARGS(inode), 1356 TP_ARGS(inode),
1359 1357
1360 TP_STRUCT__entry( 1358 TP_STRUCT__entry(
1361 __field( ino_t, ino ) 1359 __field( ino_t, ino )
1362 __field( dev_t, dev ) 1360 __field( dev_t, dev )
1363 __field( __u64, blocks ) 1361 __field( __u64, blocks )
1364 ), 1362 ),
1365 1363
1366 TP_fast_assign( 1364 TP_fast_assign(
1367 __entry->ino = inode->i_ino; 1365 __entry->ino = inode->i_ino;
1368 __entry->dev = inode->i_sb->s_dev; 1366 __entry->dev = inode->i_sb->s_dev;
1369 __entry->blocks = inode->i_blocks; 1367 __entry->blocks = inode->i_blocks;
1370 ), 1368 ),
1371 1369
1372 TP_printk("dev %d,%d ino %lu blocks %llu", 1370 TP_printk("dev %d,%d ino %lu blocks %llu",
1373 MAJOR(__entry->dev), MINOR(__entry->dev), 1371 MAJOR(__entry->dev), MINOR(__entry->dev),
1374 (unsigned long) __entry->ino, __entry->blocks) 1372 (unsigned long) __entry->ino, __entry->blocks)
1375 ); 1373 );
1376 1374
1377 DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, 1375 DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,
1378 1376
1379 TP_PROTO(struct inode *inode), 1377 TP_PROTO(struct inode *inode),
1380 1378
1381 TP_ARGS(inode) 1379 TP_ARGS(inode)
1382 ); 1380 );
1383 1381
1384 DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, 1382 DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
1385 1383
1386 TP_PROTO(struct inode *inode), 1384 TP_PROTO(struct inode *inode),
1387 1385
1388 TP_ARGS(inode) 1386 TP_ARGS(inode)
1389 ); 1387 );
1390 1388
1391 DECLARE_EVENT_CLASS(ext4__map_blocks_enter, 1389 DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
1392 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1390 TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
1393 unsigned int len, unsigned int flags), 1391 unsigned int len, unsigned int flags),
1394 1392
1395 TP_ARGS(inode, lblk, len, flags), 1393 TP_ARGS(inode, lblk, len, flags),
1396 1394
1397 TP_STRUCT__entry( 1395 TP_STRUCT__entry(
1398 __field( ino_t, ino ) 1396 __field( ino_t, ino )
1399 __field( dev_t, dev ) 1397 __field( dev_t, dev )
1400 __field( ext4_lblk_t, lblk ) 1398 __field( ext4_lblk_t, lblk )
1401 __field( unsigned int, len ) 1399 __field( unsigned int, len )
1402 __field( unsigned int, flags ) 1400 __field( unsigned int, flags )
1403 ), 1401 ),
1404 1402
1405 TP_fast_assign( 1403 TP_fast_assign(
1406 __entry->ino = inode->i_ino; 1404 __entry->ino = inode->i_ino;
1407 __entry->dev = inode->i_sb->s_dev; 1405 __entry->dev = inode->i_sb->s_dev;
1408 __entry->lblk = lblk; 1406 __entry->lblk = lblk;
1409 __entry->len = len; 1407 __entry->len = len;
1410 __entry->flags = flags; 1408 __entry->flags = flags;
1411 ), 1409 ),
1412 1410
1413 TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", 1411 TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u",
1414 MAJOR(__entry->dev), MINOR(__entry->dev), 1412 MAJOR(__entry->dev), MINOR(__entry->dev),
1415 (unsigned long) __entry->ino, 1413 (unsigned long) __entry->ino,
1416 __entry->lblk, __entry->len, __entry->flags) 1414 __entry->lblk, __entry->len, __entry->flags)
1417 ); 1415 );
1418 1416
1419 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, 1417 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
1420 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1418 TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
1421 unsigned len, unsigned flags), 1419 unsigned len, unsigned flags),
1422 1420
1423 TP_ARGS(inode, lblk, len, flags) 1421 TP_ARGS(inode, lblk, len, flags)
1424 ); 1422 );
1425 1423
1426 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, 1424 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
1427 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1425 TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
1428 unsigned len, unsigned flags), 1426 unsigned len, unsigned flags),
1429 1427
1430 TP_ARGS(inode, lblk, len, flags) 1428 TP_ARGS(inode, lblk, len, flags)
1431 ); 1429 );
1432 1430
1433 DECLARE_EVENT_CLASS(ext4__map_blocks_exit, 1431 DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
1434 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1432 TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
1435 ext4_fsblk_t pblk, unsigned int len, int ret), 1433 ext4_fsblk_t pblk, unsigned int len, int ret),
1436 1434
1437 TP_ARGS(inode, lblk, pblk, len, ret), 1435 TP_ARGS(inode, lblk, pblk, len, ret),
1438 1436
1439 TP_STRUCT__entry( 1437 TP_STRUCT__entry(
1440 __field( ino_t, ino ) 1438 __field( ino_t, ino )
1441 __field( dev_t, dev ) 1439 __field( dev_t, dev )
1442 __field( ext4_lblk_t, lblk ) 1440 __field( ext4_lblk_t, lblk )
1443 __field( ext4_fsblk_t, pblk ) 1441 __field( ext4_fsblk_t, pblk )
1444 __field( unsigned int, len ) 1442 __field( unsigned int, len )
1445 __field( int, ret ) 1443 __field( int, ret )
1446 ), 1444 ),
1447 1445
1448 TP_fast_assign( 1446 TP_fast_assign(
1449 __entry->ino = inode->i_ino; 1447 __entry->ino = inode->i_ino;
1450 __entry->dev = inode->i_sb->s_dev; 1448 __entry->dev = inode->i_sb->s_dev;
1451 __entry->lblk = lblk; 1449 __entry->lblk = lblk;
1452 __entry->pblk = pblk; 1450 __entry->pblk = pblk;
1453 __entry->len = len; 1451 __entry->len = len;
1454 __entry->ret = ret; 1452 __entry->ret = ret;
1455 ), 1453 ),
1456 1454
1457 TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", 1455 TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
1458 MAJOR(__entry->dev), MINOR(__entry->dev), 1456 MAJOR(__entry->dev), MINOR(__entry->dev),
1459 (unsigned long) __entry->ino, 1457 (unsigned long) __entry->ino,
1460 __entry->lblk, __entry->pblk, 1458 __entry->lblk, __entry->pblk,
1461 __entry->len, __entry->ret) 1459 __entry->len, __entry->ret)
1462 ); 1460 );
1463 1461
1464 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, 1462 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
1465 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1463 TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
1466 ext4_fsblk_t pblk, unsigned len, int ret), 1464 ext4_fsblk_t pblk, unsigned len, int ret),
1467 1465
1468 TP_ARGS(inode, lblk, pblk, len, ret) 1466 TP_ARGS(inode, lblk, pblk, len, ret)
1469 ); 1467 );
1470 1468
1471 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, 1469 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
1472 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1470 TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
1473 ext4_fsblk_t pblk, unsigned len, int ret), 1471 ext4_fsblk_t pblk, unsigned len, int ret),
1474 1472
1475 TP_ARGS(inode, lblk, pblk, len, ret) 1473 TP_ARGS(inode, lblk, pblk, len, ret)
1476 ); 1474 );
1477 1475
1478 TRACE_EVENT(ext4_ext_load_extent, 1476 TRACE_EVENT(ext4_ext_load_extent,
1479 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), 1477 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),
1480 1478
1481 TP_ARGS(inode, lblk, pblk), 1479 TP_ARGS(inode, lblk, pblk),
1482 1480
1483 TP_STRUCT__entry( 1481 TP_STRUCT__entry(
1484 __field( ino_t, ino ) 1482 __field( ino_t, ino )
1485 __field( dev_t, dev ) 1483 __field( dev_t, dev )
1486 __field( ext4_lblk_t, lblk ) 1484 __field( ext4_lblk_t, lblk )
1487 __field( ext4_fsblk_t, pblk ) 1485 __field( ext4_fsblk_t, pblk )
1488 ), 1486 ),
1489 1487
1490 TP_fast_assign( 1488 TP_fast_assign(
1491 __entry->ino = inode->i_ino; 1489 __entry->ino = inode->i_ino;
1492 __entry->dev = inode->i_sb->s_dev; 1490 __entry->dev = inode->i_sb->s_dev;
1493 __entry->lblk = lblk; 1491 __entry->lblk = lblk;
1494 __entry->pblk = pblk; 1492 __entry->pblk = pblk;
1495 ), 1493 ),
1496 1494
1497 TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", 1495 TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
1498 MAJOR(__entry->dev), MINOR(__entry->dev), 1496 MAJOR(__entry->dev), MINOR(__entry->dev),
1499 (unsigned long) __entry->ino, 1497 (unsigned long) __entry->ino,
1500 __entry->lblk, __entry->pblk) 1498 __entry->lblk, __entry->pblk)
1501 ); 1499 );
1502 1500
1503 TRACE_EVENT(ext4_load_inode, 1501 TRACE_EVENT(ext4_load_inode,
1504 TP_PROTO(struct inode *inode), 1502 TP_PROTO(struct inode *inode),
1505 1503
1506 TP_ARGS(inode), 1504 TP_ARGS(inode),
1507 1505
1508 TP_STRUCT__entry( 1506 TP_STRUCT__entry(
1509 __field( ino_t, ino ) 1507 __field( ino_t, ino )
1510 __field( dev_t, dev ) 1508 __field( dev_t, dev )
1511 ), 1509 ),
1512 1510
1513 TP_fast_assign( 1511 TP_fast_assign(
1514 __entry->ino = inode->i_ino; 1512 __entry->ino = inode->i_ino;
1515 __entry->dev = inode->i_sb->s_dev; 1513 __entry->dev = inode->i_sb->s_dev;
1516 ), 1514 ),
1517 1515
1518 TP_printk("dev %d,%d ino %ld", 1516 TP_printk("dev %d,%d ino %ld",
1519 MAJOR(__entry->dev), MINOR(__entry->dev), 1517 MAJOR(__entry->dev), MINOR(__entry->dev),
1520 (unsigned long) __entry->ino) 1518 (unsigned long) __entry->ino)
1521 ); 1519 );
1522 1520
1523 #endif /* _TRACE_EXT4_H */ 1521 #endif /* _TRACE_EXT4_H */
1524 1522
1525 /* This part must be outside protection */ 1523 /* This part must be outside protection */
1526 #include <trace/define_trace.h> 1524 #include <trace/define_trace.h>
1527 1525
include/trace/events/writeback.h
1 #undef TRACE_SYSTEM 1 #undef TRACE_SYSTEM
2 #define TRACE_SYSTEM writeback 2 #define TRACE_SYSTEM writeback
3 3
4 #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) 4 #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
5 #define _TRACE_WRITEBACK_H 5 #define _TRACE_WRITEBACK_H
6 6
7 #include <linux/backing-dev.h> 7 #include <linux/backing-dev.h>
8 #include <linux/device.h> 8 #include <linux/device.h>
9 #include <linux/writeback.h> 9 #include <linux/writeback.h>
10 10
11 #define show_inode_state(state) \
12 __print_flags(state, "|", \
13 {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \
14 {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \
15 {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \
16 {I_NEW, "I_NEW"}, \
17 {I_WILL_FREE, "I_WILL_FREE"}, \
18 {I_FREEING, "I_FREEING"}, \
19 {I_CLEAR, "I_CLEAR"}, \
20 {I_SYNC, "I_SYNC"}, \
21 {I_REFERENCED, "I_REFERENCED"} \
22 )
23
11 struct wb_writeback_work; 24 struct wb_writeback_work;
12 25
13 DECLARE_EVENT_CLASS(writeback_work_class, 26 DECLARE_EVENT_CLASS(writeback_work_class,
14 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), 27 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
15 TP_ARGS(bdi, work), 28 TP_ARGS(bdi, work),
16 TP_STRUCT__entry( 29 TP_STRUCT__entry(
17 __array(char, name, 32) 30 __array(char, name, 32)
18 __field(long, nr_pages) 31 __field(long, nr_pages)
19 __field(dev_t, sb_dev) 32 __field(dev_t, sb_dev)
20 __field(int, sync_mode) 33 __field(int, sync_mode)
21 __field(int, for_kupdate) 34 __field(int, for_kupdate)
22 __field(int, range_cyclic) 35 __field(int, range_cyclic)
23 __field(int, for_background) 36 __field(int, for_background)
24 ), 37 ),
25 TP_fast_assign( 38 TP_fast_assign(
26 strncpy(__entry->name, dev_name(bdi->dev), 32); 39 strncpy(__entry->name, dev_name(bdi->dev), 32);
27 __entry->nr_pages = work->nr_pages; 40 __entry->nr_pages = work->nr_pages;
28 __entry->sb_dev = work->sb ? work->sb->s_dev : 0; 41 __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
29 __entry->sync_mode = work->sync_mode; 42 __entry->sync_mode = work->sync_mode;
30 __entry->for_kupdate = work->for_kupdate; 43 __entry->for_kupdate = work->for_kupdate;
31 __entry->range_cyclic = work->range_cyclic; 44 __entry->range_cyclic = work->range_cyclic;
32 __entry->for_background = work->for_background; 45 __entry->for_background = work->for_background;
33 ), 46 ),
34 TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " 47 TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
35 "kupdate=%d range_cyclic=%d background=%d", 48 "kupdate=%d range_cyclic=%d background=%d",
36 __entry->name, 49 __entry->name,
37 MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), 50 MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
38 __entry->nr_pages, 51 __entry->nr_pages,
39 __entry->sync_mode, 52 __entry->sync_mode,
40 __entry->for_kupdate, 53 __entry->for_kupdate,
41 __entry->range_cyclic, 54 __entry->range_cyclic,
42 __entry->for_background 55 __entry->for_background
43 ) 56 )
44 ); 57 );
45 #define DEFINE_WRITEBACK_WORK_EVENT(name) \ 58 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
46 DEFINE_EVENT(writeback_work_class, name, \ 59 DEFINE_EVENT(writeback_work_class, name, \
47 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ 60 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
48 TP_ARGS(bdi, work)) 61 TP_ARGS(bdi, work))
49 DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); 62 DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
50 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 63 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
51 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 64 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
65 DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
66 DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
67 DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
52 68
53 TRACE_EVENT(writeback_pages_written, 69 TRACE_EVENT(writeback_pages_written,
54 TP_PROTO(long pages_written), 70 TP_PROTO(long pages_written),
55 TP_ARGS(pages_written), 71 TP_ARGS(pages_written),
56 TP_STRUCT__entry( 72 TP_STRUCT__entry(
57 __field(long, pages) 73 __field(long, pages)
58 ), 74 ),
59 TP_fast_assign( 75 TP_fast_assign(
60 __entry->pages = pages_written; 76 __entry->pages = pages_written;
61 ), 77 ),
62 TP_printk("%ld", __entry->pages) 78 TP_printk("%ld", __entry->pages)
63 ); 79 );
64 80
65 DECLARE_EVENT_CLASS(writeback_class, 81 DECLARE_EVENT_CLASS(writeback_class,
66 TP_PROTO(struct backing_dev_info *bdi), 82 TP_PROTO(struct backing_dev_info *bdi),
67 TP_ARGS(bdi), 83 TP_ARGS(bdi),
68 TP_STRUCT__entry( 84 TP_STRUCT__entry(
69 __array(char, name, 32) 85 __array(char, name, 32)
70 ), 86 ),
71 TP_fast_assign( 87 TP_fast_assign(
72 strncpy(__entry->name, dev_name(bdi->dev), 32); 88 strncpy(__entry->name, dev_name(bdi->dev), 32);
73 ), 89 ),
74 TP_printk("bdi %s", 90 TP_printk("bdi %s",
75 __entry->name 91 __entry->name
76 ) 92 )
77 ); 93 );
78 #define DEFINE_WRITEBACK_EVENT(name) \ 94 #define DEFINE_WRITEBACK_EVENT(name) \
79 DEFINE_EVENT(writeback_class, name, \ 95 DEFINE_EVENT(writeback_class, name, \
80 TP_PROTO(struct backing_dev_info *bdi), \ 96 TP_PROTO(struct backing_dev_info *bdi), \
81 TP_ARGS(bdi)) 97 TP_ARGS(bdi))
82 98
83 DEFINE_WRITEBACK_EVENT(writeback_nowork); 99 DEFINE_WRITEBACK_EVENT(writeback_nowork);
84 DEFINE_WRITEBACK_EVENT(writeback_wake_background); 100 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
85 DEFINE_WRITEBACK_EVENT(writeback_wake_thread); 101 DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
86 DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); 102 DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
87 DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 103 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
88 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); 104 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
89 DEFINE_WRITEBACK_EVENT(writeback_thread_start); 105 DEFINE_WRITEBACK_EVENT(writeback_thread_start);
90 DEFINE_WRITEBACK_EVENT(writeback_thread_stop); 106 DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
107 DEFINE_WRITEBACK_EVENT(balance_dirty_start);
108 DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
91 109
110 TRACE_EVENT(balance_dirty_written,
111
112 TP_PROTO(struct backing_dev_info *bdi, int written),
113
114 TP_ARGS(bdi, written),
115
116 TP_STRUCT__entry(
117 __array(char, name, 32)
118 __field(int, written)
119 ),
120
121 TP_fast_assign(
122 strncpy(__entry->name, dev_name(bdi->dev), 32);
123 __entry->written = written;
124 ),
125
126 TP_printk("bdi %s written %d",
127 __entry->name,
128 __entry->written
129 )
130 );
131
92 DECLARE_EVENT_CLASS(wbc_class, 132 DECLARE_EVENT_CLASS(wbc_class,
93 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), 133 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
94 TP_ARGS(wbc, bdi), 134 TP_ARGS(wbc, bdi),
95 TP_STRUCT__entry( 135 TP_STRUCT__entry(
96 __array(char, name, 32) 136 __array(char, name, 32)
97 __field(long, nr_to_write) 137 __field(long, nr_to_write)
98 __field(long, pages_skipped) 138 __field(long, pages_skipped)
99 __field(int, sync_mode) 139 __field(int, sync_mode)
100 __field(int, for_kupdate) 140 __field(int, for_kupdate)
101 __field(int, for_background) 141 __field(int, for_background)
102 __field(int, for_reclaim) 142 __field(int, for_reclaim)
103 __field(int, range_cyclic) 143 __field(int, range_cyclic)
104 __field(int, more_io)
105 __field(unsigned long, older_than_this)
106 __field(long, range_start) 144 __field(long, range_start)
107 __field(long, range_end) 145 __field(long, range_end)
108 ), 146 ),
109 147
110 TP_fast_assign( 148 TP_fast_assign(
111 strncpy(__entry->name, dev_name(bdi->dev), 32); 149 strncpy(__entry->name, dev_name(bdi->dev), 32);
112 __entry->nr_to_write = wbc->nr_to_write; 150 __entry->nr_to_write = wbc->nr_to_write;
113 __entry->pages_skipped = wbc->pages_skipped; 151 __entry->pages_skipped = wbc->pages_skipped;
114 __entry->sync_mode = wbc->sync_mode; 152 __entry->sync_mode = wbc->sync_mode;
115 __entry->for_kupdate = wbc->for_kupdate; 153 __entry->for_kupdate = wbc->for_kupdate;
116 __entry->for_background = wbc->for_background; 154 __entry->for_background = wbc->for_background;
117 __entry->for_reclaim = wbc->for_reclaim; 155 __entry->for_reclaim = wbc->for_reclaim;
118 __entry->range_cyclic = wbc->range_cyclic; 156 __entry->range_cyclic = wbc->range_cyclic;
119 __entry->more_io = wbc->more_io;
120 __entry->older_than_this = wbc->older_than_this ?
121 *wbc->older_than_this : 0;
122 __entry->range_start = (long)wbc->range_start; 157 __entry->range_start = (long)wbc->range_start;
123 __entry->range_end = (long)wbc->range_end; 158 __entry->range_end = (long)wbc->range_end;
124 ), 159 ),
125 160
126 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " 161 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
127 "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " 162 "bgrd=%d reclm=%d cyclic=%d "
128 "start=0x%lx end=0x%lx", 163 "start=0x%lx end=0x%lx",
129 __entry->name, 164 __entry->name,
130 __entry->nr_to_write, 165 __entry->nr_to_write,
131 __entry->pages_skipped, 166 __entry->pages_skipped,
132 __entry->sync_mode, 167 __entry->sync_mode,
133 __entry->for_kupdate, 168 __entry->for_kupdate,
134 __entry->for_background, 169 __entry->for_background,
135 __entry->for_reclaim, 170 __entry->for_reclaim,
136 __entry->range_cyclic, 171 __entry->range_cyclic,
137 __entry->more_io,
138 __entry->older_than_this,
139 __entry->range_start, 172 __entry->range_start,
140 __entry->range_end) 173 __entry->range_end)
141 ) 174 )
142 175
143 #define DEFINE_WBC_EVENT(name) \ 176 #define DEFINE_WBC_EVENT(name) \
144 DEFINE_EVENT(wbc_class, name, \ 177 DEFINE_EVENT(wbc_class, name, \
145 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ 178 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
146 TP_ARGS(wbc, bdi)) 179 TP_ARGS(wbc, bdi))
147 DEFINE_WBC_EVENT(wbc_writeback_start);
148 DEFINE_WBC_EVENT(wbc_writeback_written);
149 DEFINE_WBC_EVENT(wbc_writeback_wait);
150 DEFINE_WBC_EVENT(wbc_balance_dirty_start);
151 DEFINE_WBC_EVENT(wbc_balance_dirty_written);
152 DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
153 DEFINE_WBC_EVENT(wbc_writepage); 180 DEFINE_WBC_EVENT(wbc_writepage);
154 181
182 TRACE_EVENT(writeback_queue_io,
183 TP_PROTO(struct bdi_writeback *wb,
184 unsigned long *older_than_this,
185 int moved),
186 TP_ARGS(wb, older_than_this, moved),
187 TP_STRUCT__entry(
188 __array(char, name, 32)
189 __field(unsigned long, older)
190 __field(long, age)
191 __field(int, moved)
192 ),
193 TP_fast_assign(
194 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
195 __entry->older = older_than_this ? *older_than_this : 0;
196 __entry->age = older_than_this ?
197 (jiffies - *older_than_this) * 1000 / HZ : -1;
198 __entry->moved = moved;
199 ),
200 TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
201 __entry->name,
202 __entry->older, /* older_than_this in jiffies */
203 __entry->age, /* older_than_this in relative milliseconds */
204 __entry->moved)
205 );
206
207 TRACE_EVENT(global_dirty_state,
208
209 TP_PROTO(unsigned long background_thresh,
210 unsigned long dirty_thresh
211 ),
212
213 TP_ARGS(background_thresh,
214 dirty_thresh
215 ),
216
217 TP_STRUCT__entry(
218 __field(unsigned long, nr_dirty)
219 __field(unsigned long, nr_writeback)
220 __field(unsigned long, nr_unstable)
221 __field(unsigned long, background_thresh)
222 __field(unsigned long, dirty_thresh)
223 __field(unsigned long, dirty_limit)
224 __field(unsigned long, nr_dirtied)
225 __field(unsigned long, nr_written)
226 ),
227
228 TP_fast_assign(
229 __entry->nr_dirty = global_page_state(NR_FILE_DIRTY);
230 __entry->nr_writeback = global_page_state(NR_WRITEBACK);
231 __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
232 __entry->nr_dirtied = global_page_state(NR_DIRTIED);
233 __entry->nr_written = global_page_state(NR_WRITTEN);
234 __entry->background_thresh = background_thresh;
235 __entry->dirty_thresh = dirty_thresh;
236 __entry->dirty_limit = global_dirty_limit;
237 ),
238
239 TP_printk("dirty=%lu writeback=%lu unstable=%lu "
240 "bg_thresh=%lu thresh=%lu limit=%lu "
241 "dirtied=%lu written=%lu",
242 __entry->nr_dirty,
243 __entry->nr_writeback,
244 __entry->nr_unstable,
245 __entry->background_thresh,
246 __entry->dirty_thresh,
247 __entry->dirty_limit,
248 __entry->nr_dirtied,
249 __entry->nr_written
250 )
251 );
252
155 DECLARE_EVENT_CLASS(writeback_congest_waited_template, 253 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
156 254
157 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), 255 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
158 256
159 TP_ARGS(usec_timeout, usec_delayed), 257 TP_ARGS(usec_timeout, usec_delayed),
160 258
161 TP_STRUCT__entry( 259 TP_STRUCT__entry(
162 __field( unsigned int, usec_timeout ) 260 __field( unsigned int, usec_timeout )
163 __field( unsigned int, usec_delayed ) 261 __field( unsigned int, usec_delayed )
164 ), 262 ),
165 263
166 TP_fast_assign( 264 TP_fast_assign(
167 __entry->usec_timeout = usec_timeout; 265 __entry->usec_timeout = usec_timeout;
168 __entry->usec_delayed = usec_delayed; 266 __entry->usec_delayed = usec_delayed;
169 ), 267 ),
170 268
171 TP_printk("usec_timeout=%u usec_delayed=%u", 269 TP_printk("usec_timeout=%u usec_delayed=%u",
172 __entry->usec_timeout, 270 __entry->usec_timeout,
173 __entry->usec_delayed) 271 __entry->usec_delayed)
174 ); 272 );
175 273
176 DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, 274 DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
177 275
178 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), 276 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
179 277
180 TP_ARGS(usec_timeout, usec_delayed) 278 TP_ARGS(usec_timeout, usec_delayed)
181 ); 279 );
182 280
183 DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, 281 DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
184 282
185 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), 283 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
186 284
187 TP_ARGS(usec_timeout, usec_delayed) 285 TP_ARGS(usec_timeout, usec_delayed)
286 );
287
288 DECLARE_EVENT_CLASS(writeback_single_inode_template,
289
290 TP_PROTO(struct inode *inode,
291 struct writeback_control *wbc,
292 unsigned long nr_to_write
293 ),
294
295 TP_ARGS(inode, wbc, nr_to_write),
296
297 TP_STRUCT__entry(
298 __array(char, name, 32)
299 __field(unsigned long, ino)
300 __field(unsigned long, state)
301 __field(unsigned long, age)
302 __field(unsigned long, writeback_index)
303 __field(long, nr_to_write)
304 __field(unsigned long, wrote)
305 ),
306
307 TP_fast_assign(
308 strncpy(__entry->name,
309 dev_name(inode->i_mapping->backing_dev_info->dev), 32);
310 __entry->ino = inode->i_ino;
311 __entry->state = inode->i_state;
312 __entry->age = (jiffies - inode->dirtied_when) *
313 1000 / HZ;
314 __entry->writeback_index = inode->i_mapping->writeback_index;
315 __entry->nr_to_write = nr_to_write;
316 __entry->wrote = nr_to_write - wbc->nr_to_write;
317 ),
318
319 TP_printk("bdi %s: ino=%lu state=%s age=%lu "
320 "index=%lu to_write=%ld wrote=%lu",
321 __entry->name,
322 __entry->ino,
323 show_inode_state(__entry->state),
324 __entry->age,
325 __entry->writeback_index,
326 __entry->nr_to_write,
327 __entry->wrote
328 )
329 );
330
331 DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
332 TP_PROTO(struct inode *inode,
333 struct writeback_control *wbc,
334 unsigned long nr_to_write),
335 TP_ARGS(inode, wbc, nr_to_write)
336 );
1 1
2 #include <linux/wait.h> 2 #include <linux/wait.h>
3 #include <linux/backing-dev.h> 3 #include <linux/backing-dev.h>
4 #include <linux/kthread.h> 4 #include <linux/kthread.h>
5 #include <linux/freezer.h> 5 #include <linux/freezer.h>
6 #include <linux/fs.h> 6 #include <linux/fs.h>
7 #include <linux/pagemap.h> 7 #include <linux/pagemap.h>
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/sched.h> 9 #include <linux/sched.h>
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/writeback.h> 11 #include <linux/writeback.h>
12 #include <linux/device.h> 12 #include <linux/device.h>
13 #include <trace/events/writeback.h> 13 #include <trace/events/writeback.h>
14 14
15 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16 16
17 struct backing_dev_info default_backing_dev_info = { 17 struct backing_dev_info default_backing_dev_info = {
18 .name = "default", 18 .name = "default",
19 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 19 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
20 .state = 0, 20 .state = 0,
21 .capabilities = BDI_CAP_MAP_COPY, 21 .capabilities = BDI_CAP_MAP_COPY,
22 }; 22 };
23 EXPORT_SYMBOL_GPL(default_backing_dev_info); 23 EXPORT_SYMBOL_GPL(default_backing_dev_info);
24 24
25 struct backing_dev_info noop_backing_dev_info = { 25 struct backing_dev_info noop_backing_dev_info = {
26 .name = "noop", 26 .name = "noop",
27 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 27 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
28 }; 28 };
29 EXPORT_SYMBOL_GPL(noop_backing_dev_info); 29 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
30 30
31 static struct class *bdi_class; 31 static struct class *bdi_class;
32 32
33 /* 33 /*
34 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as 34 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
35 * reader side protection for bdi_pending_list. bdi_list has RCU reader side 35 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
36 * locking. 36 * locking.
37 */ 37 */
38 DEFINE_SPINLOCK(bdi_lock); 38 DEFINE_SPINLOCK(bdi_lock);
39 LIST_HEAD(bdi_list); 39 LIST_HEAD(bdi_list);
40 LIST_HEAD(bdi_pending_list); 40 LIST_HEAD(bdi_pending_list);
41 41
42 static struct task_struct *sync_supers_tsk; 42 static struct task_struct *sync_supers_tsk;
43 static struct timer_list sync_supers_timer; 43 static struct timer_list sync_supers_timer;
44 44
45 static int bdi_sync_supers(void *); 45 static int bdi_sync_supers(void *);
46 static void sync_supers_timer_fn(unsigned long); 46 static void sync_supers_timer_fn(unsigned long);
47 47
48 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49 {
50 if (wb1 < wb2) {
51 spin_lock(&wb1->list_lock);
52 spin_lock_nested(&wb2->list_lock, 1);
53 } else {
54 spin_lock(&wb2->list_lock);
55 spin_lock_nested(&wb1->list_lock, 1);
56 }
57 }
58
48 #ifdef CONFIG_DEBUG_FS 59 #ifdef CONFIG_DEBUG_FS
49 #include <linux/debugfs.h> 60 #include <linux/debugfs.h>
50 #include <linux/seq_file.h> 61 #include <linux/seq_file.h>
51 62
52 static struct dentry *bdi_debug_root; 63 static struct dentry *bdi_debug_root;
53 64
54 static void bdi_debug_init(void) 65 static void bdi_debug_init(void)
55 { 66 {
56 bdi_debug_root = debugfs_create_dir("bdi", NULL); 67 bdi_debug_root = debugfs_create_dir("bdi", NULL);
57 } 68 }
58 69
59 static int bdi_debug_stats_show(struct seq_file *m, void *v) 70 static int bdi_debug_stats_show(struct seq_file *m, void *v)
60 { 71 {
61 struct backing_dev_info *bdi = m->private; 72 struct backing_dev_info *bdi = m->private;
62 struct bdi_writeback *wb = &bdi->wb; 73 struct bdi_writeback *wb = &bdi->wb;
63 unsigned long background_thresh; 74 unsigned long background_thresh;
64 unsigned long dirty_thresh; 75 unsigned long dirty_thresh;
65 unsigned long bdi_thresh; 76 unsigned long bdi_thresh;
66 unsigned long nr_dirty, nr_io, nr_more_io; 77 unsigned long nr_dirty, nr_io, nr_more_io;
67 struct inode *inode; 78 struct inode *inode;
68 79
69 nr_dirty = nr_io = nr_more_io = 0; 80 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 81 spin_lock(&wb->list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 83 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 85 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 87 nr_more_io++;
77 spin_unlock(&inode_wb_list_lock); 88 spin_unlock(&wb->list_lock);
78 89
79 global_dirty_limits(&background_thresh, &dirty_thresh); 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92
82 #define K(x) ((x) << (PAGE_SHIFT - 10)) 93 #define K(x) ((x) << (PAGE_SHIFT - 10))
83 seq_printf(m, 94 seq_printf(m,
84 "BdiWriteback: %8lu kB\n" 95 "BdiWriteback: %10lu kB\n"
85 "BdiReclaimable: %8lu kB\n" 96 "BdiReclaimable: %10lu kB\n"
86 "BdiDirtyThresh: %8lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
87 "DirtyThresh: %8lu kB\n" 98 "DirtyThresh: %10lu kB\n"
88 "BackgroundThresh: %8lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
89 "b_dirty: %8lu\n" 100 "BdiWritten: %10lu kB\n"
90 "b_io: %8lu\n" 101 "BdiWriteBandwidth: %10lu kBps\n"
91 "b_more_io: %8lu\n" 102 "b_dirty: %10lu\n"
92 "bdi_list: %8u\n" 103 "b_io: %10lu\n"
93 "state: %8lx\n", 104 "b_more_io: %10lu\n"
105 "bdi_list: %10u\n"
106 "state: %10lx\n",
94 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 107 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 108 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96 K(bdi_thresh), K(dirty_thresh), 109 K(bdi_thresh),
97 K(background_thresh), nr_dirty, nr_io, nr_more_io, 110 K(dirty_thresh),
111 K(background_thresh),
112 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
113 (unsigned long) K(bdi->write_bandwidth),
114 nr_dirty,
115 nr_io,
116 nr_more_io,
98 !list_empty(&bdi->bdi_list), bdi->state); 117 !list_empty(&bdi->bdi_list), bdi->state);
99 #undef K 118 #undef K
100 119
101 return 0; 120 return 0;
102 } 121 }
103 122
104 static int bdi_debug_stats_open(struct inode *inode, struct file *file) 123 static int bdi_debug_stats_open(struct inode *inode, struct file *file)
105 { 124 {
106 return single_open(file, bdi_debug_stats_show, inode->i_private); 125 return single_open(file, bdi_debug_stats_show, inode->i_private);
107 } 126 }
108 127
109 static const struct file_operations bdi_debug_stats_fops = { 128 static const struct file_operations bdi_debug_stats_fops = {
110 .open = bdi_debug_stats_open, 129 .open = bdi_debug_stats_open,
111 .read = seq_read, 130 .read = seq_read,
112 .llseek = seq_lseek, 131 .llseek = seq_lseek,
113 .release = single_release, 132 .release = single_release,
114 }; 133 };
115 134
116 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) 135 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
117 { 136 {
118 bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); 137 bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
119 bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir, 138 bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
120 bdi, &bdi_debug_stats_fops); 139 bdi, &bdi_debug_stats_fops);
121 } 140 }
122 141
123 static void bdi_debug_unregister(struct backing_dev_info *bdi) 142 static void bdi_debug_unregister(struct backing_dev_info *bdi)
124 { 143 {
125 debugfs_remove(bdi->debug_stats); 144 debugfs_remove(bdi->debug_stats);
126 debugfs_remove(bdi->debug_dir); 145 debugfs_remove(bdi->debug_dir);
127 } 146 }
128 #else 147 #else
129 static inline void bdi_debug_init(void) 148 static inline void bdi_debug_init(void)
130 { 149 {
131 } 150 }
132 static inline void bdi_debug_register(struct backing_dev_info *bdi, 151 static inline void bdi_debug_register(struct backing_dev_info *bdi,
133 const char *name) 152 const char *name)
134 { 153 {
135 } 154 }
136 static inline void bdi_debug_unregister(struct backing_dev_info *bdi) 155 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
137 { 156 {
138 } 157 }
139 #endif 158 #endif
140 159
141 static ssize_t read_ahead_kb_store(struct device *dev, 160 static ssize_t read_ahead_kb_store(struct device *dev,
142 struct device_attribute *attr, 161 struct device_attribute *attr,
143 const char *buf, size_t count) 162 const char *buf, size_t count)
144 { 163 {
145 struct backing_dev_info *bdi = dev_get_drvdata(dev); 164 struct backing_dev_info *bdi = dev_get_drvdata(dev);
146 char *end; 165 char *end;
147 unsigned long read_ahead_kb; 166 unsigned long read_ahead_kb;
148 ssize_t ret = -EINVAL; 167 ssize_t ret = -EINVAL;
149 168
150 read_ahead_kb = simple_strtoul(buf, &end, 10); 169 read_ahead_kb = simple_strtoul(buf, &end, 10);
151 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { 170 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
152 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); 171 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
153 ret = count; 172 ret = count;
154 } 173 }
155 return ret; 174 return ret;
156 } 175 }
157 176
158 #define K(pages) ((pages) << (PAGE_SHIFT - 10)) 177 #define K(pages) ((pages) << (PAGE_SHIFT - 10))
159 178
160 #define BDI_SHOW(name, expr) \ 179 #define BDI_SHOW(name, expr) \
161 static ssize_t name##_show(struct device *dev, \ 180 static ssize_t name##_show(struct device *dev, \
162 struct device_attribute *attr, char *page) \ 181 struct device_attribute *attr, char *page) \
163 { \ 182 { \
164 struct backing_dev_info *bdi = dev_get_drvdata(dev); \ 183 struct backing_dev_info *bdi = dev_get_drvdata(dev); \
165 \ 184 \
166 return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ 185 return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
167 } 186 }
168 187
169 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) 188 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
170 189
171 static ssize_t min_ratio_store(struct device *dev, 190 static ssize_t min_ratio_store(struct device *dev,
172 struct device_attribute *attr, const char *buf, size_t count) 191 struct device_attribute *attr, const char *buf, size_t count)
173 { 192 {
174 struct backing_dev_info *bdi = dev_get_drvdata(dev); 193 struct backing_dev_info *bdi = dev_get_drvdata(dev);
175 char *end; 194 char *end;
176 unsigned int ratio; 195 unsigned int ratio;
177 ssize_t ret = -EINVAL; 196 ssize_t ret = -EINVAL;
178 197
179 ratio = simple_strtoul(buf, &end, 10); 198 ratio = simple_strtoul(buf, &end, 10);
180 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { 199 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
181 ret = bdi_set_min_ratio(bdi, ratio); 200 ret = bdi_set_min_ratio(bdi, ratio);
182 if (!ret) 201 if (!ret)
183 ret = count; 202 ret = count;
184 } 203 }
185 return ret; 204 return ret;
186 } 205 }
187 BDI_SHOW(min_ratio, bdi->min_ratio) 206 BDI_SHOW(min_ratio, bdi->min_ratio)
188 207
189 static ssize_t max_ratio_store(struct device *dev, 208 static ssize_t max_ratio_store(struct device *dev,
190 struct device_attribute *attr, const char *buf, size_t count) 209 struct device_attribute *attr, const char *buf, size_t count)
191 { 210 {
192 struct backing_dev_info *bdi = dev_get_drvdata(dev); 211 struct backing_dev_info *bdi = dev_get_drvdata(dev);
193 char *end; 212 char *end;
194 unsigned int ratio; 213 unsigned int ratio;
195 ssize_t ret = -EINVAL; 214 ssize_t ret = -EINVAL;
196 215
197 ratio = simple_strtoul(buf, &end, 10); 216 ratio = simple_strtoul(buf, &end, 10);
198 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { 217 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
199 ret = bdi_set_max_ratio(bdi, ratio); 218 ret = bdi_set_max_ratio(bdi, ratio);
200 if (!ret) 219 if (!ret)
201 ret = count; 220 ret = count;
202 } 221 }
203 return ret; 222 return ret;
204 } 223 }
205 BDI_SHOW(max_ratio, bdi->max_ratio) 224 BDI_SHOW(max_ratio, bdi->max_ratio)
206 225
207 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) 226 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
208 227
209 static struct device_attribute bdi_dev_attrs[] = { 228 static struct device_attribute bdi_dev_attrs[] = {
210 __ATTR_RW(read_ahead_kb), 229 __ATTR_RW(read_ahead_kb),
211 __ATTR_RW(min_ratio), 230 __ATTR_RW(min_ratio),
212 __ATTR_RW(max_ratio), 231 __ATTR_RW(max_ratio),
213 __ATTR_NULL, 232 __ATTR_NULL,
214 }; 233 };
215 234
216 static __init int bdi_class_init(void) 235 static __init int bdi_class_init(void)
217 { 236 {
218 bdi_class = class_create(THIS_MODULE, "bdi"); 237 bdi_class = class_create(THIS_MODULE, "bdi");
219 if (IS_ERR(bdi_class)) 238 if (IS_ERR(bdi_class))
220 return PTR_ERR(bdi_class); 239 return PTR_ERR(bdi_class);
221 240
222 bdi_class->dev_attrs = bdi_dev_attrs; 241 bdi_class->dev_attrs = bdi_dev_attrs;
223 bdi_debug_init(); 242 bdi_debug_init();
224 return 0; 243 return 0;
225 } 244 }
226 postcore_initcall(bdi_class_init); 245 postcore_initcall(bdi_class_init);
227 246
228 static int __init default_bdi_init(void) 247 static int __init default_bdi_init(void)
229 { 248 {
230 int err; 249 int err;
231 250
232 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); 251 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
233 BUG_ON(IS_ERR(sync_supers_tsk)); 252 BUG_ON(IS_ERR(sync_supers_tsk));
234 253
235 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); 254 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
236 bdi_arm_supers_timer(); 255 bdi_arm_supers_timer();
237 256
238 err = bdi_init(&default_backing_dev_info); 257 err = bdi_init(&default_backing_dev_info);
239 if (!err) 258 if (!err)
240 bdi_register(&default_backing_dev_info, NULL, "default"); 259 bdi_register(&default_backing_dev_info, NULL, "default");
241 err = bdi_init(&noop_backing_dev_info); 260 err = bdi_init(&noop_backing_dev_info);
242 261
243 return err; 262 return err;
244 } 263 }
245 subsys_initcall(default_bdi_init); 264 subsys_initcall(default_bdi_init);
246 265
247 int bdi_has_dirty_io(struct backing_dev_info *bdi) 266 int bdi_has_dirty_io(struct backing_dev_info *bdi)
248 { 267 {
249 return wb_has_dirty_io(&bdi->wb); 268 return wb_has_dirty_io(&bdi->wb);
250 } 269 }
251 270
252 static void bdi_flush_io(struct backing_dev_info *bdi)
253 {
254 struct writeback_control wbc = {
255 .sync_mode = WB_SYNC_NONE,
256 .older_than_this = NULL,
257 .range_cyclic = 1,
258 .nr_to_write = 1024,
259 };
260
261 writeback_inodes_wb(&bdi->wb, &wbc);
262 }
263
264 /* 271 /*
265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread() 272 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 * or we risk deadlocking on ->s_umount. The longer term solution would be 273 * or we risk deadlocking on ->s_umount. The longer term solution would be
267 * to implement sync_supers_bdi() or similar and simply do it from the 274 * to implement sync_supers_bdi() or similar and simply do it from the
268 * bdi writeback thread individually. 275 * bdi writeback thread individually.
269 */ 276 */
270 static int bdi_sync_supers(void *unused) 277 static int bdi_sync_supers(void *unused)
271 { 278 {
272 set_user_nice(current, 0); 279 set_user_nice(current, 0);
273 280
274 while (!kthread_should_stop()) { 281 while (!kthread_should_stop()) {
275 set_current_state(TASK_INTERRUPTIBLE); 282 set_current_state(TASK_INTERRUPTIBLE);
276 schedule(); 283 schedule();
277 284
278 /* 285 /*
279 * Do this periodically, like kupdated() did before. 286 * Do this periodically, like kupdated() did before.
280 */ 287 */
281 sync_supers(); 288 sync_supers();
282 } 289 }
283 290
284 return 0; 291 return 0;
285 } 292 }
286 293
287 void bdi_arm_supers_timer(void) 294 void bdi_arm_supers_timer(void)
288 { 295 {
289 unsigned long next; 296 unsigned long next;
290 297
291 if (!dirty_writeback_interval) 298 if (!dirty_writeback_interval)
292 return; 299 return;
293 300
294 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; 301 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
295 mod_timer(&sync_supers_timer, round_jiffies_up(next)); 302 mod_timer(&sync_supers_timer, round_jiffies_up(next));
296 } 303 }
297 304
298 static void sync_supers_timer_fn(unsigned long unused) 305 static void sync_supers_timer_fn(unsigned long unused)
299 { 306 {
300 wake_up_process(sync_supers_tsk); 307 wake_up_process(sync_supers_tsk);
301 bdi_arm_supers_timer(); 308 bdi_arm_supers_timer();
302 } 309 }
303 310
304 static void wakeup_timer_fn(unsigned long data) 311 static void wakeup_timer_fn(unsigned long data)
305 { 312 {
306 struct backing_dev_info *bdi = (struct backing_dev_info *)data; 313 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
307 314
308 spin_lock_bh(&bdi->wb_lock); 315 spin_lock_bh(&bdi->wb_lock);
309 if (bdi->wb.task) { 316 if (bdi->wb.task) {
310 trace_writeback_wake_thread(bdi); 317 trace_writeback_wake_thread(bdi);
311 wake_up_process(bdi->wb.task); 318 wake_up_process(bdi->wb.task);
312 } else { 319 } else {
313 /* 320 /*
314 * When bdi tasks are inactive for long time, they are killed. 321 * When bdi tasks are inactive for long time, they are killed.
315 * In this case we have to wake-up the forker thread which 322 * In this case we have to wake-up the forker thread which
316 * should create and run the bdi thread. 323 * should create and run the bdi thread.
317 */ 324 */
318 trace_writeback_wake_forker_thread(bdi); 325 trace_writeback_wake_forker_thread(bdi);
319 wake_up_process(default_backing_dev_info.wb.task); 326 wake_up_process(default_backing_dev_info.wb.task);
320 } 327 }
321 spin_unlock_bh(&bdi->wb_lock); 328 spin_unlock_bh(&bdi->wb_lock);
322 } 329 }
323 330
324 /* 331 /*
325 * This function is used when the first inode for this bdi is marked dirty. It 332 * This function is used when the first inode for this bdi is marked dirty. It
326 * wakes-up the corresponding bdi thread which should then take care of the 333 * wakes-up the corresponding bdi thread which should then take care of the
327 * periodic background write-out of dirty inodes. Since the write-out would 334 * periodic background write-out of dirty inodes. Since the write-out would
328 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just 335 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
329 * set up a timer which wakes the bdi thread up later. 336 * set up a timer which wakes the bdi thread up later.
330 * 337 *
331 * Note, we wouldn't bother setting up the timer, but this function is on the 338 * Note, we wouldn't bother setting up the timer, but this function is on the
332 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches 339 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
333 * by delaying the wake-up. 340 * by delaying the wake-up.
334 */ 341 */
335 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) 342 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
336 { 343 {
337 unsigned long timeout; 344 unsigned long timeout;
338 345
339 timeout = msecs_to_jiffies(dirty_writeback_interval * 10); 346 timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
340 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); 347 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
341 } 348 }
342 349
343 /* 350 /*
344 * Calculate the longest interval (jiffies) bdi threads are allowed to be 351 * Calculate the longest interval (jiffies) bdi threads are allowed to be
345 * inactive. 352 * inactive.
346 */ 353 */
347 static unsigned long bdi_longest_inactive(void) 354 static unsigned long bdi_longest_inactive(void)
348 { 355 {
349 unsigned long interval; 356 unsigned long interval;
350 357
351 interval = msecs_to_jiffies(dirty_writeback_interval * 10); 358 interval = msecs_to_jiffies(dirty_writeback_interval * 10);
352 return max(5UL * 60 * HZ, interval); 359 return max(5UL * 60 * HZ, interval);
353 } 360 }
354 361
355 static int bdi_forker_thread(void *ptr) 362 static int bdi_forker_thread(void *ptr)
356 { 363 {
357 struct bdi_writeback *me = ptr; 364 struct bdi_writeback *me = ptr;
358 365
359 current->flags |= PF_SWAPWRITE; 366 current->flags |= PF_SWAPWRITE;
360 set_freezable(); 367 set_freezable();
361 368
362 /* 369 /*
363 * Our parent may run at a different priority, just set us to normal 370 * Our parent may run at a different priority, just set us to normal
364 */ 371 */
365 set_user_nice(current, 0); 372 set_user_nice(current, 0);
366 373
367 for (;;) { 374 for (;;) {
368 struct task_struct *task = NULL; 375 struct task_struct *task = NULL;
369 struct backing_dev_info *bdi; 376 struct backing_dev_info *bdi;
370 enum { 377 enum {
371 NO_ACTION, /* Nothing to do */ 378 NO_ACTION, /* Nothing to do */
372 FORK_THREAD, /* Fork bdi thread */ 379 FORK_THREAD, /* Fork bdi thread */
373 KILL_THREAD, /* Kill inactive bdi thread */ 380 KILL_THREAD, /* Kill inactive bdi thread */
374 } action = NO_ACTION; 381 } action = NO_ACTION;
375 382
376 /* 383 /*
377 * Temporary measure, we want to make sure we don't see 384 * Temporary measure, we want to make sure we don't see
378 * dirty data on the default backing_dev_info 385 * dirty data on the default backing_dev_info
379 */ 386 */
380 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { 387 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
381 del_timer(&me->wakeup_timer); 388 del_timer(&me->wakeup_timer);
382 wb_do_writeback(me, 0); 389 wb_do_writeback(me, 0);
383 } 390 }
384 391
385 spin_lock_bh(&bdi_lock); 392 spin_lock_bh(&bdi_lock);
386 set_current_state(TASK_INTERRUPTIBLE); 393 set_current_state(TASK_INTERRUPTIBLE);
387 394
388 list_for_each_entry(bdi, &bdi_list, bdi_list) { 395 list_for_each_entry(bdi, &bdi_list, bdi_list) {
389 bool have_dirty_io; 396 bool have_dirty_io;
390 397
391 if (!bdi_cap_writeback_dirty(bdi) || 398 if (!bdi_cap_writeback_dirty(bdi) ||
392 bdi_cap_flush_forker(bdi)) 399 bdi_cap_flush_forker(bdi))
393 continue; 400 continue;
394 401
395 WARN(!test_bit(BDI_registered, &bdi->state), 402 WARN(!test_bit(BDI_registered, &bdi->state),
396 "bdi %p/%s is not registered!\n", bdi, bdi->name); 403 "bdi %p/%s is not registered!\n", bdi, bdi->name);
397 404
398 have_dirty_io = !list_empty(&bdi->work_list) || 405 have_dirty_io = !list_empty(&bdi->work_list) ||
399 wb_has_dirty_io(&bdi->wb); 406 wb_has_dirty_io(&bdi->wb);
400 407
401 /* 408 /*
402 * If the bdi has work to do, but the thread does not 409 * If the bdi has work to do, but the thread does not
403 * exist - create it. 410 * exist - create it.
404 */ 411 */
405 if (!bdi->wb.task && have_dirty_io) { 412 if (!bdi->wb.task && have_dirty_io) {
406 /* 413 /*
407 * Set the pending bit - if someone will try to 414 * Set the pending bit - if someone will try to
408 * unregister this bdi - it'll wait on this bit. 415 * unregister this bdi - it'll wait on this bit.
409 */ 416 */
410 set_bit(BDI_pending, &bdi->state); 417 set_bit(BDI_pending, &bdi->state);
411 action = FORK_THREAD; 418 action = FORK_THREAD;
412 break; 419 break;
413 } 420 }
414 421
415 spin_lock(&bdi->wb_lock); 422 spin_lock(&bdi->wb_lock);
416 423
417 /* 424 /*
418 * If there is no work to do and the bdi thread was 425 * If there is no work to do and the bdi thread was
419 * inactive long enough - kill it. The wb_lock is taken 426 * inactive long enough - kill it. The wb_lock is taken
420 * to make sure no-one adds more work to this bdi and 427 * to make sure no-one adds more work to this bdi and
421 * wakes the bdi thread up. 428 * wakes the bdi thread up.
422 */ 429 */
423 if (bdi->wb.task && !have_dirty_io && 430 if (bdi->wb.task && !have_dirty_io &&
424 time_after(jiffies, bdi->wb.last_active + 431 time_after(jiffies, bdi->wb.last_active +
425 bdi_longest_inactive())) { 432 bdi_longest_inactive())) {
426 task = bdi->wb.task; 433 task = bdi->wb.task;
427 bdi->wb.task = NULL; 434 bdi->wb.task = NULL;
428 spin_unlock(&bdi->wb_lock); 435 spin_unlock(&bdi->wb_lock);
429 set_bit(BDI_pending, &bdi->state); 436 set_bit(BDI_pending, &bdi->state);
430 action = KILL_THREAD; 437 action = KILL_THREAD;
431 break; 438 break;
432 } 439 }
433 spin_unlock(&bdi->wb_lock); 440 spin_unlock(&bdi->wb_lock);
434 } 441 }
435 spin_unlock_bh(&bdi_lock); 442 spin_unlock_bh(&bdi_lock);
436 443
437 /* Keep working if default bdi still has things to do */ 444 /* Keep working if default bdi still has things to do */
438 if (!list_empty(&me->bdi->work_list)) 445 if (!list_empty(&me->bdi->work_list))
439 __set_current_state(TASK_RUNNING); 446 __set_current_state(TASK_RUNNING);
440 447
441 switch (action) { 448 switch (action) {
442 case FORK_THREAD: 449 case FORK_THREAD:
443 __set_current_state(TASK_RUNNING); 450 __set_current_state(TASK_RUNNING);
444 task = kthread_create(bdi_writeback_thread, &bdi->wb, 451 task = kthread_create(bdi_writeback_thread, &bdi->wb,
445 "flush-%s", dev_name(bdi->dev)); 452 "flush-%s", dev_name(bdi->dev));
446 if (IS_ERR(task)) { 453 if (IS_ERR(task)) {
447 /* 454 /*
448 * If thread creation fails, force writeout of 455 * If thread creation fails, force writeout of
449 * the bdi from the thread. 456 * the bdi from the thread. Hopefully 1024 is
457 * large enough for efficient IO.
450 */ 458 */
451 bdi_flush_io(bdi); 459 writeback_inodes_wb(&bdi->wb, 1024);
452 } else { 460 } else {
453 /* 461 /*
454 * The spinlock makes sure we do not lose 462 * The spinlock makes sure we do not lose
455 * wake-ups when racing with 'bdi_queue_work()'. 463 * wake-ups when racing with 'bdi_queue_work()'.
456 * And as soon as the bdi thread is visible, we 464 * And as soon as the bdi thread is visible, we
457 * can start it. 465 * can start it.
458 */ 466 */
459 spin_lock_bh(&bdi->wb_lock); 467 spin_lock_bh(&bdi->wb_lock);
460 bdi->wb.task = task; 468 bdi->wb.task = task;
461 spin_unlock_bh(&bdi->wb_lock); 469 spin_unlock_bh(&bdi->wb_lock);
462 wake_up_process(task); 470 wake_up_process(task);
463 } 471 }
464 break; 472 break;
465 473
466 case KILL_THREAD: 474 case KILL_THREAD:
467 __set_current_state(TASK_RUNNING); 475 __set_current_state(TASK_RUNNING);
468 kthread_stop(task); 476 kthread_stop(task);
469 break; 477 break;
470 478
471 case NO_ACTION: 479 case NO_ACTION:
472 if (!wb_has_dirty_io(me) || !dirty_writeback_interval) 480 if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
473 /* 481 /*
474 * There are no dirty data. The only thing we 482 * There are no dirty data. The only thing we
475 * should now care about is checking for 483 * should now care about is checking for
476 * inactive bdi threads and killing them. Thus, 484 * inactive bdi threads and killing them. Thus,
477 * let's sleep for longer time, save energy and 485 * let's sleep for longer time, save energy and
478 * be friendly for battery-driven devices. 486 * be friendly for battery-driven devices.
479 */ 487 */
480 schedule_timeout(bdi_longest_inactive()); 488 schedule_timeout(bdi_longest_inactive());
481 else 489 else
482 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 490 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
483 try_to_freeze(); 491 try_to_freeze();
484 /* Back to the main loop */ 492 /* Back to the main loop */
485 continue; 493 continue;
486 } 494 }
487 495
488 /* 496 /*
489 * Clear pending bit and wakeup anybody waiting to tear us down. 497 * Clear pending bit and wakeup anybody waiting to tear us down.
490 */ 498 */
491 clear_bit(BDI_pending, &bdi->state); 499 clear_bit(BDI_pending, &bdi->state);
492 smp_mb__after_clear_bit(); 500 smp_mb__after_clear_bit();
493 wake_up_bit(&bdi->state, BDI_pending); 501 wake_up_bit(&bdi->state, BDI_pending);
494 } 502 }
495 503
496 return 0; 504 return 0;
497 } 505 }
498 506
499 /* 507 /*
500 * Remove bdi from bdi_list, and ensure that it is no longer visible 508 * Remove bdi from bdi_list, and ensure that it is no longer visible
501 */ 509 */
502 static void bdi_remove_from_list(struct backing_dev_info *bdi) 510 static void bdi_remove_from_list(struct backing_dev_info *bdi)
503 { 511 {
504 spin_lock_bh(&bdi_lock); 512 spin_lock_bh(&bdi_lock);
505 list_del_rcu(&bdi->bdi_list); 513 list_del_rcu(&bdi->bdi_list);
506 spin_unlock_bh(&bdi_lock); 514 spin_unlock_bh(&bdi_lock);
507 515
508 synchronize_rcu_expedited(); 516 synchronize_rcu_expedited();
509 } 517 }
510 518
511 int bdi_register(struct backing_dev_info *bdi, struct device *parent, 519 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
512 const char *fmt, ...) 520 const char *fmt, ...)
513 { 521 {
514 va_list args; 522 va_list args;
515 struct device *dev; 523 struct device *dev;
516 524
517 if (bdi->dev) /* The driver needs to use separate queues per device */ 525 if (bdi->dev) /* The driver needs to use separate queues per device */
518 return 0; 526 return 0;
519 527
520 va_start(args, fmt); 528 va_start(args, fmt);
521 dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); 529 dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
522 va_end(args); 530 va_end(args);
523 if (IS_ERR(dev)) 531 if (IS_ERR(dev))
524 return PTR_ERR(dev); 532 return PTR_ERR(dev);
525 533
526 bdi->dev = dev; 534 bdi->dev = dev;
527 535
528 /* 536 /*
529 * Just start the forker thread for our default backing_dev_info, 537 * Just start the forker thread for our default backing_dev_info,
530 * and add other bdi's to the list. They will get a thread created 538 * and add other bdi's to the list. They will get a thread created
531 * on-demand when they need it. 539 * on-demand when they need it.
532 */ 540 */
533 if (bdi_cap_flush_forker(bdi)) { 541 if (bdi_cap_flush_forker(bdi)) {
534 struct bdi_writeback *wb = &bdi->wb; 542 struct bdi_writeback *wb = &bdi->wb;
535 543
536 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", 544 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
537 dev_name(dev)); 545 dev_name(dev));
538 if (IS_ERR(wb->task)) 546 if (IS_ERR(wb->task))
539 return PTR_ERR(wb->task); 547 return PTR_ERR(wb->task);
540 } 548 }
541 549
542 bdi_debug_register(bdi, dev_name(dev)); 550 bdi_debug_register(bdi, dev_name(dev));
543 set_bit(BDI_registered, &bdi->state); 551 set_bit(BDI_registered, &bdi->state);
544 552
545 spin_lock_bh(&bdi_lock); 553 spin_lock_bh(&bdi_lock);
546 list_add_tail_rcu(&bdi->bdi_list, &bdi_list); 554 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
547 spin_unlock_bh(&bdi_lock); 555 spin_unlock_bh(&bdi_lock);
548 556
549 trace_writeback_bdi_register(bdi); 557 trace_writeback_bdi_register(bdi);
550 return 0; 558 return 0;
551 } 559 }
552 EXPORT_SYMBOL(bdi_register); 560 EXPORT_SYMBOL(bdi_register);
553 561
554 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) 562 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
555 { 563 {
556 return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); 564 return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
557 } 565 }
558 EXPORT_SYMBOL(bdi_register_dev); 566 EXPORT_SYMBOL(bdi_register_dev);
559 567
560 /* 568 /*
561 * Remove bdi from the global list and shutdown any threads we have running 569 * Remove bdi from the global list and shutdown any threads we have running
562 */ 570 */
563 static void bdi_wb_shutdown(struct backing_dev_info *bdi) 571 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
564 { 572 {
565 if (!bdi_cap_writeback_dirty(bdi)) 573 if (!bdi_cap_writeback_dirty(bdi))
566 return; 574 return;
567 575
568 /* 576 /*
569 * Make sure nobody finds us on the bdi_list anymore 577 * Make sure nobody finds us on the bdi_list anymore
570 */ 578 */
571 bdi_remove_from_list(bdi); 579 bdi_remove_from_list(bdi);
572 580
573 /* 581 /*
574 * If setup is pending, wait for that to complete first 582 * If setup is pending, wait for that to complete first
575 */ 583 */
576 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 584 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
577 TASK_UNINTERRUPTIBLE); 585 TASK_UNINTERRUPTIBLE);
578 586
579 /* 587 /*
580 * Finally, kill the kernel thread. We don't need to be RCU 588 * Finally, kill the kernel thread. We don't need to be RCU
581 * safe anymore, since the bdi is gone from visibility. Force 589 * safe anymore, since the bdi is gone from visibility. Force
582 * unfreeze of the thread before calling kthread_stop(), otherwise 590 * unfreeze of the thread before calling kthread_stop(), otherwise
583 * it would never exet if it is currently stuck in the refrigerator. 591 * it would never exet if it is currently stuck in the refrigerator.
584 */ 592 */
585 if (bdi->wb.task) { 593 if (bdi->wb.task) {
586 thaw_process(bdi->wb.task); 594 thaw_process(bdi->wb.task);
587 kthread_stop(bdi->wb.task); 595 kthread_stop(bdi->wb.task);
588 } 596 }
589 } 597 }
590 598
591 /* 599 /*
592 * This bdi is going away now, make sure that no super_blocks point to it 600 * This bdi is going away now, make sure that no super_blocks point to it
593 */ 601 */
594 static void bdi_prune_sb(struct backing_dev_info *bdi) 602 static void bdi_prune_sb(struct backing_dev_info *bdi)
595 { 603 {
596 struct super_block *sb; 604 struct super_block *sb;
597 605
598 spin_lock(&sb_lock); 606 spin_lock(&sb_lock);
599 list_for_each_entry(sb, &super_blocks, s_list) { 607 list_for_each_entry(sb, &super_blocks, s_list) {
600 if (sb->s_bdi == bdi) 608 if (sb->s_bdi == bdi)
601 sb->s_bdi = &default_backing_dev_info; 609 sb->s_bdi = &default_backing_dev_info;
602 } 610 }
603 spin_unlock(&sb_lock); 611 spin_unlock(&sb_lock);
604 } 612 }
605 613
606 void bdi_unregister(struct backing_dev_info *bdi) 614 void bdi_unregister(struct backing_dev_info *bdi)
607 { 615 {
608 if (bdi->dev) { 616 if (bdi->dev) {
609 bdi_set_min_ratio(bdi, 0); 617 bdi_set_min_ratio(bdi, 0);
610 trace_writeback_bdi_unregister(bdi); 618 trace_writeback_bdi_unregister(bdi);
611 bdi_prune_sb(bdi); 619 bdi_prune_sb(bdi);
612 del_timer_sync(&bdi->wb.wakeup_timer); 620 del_timer_sync(&bdi->wb.wakeup_timer);
613 621
614 if (!bdi_cap_flush_forker(bdi)) 622 if (!bdi_cap_flush_forker(bdi))
615 bdi_wb_shutdown(bdi); 623 bdi_wb_shutdown(bdi);
616 bdi_debug_unregister(bdi); 624 bdi_debug_unregister(bdi);
617 device_unregister(bdi->dev); 625 device_unregister(bdi->dev);
618 bdi->dev = NULL; 626 bdi->dev = NULL;
619 } 627 }
620 } 628 }
621 EXPORT_SYMBOL(bdi_unregister); 629 EXPORT_SYMBOL(bdi_unregister);
622 630
623 static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) 631 static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
624 { 632 {
625 memset(wb, 0, sizeof(*wb)); 633 memset(wb, 0, sizeof(*wb));
626 634
627 wb->bdi = bdi; 635 wb->bdi = bdi;
628 wb->last_old_flush = jiffies; 636 wb->last_old_flush = jiffies;
629 INIT_LIST_HEAD(&wb->b_dirty); 637 INIT_LIST_HEAD(&wb->b_dirty);
630 INIT_LIST_HEAD(&wb->b_io); 638 INIT_LIST_HEAD(&wb->b_io);
631 INIT_LIST_HEAD(&wb->b_more_io); 639 INIT_LIST_HEAD(&wb->b_more_io);
640 spin_lock_init(&wb->list_lock);
632 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 641 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
633 } 642 }
634 643
644 /*
645 * Initial write bandwidth: 100 MB/s
646 */
647 #define INIT_BW (100 << (20 - PAGE_SHIFT))
648
635 int bdi_init(struct backing_dev_info *bdi) 649 int bdi_init(struct backing_dev_info *bdi)
636 { 650 {
637 int i, err; 651 int i, err;
638 652
639 bdi->dev = NULL; 653 bdi->dev = NULL;
640 654
641 bdi->min_ratio = 0; 655 bdi->min_ratio = 0;
642 bdi->max_ratio = 100; 656 bdi->max_ratio = 100;
643 bdi->max_prop_frac = PROP_FRAC_BASE; 657 bdi->max_prop_frac = PROP_FRAC_BASE;
644 spin_lock_init(&bdi->wb_lock); 658 spin_lock_init(&bdi->wb_lock);
645 INIT_LIST_HEAD(&bdi->bdi_list); 659 INIT_LIST_HEAD(&bdi->bdi_list);
646 INIT_LIST_HEAD(&bdi->work_list); 660 INIT_LIST_HEAD(&bdi->work_list);
647 661
648 bdi_wb_init(&bdi->wb, bdi); 662 bdi_wb_init(&bdi->wb, bdi);
649 663
650 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 664 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
651 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 665 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
652 if (err) 666 if (err)
653 goto err; 667 goto err;
654 } 668 }
655 669
656 bdi->dirty_exceeded = 0; 670 bdi->dirty_exceeded = 0;
671
672 bdi->bw_time_stamp = jiffies;
673 bdi->written_stamp = 0;
674
675 bdi->write_bandwidth = INIT_BW;
676 bdi->avg_write_bandwidth = INIT_BW;
677
657 err = prop_local_init_percpu(&bdi->completions); 678 err = prop_local_init_percpu(&bdi->completions);
658 679
659 if (err) { 680 if (err) {
660 err: 681 err:
661 while (i--) 682 while (i--)
662 percpu_counter_destroy(&bdi->bdi_stat[i]); 683 percpu_counter_destroy(&bdi->bdi_stat[i]);
663 } 684 }
664 685
665 return err; 686 return err;
666 } 687 }
667 EXPORT_SYMBOL(bdi_init); 688 EXPORT_SYMBOL(bdi_init);
668 689
669 void bdi_destroy(struct backing_dev_info *bdi) 690 void bdi_destroy(struct backing_dev_info *bdi)
670 { 691 {
671 int i; 692 int i;
672 693
673 /* 694 /*
674 * Splice our entries to the default_backing_dev_info, if this 695 * Splice our entries to the default_backing_dev_info, if this
675 * bdi disappears 696 * bdi disappears
676 */ 697 */
677 if (bdi_has_dirty_io(bdi)) { 698 if (bdi_has_dirty_io(bdi)) {
678 struct bdi_writeback *dst = &default_backing_dev_info.wb; 699 struct bdi_writeback *dst = &default_backing_dev_info.wb;
679 700
680 spin_lock(&inode_wb_list_lock); 701 bdi_lock_two(&bdi->wb, dst);
681 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 702 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
682 list_splice(&bdi->wb.b_io, &dst->b_io); 703 list_splice(&bdi->wb.b_io, &dst->b_io);
683 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 704 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
684 spin_unlock(&inode_wb_list_lock); 705 spin_unlock(&bdi->wb.list_lock);
706 spin_unlock(&dst->list_lock);
685 } 707 }
686 708
687 bdi_unregister(bdi); 709 bdi_unregister(bdi);
688 710
689 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 711 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
690 percpu_counter_destroy(&bdi->bdi_stat[i]); 712 percpu_counter_destroy(&bdi->bdi_stat[i]);
691 713
692 prop_local_destroy_percpu(&bdi->completions); 714 prop_local_destroy_percpu(&bdi->completions);
693 } 715 }
694 EXPORT_SYMBOL(bdi_destroy); 716 EXPORT_SYMBOL(bdi_destroy);
695 717
696 /* 718 /*
697 * For use from filesystems to quickly init and register a bdi associated 719 * For use from filesystems to quickly init and register a bdi associated
698 * with dirty writeback 720 * with dirty writeback
699 */ 721 */
700 int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, 722 int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
701 unsigned int cap) 723 unsigned int cap)
702 { 724 {
703 char tmp[32]; 725 char tmp[32];
704 int err; 726 int err;
705 727
706 bdi->name = name; 728 bdi->name = name;
707 bdi->capabilities = cap; 729 bdi->capabilities = cap;
708 err = bdi_init(bdi); 730 err = bdi_init(bdi);
709 if (err) 731 if (err)
710 return err; 732 return err;
711 733
712 sprintf(tmp, "%.28s%s", name, "-%d"); 734 sprintf(tmp, "%.28s%s", name, "-%d");
713 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); 735 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
714 if (err) { 736 if (err) {
715 bdi_destroy(bdi); 737 bdi_destroy(bdi);
716 return err; 738 return err;
717 } 739 }
718 740
719 return 0; 741 return 0;
720 } 742 }
721 EXPORT_SYMBOL(bdi_setup_and_register); 743 EXPORT_SYMBOL(bdi_setup_and_register);
722 744
723 static wait_queue_head_t congestion_wqh[2] = { 745 static wait_queue_head_t congestion_wqh[2] = {
724 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 746 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
725 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 747 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
726 }; 748 };
727 static atomic_t nr_bdi_congested[2]; 749 static atomic_t nr_bdi_congested[2];
728 750
729 void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 751 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
730 { 752 {
731 enum bdi_state bit; 753 enum bdi_state bit;
732 wait_queue_head_t *wqh = &congestion_wqh[sync]; 754 wait_queue_head_t *wqh = &congestion_wqh[sync];
733 755
734 bit = sync ? BDI_sync_congested : BDI_async_congested; 756 bit = sync ? BDI_sync_congested : BDI_async_congested;
735 if (test_and_clear_bit(bit, &bdi->state)) 757 if (test_and_clear_bit(bit, &bdi->state))
736 atomic_dec(&nr_bdi_congested[sync]); 758 atomic_dec(&nr_bdi_congested[sync]);
737 smp_mb__after_clear_bit(); 759 smp_mb__after_clear_bit();
738 if (waitqueue_active(wqh)) 760 if (waitqueue_active(wqh))
739 wake_up(wqh); 761 wake_up(wqh);
740 } 762 }
741 EXPORT_SYMBOL(clear_bdi_congested); 763 EXPORT_SYMBOL(clear_bdi_congested);
742 764
743 void set_bdi_congested(struct backing_dev_info *bdi, int sync) 765 void set_bdi_congested(struct backing_dev_info *bdi, int sync)
744 { 766 {
745 enum bdi_state bit; 767 enum bdi_state bit;
746 768
747 bit = sync ? BDI_sync_congested : BDI_async_congested; 769 bit = sync ? BDI_sync_congested : BDI_async_congested;
748 if (!test_and_set_bit(bit, &bdi->state)) 770 if (!test_and_set_bit(bit, &bdi->state))
749 atomic_inc(&nr_bdi_congested[sync]); 771 atomic_inc(&nr_bdi_congested[sync]);
750 } 772 }
751 EXPORT_SYMBOL(set_bdi_congested); 773 EXPORT_SYMBOL(set_bdi_congested);
752 774
753 /** 775 /**
754 * congestion_wait - wait for a backing_dev to become uncongested 776 * congestion_wait - wait for a backing_dev to become uncongested
755 * @sync: SYNC or ASYNC IO 777 * @sync: SYNC or ASYNC IO
756 * @timeout: timeout in jiffies 778 * @timeout: timeout in jiffies
757 * 779 *
758 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit 780 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
759 * write congestion. If no backing_devs are congested then just wait for the 781 * write congestion. If no backing_devs are congested then just wait for the
760 * next write to be completed. 782 * next write to be completed.
761 */ 783 */
762 long congestion_wait(int sync, long timeout) 784 long congestion_wait(int sync, long timeout)
763 { 785 {
764 long ret; 786 long ret;
765 unsigned long start = jiffies; 787 unsigned long start = jiffies;
766 DEFINE_WAIT(wait); 788 DEFINE_WAIT(wait);
767 wait_queue_head_t *wqh = &congestion_wqh[sync]; 789 wait_queue_head_t *wqh = &congestion_wqh[sync];
768 790
769 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 791 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
770 ret = io_schedule_timeout(timeout); 792 ret = io_schedule_timeout(timeout);
771 finish_wait(wqh, &wait); 793 finish_wait(wqh, &wait);
772 794
773 trace_writeback_congestion_wait(jiffies_to_usecs(timeout), 795 trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
774 jiffies_to_usecs(jiffies - start)); 796 jiffies_to_usecs(jiffies - start));
775 797
776 return ret; 798 return ret;
777 } 799 }
778 EXPORT_SYMBOL(congestion_wait); 800 EXPORT_SYMBOL(congestion_wait);
779 801
780 /** 802 /**
781 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes 803 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
782 * @zone: A zone to check if it is heavily congested 804 * @zone: A zone to check if it is heavily congested
783 * @sync: SYNC or ASYNC IO 805 * @sync: SYNC or ASYNC IO
784 * @timeout: timeout in jiffies 806 * @timeout: timeout in jiffies
785 * 807 *
786 * In the event of a congested backing_dev (any backing_dev) and the given 808 * In the event of a congested backing_dev (any backing_dev) and the given
787 * @zone has experienced recent congestion, this waits for up to @timeout 809 * @zone has experienced recent congestion, this waits for up to @timeout
788 * jiffies for either a BDI to exit congestion of the given @sync queue 810 * jiffies for either a BDI to exit congestion of the given @sync queue
789 * or a write to complete. 811 * or a write to complete.
790 * 812 *
791 * In the absence of zone congestion, cond_resched() is called to yield 813 * In the absence of zone congestion, cond_resched() is called to yield
792 * the processor if necessary but otherwise does not sleep. 814 * the processor if necessary but otherwise does not sleep.
793 * 815 *
794 * The return value is 0 if the sleep is for the full timeout. Otherwise, 816 * The return value is 0 if the sleep is for the full timeout. Otherwise,
795 * it is the number of jiffies that were still remaining when the function 817 * it is the number of jiffies that were still remaining when the function
796 * returned. return_value == timeout implies the function did not sleep. 818 * returned. return_value == timeout implies the function did not sleep.
797 */ 819 */
798 long wait_iff_congested(struct zone *zone, int sync, long timeout) 820 long wait_iff_congested(struct zone *zone, int sync, long timeout)
799 { 821 {
800 long ret; 822 long ret;
801 unsigned long start = jiffies; 823 unsigned long start = jiffies;
802 DEFINE_WAIT(wait); 824 DEFINE_WAIT(wait);
803 wait_queue_head_t *wqh = &congestion_wqh[sync]; 825 wait_queue_head_t *wqh = &congestion_wqh[sync];
804 826
805 /* 827 /*
806 * If there is no congestion, or heavy congestion is not being 828 * If there is no congestion, or heavy congestion is not being
807 * encountered in the current zone, yield if necessary instead 829 * encountered in the current zone, yield if necessary instead
808 * of sleeping on the congestion queue 830 * of sleeping on the congestion queue
809 */ 831 */
810 if (atomic_read(&nr_bdi_congested[sync]) == 0 || 832 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
811 !zone_is_reclaim_congested(zone)) { 833 !zone_is_reclaim_congested(zone)) {
812 cond_resched(); 834 cond_resched();
813 835
814 /* In case we scheduled, work out time remaining */ 836 /* In case we scheduled, work out time remaining */
815 ret = timeout - (jiffies - start); 837 ret = timeout - (jiffies - start);
816 if (ret < 0) 838 if (ret < 0)
817 ret = 0; 839 ret = 0;
818 840
819 goto out; 841 goto out;
820 } 842 }
821 843
822 /* Sleep until uncongested or a write happens */ 844 /* Sleep until uncongested or a write happens */
1 /* 1 /*
2 * linux/mm/filemap.c 2 * linux/mm/filemap.c
3 * 3 *
4 * Copyright (C) 1994-1999 Linus Torvalds 4 * Copyright (C) 1994-1999 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * This file handles the generic file mmap semantics used by 8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this: 9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/compiler.h> 13 #include <linux/compiler.h>
14 #include <linux/fs.h> 14 #include <linux/fs.h>
15 #include <linux/uaccess.h> 15 #include <linux/uaccess.h>
16 #include <linux/aio.h> 16 #include <linux/aio.h>
17 #include <linux/capability.h> 17 #include <linux/capability.h>
18 #include <linux/kernel_stat.h> 18 #include <linux/kernel_stat.h>
19 #include <linux/gfp.h> 19 #include <linux/gfp.h>
20 #include <linux/mm.h> 20 #include <linux/mm.h>
21 #include <linux/swap.h> 21 #include <linux/swap.h>
22 #include <linux/mman.h> 22 #include <linux/mman.h>
23 #include <linux/pagemap.h> 23 #include <linux/pagemap.h>
24 #include <linux/file.h> 24 #include <linux/file.h>
25 #include <linux/uio.h> 25 #include <linux/uio.h>
26 #include <linux/hash.h> 26 #include <linux/hash.h>
27 #include <linux/writeback.h> 27 #include <linux/writeback.h>
28 #include <linux/backing-dev.h> 28 #include <linux/backing-dev.h>
29 #include <linux/pagevec.h> 29 #include <linux/pagevec.h>
30 #include <linux/blkdev.h> 30 #include <linux/blkdev.h>
31 #include <linux/security.h> 31 #include <linux/security.h>
32 #include <linux/syscalls.h> 32 #include <linux/syscalls.h>
33 #include <linux/cpuset.h> 33 #include <linux/cpuset.h>
34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35 #include <linux/memcontrol.h> 35 #include <linux/memcontrol.h>
36 #include <linux/mm_inline.h> /* for page_is_file_cache() */ 36 #include <linux/mm_inline.h> /* for page_is_file_cache() */
37 #include <linux/cleancache.h> 37 #include <linux/cleancache.h>
38 #include "internal.h" 38 #include "internal.h"
39 39
40 /* 40 /*
41 * FIXME: remove all knowledge of the buffer layer from the core VM 41 * FIXME: remove all knowledge of the buffer layer from the core VM
42 */ 42 */
43 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 43 #include <linux/buffer_head.h> /* for try_to_free_buffers */
44 44
45 #include <asm/mman.h> 45 #include <asm/mman.h>
46 46
47 /* 47 /*
48 * Shared mappings implemented 30.11.1994. It's not fully working yet, 48 * Shared mappings implemented 30.11.1994. It's not fully working yet,
49 * though. 49 * though.
50 * 50 *
51 * Shared mappings now work. 15.8.1995 Bruno. 51 * Shared mappings now work. 15.8.1995 Bruno.
52 * 52 *
53 * finished 'unifying' the page and buffer cache and SMP-threaded the 53 * finished 'unifying' the page and buffer cache and SMP-threaded the
54 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 54 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
55 * 55 *
56 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 56 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
57 */ 57 */
58 58
59 /* 59 /*
60 * Lock ordering: 60 * Lock ordering:
61 * 61 *
62 * ->i_mmap_mutex (truncate_pagecache) 62 * ->i_mmap_mutex (truncate_pagecache)
63 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->private_lock (__free_pte->__set_page_dirty_buffers)
64 * ->swap_lock (exclusive_swap_page, others) 64 * ->swap_lock (exclusive_swap_page, others)
65 * ->mapping->tree_lock 65 * ->mapping->tree_lock
66 * 66 *
67 * ->i_mutex 67 * ->i_mutex
68 * ->i_mmap_mutex (truncate->unmap_mapping_range) 68 * ->i_mmap_mutex (truncate->unmap_mapping_range)
69 * 69 *
70 * ->mmap_sem 70 * ->mmap_sem
71 * ->i_mmap_mutex 71 * ->i_mmap_mutex
72 * ->page_table_lock or pte_lock (various, mainly in memory.c) 72 * ->page_table_lock or pte_lock (various, mainly in memory.c)
73 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 73 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
74 * 74 *
75 * ->mmap_sem 75 * ->mmap_sem
76 * ->lock_page (access_process_vm) 76 * ->lock_page (access_process_vm)
77 * 77 *
78 * ->i_mutex (generic_file_buffered_write) 78 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 80 *
81 * inode_wb_list_lock 81 * bdi->wb.list_lock
82 * sb_lock (fs/fs-writeback.c) 82 * sb_lock (fs/fs-writeback.c)
83 * ->mapping->tree_lock (__sync_single_inode) 83 * ->mapping->tree_lock (__sync_single_inode)
84 * 84 *
85 * ->i_mmap_mutex 85 * ->i_mmap_mutex
86 * ->anon_vma.lock (vma_adjust) 86 * ->anon_vma.lock (vma_adjust)
87 * 87 *
88 * ->anon_vma.lock 88 * ->anon_vma.lock
89 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 89 * ->page_table_lock or pte_lock (anon_vma_prepare and various)
90 * 90 *
91 * ->page_table_lock or pte_lock 91 * ->page_table_lock or pte_lock
92 * ->swap_lock (try_to_unmap_one) 92 * ->swap_lock (try_to_unmap_one)
93 * ->private_lock (try_to_unmap_one) 93 * ->private_lock (try_to_unmap_one)
94 * ->tree_lock (try_to_unmap_one) 94 * ->tree_lock (try_to_unmap_one)
95 * ->zone.lru_lock (follow_page->mark_page_accessed) 95 * ->zone.lru_lock (follow_page->mark_page_accessed)
96 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 96 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
97 * ->private_lock (page_remove_rmap->set_page_dirty) 97 * ->private_lock (page_remove_rmap->set_page_dirty)
98 * ->tree_lock (page_remove_rmap->set_page_dirty) 98 * ->tree_lock (page_remove_rmap->set_page_dirty)
99 * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 99 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
100 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 100 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
101 * inode_wb_list_lock (zap_pte_range->set_page_dirty) 101 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
102 * ->inode->i_lock (zap_pte_range->set_page_dirty) 102 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 104 *
105 * (code doesn't rely on that order, so you could switch it around) 105 * (code doesn't rely on that order, so you could switch it around)
106 * ->tasklist_lock (memory_failure, collect_procs_ao) 106 * ->tasklist_lock (memory_failure, collect_procs_ao)
107 * ->i_mmap_mutex 107 * ->i_mmap_mutex
108 */ 108 */
109 109
110 /* 110 /*
111 * Delete a page from the page cache and free it. Caller has to make 111 * Delete a page from the page cache and free it. Caller has to make
112 * sure the page is locked and that nobody else uses it - or that usage 112 * sure the page is locked and that nobody else uses it - or that usage
113 * is safe. The caller must hold the mapping's tree_lock. 113 * is safe. The caller must hold the mapping's tree_lock.
114 */ 114 */
115 void __delete_from_page_cache(struct page *page) 115 void __delete_from_page_cache(struct page *page)
116 { 116 {
117 struct address_space *mapping = page->mapping; 117 struct address_space *mapping = page->mapping;
118 118
119 /* 119 /*
120 * if we're uptodate, flush out into the cleancache, otherwise 120 * if we're uptodate, flush out into the cleancache, otherwise
121 * invalidate any existing cleancache entries. We can't leave 121 * invalidate any existing cleancache entries. We can't leave
122 * stale data around in the cleancache once our page is gone 122 * stale data around in the cleancache once our page is gone
123 */ 123 */
124 if (PageUptodate(page) && PageMappedToDisk(page)) 124 if (PageUptodate(page) && PageMappedToDisk(page))
125 cleancache_put_page(page); 125 cleancache_put_page(page);
126 else 126 else
127 cleancache_flush_page(mapping, page); 127 cleancache_flush_page(mapping, page);
128 128
129 radix_tree_delete(&mapping->page_tree, page->index); 129 radix_tree_delete(&mapping->page_tree, page->index);
130 page->mapping = NULL; 130 page->mapping = NULL;
131 /* Leave page->index set: truncation lookup relies upon it */ 131 /* Leave page->index set: truncation lookup relies upon it */
132 mapping->nrpages--; 132 mapping->nrpages--;
133 __dec_zone_page_state(page, NR_FILE_PAGES); 133 __dec_zone_page_state(page, NR_FILE_PAGES);
134 if (PageSwapBacked(page)) 134 if (PageSwapBacked(page))
135 __dec_zone_page_state(page, NR_SHMEM); 135 __dec_zone_page_state(page, NR_SHMEM);
136 BUG_ON(page_mapped(page)); 136 BUG_ON(page_mapped(page));
137 137
138 /* 138 /*
139 * Some filesystems seem to re-dirty the page even after 139 * Some filesystems seem to re-dirty the page even after
140 * the VM has canceled the dirty bit (eg ext3 journaling). 140 * the VM has canceled the dirty bit (eg ext3 journaling).
141 * 141 *
142 * Fix it up by doing a final dirty accounting check after 142 * Fix it up by doing a final dirty accounting check after
143 * having removed the page entirely. 143 * having removed the page entirely.
144 */ 144 */
145 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 145 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
146 dec_zone_page_state(page, NR_FILE_DIRTY); 146 dec_zone_page_state(page, NR_FILE_DIRTY);
147 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 147 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
148 } 148 }
149 } 149 }
150 150
151 /** 151 /**
152 * delete_from_page_cache - delete page from page cache 152 * delete_from_page_cache - delete page from page cache
153 * @page: the page which the kernel is trying to remove from page cache 153 * @page: the page which the kernel is trying to remove from page cache
154 * 154 *
155 * This must be called only on pages that have been verified to be in the page 155 * This must be called only on pages that have been verified to be in the page
156 * cache and locked. It will never put the page into the free list, the caller 156 * cache and locked. It will never put the page into the free list, the caller
157 * has a reference on the page. 157 * has a reference on the page.
158 */ 158 */
159 void delete_from_page_cache(struct page *page) 159 void delete_from_page_cache(struct page *page)
160 { 160 {
161 struct address_space *mapping = page->mapping; 161 struct address_space *mapping = page->mapping;
162 void (*freepage)(struct page *); 162 void (*freepage)(struct page *);
163 163
164 BUG_ON(!PageLocked(page)); 164 BUG_ON(!PageLocked(page));
165 165
166 freepage = mapping->a_ops->freepage; 166 freepage = mapping->a_ops->freepage;
167 spin_lock_irq(&mapping->tree_lock); 167 spin_lock_irq(&mapping->tree_lock);
168 __delete_from_page_cache(page); 168 __delete_from_page_cache(page);
169 spin_unlock_irq(&mapping->tree_lock); 169 spin_unlock_irq(&mapping->tree_lock);
170 mem_cgroup_uncharge_cache_page(page); 170 mem_cgroup_uncharge_cache_page(page);
171 171
172 if (freepage) 172 if (freepage)
173 freepage(page); 173 freepage(page);
174 page_cache_release(page); 174 page_cache_release(page);
175 } 175 }
176 EXPORT_SYMBOL(delete_from_page_cache); 176 EXPORT_SYMBOL(delete_from_page_cache);
177 177
178 static int sleep_on_page(void *word) 178 static int sleep_on_page(void *word)
179 { 179 {
180 io_schedule(); 180 io_schedule();
181 return 0; 181 return 0;
182 } 182 }
183 183
184 static int sleep_on_page_killable(void *word) 184 static int sleep_on_page_killable(void *word)
185 { 185 {
186 sleep_on_page(word); 186 sleep_on_page(word);
187 return fatal_signal_pending(current) ? -EINTR : 0; 187 return fatal_signal_pending(current) ? -EINTR : 0;
188 } 188 }
189 189
190 /** 190 /**
191 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 191 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
192 * @mapping: address space structure to write 192 * @mapping: address space structure to write
193 * @start: offset in bytes where the range starts 193 * @start: offset in bytes where the range starts
194 * @end: offset in bytes where the range ends (inclusive) 194 * @end: offset in bytes where the range ends (inclusive)
195 * @sync_mode: enable synchronous operation 195 * @sync_mode: enable synchronous operation
196 * 196 *
197 * Start writeback against all of a mapping's dirty pages that lie 197 * Start writeback against all of a mapping's dirty pages that lie
198 * within the byte offsets <start, end> inclusive. 198 * within the byte offsets <start, end> inclusive.
199 * 199 *
200 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 200 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
201 * opposed to a regular memory cleansing writeback. The difference between 201 * opposed to a regular memory cleansing writeback. The difference between
202 * these two operations is that if a dirty page/buffer is encountered, it must 202 * these two operations is that if a dirty page/buffer is encountered, it must
203 * be waited upon, and not just skipped over. 203 * be waited upon, and not just skipped over.
204 */ 204 */
205 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 205 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
206 loff_t end, int sync_mode) 206 loff_t end, int sync_mode)
207 { 207 {
208 int ret; 208 int ret;
209 struct writeback_control wbc = { 209 struct writeback_control wbc = {
210 .sync_mode = sync_mode, 210 .sync_mode = sync_mode,
211 .nr_to_write = LONG_MAX, 211 .nr_to_write = LONG_MAX,
212 .range_start = start, 212 .range_start = start,
213 .range_end = end, 213 .range_end = end,
214 }; 214 };
215 215
216 if (!mapping_cap_writeback_dirty(mapping)) 216 if (!mapping_cap_writeback_dirty(mapping))
217 return 0; 217 return 0;
218 218
219 ret = do_writepages(mapping, &wbc); 219 ret = do_writepages(mapping, &wbc);
220 return ret; 220 return ret;
221 } 221 }
222 222
223 static inline int __filemap_fdatawrite(struct address_space *mapping, 223 static inline int __filemap_fdatawrite(struct address_space *mapping,
224 int sync_mode) 224 int sync_mode)
225 { 225 {
226 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 226 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
227 } 227 }
228 228
229 int filemap_fdatawrite(struct address_space *mapping) 229 int filemap_fdatawrite(struct address_space *mapping)
230 { 230 {
231 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 231 return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
232 } 232 }
233 EXPORT_SYMBOL(filemap_fdatawrite); 233 EXPORT_SYMBOL(filemap_fdatawrite);
234 234
235 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 235 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
236 loff_t end) 236 loff_t end)
237 { 237 {
238 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 238 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
239 } 239 }
240 EXPORT_SYMBOL(filemap_fdatawrite_range); 240 EXPORT_SYMBOL(filemap_fdatawrite_range);
241 241
242 /** 242 /**
243 * filemap_flush - mostly a non-blocking flush 243 * filemap_flush - mostly a non-blocking flush
244 * @mapping: target address_space 244 * @mapping: target address_space
245 * 245 *
246 * This is a mostly non-blocking flush. Not suitable for data-integrity 246 * This is a mostly non-blocking flush. Not suitable for data-integrity
247 * purposes - I/O may not be started against all dirty pages. 247 * purposes - I/O may not be started against all dirty pages.
248 */ 248 */
249 int filemap_flush(struct address_space *mapping) 249 int filemap_flush(struct address_space *mapping)
250 { 250 {
251 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 251 return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
252 } 252 }
253 EXPORT_SYMBOL(filemap_flush); 253 EXPORT_SYMBOL(filemap_flush);
254 254
255 /** 255 /**
256 * filemap_fdatawait_range - wait for writeback to complete 256 * filemap_fdatawait_range - wait for writeback to complete
257 * @mapping: address space structure to wait for 257 * @mapping: address space structure to wait for
258 * @start_byte: offset in bytes where the range starts 258 * @start_byte: offset in bytes where the range starts
259 * @end_byte: offset in bytes where the range ends (inclusive) 259 * @end_byte: offset in bytes where the range ends (inclusive)
260 * 260 *
261 * Walk the list of under-writeback pages of the given address space 261 * Walk the list of under-writeback pages of the given address space
262 * in the given range and wait for all of them. 262 * in the given range and wait for all of them.
263 */ 263 */
264 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 264 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
265 loff_t end_byte) 265 loff_t end_byte)
266 { 266 {
267 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; 267 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
268 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; 268 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
269 struct pagevec pvec; 269 struct pagevec pvec;
270 int nr_pages; 270 int nr_pages;
271 int ret = 0; 271 int ret = 0;
272 272
273 if (end_byte < start_byte) 273 if (end_byte < start_byte)
274 return 0; 274 return 0;
275 275
276 pagevec_init(&pvec, 0); 276 pagevec_init(&pvec, 0);
277 while ((index <= end) && 277 while ((index <= end) &&
278 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 278 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
279 PAGECACHE_TAG_WRITEBACK, 279 PAGECACHE_TAG_WRITEBACK,
280 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 280 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
281 unsigned i; 281 unsigned i;
282 282
283 for (i = 0; i < nr_pages; i++) { 283 for (i = 0; i < nr_pages; i++) {
284 struct page *page = pvec.pages[i]; 284 struct page *page = pvec.pages[i];
285 285
286 /* until radix tree lookup accepts end_index */ 286 /* until radix tree lookup accepts end_index */
287 if (page->index > end) 287 if (page->index > end)
288 continue; 288 continue;
289 289
290 wait_on_page_writeback(page); 290 wait_on_page_writeback(page);
291 if (TestClearPageError(page)) 291 if (TestClearPageError(page))
292 ret = -EIO; 292 ret = -EIO;
293 } 293 }
294 pagevec_release(&pvec); 294 pagevec_release(&pvec);
295 cond_resched(); 295 cond_resched();
296 } 296 }
297 297
298 /* Check for outstanding write errors */ 298 /* Check for outstanding write errors */
299 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 299 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
300 ret = -ENOSPC; 300 ret = -ENOSPC;
301 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 301 if (test_and_clear_bit(AS_EIO, &mapping->flags))
302 ret = -EIO; 302 ret = -EIO;
303 303
304 return ret; 304 return ret;
305 } 305 }
306 EXPORT_SYMBOL(filemap_fdatawait_range); 306 EXPORT_SYMBOL(filemap_fdatawait_range);
307 307
308 /** 308 /**
309 * filemap_fdatawait - wait for all under-writeback pages to complete 309 * filemap_fdatawait - wait for all under-writeback pages to complete
310 * @mapping: address space structure to wait for 310 * @mapping: address space structure to wait for
311 * 311 *
312 * Walk the list of under-writeback pages of the given address space 312 * Walk the list of under-writeback pages of the given address space
313 * and wait for all of them. 313 * and wait for all of them.
314 */ 314 */
315 int filemap_fdatawait(struct address_space *mapping) 315 int filemap_fdatawait(struct address_space *mapping)
316 { 316 {
317 loff_t i_size = i_size_read(mapping->host); 317 loff_t i_size = i_size_read(mapping->host);
318 318
319 if (i_size == 0) 319 if (i_size == 0)
320 return 0; 320 return 0;
321 321
322 return filemap_fdatawait_range(mapping, 0, i_size - 1); 322 return filemap_fdatawait_range(mapping, 0, i_size - 1);
323 } 323 }
324 EXPORT_SYMBOL(filemap_fdatawait); 324 EXPORT_SYMBOL(filemap_fdatawait);
325 325
326 int filemap_write_and_wait(struct address_space *mapping) 326 int filemap_write_and_wait(struct address_space *mapping)
327 { 327 {
328 int err = 0; 328 int err = 0;
329 329
330 if (mapping->nrpages) { 330 if (mapping->nrpages) {
331 err = filemap_fdatawrite(mapping); 331 err = filemap_fdatawrite(mapping);
332 /* 332 /*
333 * Even if the above returned error, the pages may be 333 * Even if the above returned error, the pages may be
334 * written partially (e.g. -ENOSPC), so we wait for it. 334 * written partially (e.g. -ENOSPC), so we wait for it.
335 * But the -EIO is special case, it may indicate the worst 335 * But the -EIO is special case, it may indicate the worst
336 * thing (e.g. bug) happened, so we avoid waiting for it. 336 * thing (e.g. bug) happened, so we avoid waiting for it.
337 */ 337 */
338 if (err != -EIO) { 338 if (err != -EIO) {
339 int err2 = filemap_fdatawait(mapping); 339 int err2 = filemap_fdatawait(mapping);
340 if (!err) 340 if (!err)
341 err = err2; 341 err = err2;
342 } 342 }
343 } 343 }
344 return err; 344 return err;
345 } 345 }
346 EXPORT_SYMBOL(filemap_write_and_wait); 346 EXPORT_SYMBOL(filemap_write_and_wait);
347 347
348 /** 348 /**
349 * filemap_write_and_wait_range - write out & wait on a file range 349 * filemap_write_and_wait_range - write out & wait on a file range
350 * @mapping: the address_space for the pages 350 * @mapping: the address_space for the pages
351 * @lstart: offset in bytes where the range starts 351 * @lstart: offset in bytes where the range starts
352 * @lend: offset in bytes where the range ends (inclusive) 352 * @lend: offset in bytes where the range ends (inclusive)
353 * 353 *
354 * Write out and wait upon file offsets lstart->lend, inclusive. 354 * Write out and wait upon file offsets lstart->lend, inclusive.
355 * 355 *
356 * Note that `lend' is inclusive (describes the last byte to be written) so 356 * Note that `lend' is inclusive (describes the last byte to be written) so
357 * that this function can be used to write to the very end-of-file (end = -1). 357 * that this function can be used to write to the very end-of-file (end = -1).
358 */ 358 */
359 int filemap_write_and_wait_range(struct address_space *mapping, 359 int filemap_write_and_wait_range(struct address_space *mapping,
360 loff_t lstart, loff_t lend) 360 loff_t lstart, loff_t lend)
361 { 361 {
362 int err = 0; 362 int err = 0;
363 363
364 if (mapping->nrpages) { 364 if (mapping->nrpages) {
365 err = __filemap_fdatawrite_range(mapping, lstart, lend, 365 err = __filemap_fdatawrite_range(mapping, lstart, lend,
366 WB_SYNC_ALL); 366 WB_SYNC_ALL);
367 /* See comment of filemap_write_and_wait() */ 367 /* See comment of filemap_write_and_wait() */
368 if (err != -EIO) { 368 if (err != -EIO) {
369 int err2 = filemap_fdatawait_range(mapping, 369 int err2 = filemap_fdatawait_range(mapping,
370 lstart, lend); 370 lstart, lend);
371 if (!err) 371 if (!err)
372 err = err2; 372 err = err2;
373 } 373 }
374 } 374 }
375 return err; 375 return err;
376 } 376 }
377 EXPORT_SYMBOL(filemap_write_and_wait_range); 377 EXPORT_SYMBOL(filemap_write_and_wait_range);
378 378
379 /** 379 /**
380 * replace_page_cache_page - replace a pagecache page with a new one 380 * replace_page_cache_page - replace a pagecache page with a new one
381 * @old: page to be replaced 381 * @old: page to be replaced
382 * @new: page to replace with 382 * @new: page to replace with
383 * @gfp_mask: allocation mode 383 * @gfp_mask: allocation mode
384 * 384 *
385 * This function replaces a page in the pagecache with a new one. On 385 * This function replaces a page in the pagecache with a new one. On
386 * success it acquires the pagecache reference for the new page and 386 * success it acquires the pagecache reference for the new page and
387 * drops it for the old page. Both the old and new pages must be 387 * drops it for the old page. Both the old and new pages must be
388 * locked. This function does not add the new page to the LRU, the 388 * locked. This function does not add the new page to the LRU, the
389 * caller must do that. 389 * caller must do that.
390 * 390 *
391 * The remove + add is atomic. The only way this function can fail is 391 * The remove + add is atomic. The only way this function can fail is
392 * memory allocation failure. 392 * memory allocation failure.
393 */ 393 */
394 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 394 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
395 { 395 {
396 int error; 396 int error;
397 struct mem_cgroup *memcg = NULL; 397 struct mem_cgroup *memcg = NULL;
398 398
399 VM_BUG_ON(!PageLocked(old)); 399 VM_BUG_ON(!PageLocked(old));
400 VM_BUG_ON(!PageLocked(new)); 400 VM_BUG_ON(!PageLocked(new));
401 VM_BUG_ON(new->mapping); 401 VM_BUG_ON(new->mapping);
402 402
403 /* 403 /*
404 * This is not page migration, but prepare_migration and 404 * This is not page migration, but prepare_migration and
405 * end_migration does enough work for charge replacement. 405 * end_migration does enough work for charge replacement.
406 * 406 *
407 * In the longer term we probably want a specialized function 407 * In the longer term we probably want a specialized function
408 * for moving the charge from old to new in a more efficient 408 * for moving the charge from old to new in a more efficient
409 * manner. 409 * manner.
410 */ 410 */
411 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); 411 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
412 if (error) 412 if (error)
413 return error; 413 return error;
414 414
415 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 415 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
416 if (!error) { 416 if (!error) {
417 struct address_space *mapping = old->mapping; 417 struct address_space *mapping = old->mapping;
418 void (*freepage)(struct page *); 418 void (*freepage)(struct page *);
419 419
420 pgoff_t offset = old->index; 420 pgoff_t offset = old->index;
421 freepage = mapping->a_ops->freepage; 421 freepage = mapping->a_ops->freepage;
422 422
423 page_cache_get(new); 423 page_cache_get(new);
424 new->mapping = mapping; 424 new->mapping = mapping;
425 new->index = offset; 425 new->index = offset;
426 426
427 spin_lock_irq(&mapping->tree_lock); 427 spin_lock_irq(&mapping->tree_lock);
428 __delete_from_page_cache(old); 428 __delete_from_page_cache(old);
429 error = radix_tree_insert(&mapping->page_tree, offset, new); 429 error = radix_tree_insert(&mapping->page_tree, offset, new);
430 BUG_ON(error); 430 BUG_ON(error);
431 mapping->nrpages++; 431 mapping->nrpages++;
432 __inc_zone_page_state(new, NR_FILE_PAGES); 432 __inc_zone_page_state(new, NR_FILE_PAGES);
433 if (PageSwapBacked(new)) 433 if (PageSwapBacked(new))
434 __inc_zone_page_state(new, NR_SHMEM); 434 __inc_zone_page_state(new, NR_SHMEM);
435 spin_unlock_irq(&mapping->tree_lock); 435 spin_unlock_irq(&mapping->tree_lock);
436 radix_tree_preload_end(); 436 radix_tree_preload_end();
437 if (freepage) 437 if (freepage)
438 freepage(old); 438 freepage(old);
439 page_cache_release(old); 439 page_cache_release(old);
440 mem_cgroup_end_migration(memcg, old, new, true); 440 mem_cgroup_end_migration(memcg, old, new, true);
441 } else { 441 } else {
442 mem_cgroup_end_migration(memcg, old, new, false); 442 mem_cgroup_end_migration(memcg, old, new, false);
443 } 443 }
444 444
445 return error; 445 return error;
446 } 446 }
447 EXPORT_SYMBOL_GPL(replace_page_cache_page); 447 EXPORT_SYMBOL_GPL(replace_page_cache_page);
448 448
449 /** 449 /**
450 * add_to_page_cache_locked - add a locked page to the pagecache 450 * add_to_page_cache_locked - add a locked page to the pagecache
451 * @page: page to add 451 * @page: page to add
452 * @mapping: the page's address_space 452 * @mapping: the page's address_space
453 * @offset: page index 453 * @offset: page index
454 * @gfp_mask: page allocation mode 454 * @gfp_mask: page allocation mode
455 * 455 *
456 * This function is used to add a page to the pagecache. It must be locked. 456 * This function is used to add a page to the pagecache. It must be locked.
457 * This function does not add the page to the LRU. The caller must do that. 457 * This function does not add the page to the LRU. The caller must do that.
458 */ 458 */
459 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 459 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
460 pgoff_t offset, gfp_t gfp_mask) 460 pgoff_t offset, gfp_t gfp_mask)
461 { 461 {
462 int error; 462 int error;
463 463
464 VM_BUG_ON(!PageLocked(page)); 464 VM_BUG_ON(!PageLocked(page));
465 465
466 error = mem_cgroup_cache_charge(page, current->mm, 466 error = mem_cgroup_cache_charge(page, current->mm,
467 gfp_mask & GFP_RECLAIM_MASK); 467 gfp_mask & GFP_RECLAIM_MASK);
468 if (error) 468 if (error)
469 goto out; 469 goto out;
470 470
471 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 471 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
472 if (error == 0) { 472 if (error == 0) {
473 page_cache_get(page); 473 page_cache_get(page);
474 page->mapping = mapping; 474 page->mapping = mapping;
475 page->index = offset; 475 page->index = offset;
476 476
477 spin_lock_irq(&mapping->tree_lock); 477 spin_lock_irq(&mapping->tree_lock);
478 error = radix_tree_insert(&mapping->page_tree, offset, page); 478 error = radix_tree_insert(&mapping->page_tree, offset, page);
479 if (likely(!error)) { 479 if (likely(!error)) {
480 mapping->nrpages++; 480 mapping->nrpages++;
481 __inc_zone_page_state(page, NR_FILE_PAGES); 481 __inc_zone_page_state(page, NR_FILE_PAGES);
482 if (PageSwapBacked(page)) 482 if (PageSwapBacked(page))
483 __inc_zone_page_state(page, NR_SHMEM); 483 __inc_zone_page_state(page, NR_SHMEM);
484 spin_unlock_irq(&mapping->tree_lock); 484 spin_unlock_irq(&mapping->tree_lock);
485 } else { 485 } else {
486 page->mapping = NULL; 486 page->mapping = NULL;
487 /* Leave page->index set: truncation relies upon it */ 487 /* Leave page->index set: truncation relies upon it */
488 spin_unlock_irq(&mapping->tree_lock); 488 spin_unlock_irq(&mapping->tree_lock);
489 mem_cgroup_uncharge_cache_page(page); 489 mem_cgroup_uncharge_cache_page(page);
490 page_cache_release(page); 490 page_cache_release(page);
491 } 491 }
492 radix_tree_preload_end(); 492 radix_tree_preload_end();
493 } else 493 } else
494 mem_cgroup_uncharge_cache_page(page); 494 mem_cgroup_uncharge_cache_page(page);
495 out: 495 out:
496 return error; 496 return error;
497 } 497 }
498 EXPORT_SYMBOL(add_to_page_cache_locked); 498 EXPORT_SYMBOL(add_to_page_cache_locked);
499 499
500 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 500 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
501 pgoff_t offset, gfp_t gfp_mask) 501 pgoff_t offset, gfp_t gfp_mask)
502 { 502 {
503 int ret; 503 int ret;
504 504
505 /* 505 /*
506 * Splice_read and readahead add shmem/tmpfs pages into the page cache 506 * Splice_read and readahead add shmem/tmpfs pages into the page cache
507 * before shmem_readpage has a chance to mark them as SwapBacked: they 507 * before shmem_readpage has a chance to mark them as SwapBacked: they
508 * need to go on the anon lru below, and mem_cgroup_cache_charge 508 * need to go on the anon lru below, and mem_cgroup_cache_charge
509 * (called in add_to_page_cache) needs to know where they're going too. 509 * (called in add_to_page_cache) needs to know where they're going too.
510 */ 510 */
511 if (mapping_cap_swap_backed(mapping)) 511 if (mapping_cap_swap_backed(mapping))
512 SetPageSwapBacked(page); 512 SetPageSwapBacked(page);
513 513
514 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 514 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
515 if (ret == 0) { 515 if (ret == 0) {
516 if (page_is_file_cache(page)) 516 if (page_is_file_cache(page))
517 lru_cache_add_file(page); 517 lru_cache_add_file(page);
518 else 518 else
519 lru_cache_add_anon(page); 519 lru_cache_add_anon(page);
520 } 520 }
521 return ret; 521 return ret;
522 } 522 }
523 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 523 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
524 524
525 #ifdef CONFIG_NUMA 525 #ifdef CONFIG_NUMA
526 struct page *__page_cache_alloc(gfp_t gfp) 526 struct page *__page_cache_alloc(gfp_t gfp)
527 { 527 {
528 int n; 528 int n;
529 struct page *page; 529 struct page *page;
530 530
531 if (cpuset_do_page_mem_spread()) { 531 if (cpuset_do_page_mem_spread()) {
532 get_mems_allowed(); 532 get_mems_allowed();
533 n = cpuset_mem_spread_node(); 533 n = cpuset_mem_spread_node();
534 page = alloc_pages_exact_node(n, gfp, 0); 534 page = alloc_pages_exact_node(n, gfp, 0);
535 put_mems_allowed(); 535 put_mems_allowed();
536 return page; 536 return page;
537 } 537 }
538 return alloc_pages(gfp, 0); 538 return alloc_pages(gfp, 0);
539 } 539 }
540 EXPORT_SYMBOL(__page_cache_alloc); 540 EXPORT_SYMBOL(__page_cache_alloc);
541 #endif 541 #endif
542 542
543 /* 543 /*
544 * In order to wait for pages to become available there must be 544 * In order to wait for pages to become available there must be
545 * waitqueues associated with pages. By using a hash table of 545 * waitqueues associated with pages. By using a hash table of
546 * waitqueues where the bucket discipline is to maintain all 546 * waitqueues where the bucket discipline is to maintain all
547 * waiters on the same queue and wake all when any of the pages 547 * waiters on the same queue and wake all when any of the pages
548 * become available, and for the woken contexts to check to be 548 * become available, and for the woken contexts to check to be
549 * sure the appropriate page became available, this saves space 549 * sure the appropriate page became available, this saves space
550 * at a cost of "thundering herd" phenomena during rare hash 550 * at a cost of "thundering herd" phenomena during rare hash
551 * collisions. 551 * collisions.
552 */ 552 */
553 static wait_queue_head_t *page_waitqueue(struct page *page) 553 static wait_queue_head_t *page_waitqueue(struct page *page)
554 { 554 {
555 const struct zone *zone = page_zone(page); 555 const struct zone *zone = page_zone(page);
556 556
557 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 557 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
558 } 558 }
559 559
560 static inline void wake_up_page(struct page *page, int bit) 560 static inline void wake_up_page(struct page *page, int bit)
561 { 561 {
562 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 562 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
563 } 563 }
564 564
565 void wait_on_page_bit(struct page *page, int bit_nr) 565 void wait_on_page_bit(struct page *page, int bit_nr)
566 { 566 {
567 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 567 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
568 568
569 if (test_bit(bit_nr, &page->flags)) 569 if (test_bit(bit_nr, &page->flags))
570 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, 570 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
571 TASK_UNINTERRUPTIBLE); 571 TASK_UNINTERRUPTIBLE);
572 } 572 }
573 EXPORT_SYMBOL(wait_on_page_bit); 573 EXPORT_SYMBOL(wait_on_page_bit);
574 574
575 int wait_on_page_bit_killable(struct page *page, int bit_nr) 575 int wait_on_page_bit_killable(struct page *page, int bit_nr)
576 { 576 {
577 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 577 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
578 578
579 if (!test_bit(bit_nr, &page->flags)) 579 if (!test_bit(bit_nr, &page->flags))
580 return 0; 580 return 0;
581 581
582 return __wait_on_bit(page_waitqueue(page), &wait, 582 return __wait_on_bit(page_waitqueue(page), &wait,
583 sleep_on_page_killable, TASK_KILLABLE); 583 sleep_on_page_killable, TASK_KILLABLE);
584 } 584 }
585 585
586 /** 586 /**
587 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 587 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
588 * @page: Page defining the wait queue of interest 588 * @page: Page defining the wait queue of interest
589 * @waiter: Waiter to add to the queue 589 * @waiter: Waiter to add to the queue
590 * 590 *
591 * Add an arbitrary @waiter to the wait queue for the nominated @page. 591 * Add an arbitrary @waiter to the wait queue for the nominated @page.
592 */ 592 */
593 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 593 void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
594 { 594 {
595 wait_queue_head_t *q = page_waitqueue(page); 595 wait_queue_head_t *q = page_waitqueue(page);
596 unsigned long flags; 596 unsigned long flags;
597 597
598 spin_lock_irqsave(&q->lock, flags); 598 spin_lock_irqsave(&q->lock, flags);
599 __add_wait_queue(q, waiter); 599 __add_wait_queue(q, waiter);
600 spin_unlock_irqrestore(&q->lock, flags); 600 spin_unlock_irqrestore(&q->lock, flags);
601 } 601 }
602 EXPORT_SYMBOL_GPL(add_page_wait_queue); 602 EXPORT_SYMBOL_GPL(add_page_wait_queue);
603 603
604 /** 604 /**
605 * unlock_page - unlock a locked page 605 * unlock_page - unlock a locked page
606 * @page: the page 606 * @page: the page
607 * 607 *
608 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 608 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
609 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 609 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
610 * mechananism between PageLocked pages and PageWriteback pages is shared. 610 * mechananism between PageLocked pages and PageWriteback pages is shared.
611 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 611 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
612 * 612 *
613 * The mb is necessary to enforce ordering between the clear_bit and the read 613 * The mb is necessary to enforce ordering between the clear_bit and the read
614 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 614 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
615 */ 615 */
616 void unlock_page(struct page *page) 616 void unlock_page(struct page *page)
617 { 617 {
618 VM_BUG_ON(!PageLocked(page)); 618 VM_BUG_ON(!PageLocked(page));
619 clear_bit_unlock(PG_locked, &page->flags); 619 clear_bit_unlock(PG_locked, &page->flags);
620 smp_mb__after_clear_bit(); 620 smp_mb__after_clear_bit();
621 wake_up_page(page, PG_locked); 621 wake_up_page(page, PG_locked);
622 } 622 }
623 EXPORT_SYMBOL(unlock_page); 623 EXPORT_SYMBOL(unlock_page);
624 624
625 /** 625 /**
626 * end_page_writeback - end writeback against a page 626 * end_page_writeback - end writeback against a page
627 * @page: the page 627 * @page: the page
628 */ 628 */
629 void end_page_writeback(struct page *page) 629 void end_page_writeback(struct page *page)
630 { 630 {
631 if (TestClearPageReclaim(page)) 631 if (TestClearPageReclaim(page))
632 rotate_reclaimable_page(page); 632 rotate_reclaimable_page(page);
633 633
634 if (!test_clear_page_writeback(page)) 634 if (!test_clear_page_writeback(page))
635 BUG(); 635 BUG();
636 636
637 smp_mb__after_clear_bit(); 637 smp_mb__after_clear_bit();
638 wake_up_page(page, PG_writeback); 638 wake_up_page(page, PG_writeback);
639 } 639 }
640 EXPORT_SYMBOL(end_page_writeback); 640 EXPORT_SYMBOL(end_page_writeback);
641 641
642 /** 642 /**
643 * __lock_page - get a lock on the page, assuming we need to sleep to get it 643 * __lock_page - get a lock on the page, assuming we need to sleep to get it
644 * @page: the page to lock 644 * @page: the page to lock
645 */ 645 */
646 void __lock_page(struct page *page) 646 void __lock_page(struct page *page)
647 { 647 {
648 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 648 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
649 649
650 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, 650 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
651 TASK_UNINTERRUPTIBLE); 651 TASK_UNINTERRUPTIBLE);
652 } 652 }
653 EXPORT_SYMBOL(__lock_page); 653 EXPORT_SYMBOL(__lock_page);
654 654
655 int __lock_page_killable(struct page *page) 655 int __lock_page_killable(struct page *page)
656 { 656 {
657 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 657 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
658 658
659 return __wait_on_bit_lock(page_waitqueue(page), &wait, 659 return __wait_on_bit_lock(page_waitqueue(page), &wait,
660 sleep_on_page_killable, TASK_KILLABLE); 660 sleep_on_page_killable, TASK_KILLABLE);
661 } 661 }
662 EXPORT_SYMBOL_GPL(__lock_page_killable); 662 EXPORT_SYMBOL_GPL(__lock_page_killable);
663 663
664 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 664 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
665 unsigned int flags) 665 unsigned int flags)
666 { 666 {
667 if (flags & FAULT_FLAG_ALLOW_RETRY) { 667 if (flags & FAULT_FLAG_ALLOW_RETRY) {
668 /* 668 /*
669 * CAUTION! In this case, mmap_sem is not released 669 * CAUTION! In this case, mmap_sem is not released
670 * even though return 0. 670 * even though return 0.
671 */ 671 */
672 if (flags & FAULT_FLAG_RETRY_NOWAIT) 672 if (flags & FAULT_FLAG_RETRY_NOWAIT)
673 return 0; 673 return 0;
674 674
675 up_read(&mm->mmap_sem); 675 up_read(&mm->mmap_sem);
676 if (flags & FAULT_FLAG_KILLABLE) 676 if (flags & FAULT_FLAG_KILLABLE)
677 wait_on_page_locked_killable(page); 677 wait_on_page_locked_killable(page);
678 else 678 else
679 wait_on_page_locked(page); 679 wait_on_page_locked(page);
680 return 0; 680 return 0;
681 } else { 681 } else {
682 if (flags & FAULT_FLAG_KILLABLE) { 682 if (flags & FAULT_FLAG_KILLABLE) {
683 int ret; 683 int ret;
684 684
685 ret = __lock_page_killable(page); 685 ret = __lock_page_killable(page);
686 if (ret) { 686 if (ret) {
687 up_read(&mm->mmap_sem); 687 up_read(&mm->mmap_sem);
688 return 0; 688 return 0;
689 } 689 }
690 } else 690 } else
691 __lock_page(page); 691 __lock_page(page);
692 return 1; 692 return 1;
693 } 693 }
694 } 694 }
695 695
696 /** 696 /**
697 * find_get_page - find and get a page reference 697 * find_get_page - find and get a page reference
698 * @mapping: the address_space to search 698 * @mapping: the address_space to search
699 * @offset: the page index 699 * @offset: the page index
700 * 700 *
701 * Is there a pagecache struct page at the given (mapping, offset) tuple? 701 * Is there a pagecache struct page at the given (mapping, offset) tuple?
702 * If yes, increment its refcount and return it; if no, return NULL. 702 * If yes, increment its refcount and return it; if no, return NULL.
703 */ 703 */
704 struct page *find_get_page(struct address_space *mapping, pgoff_t offset) 704 struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
705 { 705 {
706 void **pagep; 706 void **pagep;
707 struct page *page; 707 struct page *page;
708 708
709 rcu_read_lock(); 709 rcu_read_lock();
710 repeat: 710 repeat:
711 page = NULL; 711 page = NULL;
712 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 712 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
713 if (pagep) { 713 if (pagep) {
714 page = radix_tree_deref_slot(pagep); 714 page = radix_tree_deref_slot(pagep);
715 if (unlikely(!page)) 715 if (unlikely(!page))
716 goto out; 716 goto out;
717 if (radix_tree_deref_retry(page)) 717 if (radix_tree_deref_retry(page))
718 goto repeat; 718 goto repeat;
719 719
720 if (!page_cache_get_speculative(page)) 720 if (!page_cache_get_speculative(page))
721 goto repeat; 721 goto repeat;
722 722
723 /* 723 /*
724 * Has the page moved? 724 * Has the page moved?
725 * This is part of the lockless pagecache protocol. See 725 * This is part of the lockless pagecache protocol. See
726 * include/linux/pagemap.h for details. 726 * include/linux/pagemap.h for details.
727 */ 727 */
728 if (unlikely(page != *pagep)) { 728 if (unlikely(page != *pagep)) {
729 page_cache_release(page); 729 page_cache_release(page);
730 goto repeat; 730 goto repeat;
731 } 731 }
732 } 732 }
733 out: 733 out:
734 rcu_read_unlock(); 734 rcu_read_unlock();
735 735
736 return page; 736 return page;
737 } 737 }
738 EXPORT_SYMBOL(find_get_page); 738 EXPORT_SYMBOL(find_get_page);
739 739
740 /** 740 /**
741 * find_lock_page - locate, pin and lock a pagecache page 741 * find_lock_page - locate, pin and lock a pagecache page
742 * @mapping: the address_space to search 742 * @mapping: the address_space to search
743 * @offset: the page index 743 * @offset: the page index
744 * 744 *
745 * Locates the desired pagecache page, locks it, increments its reference 745 * Locates the desired pagecache page, locks it, increments its reference
746 * count and returns its address. 746 * count and returns its address.
747 * 747 *
748 * Returns zero if the page was not present. find_lock_page() may sleep. 748 * Returns zero if the page was not present. find_lock_page() may sleep.
749 */ 749 */
750 struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) 750 struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
751 { 751 {
752 struct page *page; 752 struct page *page;
753 753
754 repeat: 754 repeat:
755 page = find_get_page(mapping, offset); 755 page = find_get_page(mapping, offset);
756 if (page) { 756 if (page) {
757 lock_page(page); 757 lock_page(page);
758 /* Has the page been truncated? */ 758 /* Has the page been truncated? */
759 if (unlikely(page->mapping != mapping)) { 759 if (unlikely(page->mapping != mapping)) {
760 unlock_page(page); 760 unlock_page(page);
761 page_cache_release(page); 761 page_cache_release(page);
762 goto repeat; 762 goto repeat;
763 } 763 }
764 VM_BUG_ON(page->index != offset); 764 VM_BUG_ON(page->index != offset);
765 } 765 }
766 return page; 766 return page;
767 } 767 }
768 EXPORT_SYMBOL(find_lock_page); 768 EXPORT_SYMBOL(find_lock_page);
769 769
770 /** 770 /**
771 * find_or_create_page - locate or add a pagecache page 771 * find_or_create_page - locate or add a pagecache page
772 * @mapping: the page's address_space 772 * @mapping: the page's address_space
773 * @index: the page's index into the mapping 773 * @index: the page's index into the mapping
774 * @gfp_mask: page allocation mode 774 * @gfp_mask: page allocation mode
775 * 775 *
776 * Locates a page in the pagecache. If the page is not present, a new page 776 * Locates a page in the pagecache. If the page is not present, a new page
777 * is allocated using @gfp_mask and is added to the pagecache and to the VM's 777 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
778 * LRU list. The returned page is locked and has its reference count 778 * LRU list. The returned page is locked and has its reference count
779 * incremented. 779 * incremented.
780 * 780 *
781 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic 781 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
782 * allocation! 782 * allocation!
783 * 783 *
784 * find_or_create_page() returns the desired page's address, or zero on 784 * find_or_create_page() returns the desired page's address, or zero on
785 * memory exhaustion. 785 * memory exhaustion.
786 */ 786 */
787 struct page *find_or_create_page(struct address_space *mapping, 787 struct page *find_or_create_page(struct address_space *mapping,
788 pgoff_t index, gfp_t gfp_mask) 788 pgoff_t index, gfp_t gfp_mask)
789 { 789 {
790 struct page *page; 790 struct page *page;
791 int err; 791 int err;
792 repeat: 792 repeat:
793 page = find_lock_page(mapping, index); 793 page = find_lock_page(mapping, index);
794 if (!page) { 794 if (!page) {
795 page = __page_cache_alloc(gfp_mask); 795 page = __page_cache_alloc(gfp_mask);
796 if (!page) 796 if (!page)
797 return NULL; 797 return NULL;
798 /* 798 /*
799 * We want a regular kernel memory (not highmem or DMA etc) 799 * We want a regular kernel memory (not highmem or DMA etc)
800 * allocation for the radix tree nodes, but we need to honour 800 * allocation for the radix tree nodes, but we need to honour
801 * the context-specific requirements the caller has asked for. 801 * the context-specific requirements the caller has asked for.
802 * GFP_RECLAIM_MASK collects those requirements. 802 * GFP_RECLAIM_MASK collects those requirements.
803 */ 803 */
804 err = add_to_page_cache_lru(page, mapping, index, 804 err = add_to_page_cache_lru(page, mapping, index,
805 (gfp_mask & GFP_RECLAIM_MASK)); 805 (gfp_mask & GFP_RECLAIM_MASK));
806 if (unlikely(err)) { 806 if (unlikely(err)) {
807 page_cache_release(page); 807 page_cache_release(page);
808 page = NULL; 808 page = NULL;
809 if (err == -EEXIST) 809 if (err == -EEXIST)
810 goto repeat; 810 goto repeat;
811 } 811 }
812 } 812 }
813 return page; 813 return page;
814 } 814 }
815 EXPORT_SYMBOL(find_or_create_page); 815 EXPORT_SYMBOL(find_or_create_page);
816 816
817 /** 817 /**
818 * find_get_pages - gang pagecache lookup 818 * find_get_pages - gang pagecache lookup
819 * @mapping: The address_space to search 819 * @mapping: The address_space to search
820 * @start: The starting page index 820 * @start: The starting page index
821 * @nr_pages: The maximum number of pages 821 * @nr_pages: The maximum number of pages
822 * @pages: Where the resulting pages are placed 822 * @pages: Where the resulting pages are placed
823 * 823 *
824 * find_get_pages() will search for and return a group of up to 824 * find_get_pages() will search for and return a group of up to
825 * @nr_pages pages in the mapping. The pages are placed at @pages. 825 * @nr_pages pages in the mapping. The pages are placed at @pages.
826 * find_get_pages() takes a reference against the returned pages. 826 * find_get_pages() takes a reference against the returned pages.
827 * 827 *
828 * The search returns a group of mapping-contiguous pages with ascending 828 * The search returns a group of mapping-contiguous pages with ascending
829 * indexes. There may be holes in the indices due to not-present pages. 829 * indexes. There may be holes in the indices due to not-present pages.
830 * 830 *
831 * find_get_pages() returns the number of pages which were found. 831 * find_get_pages() returns the number of pages which were found.
832 */ 832 */
833 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 833 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
834 unsigned int nr_pages, struct page **pages) 834 unsigned int nr_pages, struct page **pages)
835 { 835 {
836 unsigned int i; 836 unsigned int i;
837 unsigned int ret; 837 unsigned int ret;
838 unsigned int nr_found; 838 unsigned int nr_found;
839 839
840 rcu_read_lock(); 840 rcu_read_lock();
841 restart: 841 restart:
842 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 842 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
843 (void ***)pages, start, nr_pages); 843 (void ***)pages, start, nr_pages);
844 ret = 0; 844 ret = 0;
845 for (i = 0; i < nr_found; i++) { 845 for (i = 0; i < nr_found; i++) {
846 struct page *page; 846 struct page *page;
847 repeat: 847 repeat:
848 page = radix_tree_deref_slot((void **)pages[i]); 848 page = radix_tree_deref_slot((void **)pages[i]);
849 if (unlikely(!page)) 849 if (unlikely(!page))
850 continue; 850 continue;
851 851
852 /* 852 /*
853 * This can only trigger when the entry at index 0 moves out 853 * This can only trigger when the entry at index 0 moves out
854 * of or back to the root: none yet gotten, safe to restart. 854 * of or back to the root: none yet gotten, safe to restart.
855 */ 855 */
856 if (radix_tree_deref_retry(page)) { 856 if (radix_tree_deref_retry(page)) {
857 WARN_ON(start | i); 857 WARN_ON(start | i);
858 goto restart; 858 goto restart;
859 } 859 }
860 860
861 if (!page_cache_get_speculative(page)) 861 if (!page_cache_get_speculative(page))
862 goto repeat; 862 goto repeat;
863 863
864 /* Has the page moved? */ 864 /* Has the page moved? */
865 if (unlikely(page != *((void **)pages[i]))) { 865 if (unlikely(page != *((void **)pages[i]))) {
866 page_cache_release(page); 866 page_cache_release(page);
867 goto repeat; 867 goto repeat;
868 } 868 }
869 869
870 pages[ret] = page; 870 pages[ret] = page;
871 ret++; 871 ret++;
872 } 872 }
873 873
874 /* 874 /*
875 * If all entries were removed before we could secure them, 875 * If all entries were removed before we could secure them,
876 * try again, because callers stop trying once 0 is returned. 876 * try again, because callers stop trying once 0 is returned.
877 */ 877 */
878 if (unlikely(!ret && nr_found)) 878 if (unlikely(!ret && nr_found))
879 goto restart; 879 goto restart;
880 rcu_read_unlock(); 880 rcu_read_unlock();
881 return ret; 881 return ret;
882 } 882 }
883 883
884 /** 884 /**
885 * find_get_pages_contig - gang contiguous pagecache lookup 885 * find_get_pages_contig - gang contiguous pagecache lookup
886 * @mapping: The address_space to search 886 * @mapping: The address_space to search
887 * @index: The starting page index 887 * @index: The starting page index
888 * @nr_pages: The maximum number of pages 888 * @nr_pages: The maximum number of pages
889 * @pages: Where the resulting pages are placed 889 * @pages: Where the resulting pages are placed
890 * 890 *
891 * find_get_pages_contig() works exactly like find_get_pages(), except 891 * find_get_pages_contig() works exactly like find_get_pages(), except
892 * that the returned number of pages are guaranteed to be contiguous. 892 * that the returned number of pages are guaranteed to be contiguous.
893 * 893 *
894 * find_get_pages_contig() returns the number of pages which were found. 894 * find_get_pages_contig() returns the number of pages which were found.
895 */ 895 */
896 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 896 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
897 unsigned int nr_pages, struct page **pages) 897 unsigned int nr_pages, struct page **pages)
898 { 898 {
899 unsigned int i; 899 unsigned int i;
900 unsigned int ret; 900 unsigned int ret;
901 unsigned int nr_found; 901 unsigned int nr_found;
902 902
903 rcu_read_lock(); 903 rcu_read_lock();
904 restart: 904 restart:
905 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 905 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
906 (void ***)pages, index, nr_pages); 906 (void ***)pages, index, nr_pages);
907 ret = 0; 907 ret = 0;
908 for (i = 0; i < nr_found; i++) { 908 for (i = 0; i < nr_found; i++) {
909 struct page *page; 909 struct page *page;
910 repeat: 910 repeat:
911 page = radix_tree_deref_slot((void **)pages[i]); 911 page = radix_tree_deref_slot((void **)pages[i]);
912 if (unlikely(!page)) 912 if (unlikely(!page))
913 continue; 913 continue;
914 914
915 /* 915 /*
916 * This can only trigger when the entry at index 0 moves out 916 * This can only trigger when the entry at index 0 moves out
917 * of or back to the root: none yet gotten, safe to restart. 917 * of or back to the root: none yet gotten, safe to restart.
918 */ 918 */
919 if (radix_tree_deref_retry(page)) 919 if (radix_tree_deref_retry(page))
920 goto restart; 920 goto restart;
921 921
922 if (!page_cache_get_speculative(page)) 922 if (!page_cache_get_speculative(page))
923 goto repeat; 923 goto repeat;
924 924
925 /* Has the page moved? */ 925 /* Has the page moved? */
926 if (unlikely(page != *((void **)pages[i]))) { 926 if (unlikely(page != *((void **)pages[i]))) {
927 page_cache_release(page); 927 page_cache_release(page);
928 goto repeat; 928 goto repeat;
929 } 929 }
930 930
931 /* 931 /*
932 * must check mapping and index after taking the ref. 932 * must check mapping and index after taking the ref.
933 * otherwise we can get both false positives and false 933 * otherwise we can get both false positives and false
934 * negatives, which is just confusing to the caller. 934 * negatives, which is just confusing to the caller.
935 */ 935 */
936 if (page->mapping == NULL || page->index != index) { 936 if (page->mapping == NULL || page->index != index) {
937 page_cache_release(page); 937 page_cache_release(page);
938 break; 938 break;
939 } 939 }
940 940
941 pages[ret] = page; 941 pages[ret] = page;
942 ret++; 942 ret++;
943 index++; 943 index++;
944 } 944 }
945 rcu_read_unlock(); 945 rcu_read_unlock();
946 return ret; 946 return ret;
947 } 947 }
948 EXPORT_SYMBOL(find_get_pages_contig); 948 EXPORT_SYMBOL(find_get_pages_contig);
949 949
950 /** 950 /**
951 * find_get_pages_tag - find and return pages that match @tag 951 * find_get_pages_tag - find and return pages that match @tag
952 * @mapping: the address_space to search 952 * @mapping: the address_space to search
953 * @index: the starting page index 953 * @index: the starting page index
954 * @tag: the tag index 954 * @tag: the tag index
955 * @nr_pages: the maximum number of pages 955 * @nr_pages: the maximum number of pages
956 * @pages: where the resulting pages are placed 956 * @pages: where the resulting pages are placed
957 * 957 *
958 * Like find_get_pages, except we only return pages which are tagged with 958 * Like find_get_pages, except we only return pages which are tagged with
959 * @tag. We update @index to index the next page for the traversal. 959 * @tag. We update @index to index the next page for the traversal.
960 */ 960 */
961 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 961 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
962 int tag, unsigned int nr_pages, struct page **pages) 962 int tag, unsigned int nr_pages, struct page **pages)
963 { 963 {
964 unsigned int i; 964 unsigned int i;
965 unsigned int ret; 965 unsigned int ret;
966 unsigned int nr_found; 966 unsigned int nr_found;
967 967
968 rcu_read_lock(); 968 rcu_read_lock();
969 restart: 969 restart:
970 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, 970 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
971 (void ***)pages, *index, nr_pages, tag); 971 (void ***)pages, *index, nr_pages, tag);
972 ret = 0; 972 ret = 0;
973 for (i = 0; i < nr_found; i++) { 973 for (i = 0; i < nr_found; i++) {
974 struct page *page; 974 struct page *page;
975 repeat: 975 repeat:
976 page = radix_tree_deref_slot((void **)pages[i]); 976 page = radix_tree_deref_slot((void **)pages[i]);
977 if (unlikely(!page)) 977 if (unlikely(!page))
978 continue; 978 continue;
979 979
980 /* 980 /*
981 * This can only trigger when the entry at index 0 moves out 981 * This can only trigger when the entry at index 0 moves out
982 * of or back to the root: none yet gotten, safe to restart. 982 * of or back to the root: none yet gotten, safe to restart.
983 */ 983 */
984 if (radix_tree_deref_retry(page)) 984 if (radix_tree_deref_retry(page))
985 goto restart; 985 goto restart;
986 986
987 if (!page_cache_get_speculative(page)) 987 if (!page_cache_get_speculative(page))
988 goto repeat; 988 goto repeat;
989 989
990 /* Has the page moved? */ 990 /* Has the page moved? */
991 if (unlikely(page != *((void **)pages[i]))) { 991 if (unlikely(page != *((void **)pages[i]))) {
992 page_cache_release(page); 992 page_cache_release(page);
993 goto repeat; 993 goto repeat;
994 } 994 }
995 995
996 pages[ret] = page; 996 pages[ret] = page;
997 ret++; 997 ret++;
998 } 998 }
999 999
1000 /* 1000 /*
1001 * If all entries were removed before we could secure them, 1001 * If all entries were removed before we could secure them,
1002 * try again, because callers stop trying once 0 is returned. 1002 * try again, because callers stop trying once 0 is returned.
1003 */ 1003 */
1004 if (unlikely(!ret && nr_found)) 1004 if (unlikely(!ret && nr_found))
1005 goto restart; 1005 goto restart;
1006 rcu_read_unlock(); 1006 rcu_read_unlock();
1007 1007
1008 if (ret) 1008 if (ret)
1009 *index = pages[ret - 1]->index + 1; 1009 *index = pages[ret - 1]->index + 1;
1010 1010
1011 return ret; 1011 return ret;
1012 } 1012 }
1013 EXPORT_SYMBOL(find_get_pages_tag); 1013 EXPORT_SYMBOL(find_get_pages_tag);
1014 1014
1015 /** 1015 /**
1016 * grab_cache_page_nowait - returns locked page at given index in given cache 1016 * grab_cache_page_nowait - returns locked page at given index in given cache
1017 * @mapping: target address_space 1017 * @mapping: target address_space
1018 * @index: the page index 1018 * @index: the page index
1019 * 1019 *
1020 * Same as grab_cache_page(), but do not wait if the page is unavailable. 1020 * Same as grab_cache_page(), but do not wait if the page is unavailable.
1021 * This is intended for speculative data generators, where the data can 1021 * This is intended for speculative data generators, where the data can
1022 * be regenerated if the page couldn't be grabbed. This routine should 1022 * be regenerated if the page couldn't be grabbed. This routine should
1023 * be safe to call while holding the lock for another page. 1023 * be safe to call while holding the lock for another page.
1024 * 1024 *
1025 * Clear __GFP_FS when allocating the page to avoid recursion into the fs 1025 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
1026 * and deadlock against the caller's locked page. 1026 * and deadlock against the caller's locked page.
1027 */ 1027 */
1028 struct page * 1028 struct page *
1029 grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) 1029 grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
1030 { 1030 {
1031 struct page *page = find_get_page(mapping, index); 1031 struct page *page = find_get_page(mapping, index);
1032 1032
1033 if (page) { 1033 if (page) {
1034 if (trylock_page(page)) 1034 if (trylock_page(page))
1035 return page; 1035 return page;
1036 page_cache_release(page); 1036 page_cache_release(page);
1037 return NULL; 1037 return NULL;
1038 } 1038 }
1039 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 1039 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
1040 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { 1040 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
1041 page_cache_release(page); 1041 page_cache_release(page);
1042 page = NULL; 1042 page = NULL;
1043 } 1043 }
1044 return page; 1044 return page;
1045 } 1045 }
1046 EXPORT_SYMBOL(grab_cache_page_nowait); 1046 EXPORT_SYMBOL(grab_cache_page_nowait);
1047 1047
1048 /* 1048 /*
1049 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1049 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
1050 * a _large_ part of the i/o request. Imagine the worst scenario: 1050 * a _large_ part of the i/o request. Imagine the worst scenario:
1051 * 1051 *
1052 * ---R__________________________________________B__________ 1052 * ---R__________________________________________B__________
1053 * ^ reading here ^ bad block(assume 4k) 1053 * ^ reading here ^ bad block(assume 4k)
1054 * 1054 *
1055 * read(R) => miss => readahead(R...B) => media error => frustrating retries 1055 * read(R) => miss => readahead(R...B) => media error => frustrating retries
1056 * => failing the whole request => read(R) => read(R+1) => 1056 * => failing the whole request => read(R) => read(R+1) =>
1057 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 1057 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
1058 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 1058 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
1059 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 1059 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
1060 * 1060 *
1061 * It is going insane. Fix it by quickly scaling down the readahead size. 1061 * It is going insane. Fix it by quickly scaling down the readahead size.
1062 */ 1062 */
1063 static void shrink_readahead_size_eio(struct file *filp, 1063 static void shrink_readahead_size_eio(struct file *filp,
1064 struct file_ra_state *ra) 1064 struct file_ra_state *ra)
1065 { 1065 {
1066 ra->ra_pages /= 4; 1066 ra->ra_pages /= 4;
1067 } 1067 }
1068 1068
1069 /** 1069 /**
1070 * do_generic_file_read - generic file read routine 1070 * do_generic_file_read - generic file read routine
1071 * @filp: the file to read 1071 * @filp: the file to read
1072 * @ppos: current file position 1072 * @ppos: current file position
1073 * @desc: read_descriptor 1073 * @desc: read_descriptor
1074 * @actor: read method 1074 * @actor: read method
1075 * 1075 *
1076 * This is a generic file read routine, and uses the 1076 * This is a generic file read routine, and uses the
1077 * mapping->a_ops->readpage() function for the actual low-level stuff. 1077 * mapping->a_ops->readpage() function for the actual low-level stuff.
1078 * 1078 *
1079 * This is really ugly. But the goto's actually try to clarify some 1079 * This is really ugly. But the goto's actually try to clarify some
1080 * of the logic when it comes to error handling etc. 1080 * of the logic when it comes to error handling etc.
1081 */ 1081 */
1082 static void do_generic_file_read(struct file *filp, loff_t *ppos, 1082 static void do_generic_file_read(struct file *filp, loff_t *ppos,
1083 read_descriptor_t *desc, read_actor_t actor) 1083 read_descriptor_t *desc, read_actor_t actor)
1084 { 1084 {
1085 struct address_space *mapping = filp->f_mapping; 1085 struct address_space *mapping = filp->f_mapping;
1086 struct inode *inode = mapping->host; 1086 struct inode *inode = mapping->host;
1087 struct file_ra_state *ra = &filp->f_ra; 1087 struct file_ra_state *ra = &filp->f_ra;
1088 pgoff_t index; 1088 pgoff_t index;
1089 pgoff_t last_index; 1089 pgoff_t last_index;
1090 pgoff_t prev_index; 1090 pgoff_t prev_index;
1091 unsigned long offset; /* offset into pagecache page */ 1091 unsigned long offset; /* offset into pagecache page */
1092 unsigned int prev_offset; 1092 unsigned int prev_offset;
1093 int error; 1093 int error;
1094 1094
1095 index = *ppos >> PAGE_CACHE_SHIFT; 1095 index = *ppos >> PAGE_CACHE_SHIFT;
1096 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; 1096 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
1097 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); 1097 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
1098 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 1098 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
1099 offset = *ppos & ~PAGE_CACHE_MASK; 1099 offset = *ppos & ~PAGE_CACHE_MASK;
1100 1100
1101 for (;;) { 1101 for (;;) {
1102 struct page *page; 1102 struct page *page;
1103 pgoff_t end_index; 1103 pgoff_t end_index;
1104 loff_t isize; 1104 loff_t isize;
1105 unsigned long nr, ret; 1105 unsigned long nr, ret;
1106 1106
1107 cond_resched(); 1107 cond_resched();
1108 find_page: 1108 find_page:
1109 page = find_get_page(mapping, index); 1109 page = find_get_page(mapping, index);
1110 if (!page) { 1110 if (!page) {
1111 page_cache_sync_readahead(mapping, 1111 page_cache_sync_readahead(mapping,
1112 ra, filp, 1112 ra, filp,
1113 index, last_index - index); 1113 index, last_index - index);
1114 page = find_get_page(mapping, index); 1114 page = find_get_page(mapping, index);
1115 if (unlikely(page == NULL)) 1115 if (unlikely(page == NULL))
1116 goto no_cached_page; 1116 goto no_cached_page;
1117 } 1117 }
1118 if (PageReadahead(page)) { 1118 if (PageReadahead(page)) {
1119 page_cache_async_readahead(mapping, 1119 page_cache_async_readahead(mapping,
1120 ra, filp, page, 1120 ra, filp, page,
1121 index, last_index - index); 1121 index, last_index - index);
1122 } 1122 }
1123 if (!PageUptodate(page)) { 1123 if (!PageUptodate(page)) {
1124 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1124 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1125 !mapping->a_ops->is_partially_uptodate) 1125 !mapping->a_ops->is_partially_uptodate)
1126 goto page_not_up_to_date; 1126 goto page_not_up_to_date;
1127 if (!trylock_page(page)) 1127 if (!trylock_page(page))
1128 goto page_not_up_to_date; 1128 goto page_not_up_to_date;
1129 /* Did it get truncated before we got the lock? */ 1129 /* Did it get truncated before we got the lock? */
1130 if (!page->mapping) 1130 if (!page->mapping)
1131 goto page_not_up_to_date_locked; 1131 goto page_not_up_to_date_locked;
1132 if (!mapping->a_ops->is_partially_uptodate(page, 1132 if (!mapping->a_ops->is_partially_uptodate(page,
1133 desc, offset)) 1133 desc, offset))
1134 goto page_not_up_to_date_locked; 1134 goto page_not_up_to_date_locked;
1135 unlock_page(page); 1135 unlock_page(page);
1136 } 1136 }
1137 page_ok: 1137 page_ok:
1138 /* 1138 /*
1139 * i_size must be checked after we know the page is Uptodate. 1139 * i_size must be checked after we know the page is Uptodate.
1140 * 1140 *
1141 * Checking i_size after the check allows us to calculate 1141 * Checking i_size after the check allows us to calculate
1142 * the correct value for "nr", which means the zero-filled 1142 * the correct value for "nr", which means the zero-filled
1143 * part of the page is not copied back to userspace (unless 1143 * part of the page is not copied back to userspace (unless
1144 * another truncate extends the file - this is desired though). 1144 * another truncate extends the file - this is desired though).
1145 */ 1145 */
1146 1146
1147 isize = i_size_read(inode); 1147 isize = i_size_read(inode);
1148 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1148 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1149 if (unlikely(!isize || index > end_index)) { 1149 if (unlikely(!isize || index > end_index)) {
1150 page_cache_release(page); 1150 page_cache_release(page);
1151 goto out; 1151 goto out;
1152 } 1152 }
1153 1153
1154 /* nr is the maximum number of bytes to copy from this page */ 1154 /* nr is the maximum number of bytes to copy from this page */
1155 nr = PAGE_CACHE_SIZE; 1155 nr = PAGE_CACHE_SIZE;
1156 if (index == end_index) { 1156 if (index == end_index) {
1157 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1157 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1158 if (nr <= offset) { 1158 if (nr <= offset) {
1159 page_cache_release(page); 1159 page_cache_release(page);
1160 goto out; 1160 goto out;
1161 } 1161 }
1162 } 1162 }
1163 nr = nr - offset; 1163 nr = nr - offset;
1164 1164
1165 /* If users can be writing to this page using arbitrary 1165 /* If users can be writing to this page using arbitrary
1166 * virtual addresses, take care about potential aliasing 1166 * virtual addresses, take care about potential aliasing
1167 * before reading the page on the kernel side. 1167 * before reading the page on the kernel side.
1168 */ 1168 */
1169 if (mapping_writably_mapped(mapping)) 1169 if (mapping_writably_mapped(mapping))
1170 flush_dcache_page(page); 1170 flush_dcache_page(page);
1171 1171
1172 /* 1172 /*
1173 * When a sequential read accesses a page several times, 1173 * When a sequential read accesses a page several times,
1174 * only mark it as accessed the first time. 1174 * only mark it as accessed the first time.
1175 */ 1175 */
1176 if (prev_index != index || offset != prev_offset) 1176 if (prev_index != index || offset != prev_offset)
1177 mark_page_accessed(page); 1177 mark_page_accessed(page);
1178 prev_index = index; 1178 prev_index = index;
1179 1179
1180 /* 1180 /*
1181 * Ok, we have the page, and it's up-to-date, so 1181 * Ok, we have the page, and it's up-to-date, so
1182 * now we can copy it to user space... 1182 * now we can copy it to user space...
1183 * 1183 *
1184 * The actor routine returns how many bytes were actually used.. 1184 * The actor routine returns how many bytes were actually used..
1185 * NOTE! This may not be the same as how much of a user buffer 1185 * NOTE! This may not be the same as how much of a user buffer
1186 * we filled up (we may be padding etc), so we can only update 1186 * we filled up (we may be padding etc), so we can only update
1187 * "pos" here (the actor routine has to update the user buffer 1187 * "pos" here (the actor routine has to update the user buffer
1188 * pointers and the remaining count). 1188 * pointers and the remaining count).
1189 */ 1189 */
1190 ret = actor(desc, page, offset, nr); 1190 ret = actor(desc, page, offset, nr);
1191 offset += ret; 1191 offset += ret;
1192 index += offset >> PAGE_CACHE_SHIFT; 1192 index += offset >> PAGE_CACHE_SHIFT;
1193 offset &= ~PAGE_CACHE_MASK; 1193 offset &= ~PAGE_CACHE_MASK;
1194 prev_offset = offset; 1194 prev_offset = offset;
1195 1195
1196 page_cache_release(page); 1196 page_cache_release(page);
1197 if (ret == nr && desc->count) 1197 if (ret == nr && desc->count)
1198 continue; 1198 continue;
1199 goto out; 1199 goto out;
1200 1200
1201 page_not_up_to_date: 1201 page_not_up_to_date:
1202 /* Get exclusive access to the page ... */ 1202 /* Get exclusive access to the page ... */
1203 error = lock_page_killable(page); 1203 error = lock_page_killable(page);
1204 if (unlikely(error)) 1204 if (unlikely(error))
1205 goto readpage_error; 1205 goto readpage_error;
1206 1206
1207 page_not_up_to_date_locked: 1207 page_not_up_to_date_locked:
1208 /* Did it get truncated before we got the lock? */ 1208 /* Did it get truncated before we got the lock? */
1209 if (!page->mapping) { 1209 if (!page->mapping) {
1210 unlock_page(page); 1210 unlock_page(page);
1211 page_cache_release(page); 1211 page_cache_release(page);
1212 continue; 1212 continue;
1213 } 1213 }
1214 1214
1215 /* Did somebody else fill it already? */ 1215 /* Did somebody else fill it already? */
1216 if (PageUptodate(page)) { 1216 if (PageUptodate(page)) {
1217 unlock_page(page); 1217 unlock_page(page);
1218 goto page_ok; 1218 goto page_ok;
1219 } 1219 }
1220 1220
1221 readpage: 1221 readpage:
1222 /* 1222 /*
1223 * A previous I/O error may have been due to temporary 1223 * A previous I/O error may have been due to temporary
1224 * failures, eg. multipath errors. 1224 * failures, eg. multipath errors.
1225 * PG_error will be set again if readpage fails. 1225 * PG_error will be set again if readpage fails.
1226 */ 1226 */
1227 ClearPageError(page); 1227 ClearPageError(page);
1228 /* Start the actual read. The read will unlock the page. */ 1228 /* Start the actual read. The read will unlock the page. */
1229 error = mapping->a_ops->readpage(filp, page); 1229 error = mapping->a_ops->readpage(filp, page);
1230 1230
1231 if (unlikely(error)) { 1231 if (unlikely(error)) {
1232 if (error == AOP_TRUNCATED_PAGE) { 1232 if (error == AOP_TRUNCATED_PAGE) {
1233 page_cache_release(page); 1233 page_cache_release(page);
1234 goto find_page; 1234 goto find_page;
1235 } 1235 }
1236 goto readpage_error; 1236 goto readpage_error;
1237 } 1237 }
1238 1238
1239 if (!PageUptodate(page)) { 1239 if (!PageUptodate(page)) {
1240 error = lock_page_killable(page); 1240 error = lock_page_killable(page);
1241 if (unlikely(error)) 1241 if (unlikely(error))
1242 goto readpage_error; 1242 goto readpage_error;
1243 if (!PageUptodate(page)) { 1243 if (!PageUptodate(page)) {
1244 if (page->mapping == NULL) { 1244 if (page->mapping == NULL) {
1245 /* 1245 /*
1246 * invalidate_mapping_pages got it 1246 * invalidate_mapping_pages got it
1247 */ 1247 */
1248 unlock_page(page); 1248 unlock_page(page);
1249 page_cache_release(page); 1249 page_cache_release(page);
1250 goto find_page; 1250 goto find_page;
1251 } 1251 }
1252 unlock_page(page); 1252 unlock_page(page);
1253 shrink_readahead_size_eio(filp, ra); 1253 shrink_readahead_size_eio(filp, ra);
1254 error = -EIO; 1254 error = -EIO;
1255 goto readpage_error; 1255 goto readpage_error;
1256 } 1256 }
1257 unlock_page(page); 1257 unlock_page(page);
1258 } 1258 }
1259 1259
1260 goto page_ok; 1260 goto page_ok;
1261 1261
1262 readpage_error: 1262 readpage_error:
1263 /* UHHUH! A synchronous read error occurred. Report it */ 1263 /* UHHUH! A synchronous read error occurred. Report it */
1264 desc->error = error; 1264 desc->error = error;
1265 page_cache_release(page); 1265 page_cache_release(page);
1266 goto out; 1266 goto out;
1267 1267
1268 no_cached_page: 1268 no_cached_page:
1269 /* 1269 /*
1270 * Ok, it wasn't cached, so we need to create a new 1270 * Ok, it wasn't cached, so we need to create a new
1271 * page.. 1271 * page..
1272 */ 1272 */
1273 page = page_cache_alloc_cold(mapping); 1273 page = page_cache_alloc_cold(mapping);
1274 if (!page) { 1274 if (!page) {
1275 desc->error = -ENOMEM; 1275 desc->error = -ENOMEM;
1276 goto out; 1276 goto out;
1277 } 1277 }
1278 error = add_to_page_cache_lru(page, mapping, 1278 error = add_to_page_cache_lru(page, mapping,
1279 index, GFP_KERNEL); 1279 index, GFP_KERNEL);
1280 if (error) { 1280 if (error) {
1281 page_cache_release(page); 1281 page_cache_release(page);
1282 if (error == -EEXIST) 1282 if (error == -EEXIST)
1283 goto find_page; 1283 goto find_page;
1284 desc->error = error; 1284 desc->error = error;
1285 goto out; 1285 goto out;
1286 } 1286 }
1287 goto readpage; 1287 goto readpage;
1288 } 1288 }
1289 1289
1290 out: 1290 out:
1291 ra->prev_pos = prev_index; 1291 ra->prev_pos = prev_index;
1292 ra->prev_pos <<= PAGE_CACHE_SHIFT; 1292 ra->prev_pos <<= PAGE_CACHE_SHIFT;
1293 ra->prev_pos |= prev_offset; 1293 ra->prev_pos |= prev_offset;
1294 1294
1295 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1295 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1296 file_accessed(filp); 1296 file_accessed(filp);
1297 } 1297 }
1298 1298
1299 int file_read_actor(read_descriptor_t *desc, struct page *page, 1299 int file_read_actor(read_descriptor_t *desc, struct page *page,
1300 unsigned long offset, unsigned long size) 1300 unsigned long offset, unsigned long size)
1301 { 1301 {
1302 char *kaddr; 1302 char *kaddr;
1303 unsigned long left, count = desc->count; 1303 unsigned long left, count = desc->count;
1304 1304
1305 if (size > count) 1305 if (size > count)
1306 size = count; 1306 size = count;
1307 1307
1308 /* 1308 /*
1309 * Faults on the destination of a read are common, so do it before 1309 * Faults on the destination of a read are common, so do it before
1310 * taking the kmap. 1310 * taking the kmap.
1311 */ 1311 */
1312 if (!fault_in_pages_writeable(desc->arg.buf, size)) { 1312 if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1313 kaddr = kmap_atomic(page, KM_USER0); 1313 kaddr = kmap_atomic(page, KM_USER0);
1314 left = __copy_to_user_inatomic(desc->arg.buf, 1314 left = __copy_to_user_inatomic(desc->arg.buf,
1315 kaddr + offset, size); 1315 kaddr + offset, size);
1316 kunmap_atomic(kaddr, KM_USER0); 1316 kunmap_atomic(kaddr, KM_USER0);
1317 if (left == 0) 1317 if (left == 0)
1318 goto success; 1318 goto success;
1319 } 1319 }
1320 1320
1321 /* Do it the slow way */ 1321 /* Do it the slow way */
1322 kaddr = kmap(page); 1322 kaddr = kmap(page);
1323 left = __copy_to_user(desc->arg.buf, kaddr + offset, size); 1323 left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1324 kunmap(page); 1324 kunmap(page);
1325 1325
1326 if (left) { 1326 if (left) {
1327 size -= left; 1327 size -= left;
1328 desc->error = -EFAULT; 1328 desc->error = -EFAULT;
1329 } 1329 }
1330 success: 1330 success:
1331 desc->count = count - size; 1331 desc->count = count - size;
1332 desc->written += size; 1332 desc->written += size;
1333 desc->arg.buf += size; 1333 desc->arg.buf += size;
1334 return size; 1334 return size;
1335 } 1335 }
1336 1336
1337 /* 1337 /*
1338 * Performs necessary checks before doing a write 1338 * Performs necessary checks before doing a write
1339 * @iov: io vector request 1339 * @iov: io vector request
1340 * @nr_segs: number of segments in the iovec 1340 * @nr_segs: number of segments in the iovec
1341 * @count: number of bytes to write 1341 * @count: number of bytes to write
1342 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE 1342 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1343 * 1343 *
1344 * Adjust number of segments and amount of bytes to write (nr_segs should be 1344 * Adjust number of segments and amount of bytes to write (nr_segs should be
1345 * properly initialized first). Returns appropriate error code that caller 1345 * properly initialized first). Returns appropriate error code that caller
1346 * should return or zero in case that write should be allowed. 1346 * should return or zero in case that write should be allowed.
1347 */ 1347 */
1348 int generic_segment_checks(const struct iovec *iov, 1348 int generic_segment_checks(const struct iovec *iov,
1349 unsigned long *nr_segs, size_t *count, int access_flags) 1349 unsigned long *nr_segs, size_t *count, int access_flags)
1350 { 1350 {
1351 unsigned long seg; 1351 unsigned long seg;
1352 size_t cnt = 0; 1352 size_t cnt = 0;
1353 for (seg = 0; seg < *nr_segs; seg++) { 1353 for (seg = 0; seg < *nr_segs; seg++) {
1354 const struct iovec *iv = &iov[seg]; 1354 const struct iovec *iv = &iov[seg];
1355 1355
1356 /* 1356 /*
1357 * If any segment has a negative length, or the cumulative 1357 * If any segment has a negative length, or the cumulative
1358 * length ever wraps negative then return -EINVAL. 1358 * length ever wraps negative then return -EINVAL.
1359 */ 1359 */
1360 cnt += iv->iov_len; 1360 cnt += iv->iov_len;
1361 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) 1361 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1362 return -EINVAL; 1362 return -EINVAL;
1363 if (access_ok(access_flags, iv->iov_base, iv->iov_len)) 1363 if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1364 continue; 1364 continue;
1365 if (seg == 0) 1365 if (seg == 0)
1366 return -EFAULT; 1366 return -EFAULT;
1367 *nr_segs = seg; 1367 *nr_segs = seg;
1368 cnt -= iv->iov_len; /* This segment is no good */ 1368 cnt -= iv->iov_len; /* This segment is no good */
1369 break; 1369 break;
1370 } 1370 }
1371 *count = cnt; 1371 *count = cnt;
1372 return 0; 1372 return 0;
1373 } 1373 }
1374 EXPORT_SYMBOL(generic_segment_checks); 1374 EXPORT_SYMBOL(generic_segment_checks);
1375 1375
1376 /** 1376 /**
1377 * generic_file_aio_read - generic filesystem read routine 1377 * generic_file_aio_read - generic filesystem read routine
1378 * @iocb: kernel I/O control block 1378 * @iocb: kernel I/O control block
1379 * @iov: io vector request 1379 * @iov: io vector request
1380 * @nr_segs: number of segments in the iovec 1380 * @nr_segs: number of segments in the iovec
1381 * @pos: current file position 1381 * @pos: current file position
1382 * 1382 *
1383 * This is the "read()" routine for all filesystems 1383 * This is the "read()" routine for all filesystems
1384 * that can use the page cache directly. 1384 * that can use the page cache directly.
1385 */ 1385 */
1386 ssize_t 1386 ssize_t
1387 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 1387 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1388 unsigned long nr_segs, loff_t pos) 1388 unsigned long nr_segs, loff_t pos)
1389 { 1389 {
1390 struct file *filp = iocb->ki_filp; 1390 struct file *filp = iocb->ki_filp;
1391 ssize_t retval; 1391 ssize_t retval;
1392 unsigned long seg = 0; 1392 unsigned long seg = 0;
1393 size_t count; 1393 size_t count;
1394 loff_t *ppos = &iocb->ki_pos; 1394 loff_t *ppos = &iocb->ki_pos;
1395 struct blk_plug plug; 1395 struct blk_plug plug;
1396 1396
1397 count = 0; 1397 count = 0;
1398 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1398 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1399 if (retval) 1399 if (retval)
1400 return retval; 1400 return retval;
1401 1401
1402 blk_start_plug(&plug); 1402 blk_start_plug(&plug);
1403 1403
1404 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1404 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1405 if (filp->f_flags & O_DIRECT) { 1405 if (filp->f_flags & O_DIRECT) {
1406 loff_t size; 1406 loff_t size;
1407 struct address_space *mapping; 1407 struct address_space *mapping;
1408 struct inode *inode; 1408 struct inode *inode;
1409 1409
1410 mapping = filp->f_mapping; 1410 mapping = filp->f_mapping;
1411 inode = mapping->host; 1411 inode = mapping->host;
1412 if (!count) 1412 if (!count)
1413 goto out; /* skip atime */ 1413 goto out; /* skip atime */
1414 size = i_size_read(inode); 1414 size = i_size_read(inode);
1415 if (pos < size) { 1415 if (pos < size) {
1416 retval = filemap_write_and_wait_range(mapping, pos, 1416 retval = filemap_write_and_wait_range(mapping, pos,
1417 pos + iov_length(iov, nr_segs) - 1); 1417 pos + iov_length(iov, nr_segs) - 1);
1418 if (!retval) { 1418 if (!retval) {
1419 retval = mapping->a_ops->direct_IO(READ, iocb, 1419 retval = mapping->a_ops->direct_IO(READ, iocb,
1420 iov, pos, nr_segs); 1420 iov, pos, nr_segs);
1421 } 1421 }
1422 if (retval > 0) { 1422 if (retval > 0) {
1423 *ppos = pos + retval; 1423 *ppos = pos + retval;
1424 count -= retval; 1424 count -= retval;
1425 } 1425 }
1426 1426
1427 /* 1427 /*
1428 * Btrfs can have a short DIO read if we encounter 1428 * Btrfs can have a short DIO read if we encounter
1429 * compressed extents, so if there was an error, or if 1429 * compressed extents, so if there was an error, or if
1430 * we've already read everything we wanted to, or if 1430 * we've already read everything we wanted to, or if
1431 * there was a short read because we hit EOF, go ahead 1431 * there was a short read because we hit EOF, go ahead
1432 * and return. Otherwise fallthrough to buffered io for 1432 * and return. Otherwise fallthrough to buffered io for
1433 * the rest of the read. 1433 * the rest of the read.
1434 */ 1434 */
1435 if (retval < 0 || !count || *ppos >= size) { 1435 if (retval < 0 || !count || *ppos >= size) {
1436 file_accessed(filp); 1436 file_accessed(filp);
1437 goto out; 1437 goto out;
1438 } 1438 }
1439 } 1439 }
1440 } 1440 }
1441 1441
1442 count = retval; 1442 count = retval;
1443 for (seg = 0; seg < nr_segs; seg++) { 1443 for (seg = 0; seg < nr_segs; seg++) {
1444 read_descriptor_t desc; 1444 read_descriptor_t desc;
1445 loff_t offset = 0; 1445 loff_t offset = 0;
1446 1446
1447 /* 1447 /*
1448 * If we did a short DIO read we need to skip the section of the 1448 * If we did a short DIO read we need to skip the section of the
1449 * iov that we've already read data into. 1449 * iov that we've already read data into.
1450 */ 1450 */
1451 if (count) { 1451 if (count) {
1452 if (count > iov[seg].iov_len) { 1452 if (count > iov[seg].iov_len) {
1453 count -= iov[seg].iov_len; 1453 count -= iov[seg].iov_len;
1454 continue; 1454 continue;
1455 } 1455 }
1456 offset = count; 1456 offset = count;
1457 count = 0; 1457 count = 0;
1458 } 1458 }
1459 1459
1460 desc.written = 0; 1460 desc.written = 0;
1461 desc.arg.buf = iov[seg].iov_base + offset; 1461 desc.arg.buf = iov[seg].iov_base + offset;
1462 desc.count = iov[seg].iov_len - offset; 1462 desc.count = iov[seg].iov_len - offset;
1463 if (desc.count == 0) 1463 if (desc.count == 0)
1464 continue; 1464 continue;
1465 desc.error = 0; 1465 desc.error = 0;
1466 do_generic_file_read(filp, ppos, &desc, file_read_actor); 1466 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1467 retval += desc.written; 1467 retval += desc.written;
1468 if (desc.error) { 1468 if (desc.error) {
1469 retval = retval ?: desc.error; 1469 retval = retval ?: desc.error;
1470 break; 1470 break;
1471 } 1471 }
1472 if (desc.count > 0) 1472 if (desc.count > 0)
1473 break; 1473 break;
1474 } 1474 }
1475 out: 1475 out:
1476 blk_finish_plug(&plug); 1476 blk_finish_plug(&plug);
1477 return retval; 1477 return retval;
1478 } 1478 }
1479 EXPORT_SYMBOL(generic_file_aio_read); 1479 EXPORT_SYMBOL(generic_file_aio_read);
1480 1480
1481 static ssize_t 1481 static ssize_t
1482 do_readahead(struct address_space *mapping, struct file *filp, 1482 do_readahead(struct address_space *mapping, struct file *filp,
1483 pgoff_t index, unsigned long nr) 1483 pgoff_t index, unsigned long nr)
1484 { 1484 {
1485 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1485 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1486 return -EINVAL; 1486 return -EINVAL;
1487 1487
1488 force_page_cache_readahead(mapping, filp, index, nr); 1488 force_page_cache_readahead(mapping, filp, index, nr);
1489 return 0; 1489 return 0;
1490 } 1490 }
1491 1491
1492 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) 1492 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1493 { 1493 {
1494 ssize_t ret; 1494 ssize_t ret;
1495 struct file *file; 1495 struct file *file;
1496 1496
1497 ret = -EBADF; 1497 ret = -EBADF;
1498 file = fget(fd); 1498 file = fget(fd);
1499 if (file) { 1499 if (file) {
1500 if (file->f_mode & FMODE_READ) { 1500 if (file->f_mode & FMODE_READ) {
1501 struct address_space *mapping = file->f_mapping; 1501 struct address_space *mapping = file->f_mapping;
1502 pgoff_t start = offset >> PAGE_CACHE_SHIFT; 1502 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1503 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1503 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1504 unsigned long len = end - start + 1; 1504 unsigned long len = end - start + 1;
1505 ret = do_readahead(mapping, file, start, len); 1505 ret = do_readahead(mapping, file, start, len);
1506 } 1506 }
1507 fput(file); 1507 fput(file);
1508 } 1508 }
1509 return ret; 1509 return ret;
1510 } 1510 }
1511 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 1511 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1512 asmlinkage long SyS_readahead(long fd, loff_t offset, long count) 1512 asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1513 { 1513 {
1514 return SYSC_readahead((int) fd, offset, (size_t) count); 1514 return SYSC_readahead((int) fd, offset, (size_t) count);
1515 } 1515 }
1516 SYSCALL_ALIAS(sys_readahead, SyS_readahead); 1516 SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1517 #endif 1517 #endif
1518 1518
1519 #ifdef CONFIG_MMU 1519 #ifdef CONFIG_MMU
1520 /** 1520 /**
1521 * page_cache_read - adds requested page to the page cache if not already there 1521 * page_cache_read - adds requested page to the page cache if not already there
1522 * @file: file to read 1522 * @file: file to read
1523 * @offset: page index 1523 * @offset: page index
1524 * 1524 *
1525 * This adds the requested page to the page cache if it isn't already there, 1525 * This adds the requested page to the page cache if it isn't already there,
1526 * and schedules an I/O to read in its contents from disk. 1526 * and schedules an I/O to read in its contents from disk.
1527 */ 1527 */
1528 static int page_cache_read(struct file *file, pgoff_t offset) 1528 static int page_cache_read(struct file *file, pgoff_t offset)
1529 { 1529 {
1530 struct address_space *mapping = file->f_mapping; 1530 struct address_space *mapping = file->f_mapping;
1531 struct page *page; 1531 struct page *page;
1532 int ret; 1532 int ret;
1533 1533
1534 do { 1534 do {
1535 page = page_cache_alloc_cold(mapping); 1535 page = page_cache_alloc_cold(mapping);
1536 if (!page) 1536 if (!page)
1537 return -ENOMEM; 1537 return -ENOMEM;
1538 1538
1539 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); 1539 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1540 if (ret == 0) 1540 if (ret == 0)
1541 ret = mapping->a_ops->readpage(file, page); 1541 ret = mapping->a_ops->readpage(file, page);
1542 else if (ret == -EEXIST) 1542 else if (ret == -EEXIST)
1543 ret = 0; /* losing race to add is OK */ 1543 ret = 0; /* losing race to add is OK */
1544 1544
1545 page_cache_release(page); 1545 page_cache_release(page);
1546 1546
1547 } while (ret == AOP_TRUNCATED_PAGE); 1547 } while (ret == AOP_TRUNCATED_PAGE);
1548 1548
1549 return ret; 1549 return ret;
1550 } 1550 }
1551 1551
1552 #define MMAP_LOTSAMISS (100) 1552 #define MMAP_LOTSAMISS (100)
1553 1553
1554 /* 1554 /*
1555 * Synchronous readahead happens when we don't even find 1555 * Synchronous readahead happens when we don't even find
1556 * a page in the page cache at all. 1556 * a page in the page cache at all.
1557 */ 1557 */
1558 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 1558 static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1559 struct file_ra_state *ra, 1559 struct file_ra_state *ra,
1560 struct file *file, 1560 struct file *file,
1561 pgoff_t offset) 1561 pgoff_t offset)
1562 { 1562 {
1563 unsigned long ra_pages; 1563 unsigned long ra_pages;
1564 struct address_space *mapping = file->f_mapping; 1564 struct address_space *mapping = file->f_mapping;
1565 1565
1566 /* If we don't want any read-ahead, don't bother */ 1566 /* If we don't want any read-ahead, don't bother */
1567 if (VM_RandomReadHint(vma)) 1567 if (VM_RandomReadHint(vma))
1568 return; 1568 return;
1569 if (!ra->ra_pages) 1569 if (!ra->ra_pages)
1570 return; 1570 return;
1571 1571
1572 if (VM_SequentialReadHint(vma)) { 1572 if (VM_SequentialReadHint(vma)) {
1573 page_cache_sync_readahead(mapping, ra, file, offset, 1573 page_cache_sync_readahead(mapping, ra, file, offset,
1574 ra->ra_pages); 1574 ra->ra_pages);
1575 return; 1575 return;
1576 } 1576 }
1577 1577
1578 /* Avoid banging the cache line if not needed */ 1578 /* Avoid banging the cache line if not needed */
1579 if (ra->mmap_miss < MMAP_LOTSAMISS * 10) 1579 if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
1580 ra->mmap_miss++; 1580 ra->mmap_miss++;
1581 1581
1582 /* 1582 /*
1583 * Do we miss much more than hit in this file? If so, 1583 * Do we miss much more than hit in this file? If so,
1584 * stop bothering with read-ahead. It will only hurt. 1584 * stop bothering with read-ahead. It will only hurt.
1585 */ 1585 */
1586 if (ra->mmap_miss > MMAP_LOTSAMISS) 1586 if (ra->mmap_miss > MMAP_LOTSAMISS)
1587 return; 1587 return;
1588 1588
1589 /* 1589 /*
1590 * mmap read-around 1590 * mmap read-around
1591 */ 1591 */
1592 ra_pages = max_sane_readahead(ra->ra_pages); 1592 ra_pages = max_sane_readahead(ra->ra_pages);
1593 ra->start = max_t(long, 0, offset - ra_pages / 2); 1593 ra->start = max_t(long, 0, offset - ra_pages / 2);
1594 ra->size = ra_pages; 1594 ra->size = ra_pages;
1595 ra->async_size = ra_pages / 4; 1595 ra->async_size = ra_pages / 4;
1596 ra_submit(ra, mapping, file); 1596 ra_submit(ra, mapping, file);
1597 } 1597 }
1598 1598
1599 /* 1599 /*
1600 * Asynchronous readahead happens when we find the page and PG_readahead, 1600 * Asynchronous readahead happens when we find the page and PG_readahead,
1601 * so we want to possibly extend the readahead further.. 1601 * so we want to possibly extend the readahead further..
1602 */ 1602 */
1603 static void do_async_mmap_readahead(struct vm_area_struct *vma, 1603 static void do_async_mmap_readahead(struct vm_area_struct *vma,
1604 struct file_ra_state *ra, 1604 struct file_ra_state *ra,
1605 struct file *file, 1605 struct file *file,
1606 struct page *page, 1606 struct page *page,
1607 pgoff_t offset) 1607 pgoff_t offset)
1608 { 1608 {
1609 struct address_space *mapping = file->f_mapping; 1609 struct address_space *mapping = file->f_mapping;
1610 1610
1611 /* If we don't want any read-ahead, don't bother */ 1611 /* If we don't want any read-ahead, don't bother */
1612 if (VM_RandomReadHint(vma)) 1612 if (VM_RandomReadHint(vma))
1613 return; 1613 return;
1614 if (ra->mmap_miss > 0) 1614 if (ra->mmap_miss > 0)
1615 ra->mmap_miss--; 1615 ra->mmap_miss--;
1616 if (PageReadahead(page)) 1616 if (PageReadahead(page))
1617 page_cache_async_readahead(mapping, ra, file, 1617 page_cache_async_readahead(mapping, ra, file,
1618 page, offset, ra->ra_pages); 1618 page, offset, ra->ra_pages);
1619 } 1619 }
1620 1620
1621 /** 1621 /**
1622 * filemap_fault - read in file data for page fault handling 1622 * filemap_fault - read in file data for page fault handling
1623 * @vma: vma in which the fault was taken 1623 * @vma: vma in which the fault was taken
1624 * @vmf: struct vm_fault containing details of the fault 1624 * @vmf: struct vm_fault containing details of the fault
1625 * 1625 *
1626 * filemap_fault() is invoked via the vma operations vector for a 1626 * filemap_fault() is invoked via the vma operations vector for a
1627 * mapped memory region to read in file data during a page fault. 1627 * mapped memory region to read in file data during a page fault.
1628 * 1628 *
1629 * The goto's are kind of ugly, but this streamlines the normal case of having 1629 * The goto's are kind of ugly, but this streamlines the normal case of having
1630 * it in the page cache, and handles the special cases reasonably without 1630 * it in the page cache, and handles the special cases reasonably without
1631 * having a lot of duplicated code. 1631 * having a lot of duplicated code.
1632 */ 1632 */
1633 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1633 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1634 { 1634 {
1635 int error; 1635 int error;
1636 struct file *file = vma->vm_file; 1636 struct file *file = vma->vm_file;
1637 struct address_space *mapping = file->f_mapping; 1637 struct address_space *mapping = file->f_mapping;
1638 struct file_ra_state *ra = &file->f_ra; 1638 struct file_ra_state *ra = &file->f_ra;
1639 struct inode *inode = mapping->host; 1639 struct inode *inode = mapping->host;
1640 pgoff_t offset = vmf->pgoff; 1640 pgoff_t offset = vmf->pgoff;
1641 struct page *page; 1641 struct page *page;
1642 pgoff_t size; 1642 pgoff_t size;
1643 int ret = 0; 1643 int ret = 0;
1644 1644
1645 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1645 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1646 if (offset >= size) 1646 if (offset >= size)
1647 return VM_FAULT_SIGBUS; 1647 return VM_FAULT_SIGBUS;
1648 1648
1649 /* 1649 /*
1650 * Do we have something in the page cache already? 1650 * Do we have something in the page cache already?
1651 */ 1651 */
1652 page = find_get_page(mapping, offset); 1652 page = find_get_page(mapping, offset);
1653 if (likely(page)) { 1653 if (likely(page)) {
1654 /* 1654 /*
1655 * We found the page, so try async readahead before 1655 * We found the page, so try async readahead before
1656 * waiting for the lock. 1656 * waiting for the lock.
1657 */ 1657 */
1658 do_async_mmap_readahead(vma, ra, file, page, offset); 1658 do_async_mmap_readahead(vma, ra, file, page, offset);
1659 } else { 1659 } else {
1660 /* No page in the page cache at all */ 1660 /* No page in the page cache at all */
1661 do_sync_mmap_readahead(vma, ra, file, offset); 1661 do_sync_mmap_readahead(vma, ra, file, offset);
1662 count_vm_event(PGMAJFAULT); 1662 count_vm_event(PGMAJFAULT);
1663 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1663 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1664 ret = VM_FAULT_MAJOR; 1664 ret = VM_FAULT_MAJOR;
1665 retry_find: 1665 retry_find:
1666 page = find_get_page(mapping, offset); 1666 page = find_get_page(mapping, offset);
1667 if (!page) 1667 if (!page)
1668 goto no_cached_page; 1668 goto no_cached_page;
1669 } 1669 }
1670 1670
1671 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 1671 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
1672 page_cache_release(page); 1672 page_cache_release(page);
1673 return ret | VM_FAULT_RETRY; 1673 return ret | VM_FAULT_RETRY;
1674 } 1674 }
1675 1675
1676 /* Did it get truncated? */ 1676 /* Did it get truncated? */
1677 if (unlikely(page->mapping != mapping)) { 1677 if (unlikely(page->mapping != mapping)) {
1678 unlock_page(page); 1678 unlock_page(page);
1679 put_page(page); 1679 put_page(page);
1680 goto retry_find; 1680 goto retry_find;
1681 } 1681 }
1682 VM_BUG_ON(page->index != offset); 1682 VM_BUG_ON(page->index != offset);
1683 1683
1684 /* 1684 /*
1685 * We have a locked page in the page cache, now we need to check 1685 * We have a locked page in the page cache, now we need to check
1686 * that it's up-to-date. If not, it is going to be due to an error. 1686 * that it's up-to-date. If not, it is going to be due to an error.
1687 */ 1687 */
1688 if (unlikely(!PageUptodate(page))) 1688 if (unlikely(!PageUptodate(page)))
1689 goto page_not_uptodate; 1689 goto page_not_uptodate;
1690 1690
1691 /* 1691 /*
1692 * Found the page and have a reference on it. 1692 * Found the page and have a reference on it.
1693 * We must recheck i_size under page lock. 1693 * We must recheck i_size under page lock.
1694 */ 1694 */
1695 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1695 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1696 if (unlikely(offset >= size)) { 1696 if (unlikely(offset >= size)) {
1697 unlock_page(page); 1697 unlock_page(page);
1698 page_cache_release(page); 1698 page_cache_release(page);
1699 return VM_FAULT_SIGBUS; 1699 return VM_FAULT_SIGBUS;
1700 } 1700 }
1701 1701
1702 vmf->page = page; 1702 vmf->page = page;
1703 return ret | VM_FAULT_LOCKED; 1703 return ret | VM_FAULT_LOCKED;
1704 1704
1705 no_cached_page: 1705 no_cached_page:
1706 /* 1706 /*
1707 * We're only likely to ever get here if MADV_RANDOM is in 1707 * We're only likely to ever get here if MADV_RANDOM is in
1708 * effect. 1708 * effect.
1709 */ 1709 */
1710 error = page_cache_read(file, offset); 1710 error = page_cache_read(file, offset);
1711 1711
1712 /* 1712 /*
1713 * The page we want has now been added to the page cache. 1713 * The page we want has now been added to the page cache.
1714 * In the unlikely event that someone removed it in the 1714 * In the unlikely event that someone removed it in the
1715 * meantime, we'll just come back here and read it again. 1715 * meantime, we'll just come back here and read it again.
1716 */ 1716 */
1717 if (error >= 0) 1717 if (error >= 0)
1718 goto retry_find; 1718 goto retry_find;
1719 1719
1720 /* 1720 /*
1721 * An error return from page_cache_read can result if the 1721 * An error return from page_cache_read can result if the
1722 * system is low on memory, or a problem occurs while trying 1722 * system is low on memory, or a problem occurs while trying
1723 * to schedule I/O. 1723 * to schedule I/O.
1724 */ 1724 */
1725 if (error == -ENOMEM) 1725 if (error == -ENOMEM)
1726 return VM_FAULT_OOM; 1726 return VM_FAULT_OOM;
1727 return VM_FAULT_SIGBUS; 1727 return VM_FAULT_SIGBUS;
1728 1728
1729 page_not_uptodate: 1729 page_not_uptodate:
1730 /* 1730 /*
1731 * Umm, take care of errors if the page isn't up-to-date. 1731 * Umm, take care of errors if the page isn't up-to-date.
1732 * Try to re-read it _once_. We do this synchronously, 1732 * Try to re-read it _once_. We do this synchronously,
1733 * because there really aren't any performance issues here 1733 * because there really aren't any performance issues here
1734 * and we need to check for errors. 1734 * and we need to check for errors.
1735 */ 1735 */
1736 ClearPageError(page); 1736 ClearPageError(page);
1737 error = mapping->a_ops->readpage(file, page); 1737 error = mapping->a_ops->readpage(file, page);
1738 if (!error) { 1738 if (!error) {
1739 wait_on_page_locked(page); 1739 wait_on_page_locked(page);
1740 if (!PageUptodate(page)) 1740 if (!PageUptodate(page))
1741 error = -EIO; 1741 error = -EIO;
1742 } 1742 }
1743 page_cache_release(page); 1743 page_cache_release(page);
1744 1744
1745 if (!error || error == AOP_TRUNCATED_PAGE) 1745 if (!error || error == AOP_TRUNCATED_PAGE)
1746 goto retry_find; 1746 goto retry_find;
1747 1747
1748 /* Things didn't work out. Return zero to tell the mm layer so. */ 1748 /* Things didn't work out. Return zero to tell the mm layer so. */
1749 shrink_readahead_size_eio(file, ra); 1749 shrink_readahead_size_eio(file, ra);
1750 return VM_FAULT_SIGBUS; 1750 return VM_FAULT_SIGBUS;
1751 } 1751 }
1752 EXPORT_SYMBOL(filemap_fault); 1752 EXPORT_SYMBOL(filemap_fault);
1753 1753
1754 const struct vm_operations_struct generic_file_vm_ops = { 1754 const struct vm_operations_struct generic_file_vm_ops = {
1755 .fault = filemap_fault, 1755 .fault = filemap_fault,
1756 }; 1756 };
1757 1757
1758 /* This is used for a general mmap of a disk file */ 1758 /* This is used for a general mmap of a disk file */
1759 1759
1760 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1760 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1761 { 1761 {
1762 struct address_space *mapping = file->f_mapping; 1762 struct address_space *mapping = file->f_mapping;
1763 1763
1764 if (!mapping->a_ops->readpage) 1764 if (!mapping->a_ops->readpage)
1765 return -ENOEXEC; 1765 return -ENOEXEC;
1766 file_accessed(file); 1766 file_accessed(file);
1767 vma->vm_ops = &generic_file_vm_ops; 1767 vma->vm_ops = &generic_file_vm_ops;
1768 vma->vm_flags |= VM_CAN_NONLINEAR; 1768 vma->vm_flags |= VM_CAN_NONLINEAR;
1769 return 0; 1769 return 0;
1770 } 1770 }
1771 1771
1772 /* 1772 /*
1773 * This is for filesystems which do not implement ->writepage. 1773 * This is for filesystems which do not implement ->writepage.
1774 */ 1774 */
1775 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 1775 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1776 { 1776 {
1777 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 1777 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1778 return -EINVAL; 1778 return -EINVAL;
1779 return generic_file_mmap(file, vma); 1779 return generic_file_mmap(file, vma);
1780 } 1780 }
1781 #else 1781 #else
1782 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1782 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1783 { 1783 {
1784 return -ENOSYS; 1784 return -ENOSYS;
1785 } 1785 }
1786 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 1786 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1787 { 1787 {
1788 return -ENOSYS; 1788 return -ENOSYS;
1789 } 1789 }
1790 #endif /* CONFIG_MMU */ 1790 #endif /* CONFIG_MMU */
1791 1791
1792 EXPORT_SYMBOL(generic_file_mmap); 1792 EXPORT_SYMBOL(generic_file_mmap);
1793 EXPORT_SYMBOL(generic_file_readonly_mmap); 1793 EXPORT_SYMBOL(generic_file_readonly_mmap);
1794 1794
1795 static struct page *__read_cache_page(struct address_space *mapping, 1795 static struct page *__read_cache_page(struct address_space *mapping,
1796 pgoff_t index, 1796 pgoff_t index,
1797 int (*filler)(void *, struct page *), 1797 int (*filler)(void *, struct page *),
1798 void *data, 1798 void *data,
1799 gfp_t gfp) 1799 gfp_t gfp)
1800 { 1800 {
1801 struct page *page; 1801 struct page *page;
1802 int err; 1802 int err;
1803 repeat: 1803 repeat:
1804 page = find_get_page(mapping, index); 1804 page = find_get_page(mapping, index);
1805 if (!page) { 1805 if (!page) {
1806 page = __page_cache_alloc(gfp | __GFP_COLD); 1806 page = __page_cache_alloc(gfp | __GFP_COLD);
1807 if (!page) 1807 if (!page)
1808 return ERR_PTR(-ENOMEM); 1808 return ERR_PTR(-ENOMEM);
1809 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1809 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1810 if (unlikely(err)) { 1810 if (unlikely(err)) {
1811 page_cache_release(page); 1811 page_cache_release(page);
1812 if (err == -EEXIST) 1812 if (err == -EEXIST)
1813 goto repeat; 1813 goto repeat;
1814 /* Presumably ENOMEM for radix tree node */ 1814 /* Presumably ENOMEM for radix tree node */
1815 return ERR_PTR(err); 1815 return ERR_PTR(err);
1816 } 1816 }
1817 err = filler(data, page); 1817 err = filler(data, page);
1818 if (err < 0) { 1818 if (err < 0) {
1819 page_cache_release(page); 1819 page_cache_release(page);
1820 page = ERR_PTR(err); 1820 page = ERR_PTR(err);
1821 } 1821 }
1822 } 1822 }
1823 return page; 1823 return page;
1824 } 1824 }
1825 1825
1826 static struct page *do_read_cache_page(struct address_space *mapping, 1826 static struct page *do_read_cache_page(struct address_space *mapping,
1827 pgoff_t index, 1827 pgoff_t index,
1828 int (*filler)(void *, struct page *), 1828 int (*filler)(void *, struct page *),
1829 void *data, 1829 void *data,
1830 gfp_t gfp) 1830 gfp_t gfp)
1831 1831
1832 { 1832 {
1833 struct page *page; 1833 struct page *page;
1834 int err; 1834 int err;
1835 1835
1836 retry: 1836 retry:
1837 page = __read_cache_page(mapping, index, filler, data, gfp); 1837 page = __read_cache_page(mapping, index, filler, data, gfp);
1838 if (IS_ERR(page)) 1838 if (IS_ERR(page))
1839 return page; 1839 return page;
1840 if (PageUptodate(page)) 1840 if (PageUptodate(page))
1841 goto out; 1841 goto out;
1842 1842
1843 lock_page(page); 1843 lock_page(page);
1844 if (!page->mapping) { 1844 if (!page->mapping) {
1845 unlock_page(page); 1845 unlock_page(page);
1846 page_cache_release(page); 1846 page_cache_release(page);
1847 goto retry; 1847 goto retry;
1848 } 1848 }
1849 if (PageUptodate(page)) { 1849 if (PageUptodate(page)) {
1850 unlock_page(page); 1850 unlock_page(page);
1851 goto out; 1851 goto out;
1852 } 1852 }
1853 err = filler(data, page); 1853 err = filler(data, page);
1854 if (err < 0) { 1854 if (err < 0) {
1855 page_cache_release(page); 1855 page_cache_release(page);
1856 return ERR_PTR(err); 1856 return ERR_PTR(err);
1857 } 1857 }
1858 out: 1858 out:
1859 mark_page_accessed(page); 1859 mark_page_accessed(page);
1860 return page; 1860 return page;
1861 } 1861 }
1862 1862
1863 /** 1863 /**
1864 * read_cache_page_async - read into page cache, fill it if needed 1864 * read_cache_page_async - read into page cache, fill it if needed
1865 * @mapping: the page's address_space 1865 * @mapping: the page's address_space
1866 * @index: the page index 1866 * @index: the page index
1867 * @filler: function to perform the read 1867 * @filler: function to perform the read
1868 * @data: first arg to filler(data, page) function, often left as NULL 1868 * @data: first arg to filler(data, page) function, often left as NULL
1869 * 1869 *
1870 * Same as read_cache_page, but don't wait for page to become unlocked 1870 * Same as read_cache_page, but don't wait for page to become unlocked
1871 * after submitting it to the filler. 1871 * after submitting it to the filler.
1872 * 1872 *
1873 * Read into the page cache. If a page already exists, and PageUptodate() is 1873 * Read into the page cache. If a page already exists, and PageUptodate() is
1874 * not set, try to fill the page but don't wait for it to become unlocked. 1874 * not set, try to fill the page but don't wait for it to become unlocked.
1875 * 1875 *
1876 * If the page does not get brought uptodate, return -EIO. 1876 * If the page does not get brought uptodate, return -EIO.
1877 */ 1877 */
1878 struct page *read_cache_page_async(struct address_space *mapping, 1878 struct page *read_cache_page_async(struct address_space *mapping,
1879 pgoff_t index, 1879 pgoff_t index,
1880 int (*filler)(void *, struct page *), 1880 int (*filler)(void *, struct page *),
1881 void *data) 1881 void *data)
1882 { 1882 {
1883 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1883 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1884 } 1884 }
1885 EXPORT_SYMBOL(read_cache_page_async); 1885 EXPORT_SYMBOL(read_cache_page_async);
1886 1886
1887 static struct page *wait_on_page_read(struct page *page) 1887 static struct page *wait_on_page_read(struct page *page)
1888 { 1888 {
1889 if (!IS_ERR(page)) { 1889 if (!IS_ERR(page)) {
1890 wait_on_page_locked(page); 1890 wait_on_page_locked(page);
1891 if (!PageUptodate(page)) { 1891 if (!PageUptodate(page)) {
1892 page_cache_release(page); 1892 page_cache_release(page);
1893 page = ERR_PTR(-EIO); 1893 page = ERR_PTR(-EIO);
1894 } 1894 }
1895 } 1895 }
1896 return page; 1896 return page;
1897 } 1897 }
1898 1898
1899 /** 1899 /**
1900 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 1900 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
1901 * @mapping: the page's address_space 1901 * @mapping: the page's address_space
1902 * @index: the page index 1902 * @index: the page index
1903 * @gfp: the page allocator flags to use if allocating 1903 * @gfp: the page allocator flags to use if allocating
1904 * 1904 *
1905 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 1905 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
1906 * any new page allocations done using the specified allocation flags. Note 1906 * any new page allocations done using the specified allocation flags. Note
1907 * that the Radix tree operations will still use GFP_KERNEL, so you can't 1907 * that the Radix tree operations will still use GFP_KERNEL, so you can't
1908 * expect to do this atomically or anything like that - but you can pass in 1908 * expect to do this atomically or anything like that - but you can pass in
1909 * other page requirements. 1909 * other page requirements.
1910 * 1910 *
1911 * If the page does not get brought uptodate, return -EIO. 1911 * If the page does not get brought uptodate, return -EIO.
1912 */ 1912 */
1913 struct page *read_cache_page_gfp(struct address_space *mapping, 1913 struct page *read_cache_page_gfp(struct address_space *mapping,
1914 pgoff_t index, 1914 pgoff_t index,
1915 gfp_t gfp) 1915 gfp_t gfp)
1916 { 1916 {
1917 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 1917 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
1918 1918
1919 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); 1919 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
1920 } 1920 }
1921 EXPORT_SYMBOL(read_cache_page_gfp); 1921 EXPORT_SYMBOL(read_cache_page_gfp);
1922 1922
1923 /** 1923 /**
1924 * read_cache_page - read into page cache, fill it if needed 1924 * read_cache_page - read into page cache, fill it if needed
1925 * @mapping: the page's address_space 1925 * @mapping: the page's address_space
1926 * @index: the page index 1926 * @index: the page index
1927 * @filler: function to perform the read 1927 * @filler: function to perform the read
1928 * @data: first arg to filler(data, page) function, often left as NULL 1928 * @data: first arg to filler(data, page) function, often left as NULL
1929 * 1929 *
1930 * Read into the page cache. If a page already exists, and PageUptodate() is 1930 * Read into the page cache. If a page already exists, and PageUptodate() is
1931 * not set, try to fill the page then wait for it to become unlocked. 1931 * not set, try to fill the page then wait for it to become unlocked.
1932 * 1932 *
1933 * If the page does not get brought uptodate, return -EIO. 1933 * If the page does not get brought uptodate, return -EIO.
1934 */ 1934 */
1935 struct page *read_cache_page(struct address_space *mapping, 1935 struct page *read_cache_page(struct address_space *mapping,
1936 pgoff_t index, 1936 pgoff_t index,
1937 int (*filler)(void *, struct page *), 1937 int (*filler)(void *, struct page *),
1938 void *data) 1938 void *data)
1939 { 1939 {
1940 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1940 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1941 } 1941 }
1942 EXPORT_SYMBOL(read_cache_page); 1942 EXPORT_SYMBOL(read_cache_page);
1943 1943
1944 /* 1944 /*
1945 * The logic we want is 1945 * The logic we want is
1946 * 1946 *
1947 * if suid or (sgid and xgrp) 1947 * if suid or (sgid and xgrp)
1948 * remove privs 1948 * remove privs
1949 */ 1949 */
1950 int should_remove_suid(struct dentry *dentry) 1950 int should_remove_suid(struct dentry *dentry)
1951 { 1951 {
1952 mode_t mode = dentry->d_inode->i_mode; 1952 mode_t mode = dentry->d_inode->i_mode;
1953 int kill = 0; 1953 int kill = 0;
1954 1954
1955 /* suid always must be killed */ 1955 /* suid always must be killed */
1956 if (unlikely(mode & S_ISUID)) 1956 if (unlikely(mode & S_ISUID))
1957 kill = ATTR_KILL_SUID; 1957 kill = ATTR_KILL_SUID;
1958 1958
1959 /* 1959 /*
1960 * sgid without any exec bits is just a mandatory locking mark; leave 1960 * sgid without any exec bits is just a mandatory locking mark; leave
1961 * it alone. If some exec bits are set, it's a real sgid; kill it. 1961 * it alone. If some exec bits are set, it's a real sgid; kill it.
1962 */ 1962 */
1963 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1963 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1964 kill |= ATTR_KILL_SGID; 1964 kill |= ATTR_KILL_SGID;
1965 1965
1966 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1966 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1967 return kill; 1967 return kill;
1968 1968
1969 return 0; 1969 return 0;
1970 } 1970 }
1971 EXPORT_SYMBOL(should_remove_suid); 1971 EXPORT_SYMBOL(should_remove_suid);
1972 1972
1973 static int __remove_suid(struct dentry *dentry, int kill) 1973 static int __remove_suid(struct dentry *dentry, int kill)
1974 { 1974 {
1975 struct iattr newattrs; 1975 struct iattr newattrs;
1976 1976
1977 newattrs.ia_valid = ATTR_FORCE | kill; 1977 newattrs.ia_valid = ATTR_FORCE | kill;
1978 return notify_change(dentry, &newattrs); 1978 return notify_change(dentry, &newattrs);
1979 } 1979 }
1980 1980
1981 int file_remove_suid(struct file *file) 1981 int file_remove_suid(struct file *file)
1982 { 1982 {
1983 struct dentry *dentry = file->f_path.dentry; 1983 struct dentry *dentry = file->f_path.dentry;
1984 struct inode *inode = dentry->d_inode; 1984 struct inode *inode = dentry->d_inode;
1985 int killsuid; 1985 int killsuid;
1986 int killpriv; 1986 int killpriv;
1987 int error = 0; 1987 int error = 0;
1988 1988
1989 /* Fast path for nothing security related */ 1989 /* Fast path for nothing security related */
1990 if (IS_NOSEC(inode)) 1990 if (IS_NOSEC(inode))
1991 return 0; 1991 return 0;
1992 1992
1993 killsuid = should_remove_suid(dentry); 1993 killsuid = should_remove_suid(dentry);
1994 killpriv = security_inode_need_killpriv(dentry); 1994 killpriv = security_inode_need_killpriv(dentry);
1995 1995
1996 if (killpriv < 0) 1996 if (killpriv < 0)
1997 return killpriv; 1997 return killpriv;
1998 if (killpriv) 1998 if (killpriv)
1999 error = security_inode_killpriv(dentry); 1999 error = security_inode_killpriv(dentry);
2000 if (!error && killsuid) 2000 if (!error && killsuid)
2001 error = __remove_suid(dentry, killsuid); 2001 error = __remove_suid(dentry, killsuid);
2002 if (!error && (inode->i_sb->s_flags & MS_NOSEC)) 2002 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
2003 inode->i_flags |= S_NOSEC; 2003 inode->i_flags |= S_NOSEC;
2004 2004
2005 return error; 2005 return error;
2006 } 2006 }
2007 EXPORT_SYMBOL(file_remove_suid); 2007 EXPORT_SYMBOL(file_remove_suid);
2008 2008
2009 static size_t __iovec_copy_from_user_inatomic(char *vaddr, 2009 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
2010 const struct iovec *iov, size_t base, size_t bytes) 2010 const struct iovec *iov, size_t base, size_t bytes)
2011 { 2011 {
2012 size_t copied = 0, left = 0; 2012 size_t copied = 0, left = 0;
2013 2013
2014 while (bytes) { 2014 while (bytes) {
2015 char __user *buf = iov->iov_base + base; 2015 char __user *buf = iov->iov_base + base;
2016 int copy = min(bytes, iov->iov_len - base); 2016 int copy = min(bytes, iov->iov_len - base);
2017 2017
2018 base = 0; 2018 base = 0;
2019 left = __copy_from_user_inatomic(vaddr, buf, copy); 2019 left = __copy_from_user_inatomic(vaddr, buf, copy);
2020 copied += copy; 2020 copied += copy;
2021 bytes -= copy; 2021 bytes -= copy;
2022 vaddr += copy; 2022 vaddr += copy;
2023 iov++; 2023 iov++;
2024 2024
2025 if (unlikely(left)) 2025 if (unlikely(left))
2026 break; 2026 break;
2027 } 2027 }
2028 return copied - left; 2028 return copied - left;
2029 } 2029 }
2030 2030
2031 /* 2031 /*
2032 * Copy as much as we can into the page and return the number of bytes which 2032 * Copy as much as we can into the page and return the number of bytes which
2033 * were successfully copied. If a fault is encountered then return the number of 2033 * were successfully copied. If a fault is encountered then return the number of
2034 * bytes which were copied. 2034 * bytes which were copied.
2035 */ 2035 */
2036 size_t iov_iter_copy_from_user_atomic(struct page *page, 2036 size_t iov_iter_copy_from_user_atomic(struct page *page,
2037 struct iov_iter *i, unsigned long offset, size_t bytes) 2037 struct iov_iter *i, unsigned long offset, size_t bytes)
2038 { 2038 {
2039 char *kaddr; 2039 char *kaddr;
2040 size_t copied; 2040 size_t copied;
2041 2041
2042 BUG_ON(!in_atomic()); 2042 BUG_ON(!in_atomic());
2043 kaddr = kmap_atomic(page, KM_USER0); 2043 kaddr = kmap_atomic(page, KM_USER0);
2044 if (likely(i->nr_segs == 1)) { 2044 if (likely(i->nr_segs == 1)) {
2045 int left; 2045 int left;
2046 char __user *buf = i->iov->iov_base + i->iov_offset; 2046 char __user *buf = i->iov->iov_base + i->iov_offset;
2047 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); 2047 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
2048 copied = bytes - left; 2048 copied = bytes - left;
2049 } else { 2049 } else {
2050 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 2050 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
2051 i->iov, i->iov_offset, bytes); 2051 i->iov, i->iov_offset, bytes);
2052 } 2052 }
2053 kunmap_atomic(kaddr, KM_USER0); 2053 kunmap_atomic(kaddr, KM_USER0);
2054 2054
2055 return copied; 2055 return copied;
2056 } 2056 }
2057 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); 2057 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
2058 2058
2059 /* 2059 /*
2060 * This has the same sideeffects and return value as 2060 * This has the same sideeffects and return value as
2061 * iov_iter_copy_from_user_atomic(). 2061 * iov_iter_copy_from_user_atomic().
2062 * The difference is that it attempts to resolve faults. 2062 * The difference is that it attempts to resolve faults.
2063 * Page must not be locked. 2063 * Page must not be locked.
2064 */ 2064 */
2065 size_t iov_iter_copy_from_user(struct page *page, 2065 size_t iov_iter_copy_from_user(struct page *page,
2066 struct iov_iter *i, unsigned long offset, size_t bytes) 2066 struct iov_iter *i, unsigned long offset, size_t bytes)
2067 { 2067 {
2068 char *kaddr; 2068 char *kaddr;
2069 size_t copied; 2069 size_t copied;
2070 2070
2071 kaddr = kmap(page); 2071 kaddr = kmap(page);
2072 if (likely(i->nr_segs == 1)) { 2072 if (likely(i->nr_segs == 1)) {
2073 int left; 2073 int left;
2074 char __user *buf = i->iov->iov_base + i->iov_offset; 2074 char __user *buf = i->iov->iov_base + i->iov_offset;
2075 left = __copy_from_user(kaddr + offset, buf, bytes); 2075 left = __copy_from_user(kaddr + offset, buf, bytes);
2076 copied = bytes - left; 2076 copied = bytes - left;
2077 } else { 2077 } else {
2078 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 2078 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
2079 i->iov, i->iov_offset, bytes); 2079 i->iov, i->iov_offset, bytes);
2080 } 2080 }
2081 kunmap(page); 2081 kunmap(page);
2082 return copied; 2082 return copied;
2083 } 2083 }
2084 EXPORT_SYMBOL(iov_iter_copy_from_user); 2084 EXPORT_SYMBOL(iov_iter_copy_from_user);
2085 2085
2086 void iov_iter_advance(struct iov_iter *i, size_t bytes) 2086 void iov_iter_advance(struct iov_iter *i, size_t bytes)
2087 { 2087 {
2088 BUG_ON(i->count < bytes); 2088 BUG_ON(i->count < bytes);
2089 2089
2090 if (likely(i->nr_segs == 1)) { 2090 if (likely(i->nr_segs == 1)) {
2091 i->iov_offset += bytes; 2091 i->iov_offset += bytes;
2092 i->count -= bytes; 2092 i->count -= bytes;
2093 } else { 2093 } else {
2094 const struct iovec *iov = i->iov; 2094 const struct iovec *iov = i->iov;
2095 size_t base = i->iov_offset; 2095 size_t base = i->iov_offset;
2096 2096
2097 /* 2097 /*
2098 * The !iov->iov_len check ensures we skip over unlikely 2098 * The !iov->iov_len check ensures we skip over unlikely
2099 * zero-length segments (without overruning the iovec). 2099 * zero-length segments (without overruning the iovec).
2100 */ 2100 */
2101 while (bytes || unlikely(i->count && !iov->iov_len)) { 2101 while (bytes || unlikely(i->count && !iov->iov_len)) {
2102 int copy; 2102 int copy;
2103 2103
2104 copy = min(bytes, iov->iov_len - base); 2104 copy = min(bytes, iov->iov_len - base);
2105 BUG_ON(!i->count || i->count < copy); 2105 BUG_ON(!i->count || i->count < copy);
2106 i->count -= copy; 2106 i->count -= copy;
2107 bytes -= copy; 2107 bytes -= copy;
2108 base += copy; 2108 base += copy;
2109 if (iov->iov_len == base) { 2109 if (iov->iov_len == base) {
2110 iov++; 2110 iov++;
2111 base = 0; 2111 base = 0;
2112 } 2112 }
2113 } 2113 }
2114 i->iov = iov; 2114 i->iov = iov;
2115 i->iov_offset = base; 2115 i->iov_offset = base;
2116 } 2116 }
2117 } 2117 }
2118 EXPORT_SYMBOL(iov_iter_advance); 2118 EXPORT_SYMBOL(iov_iter_advance);
2119 2119
2120 /* 2120 /*
2121 * Fault in the first iovec of the given iov_iter, to a maximum length 2121 * Fault in the first iovec of the given iov_iter, to a maximum length
2122 * of bytes. Returns 0 on success, or non-zero if the memory could not be 2122 * of bytes. Returns 0 on success, or non-zero if the memory could not be
2123 * accessed (ie. because it is an invalid address). 2123 * accessed (ie. because it is an invalid address).
2124 * 2124 *
2125 * writev-intensive code may want this to prefault several iovecs -- that 2125 * writev-intensive code may want this to prefault several iovecs -- that
2126 * would be possible (callers must not rely on the fact that _only_ the 2126 * would be possible (callers must not rely on the fact that _only_ the
2127 * first iovec will be faulted with the current implementation). 2127 * first iovec will be faulted with the current implementation).
2128 */ 2128 */
2129 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) 2129 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
2130 { 2130 {
2131 char __user *buf = i->iov->iov_base + i->iov_offset; 2131 char __user *buf = i->iov->iov_base + i->iov_offset;
2132 bytes = min(bytes, i->iov->iov_len - i->iov_offset); 2132 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
2133 return fault_in_pages_readable(buf, bytes); 2133 return fault_in_pages_readable(buf, bytes);
2134 } 2134 }
2135 EXPORT_SYMBOL(iov_iter_fault_in_readable); 2135 EXPORT_SYMBOL(iov_iter_fault_in_readable);
2136 2136
2137 /* 2137 /*
2138 * Return the count of just the current iov_iter segment. 2138 * Return the count of just the current iov_iter segment.
2139 */ 2139 */
2140 size_t iov_iter_single_seg_count(struct iov_iter *i) 2140 size_t iov_iter_single_seg_count(struct iov_iter *i)
2141 { 2141 {
2142 const struct iovec *iov = i->iov; 2142 const struct iovec *iov = i->iov;
2143 if (i->nr_segs == 1) 2143 if (i->nr_segs == 1)
2144 return i->count; 2144 return i->count;
2145 else 2145 else
2146 return min(i->count, iov->iov_len - i->iov_offset); 2146 return min(i->count, iov->iov_len - i->iov_offset);
2147 } 2147 }
2148 EXPORT_SYMBOL(iov_iter_single_seg_count); 2148 EXPORT_SYMBOL(iov_iter_single_seg_count);
2149 2149
2150 /* 2150 /*
2151 * Performs necessary checks before doing a write 2151 * Performs necessary checks before doing a write
2152 * 2152 *
2153 * Can adjust writing position or amount of bytes to write. 2153 * Can adjust writing position or amount of bytes to write.
2154 * Returns appropriate error code that caller should return or 2154 * Returns appropriate error code that caller should return or
2155 * zero in case that write should be allowed. 2155 * zero in case that write should be allowed.
2156 */ 2156 */
2157 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 2157 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
2158 { 2158 {
2159 struct inode *inode = file->f_mapping->host; 2159 struct inode *inode = file->f_mapping->host;
2160 unsigned long limit = rlimit(RLIMIT_FSIZE); 2160 unsigned long limit = rlimit(RLIMIT_FSIZE);
2161 2161
2162 if (unlikely(*pos < 0)) 2162 if (unlikely(*pos < 0))
2163 return -EINVAL; 2163 return -EINVAL;
2164 2164
2165 if (!isblk) { 2165 if (!isblk) {
2166 /* FIXME: this is for backwards compatibility with 2.4 */ 2166 /* FIXME: this is for backwards compatibility with 2.4 */
2167 if (file->f_flags & O_APPEND) 2167 if (file->f_flags & O_APPEND)
2168 *pos = i_size_read(inode); 2168 *pos = i_size_read(inode);
2169 2169
2170 if (limit != RLIM_INFINITY) { 2170 if (limit != RLIM_INFINITY) {
2171 if (*pos >= limit) { 2171 if (*pos >= limit) {
2172 send_sig(SIGXFSZ, current, 0); 2172 send_sig(SIGXFSZ, current, 0);
2173 return -EFBIG; 2173 return -EFBIG;
2174 } 2174 }
2175 if (*count > limit - (typeof(limit))*pos) { 2175 if (*count > limit - (typeof(limit))*pos) {
2176 *count = limit - (typeof(limit))*pos; 2176 *count = limit - (typeof(limit))*pos;
2177 } 2177 }
2178 } 2178 }
2179 } 2179 }
2180 2180
2181 /* 2181 /*
2182 * LFS rule 2182 * LFS rule
2183 */ 2183 */
2184 if (unlikely(*pos + *count > MAX_NON_LFS && 2184 if (unlikely(*pos + *count > MAX_NON_LFS &&
2185 !(file->f_flags & O_LARGEFILE))) { 2185 !(file->f_flags & O_LARGEFILE))) {
2186 if (*pos >= MAX_NON_LFS) { 2186 if (*pos >= MAX_NON_LFS) {
2187 return -EFBIG; 2187 return -EFBIG;
2188 } 2188 }
2189 if (*count > MAX_NON_LFS - (unsigned long)*pos) { 2189 if (*count > MAX_NON_LFS - (unsigned long)*pos) {
2190 *count = MAX_NON_LFS - (unsigned long)*pos; 2190 *count = MAX_NON_LFS - (unsigned long)*pos;
2191 } 2191 }
2192 } 2192 }
2193 2193
2194 /* 2194 /*
2195 * Are we about to exceed the fs block limit ? 2195 * Are we about to exceed the fs block limit ?
2196 * 2196 *
2197 * If we have written data it becomes a short write. If we have 2197 * If we have written data it becomes a short write. If we have
2198 * exceeded without writing data we send a signal and return EFBIG. 2198 * exceeded without writing data we send a signal and return EFBIG.
2199 * Linus frestrict idea will clean these up nicely.. 2199 * Linus frestrict idea will clean these up nicely..
2200 */ 2200 */
2201 if (likely(!isblk)) { 2201 if (likely(!isblk)) {
2202 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { 2202 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2203 if (*count || *pos > inode->i_sb->s_maxbytes) { 2203 if (*count || *pos > inode->i_sb->s_maxbytes) {
2204 return -EFBIG; 2204 return -EFBIG;
2205 } 2205 }
2206 /* zero-length writes at ->s_maxbytes are OK */ 2206 /* zero-length writes at ->s_maxbytes are OK */
2207 } 2207 }
2208 2208
2209 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) 2209 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2210 *count = inode->i_sb->s_maxbytes - *pos; 2210 *count = inode->i_sb->s_maxbytes - *pos;
2211 } else { 2211 } else {
2212 #ifdef CONFIG_BLOCK 2212 #ifdef CONFIG_BLOCK
2213 loff_t isize; 2213 loff_t isize;
2214 if (bdev_read_only(I_BDEV(inode))) 2214 if (bdev_read_only(I_BDEV(inode)))
2215 return -EPERM; 2215 return -EPERM;
2216 isize = i_size_read(inode); 2216 isize = i_size_read(inode);
2217 if (*pos >= isize) { 2217 if (*pos >= isize) {
2218 if (*count || *pos > isize) 2218 if (*count || *pos > isize)
2219 return -ENOSPC; 2219 return -ENOSPC;
2220 } 2220 }
2221 2221
2222 if (*pos + *count > isize) 2222 if (*pos + *count > isize)
2223 *count = isize - *pos; 2223 *count = isize - *pos;
2224 #else 2224 #else
2225 return -EPERM; 2225 return -EPERM;
2226 #endif 2226 #endif
2227 } 2227 }
2228 return 0; 2228 return 0;
2229 } 2229 }
2230 EXPORT_SYMBOL(generic_write_checks); 2230 EXPORT_SYMBOL(generic_write_checks);
2231 2231
2232 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2232 int pagecache_write_begin(struct file *file, struct address_space *mapping,
2233 loff_t pos, unsigned len, unsigned flags, 2233 loff_t pos, unsigned len, unsigned flags,
2234 struct page **pagep, void **fsdata) 2234 struct page **pagep, void **fsdata)
2235 { 2235 {
2236 const struct address_space_operations *aops = mapping->a_ops; 2236 const struct address_space_operations *aops = mapping->a_ops;
2237 2237
2238 return aops->write_begin(file, mapping, pos, len, flags, 2238 return aops->write_begin(file, mapping, pos, len, flags,
2239 pagep, fsdata); 2239 pagep, fsdata);
2240 } 2240 }
2241 EXPORT_SYMBOL(pagecache_write_begin); 2241 EXPORT_SYMBOL(pagecache_write_begin);
2242 2242
2243 int pagecache_write_end(struct file *file, struct address_space *mapping, 2243 int pagecache_write_end(struct file *file, struct address_space *mapping,
2244 loff_t pos, unsigned len, unsigned copied, 2244 loff_t pos, unsigned len, unsigned copied,
2245 struct page *page, void *fsdata) 2245 struct page *page, void *fsdata)
2246 { 2246 {
2247 const struct address_space_operations *aops = mapping->a_ops; 2247 const struct address_space_operations *aops = mapping->a_ops;
2248 2248
2249 mark_page_accessed(page); 2249 mark_page_accessed(page);
2250 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2250 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2251 } 2251 }
2252 EXPORT_SYMBOL(pagecache_write_end); 2252 EXPORT_SYMBOL(pagecache_write_end);
2253 2253
2254 ssize_t 2254 ssize_t
2255 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 2255 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2256 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 2256 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2257 size_t count, size_t ocount) 2257 size_t count, size_t ocount)
2258 { 2258 {
2259 struct file *file = iocb->ki_filp; 2259 struct file *file = iocb->ki_filp;
2260 struct address_space *mapping = file->f_mapping; 2260 struct address_space *mapping = file->f_mapping;
2261 struct inode *inode = mapping->host; 2261 struct inode *inode = mapping->host;
2262 ssize_t written; 2262 ssize_t written;
2263 size_t write_len; 2263 size_t write_len;
2264 pgoff_t end; 2264 pgoff_t end;
2265 2265
2266 if (count != ocount) 2266 if (count != ocount)
2267 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2267 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2268 2268
2269 write_len = iov_length(iov, *nr_segs); 2269 write_len = iov_length(iov, *nr_segs);
2270 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2270 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2271 2271
2272 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2272 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2273 if (written) 2273 if (written)
2274 goto out; 2274 goto out;
2275 2275
2276 /* 2276 /*
2277 * After a write we want buffered reads to be sure to go to disk to get 2277 * After a write we want buffered reads to be sure to go to disk to get
2278 * the new data. We invalidate clean cached page from the region we're 2278 * the new data. We invalidate clean cached page from the region we're
2279 * about to write. We do this *before* the write so that we can return 2279 * about to write. We do this *before* the write so that we can return
2280 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2280 * without clobbering -EIOCBQUEUED from ->direct_IO().
2281 */ 2281 */
2282 if (mapping->nrpages) { 2282 if (mapping->nrpages) {
2283 written = invalidate_inode_pages2_range(mapping, 2283 written = invalidate_inode_pages2_range(mapping,
2284 pos >> PAGE_CACHE_SHIFT, end); 2284 pos >> PAGE_CACHE_SHIFT, end);
2285 /* 2285 /*
2286 * If a page can not be invalidated, return 0 to fall back 2286 * If a page can not be invalidated, return 0 to fall back
2287 * to buffered write. 2287 * to buffered write.
2288 */ 2288 */
2289 if (written) { 2289 if (written) {
2290 if (written == -EBUSY) 2290 if (written == -EBUSY)
2291 return 0; 2291 return 0;
2292 goto out; 2292 goto out;
2293 } 2293 }
2294 } 2294 }
2295 2295
2296 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2296 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2297 2297
2298 /* 2298 /*
2299 * Finally, try again to invalidate clean pages which might have been 2299 * Finally, try again to invalidate clean pages which might have been
2300 * cached by non-direct readahead, or faulted in by get_user_pages() 2300 * cached by non-direct readahead, or faulted in by get_user_pages()
2301 * if the source of the write was an mmap'ed region of the file 2301 * if the source of the write was an mmap'ed region of the file
2302 * we're writing. Either one is a pretty crazy thing to do, 2302 * we're writing. Either one is a pretty crazy thing to do,
2303 * so we don't support it 100%. If this invalidation 2303 * so we don't support it 100%. If this invalidation
2304 * fails, tough, the write still worked... 2304 * fails, tough, the write still worked...
2305 */ 2305 */
2306 if (mapping->nrpages) { 2306 if (mapping->nrpages) {
2307 invalidate_inode_pages2_range(mapping, 2307 invalidate_inode_pages2_range(mapping,
2308 pos >> PAGE_CACHE_SHIFT, end); 2308 pos >> PAGE_CACHE_SHIFT, end);
2309 } 2309 }
2310 2310
2311 if (written > 0) { 2311 if (written > 0) {
2312 pos += written; 2312 pos += written;
2313 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2313 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2314 i_size_write(inode, pos); 2314 i_size_write(inode, pos);
2315 mark_inode_dirty(inode); 2315 mark_inode_dirty(inode);
2316 } 2316 }
2317 *ppos = pos; 2317 *ppos = pos;
2318 } 2318 }
2319 out: 2319 out:
2320 return written; 2320 return written;
2321 } 2321 }
2322 EXPORT_SYMBOL(generic_file_direct_write); 2322 EXPORT_SYMBOL(generic_file_direct_write);
2323 2323
2324 /* 2324 /*
2325 * Find or create a page at the given pagecache position. Return the locked 2325 * Find or create a page at the given pagecache position. Return the locked
2326 * page. This function is specifically for buffered writes. 2326 * page. This function is specifically for buffered writes.
2327 */ 2327 */
2328 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2328 struct page *grab_cache_page_write_begin(struct address_space *mapping,
2329 pgoff_t index, unsigned flags) 2329 pgoff_t index, unsigned flags)
2330 { 2330 {
2331 int status; 2331 int status;
2332 struct page *page; 2332 struct page *page;
2333 gfp_t gfp_notmask = 0; 2333 gfp_t gfp_notmask = 0;
2334 if (flags & AOP_FLAG_NOFS) 2334 if (flags & AOP_FLAG_NOFS)
2335 gfp_notmask = __GFP_FS; 2335 gfp_notmask = __GFP_FS;
2336 repeat: 2336 repeat:
2337 page = find_lock_page(mapping, index); 2337 page = find_lock_page(mapping, index);
2338 if (page) 2338 if (page)
2339 goto found; 2339 goto found;
2340 2340
2341 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2341 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
2342 if (!page) 2342 if (!page)
2343 return NULL; 2343 return NULL;
2344 status = add_to_page_cache_lru(page, mapping, index, 2344 status = add_to_page_cache_lru(page, mapping, index,
2345 GFP_KERNEL & ~gfp_notmask); 2345 GFP_KERNEL & ~gfp_notmask);
2346 if (unlikely(status)) { 2346 if (unlikely(status)) {
2347 page_cache_release(page); 2347 page_cache_release(page);
2348 if (status == -EEXIST) 2348 if (status == -EEXIST)
2349 goto repeat; 2349 goto repeat;
2350 return NULL; 2350 return NULL;
2351 } 2351 }
2352 found: 2352 found:
2353 wait_on_page_writeback(page); 2353 wait_on_page_writeback(page);
2354 return page; 2354 return page;
2355 } 2355 }
2356 EXPORT_SYMBOL(grab_cache_page_write_begin); 2356 EXPORT_SYMBOL(grab_cache_page_write_begin);
2357 2357
2358 static ssize_t generic_perform_write(struct file *file, 2358 static ssize_t generic_perform_write(struct file *file,
2359 struct iov_iter *i, loff_t pos) 2359 struct iov_iter *i, loff_t pos)
2360 { 2360 {
2361 struct address_space *mapping = file->f_mapping; 2361 struct address_space *mapping = file->f_mapping;
2362 const struct address_space_operations *a_ops = mapping->a_ops; 2362 const struct address_space_operations *a_ops = mapping->a_ops;
2363 long status = 0; 2363 long status = 0;
2364 ssize_t written = 0; 2364 ssize_t written = 0;
2365 unsigned int flags = 0; 2365 unsigned int flags = 0;
2366 2366
2367 /* 2367 /*
2368 * Copies from kernel address space cannot fail (NFSD is a big user). 2368 * Copies from kernel address space cannot fail (NFSD is a big user).
2369 */ 2369 */
2370 if (segment_eq(get_fs(), KERNEL_DS)) 2370 if (segment_eq(get_fs(), KERNEL_DS))
2371 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2371 flags |= AOP_FLAG_UNINTERRUPTIBLE;
2372 2372
2373 do { 2373 do {
2374 struct page *page; 2374 struct page *page;
2375 unsigned long offset; /* Offset into pagecache page */ 2375 unsigned long offset; /* Offset into pagecache page */
2376 unsigned long bytes; /* Bytes to write to page */ 2376 unsigned long bytes; /* Bytes to write to page */
2377 size_t copied; /* Bytes copied from user */ 2377 size_t copied; /* Bytes copied from user */
2378 void *fsdata; 2378 void *fsdata;
2379 2379
2380 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2380 offset = (pos & (PAGE_CACHE_SIZE - 1));
2381 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2381 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2382 iov_iter_count(i)); 2382 iov_iter_count(i));
2383 2383
2384 again: 2384 again:
2385 2385
2386 /* 2386 /*
2387 * Bring in the user page that we will copy from _first_. 2387 * Bring in the user page that we will copy from _first_.
2388 * Otherwise there's a nasty deadlock on copying from the 2388 * Otherwise there's a nasty deadlock on copying from the
2389 * same page as we're writing to, without it being marked 2389 * same page as we're writing to, without it being marked
2390 * up-to-date. 2390 * up-to-date.
2391 * 2391 *
2392 * Not only is this an optimisation, but it is also required 2392 * Not only is this an optimisation, but it is also required
2393 * to check that the address is actually valid, when atomic 2393 * to check that the address is actually valid, when atomic
2394 * usercopies are used, below. 2394 * usercopies are used, below.
2395 */ 2395 */
2396 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2396 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2397 status = -EFAULT; 2397 status = -EFAULT;
2398 break; 2398 break;
2399 } 2399 }
2400 2400
2401 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2401 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2402 &page, &fsdata); 2402 &page, &fsdata);
2403 if (unlikely(status)) 2403 if (unlikely(status))
2404 break; 2404 break;
2405 2405
2406 if (mapping_writably_mapped(mapping)) 2406 if (mapping_writably_mapped(mapping))
2407 flush_dcache_page(page); 2407 flush_dcache_page(page);
2408 2408
2409 pagefault_disable(); 2409 pagefault_disable();
2410 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2410 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2411 pagefault_enable(); 2411 pagefault_enable();
2412 flush_dcache_page(page); 2412 flush_dcache_page(page);
2413 2413
2414 mark_page_accessed(page); 2414 mark_page_accessed(page);
2415 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2415 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2416 page, fsdata); 2416 page, fsdata);
2417 if (unlikely(status < 0)) 2417 if (unlikely(status < 0))
2418 break; 2418 break;
2419 copied = status; 2419 copied = status;
2420 2420
2421 cond_resched(); 2421 cond_resched();
2422 2422
2423 iov_iter_advance(i, copied); 2423 iov_iter_advance(i, copied);
2424 if (unlikely(copied == 0)) { 2424 if (unlikely(copied == 0)) {
2425 /* 2425 /*
2426 * If we were unable to copy any data at all, we must 2426 * If we were unable to copy any data at all, we must
2427 * fall back to a single segment length write. 2427 * fall back to a single segment length write.
2428 * 2428 *
2429 * If we didn't fallback here, we could livelock 2429 * If we didn't fallback here, we could livelock
2430 * because not all segments in the iov can be copied at 2430 * because not all segments in the iov can be copied at
2431 * once without a pagefault. 2431 * once without a pagefault.
2432 */ 2432 */
2433 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2433 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2434 iov_iter_single_seg_count(i)); 2434 iov_iter_single_seg_count(i));
2435 goto again; 2435 goto again;
2436 } 2436 }
2437 pos += copied; 2437 pos += copied;
2438 written += copied; 2438 written += copied;
2439 2439
2440 balance_dirty_pages_ratelimited(mapping); 2440 balance_dirty_pages_ratelimited(mapping);
2441 2441
2442 } while (iov_iter_count(i)); 2442 } while (iov_iter_count(i));
2443 2443
2444 return written ? written : status; 2444 return written ? written : status;
2445 } 2445 }
2446 2446
2447 ssize_t 2447 ssize_t
2448 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 2448 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2449 unsigned long nr_segs, loff_t pos, loff_t *ppos, 2449 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2450 size_t count, ssize_t written) 2450 size_t count, ssize_t written)
2451 { 2451 {
2452 struct file *file = iocb->ki_filp; 2452 struct file *file = iocb->ki_filp;
2453 ssize_t status; 2453 ssize_t status;
2454 struct iov_iter i; 2454 struct iov_iter i;
2455 2455
2456 iov_iter_init(&i, iov, nr_segs, count, written); 2456 iov_iter_init(&i, iov, nr_segs, count, written);
2457 status = generic_perform_write(file, &i, pos); 2457 status = generic_perform_write(file, &i, pos);
2458 2458
2459 if (likely(status >= 0)) { 2459 if (likely(status >= 0)) {
2460 written += status; 2460 written += status;
2461 *ppos = pos + status; 2461 *ppos = pos + status;
2462 } 2462 }
2463 2463
2464 return written ? written : status; 2464 return written ? written : status;
2465 } 2465 }
2466 EXPORT_SYMBOL(generic_file_buffered_write); 2466 EXPORT_SYMBOL(generic_file_buffered_write);
2467 2467
2468 /** 2468 /**
2469 * __generic_file_aio_write - write data to a file 2469 * __generic_file_aio_write - write data to a file
2470 * @iocb: IO state structure (file, offset, etc.) 2470 * @iocb: IO state structure (file, offset, etc.)
2471 * @iov: vector with data to write 2471 * @iov: vector with data to write
2472 * @nr_segs: number of segments in the vector 2472 * @nr_segs: number of segments in the vector
2473 * @ppos: position where to write 2473 * @ppos: position where to write
2474 * 2474 *
2475 * This function does all the work needed for actually writing data to a 2475 * This function does all the work needed for actually writing data to a
2476 * file. It does all basic checks, removes SUID from the file, updates 2476 * file. It does all basic checks, removes SUID from the file, updates
2477 * modification times and calls proper subroutines depending on whether we 2477 * modification times and calls proper subroutines depending on whether we
2478 * do direct IO or a standard buffered write. 2478 * do direct IO or a standard buffered write.
2479 * 2479 *
2480 * It expects i_mutex to be grabbed unless we work on a block device or similar 2480 * It expects i_mutex to be grabbed unless we work on a block device or similar
2481 * object which does not need locking at all. 2481 * object which does not need locking at all.
2482 * 2482 *
2483 * This function does *not* take care of syncing data in case of O_SYNC write. 2483 * This function does *not* take care of syncing data in case of O_SYNC write.
2484 * A caller has to handle it. This is mainly due to the fact that we want to 2484 * A caller has to handle it. This is mainly due to the fact that we want to
2485 * avoid syncing under i_mutex. 2485 * avoid syncing under i_mutex.
2486 */ 2486 */
2487 ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2487 ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2488 unsigned long nr_segs, loff_t *ppos) 2488 unsigned long nr_segs, loff_t *ppos)
2489 { 2489 {
2490 struct file *file = iocb->ki_filp; 2490 struct file *file = iocb->ki_filp;
2491 struct address_space * mapping = file->f_mapping; 2491 struct address_space * mapping = file->f_mapping;
2492 size_t ocount; /* original count */ 2492 size_t ocount; /* original count */
2493 size_t count; /* after file limit checks */ 2493 size_t count; /* after file limit checks */
2494 struct inode *inode = mapping->host; 2494 struct inode *inode = mapping->host;
2495 loff_t pos; 2495 loff_t pos;
2496 ssize_t written; 2496 ssize_t written;
2497 ssize_t err; 2497 ssize_t err;
2498 2498
2499 ocount = 0; 2499 ocount = 0;
2500 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 2500 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2501 if (err) 2501 if (err)
2502 return err; 2502 return err;
2503 2503
2504 count = ocount; 2504 count = ocount;
2505 pos = *ppos; 2505 pos = *ppos;
2506 2506
2507 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2507 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2508 2508
2509 /* We can write back this queue in page reclaim */ 2509 /* We can write back this queue in page reclaim */
2510 current->backing_dev_info = mapping->backing_dev_info; 2510 current->backing_dev_info = mapping->backing_dev_info;
2511 written = 0; 2511 written = 0;
2512 2512
2513 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2513 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2514 if (err) 2514 if (err)
2515 goto out; 2515 goto out;
2516 2516
2517 if (count == 0) 2517 if (count == 0)
2518 goto out; 2518 goto out;
2519 2519
2520 err = file_remove_suid(file); 2520 err = file_remove_suid(file);
2521 if (err) 2521 if (err)
2522 goto out; 2522 goto out;
2523 2523
2524 file_update_time(file); 2524 file_update_time(file);
2525 2525
2526 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2526 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2527 if (unlikely(file->f_flags & O_DIRECT)) { 2527 if (unlikely(file->f_flags & O_DIRECT)) {
2528 loff_t endbyte; 2528 loff_t endbyte;
2529 ssize_t written_buffered; 2529 ssize_t written_buffered;
2530 2530
2531 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 2531 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2532 ppos, count, ocount); 2532 ppos, count, ocount);
2533 if (written < 0 || written == count) 2533 if (written < 0 || written == count)
2534 goto out; 2534 goto out;
2535 /* 2535 /*
2536 * direct-io write to a hole: fall through to buffered I/O 2536 * direct-io write to a hole: fall through to buffered I/O
2537 * for completing the rest of the request. 2537 * for completing the rest of the request.
2538 */ 2538 */
2539 pos += written; 2539 pos += written;
2540 count -= written; 2540 count -= written;
2541 written_buffered = generic_file_buffered_write(iocb, iov, 2541 written_buffered = generic_file_buffered_write(iocb, iov,
2542 nr_segs, pos, ppos, count, 2542 nr_segs, pos, ppos, count,
2543 written); 2543 written);
2544 /* 2544 /*
2545 * If generic_file_buffered_write() retuned a synchronous error 2545 * If generic_file_buffered_write() retuned a synchronous error
2546 * then we want to return the number of bytes which were 2546 * then we want to return the number of bytes which were
2547 * direct-written, or the error code if that was zero. Note 2547 * direct-written, or the error code if that was zero. Note
2548 * that this differs from normal direct-io semantics, which 2548 * that this differs from normal direct-io semantics, which
2549 * will return -EFOO even if some bytes were written. 2549 * will return -EFOO even if some bytes were written.
2550 */ 2550 */
2551 if (written_buffered < 0) { 2551 if (written_buffered < 0) {
2552 err = written_buffered; 2552 err = written_buffered;
2553 goto out; 2553 goto out;
2554 } 2554 }
2555 2555
2556 /* 2556 /*
2557 * We need to ensure that the page cache pages are written to 2557 * We need to ensure that the page cache pages are written to
2558 * disk and invalidated to preserve the expected O_DIRECT 2558 * disk and invalidated to preserve the expected O_DIRECT
2559 * semantics. 2559 * semantics.
2560 */ 2560 */
2561 endbyte = pos + written_buffered - written - 1; 2561 endbyte = pos + written_buffered - written - 1;
2562 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 2562 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
2563 if (err == 0) { 2563 if (err == 0) {
2564 written = written_buffered; 2564 written = written_buffered;
2565 invalidate_mapping_pages(mapping, 2565 invalidate_mapping_pages(mapping,
2566 pos >> PAGE_CACHE_SHIFT, 2566 pos >> PAGE_CACHE_SHIFT,
2567 endbyte >> PAGE_CACHE_SHIFT); 2567 endbyte >> PAGE_CACHE_SHIFT);
2568 } else { 2568 } else {
2569 /* 2569 /*
2570 * We don't know how much we wrote, so just return 2570 * We don't know how much we wrote, so just return
2571 * the number of bytes which were direct-written 2571 * the number of bytes which were direct-written
2572 */ 2572 */
2573 } 2573 }
2574 } else { 2574 } else {
2575 written = generic_file_buffered_write(iocb, iov, nr_segs, 2575 written = generic_file_buffered_write(iocb, iov, nr_segs,
2576 pos, ppos, count, written); 2576 pos, ppos, count, written);
2577 } 2577 }
2578 out: 2578 out:
2579 current->backing_dev_info = NULL; 2579 current->backing_dev_info = NULL;
2580 return written ? written : err; 2580 return written ? written : err;
2581 } 2581 }
2582 EXPORT_SYMBOL(__generic_file_aio_write); 2582 EXPORT_SYMBOL(__generic_file_aio_write);
2583 2583
2584 /** 2584 /**
2585 * generic_file_aio_write - write data to a file 2585 * generic_file_aio_write - write data to a file
2586 * @iocb: IO state structure 2586 * @iocb: IO state structure
2587 * @iov: vector with data to write 2587 * @iov: vector with data to write
2588 * @nr_segs: number of segments in the vector 2588 * @nr_segs: number of segments in the vector
2589 * @pos: position in file where to write 2589 * @pos: position in file where to write
2590 * 2590 *
2591 * This is a wrapper around __generic_file_aio_write() to be used by most 2591 * This is a wrapper around __generic_file_aio_write() to be used by most
2592 * filesystems. It takes care of syncing the file in case of O_SYNC file 2592 * filesystems. It takes care of syncing the file in case of O_SYNC file
2593 * and acquires i_mutex as needed. 2593 * and acquires i_mutex as needed.
2594 */ 2594 */
2595 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2595 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2596 unsigned long nr_segs, loff_t pos) 2596 unsigned long nr_segs, loff_t pos)
2597 { 2597 {
2598 struct file *file = iocb->ki_filp; 2598 struct file *file = iocb->ki_filp;
2599 struct inode *inode = file->f_mapping->host; 2599 struct inode *inode = file->f_mapping->host;
2600 struct blk_plug plug; 2600 struct blk_plug plug;
2601 ssize_t ret; 2601 ssize_t ret;
2602 2602
2603 BUG_ON(iocb->ki_pos != pos); 2603 BUG_ON(iocb->ki_pos != pos);
2604 2604
2605 mutex_lock(&inode->i_mutex); 2605 mutex_lock(&inode->i_mutex);
2606 blk_start_plug(&plug); 2606 blk_start_plug(&plug);
2607 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2607 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2608 mutex_unlock(&inode->i_mutex); 2608 mutex_unlock(&inode->i_mutex);
2609 2609
2610 if (ret > 0 || ret == -EIOCBQUEUED) { 2610 if (ret > 0 || ret == -EIOCBQUEUED) {
2611 ssize_t err; 2611 ssize_t err;
2612 2612
2613 err = generic_write_sync(file, pos, ret); 2613 err = generic_write_sync(file, pos, ret);
2614 if (err < 0 && ret > 0) 2614 if (err < 0 && ret > 0)
2615 ret = err; 2615 ret = err;
2616 } 2616 }
2617 blk_finish_plug(&plug); 2617 blk_finish_plug(&plug);
2618 return ret; 2618 return ret;
2619 } 2619 }
2620 EXPORT_SYMBOL(generic_file_aio_write); 2620 EXPORT_SYMBOL(generic_file_aio_write);
2621 2621
2622 /** 2622 /**
2623 * try_to_release_page() - release old fs-specific metadata on a page 2623 * try_to_release_page() - release old fs-specific metadata on a page
2624 * 2624 *
2625 * @page: the page which the kernel is trying to free 2625 * @page: the page which the kernel is trying to free
2626 * @gfp_mask: memory allocation flags (and I/O mode) 2626 * @gfp_mask: memory allocation flags (and I/O mode)
2627 * 2627 *
2628 * The address_space is to try to release any data against the page 2628 * The address_space is to try to release any data against the page
2629 * (presumably at page->private). If the release was successful, return `1'. 2629 * (presumably at page->private). If the release was successful, return `1'.
2630 * Otherwise return zero. 2630 * Otherwise return zero.
2631 * 2631 *
2632 * This may also be called if PG_fscache is set on a page, indicating that the 2632 * This may also be called if PG_fscache is set on a page, indicating that the
2633 * page is known to the local caching routines. 2633 * page is known to the local caching routines.
2634 * 2634 *
2635 * The @gfp_mask argument specifies whether I/O may be performed to release 2635 * The @gfp_mask argument specifies whether I/O may be performed to release
2636 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2636 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2637 * 2637 *
2638 */ 2638 */
2639 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2639 int try_to_release_page(struct page *page, gfp_t gfp_mask)
2640 { 2640 {
2641 struct address_space * const mapping = page->mapping; 2641 struct address_space * const mapping = page->mapping;
2642 2642
2643 BUG_ON(!PageLocked(page)); 2643 BUG_ON(!PageLocked(page));
2644 if (PageWriteback(page)) 2644 if (PageWriteback(page))
2645 return 0; 2645 return 0;
2646 2646
2647 if (mapping && mapping->a_ops->releasepage) 2647 if (mapping && mapping->a_ops->releasepage)
2648 return mapping->a_ops->releasepage(page, gfp_mask); 2648 return mapping->a_ops->releasepage(page, gfp_mask);
2649 return try_to_free_buffers(page); 2649 return try_to_free_buffers(page);
2650 } 2650 }
2651 2651
2652 EXPORT_SYMBOL(try_to_release_page); 2652 EXPORT_SYMBOL(try_to_release_page);
2653 2653
1 /* 1 /*
2 * mm/page-writeback.c 2 * mm/page-writeback.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 5 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
6 * 6 *
7 * Contains functions related to writing back dirty pages at the 7 * Contains functions related to writing back dirty pages at the
8 * address_space level. 8 * address_space level.
9 * 9 *
10 * 10Apr2002 Andrew Morton 10 * 10Apr2002 Andrew Morton
11 * Initial version 11 * Initial version
12 */ 12 */
13 13
14 #include <linux/kernel.h> 14 #include <linux/kernel.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/spinlock.h> 16 #include <linux/spinlock.h>
17 #include <linux/fs.h> 17 #include <linux/fs.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/slab.h> 20 #include <linux/slab.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/writeback.h> 22 #include <linux/writeback.h>
23 #include <linux/init.h> 23 #include <linux/init.h>
24 #include <linux/backing-dev.h> 24 #include <linux/backing-dev.h>
25 #include <linux/task_io_accounting_ops.h> 25 #include <linux/task_io_accounting_ops.h>
26 #include <linux/blkdev.h> 26 #include <linux/blkdev.h>
27 #include <linux/mpage.h> 27 #include <linux/mpage.h>
28 #include <linux/rmap.h> 28 #include <linux/rmap.h>
29 #include <linux/percpu.h> 29 #include <linux/percpu.h>
30 #include <linux/notifier.h> 30 #include <linux/notifier.h>
31 #include <linux/smp.h> 31 #include <linux/smp.h>
32 #include <linux/sysctl.h> 32 #include <linux/sysctl.h>
33 #include <linux/cpu.h> 33 #include <linux/cpu.h>
34 #include <linux/syscalls.h> 34 #include <linux/syscalls.h>
35 #include <linux/buffer_head.h> 35 #include <linux/buffer_head.h>
36 #include <linux/pagevec.h> 36 #include <linux/pagevec.h>
37 #include <trace/events/writeback.h> 37 #include <trace/events/writeback.h>
38 38
39 /* 39 /*
40 * Sleep at most 200ms at a time in balance_dirty_pages().
41 */
42 #define MAX_PAUSE max(HZ/5, 1)
43
44 /*
45 * Estimate write bandwidth at 200ms intervals.
46 */
47 #define BANDWIDTH_INTERVAL max(HZ/5, 1)
48
49 /*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
41 * will look to see if it needs to force writeback or throttling. 51 * will look to see if it needs to force writeback or throttling.
42 */ 52 */
43 static long ratelimit_pages = 32; 53 static long ratelimit_pages = 32;
44 54
45 /* 55 /*
46 * When balance_dirty_pages decides that the caller needs to perform some 56 * When balance_dirty_pages decides that the caller needs to perform some
47 * non-background writeback, this is how many pages it will attempt to write. 57 * non-background writeback, this is how many pages it will attempt to write.
48 * It should be somewhat larger than dirtied pages to ensure that reasonably 58 * It should be somewhat larger than dirtied pages to ensure that reasonably
49 * large amounts of I/O are submitted. 59 * large amounts of I/O are submitted.
50 */ 60 */
51 static inline long sync_writeback_pages(unsigned long dirtied) 61 static inline long sync_writeback_pages(unsigned long dirtied)
52 { 62 {
53 if (dirtied < ratelimit_pages) 63 if (dirtied < ratelimit_pages)
54 dirtied = ratelimit_pages; 64 dirtied = ratelimit_pages;
55 65
56 return dirtied + dirtied / 2; 66 return dirtied + dirtied / 2;
57 } 67 }
58 68
59 /* The following parameters are exported via /proc/sys/vm */ 69 /* The following parameters are exported via /proc/sys/vm */
60 70
61 /* 71 /*
62 * Start background writeback (via writeback threads) at this percentage 72 * Start background writeback (via writeback threads) at this percentage
63 */ 73 */
64 int dirty_background_ratio = 10; 74 int dirty_background_ratio = 10;
65 75
66 /* 76 /*
67 * dirty_background_bytes starts at 0 (disabled) so that it is a function of 77 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
68 * dirty_background_ratio * the amount of dirtyable memory 78 * dirty_background_ratio * the amount of dirtyable memory
69 */ 79 */
70 unsigned long dirty_background_bytes; 80 unsigned long dirty_background_bytes;
71 81
72 /* 82 /*
73 * free highmem will not be subtracted from the total free memory 83 * free highmem will not be subtracted from the total free memory
74 * for calculating free ratios if vm_highmem_is_dirtyable is true 84 * for calculating free ratios if vm_highmem_is_dirtyable is true
75 */ 85 */
76 int vm_highmem_is_dirtyable; 86 int vm_highmem_is_dirtyable;
77 87
78 /* 88 /*
79 * The generator of dirty data starts writeback at this percentage 89 * The generator of dirty data starts writeback at this percentage
80 */ 90 */
81 int vm_dirty_ratio = 20; 91 int vm_dirty_ratio = 20;
82 92
83 /* 93 /*
84 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of 94 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
85 * vm_dirty_ratio * the amount of dirtyable memory 95 * vm_dirty_ratio * the amount of dirtyable memory
86 */ 96 */
87 unsigned long vm_dirty_bytes; 97 unsigned long vm_dirty_bytes;
88 98
89 /* 99 /*
90 * The interval between `kupdate'-style writebacks 100 * The interval between `kupdate'-style writebacks
91 */ 101 */
92 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ 102 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
93 103
94 /* 104 /*
95 * The longest time for which data is allowed to remain dirty 105 * The longest time for which data is allowed to remain dirty
96 */ 106 */
97 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ 107 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
98 108
99 /* 109 /*
100 * Flag that makes the machine dump writes/reads and block dirtyings. 110 * Flag that makes the machine dump writes/reads and block dirtyings.
101 */ 111 */
102 int block_dump; 112 int block_dump;
103 113
104 /* 114 /*
105 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: 115 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
106 * a full sync is triggered after this time elapses without any disk activity. 116 * a full sync is triggered after this time elapses without any disk activity.
107 */ 117 */
108 int laptop_mode; 118 int laptop_mode;
109 119
110 EXPORT_SYMBOL(laptop_mode); 120 EXPORT_SYMBOL(laptop_mode);
111 121
112 /* End of sysctl-exported parameters */ 122 /* End of sysctl-exported parameters */
113 123
124 unsigned long global_dirty_limit;
114 125
115 /* 126 /*
116 * Scale the writeback cache size proportional to the relative writeout speeds. 127 * Scale the writeback cache size proportional to the relative writeout speeds.
117 * 128 *
118 * We do this by keeping a floating proportion between BDIs, based on page 129 * We do this by keeping a floating proportion between BDIs, based on page
119 * writeback completions [end_page_writeback()]. Those devices that write out 130 * writeback completions [end_page_writeback()]. Those devices that write out
120 * pages fastest will get the larger share, while the slower will get a smaller 131 * pages fastest will get the larger share, while the slower will get a smaller
121 * share. 132 * share.
122 * 133 *
123 * We use page writeout completions because we are interested in getting rid of 134 * We use page writeout completions because we are interested in getting rid of
124 * dirty pages. Having them written out is the primary goal. 135 * dirty pages. Having them written out is the primary goal.
125 * 136 *
126 * We introduce a concept of time, a period over which we measure these events, 137 * We introduce a concept of time, a period over which we measure these events,
127 * because demand can/will vary over time. The length of this period itself is 138 * because demand can/will vary over time. The length of this period itself is
128 * measured in page writeback completions. 139 * measured in page writeback completions.
129 * 140 *
130 */ 141 */
131 static struct prop_descriptor vm_completions; 142 static struct prop_descriptor vm_completions;
132 static struct prop_descriptor vm_dirties; 143 static struct prop_descriptor vm_dirties;
133 144
134 /* 145 /*
135 * couple the period to the dirty_ratio: 146 * couple the period to the dirty_ratio:
136 * 147 *
137 * period/2 ~ roundup_pow_of_two(dirty limit) 148 * period/2 ~ roundup_pow_of_two(dirty limit)
138 */ 149 */
139 static int calc_period_shift(void) 150 static int calc_period_shift(void)
140 { 151 {
141 unsigned long dirty_total; 152 unsigned long dirty_total;
142 153
143 if (vm_dirty_bytes) 154 if (vm_dirty_bytes)
144 dirty_total = vm_dirty_bytes / PAGE_SIZE; 155 dirty_total = vm_dirty_bytes / PAGE_SIZE;
145 else 156 else
146 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 157 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
147 100; 158 100;
148 return 2 + ilog2(dirty_total - 1); 159 return 2 + ilog2(dirty_total - 1);
149 } 160 }
150 161
151 /* 162 /*
152 * update the period when the dirty threshold changes. 163 * update the period when the dirty threshold changes.
153 */ 164 */
154 static void update_completion_period(void) 165 static void update_completion_period(void)
155 { 166 {
156 int shift = calc_period_shift(); 167 int shift = calc_period_shift();
157 prop_change_shift(&vm_completions, shift); 168 prop_change_shift(&vm_completions, shift);
158 prop_change_shift(&vm_dirties, shift); 169 prop_change_shift(&vm_dirties, shift);
159 } 170 }
160 171
161 int dirty_background_ratio_handler(struct ctl_table *table, int write, 172 int dirty_background_ratio_handler(struct ctl_table *table, int write,
162 void __user *buffer, size_t *lenp, 173 void __user *buffer, size_t *lenp,
163 loff_t *ppos) 174 loff_t *ppos)
164 { 175 {
165 int ret; 176 int ret;
166 177
167 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 178 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
168 if (ret == 0 && write) 179 if (ret == 0 && write)
169 dirty_background_bytes = 0; 180 dirty_background_bytes = 0;
170 return ret; 181 return ret;
171 } 182 }
172 183
173 int dirty_background_bytes_handler(struct ctl_table *table, int write, 184 int dirty_background_bytes_handler(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, 185 void __user *buffer, size_t *lenp,
175 loff_t *ppos) 186 loff_t *ppos)
176 { 187 {
177 int ret; 188 int ret;
178 189
179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 190 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 if (ret == 0 && write) 191 if (ret == 0 && write)
181 dirty_background_ratio = 0; 192 dirty_background_ratio = 0;
182 return ret; 193 return ret;
183 } 194 }
184 195
185 int dirty_ratio_handler(struct ctl_table *table, int write, 196 int dirty_ratio_handler(struct ctl_table *table, int write,
186 void __user *buffer, size_t *lenp, 197 void __user *buffer, size_t *lenp,
187 loff_t *ppos) 198 loff_t *ppos)
188 { 199 {
189 int old_ratio = vm_dirty_ratio; 200 int old_ratio = vm_dirty_ratio;
190 int ret; 201 int ret;
191 202
192 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 203 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
193 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 204 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
194 update_completion_period(); 205 update_completion_period();
195 vm_dirty_bytes = 0; 206 vm_dirty_bytes = 0;
196 } 207 }
197 return ret; 208 return ret;
198 } 209 }
199 210
200 211
201 int dirty_bytes_handler(struct ctl_table *table, int write, 212 int dirty_bytes_handler(struct ctl_table *table, int write,
202 void __user *buffer, size_t *lenp, 213 void __user *buffer, size_t *lenp,
203 loff_t *ppos) 214 loff_t *ppos)
204 { 215 {
205 unsigned long old_bytes = vm_dirty_bytes; 216 unsigned long old_bytes = vm_dirty_bytes;
206 int ret; 217 int ret;
207 218
208 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 219 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
209 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 220 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
210 update_completion_period(); 221 update_completion_period();
211 vm_dirty_ratio = 0; 222 vm_dirty_ratio = 0;
212 } 223 }
213 return ret; 224 return ret;
214 } 225 }
215 226
216 /* 227 /*
217 * Increment the BDI's writeout completion count and the global writeout 228 * Increment the BDI's writeout completion count and the global writeout
218 * completion count. Called from test_clear_page_writeback(). 229 * completion count. Called from test_clear_page_writeback().
219 */ 230 */
220 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 231 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221 { 232 {
233 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 234 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac); 235 bdi->max_prop_frac);
224 } 236 }
225 237
226 void bdi_writeout_inc(struct backing_dev_info *bdi) 238 void bdi_writeout_inc(struct backing_dev_info *bdi)
227 { 239 {
228 unsigned long flags; 240 unsigned long flags;
229 241
230 local_irq_save(flags); 242 local_irq_save(flags);
231 __bdi_writeout_inc(bdi); 243 __bdi_writeout_inc(bdi);
232 local_irq_restore(flags); 244 local_irq_restore(flags);
233 } 245 }
234 EXPORT_SYMBOL_GPL(bdi_writeout_inc); 246 EXPORT_SYMBOL_GPL(bdi_writeout_inc);
235 247
236 void task_dirty_inc(struct task_struct *tsk) 248 void task_dirty_inc(struct task_struct *tsk)
237 { 249 {
238 prop_inc_single(&vm_dirties, &tsk->dirties); 250 prop_inc_single(&vm_dirties, &tsk->dirties);
239 } 251 }
240 252
241 /* 253 /*
242 * Obtain an accurate fraction of the BDI's portion. 254 * Obtain an accurate fraction of the BDI's portion.
243 */ 255 */
244 static void bdi_writeout_fraction(struct backing_dev_info *bdi, 256 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 long *numerator, long *denominator) 257 long *numerator, long *denominator)
246 { 258 {
247 if (bdi_cap_writeback_dirty(bdi)) { 259 prop_fraction_percpu(&vm_completions, &bdi->completions,
248 prop_fraction_percpu(&vm_completions, &bdi->completions,
249 numerator, denominator); 260 numerator, denominator);
250 } else {
251 *numerator = 0;
252 *denominator = 1;
253 }
254 } 261 }
255 262
256 static inline void task_dirties_fraction(struct task_struct *tsk, 263 static inline void task_dirties_fraction(struct task_struct *tsk,
257 long *numerator, long *denominator) 264 long *numerator, long *denominator)
258 { 265 {
259 prop_fraction_single(&vm_dirties, &tsk->dirties, 266 prop_fraction_single(&vm_dirties, &tsk->dirties,
260 numerator, denominator); 267 numerator, denominator);
261 } 268 }
262 269
263 /* 270 /*
264 * task_dirty_limit - scale down dirty throttling threshold for one task 271 * task_dirty_limit - scale down dirty throttling threshold for one task
265 * 272 *
266 * task specific dirty limit: 273 * task specific dirty limit:
267 * 274 *
268 * dirty -= (dirty/8) * p_{t} 275 * dirty -= (dirty/8) * p_{t}
269 * 276 *
270 * To protect light/slow dirtying tasks from heavier/fast ones, we start 277 * To protect light/slow dirtying tasks from heavier/fast ones, we start
271 * throttling individual tasks before reaching the bdi dirty limit. 278 * throttling individual tasks before reaching the bdi dirty limit.
272 * Relatively low thresholds will be allocated to heavy dirtiers. So when 279 * Relatively low thresholds will be allocated to heavy dirtiers. So when
273 * dirty pages grow large, heavy dirtiers will be throttled first, which will 280 * dirty pages grow large, heavy dirtiers will be throttled first, which will
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough 281 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled. 282 * dirty threshold may never get throttled.
276 */ 283 */
284 #define TASK_LIMIT_FRACTION 8
277 static unsigned long task_dirty_limit(struct task_struct *tsk, 285 static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty) 286 unsigned long bdi_dirty)
279 { 287 {
280 long numerator, denominator; 288 long numerator, denominator;
281 unsigned long dirty = bdi_dirty; 289 unsigned long dirty = bdi_dirty;
282 u64 inv = dirty >> 3; 290 u64 inv = dirty / TASK_LIMIT_FRACTION;
283 291
284 task_dirties_fraction(tsk, &numerator, &denominator); 292 task_dirties_fraction(tsk, &numerator, &denominator);
285 inv *= numerator; 293 inv *= numerator;
286 do_div(inv, denominator); 294 do_div(inv, denominator);
287 295
288 dirty -= inv; 296 dirty -= inv;
289 297
290 return max(dirty, bdi_dirty/2); 298 return max(dirty, bdi_dirty/2);
291 } 299 }
292 300
301 /* Minimum limit for any task */
302 static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
303 {
304 return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
305 }
306
293 /* 307 /*
294 * 308 *
295 */ 309 */
296 static unsigned int bdi_min_ratio; 310 static unsigned int bdi_min_ratio;
297 311
298 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 312 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
299 { 313 {
300 int ret = 0; 314 int ret = 0;
301 315
302 spin_lock_bh(&bdi_lock); 316 spin_lock_bh(&bdi_lock);
303 if (min_ratio > bdi->max_ratio) { 317 if (min_ratio > bdi->max_ratio) {
304 ret = -EINVAL; 318 ret = -EINVAL;
305 } else { 319 } else {
306 min_ratio -= bdi->min_ratio; 320 min_ratio -= bdi->min_ratio;
307 if (bdi_min_ratio + min_ratio < 100) { 321 if (bdi_min_ratio + min_ratio < 100) {
308 bdi_min_ratio += min_ratio; 322 bdi_min_ratio += min_ratio;
309 bdi->min_ratio += min_ratio; 323 bdi->min_ratio += min_ratio;
310 } else { 324 } else {
311 ret = -EINVAL; 325 ret = -EINVAL;
312 } 326 }
313 } 327 }
314 spin_unlock_bh(&bdi_lock); 328 spin_unlock_bh(&bdi_lock);
315 329
316 return ret; 330 return ret;
317 } 331 }
318 332
319 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 333 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
320 { 334 {
321 int ret = 0; 335 int ret = 0;
322 336
323 if (max_ratio > 100) 337 if (max_ratio > 100)
324 return -EINVAL; 338 return -EINVAL;
325 339
326 spin_lock_bh(&bdi_lock); 340 spin_lock_bh(&bdi_lock);
327 if (bdi->min_ratio > max_ratio) { 341 if (bdi->min_ratio > max_ratio) {
328 ret = -EINVAL; 342 ret = -EINVAL;
329 } else { 343 } else {
330 bdi->max_ratio = max_ratio; 344 bdi->max_ratio = max_ratio;
331 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 345 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
332 } 346 }
333 spin_unlock_bh(&bdi_lock); 347 spin_unlock_bh(&bdi_lock);
334 348
335 return ret; 349 return ret;
336 } 350 }
337 EXPORT_SYMBOL(bdi_set_max_ratio); 351 EXPORT_SYMBOL(bdi_set_max_ratio);
338 352
339 /* 353 /*
340 * Work out the current dirty-memory clamping and background writeout 354 * Work out the current dirty-memory clamping and background writeout
341 * thresholds. 355 * thresholds.
342 * 356 *
343 * The main aim here is to lower them aggressively if there is a lot of mapped 357 * The main aim here is to lower them aggressively if there is a lot of mapped
344 * memory around. To avoid stressing page reclaim with lots of unreclaimable 358 * memory around. To avoid stressing page reclaim with lots of unreclaimable
345 * pages. It is better to clamp down on writers than to start swapping, and 359 * pages. It is better to clamp down on writers than to start swapping, and
346 * performing lots of scanning. 360 * performing lots of scanning.
347 * 361 *
348 * We only allow 1/2 of the currently-unmapped memory to be dirtied. 362 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
349 * 363 *
350 * We don't permit the clamping level to fall below 5% - that is getting rather 364 * We don't permit the clamping level to fall below 5% - that is getting rather
351 * excessive. 365 * excessive.
352 * 366 *
353 * We make sure that the background writeout level is below the adjusted 367 * We make sure that the background writeout level is below the adjusted
354 * clamping level. 368 * clamping level.
355 */ 369 */
356 370
357 static unsigned long highmem_dirtyable_memory(unsigned long total) 371 static unsigned long highmem_dirtyable_memory(unsigned long total)
358 { 372 {
359 #ifdef CONFIG_HIGHMEM 373 #ifdef CONFIG_HIGHMEM
360 int node; 374 int node;
361 unsigned long x = 0; 375 unsigned long x = 0;
362 376
363 for_each_node_state(node, N_HIGH_MEMORY) { 377 for_each_node_state(node, N_HIGH_MEMORY) {
364 struct zone *z = 378 struct zone *z =
365 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 379 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
366 380
367 x += zone_page_state(z, NR_FREE_PAGES) + 381 x += zone_page_state(z, NR_FREE_PAGES) +
368 zone_reclaimable_pages(z); 382 zone_reclaimable_pages(z);
369 } 383 }
370 /* 384 /*
371 * Make sure that the number of highmem pages is never larger 385 * Make sure that the number of highmem pages is never larger
372 * than the number of the total dirtyable memory. This can only 386 * than the number of the total dirtyable memory. This can only
373 * occur in very strange VM situations but we want to make sure 387 * occur in very strange VM situations but we want to make sure
374 * that this does not occur. 388 * that this does not occur.
375 */ 389 */
376 return min(x, total); 390 return min(x, total);
377 #else 391 #else
378 return 0; 392 return 0;
379 #endif 393 #endif
380 } 394 }
381 395
382 /** 396 /**
383 * determine_dirtyable_memory - amount of memory that may be used 397 * determine_dirtyable_memory - amount of memory that may be used
384 * 398 *
385 * Returns the numebr of pages that can currently be freed and used 399 * Returns the numebr of pages that can currently be freed and used
386 * by the kernel for direct mappings. 400 * by the kernel for direct mappings.
387 */ 401 */
388 unsigned long determine_dirtyable_memory(void) 402 unsigned long determine_dirtyable_memory(void)
389 { 403 {
390 unsigned long x; 404 unsigned long x;
391 405
392 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); 406 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
393 407
394 if (!vm_highmem_is_dirtyable) 408 if (!vm_highmem_is_dirtyable)
395 x -= highmem_dirtyable_memory(x); 409 x -= highmem_dirtyable_memory(x);
396 410
397 return x + 1; /* Ensure that we never return 0 */ 411 return x + 1; /* Ensure that we never return 0 */
398 } 412 }
399 413
414 static unsigned long hard_dirty_limit(unsigned long thresh)
415 {
416 return max(thresh, global_dirty_limit);
417 }
418
400 /* 419 /*
401 * global_dirty_limits - background-writeback and dirty-throttling thresholds 420 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 * 421 *
403 * Calculate the dirty thresholds based on sysctl parameters 422 * Calculate the dirty thresholds based on sysctl parameters
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes 423 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes 424 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and 425 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * real-time tasks. 426 * real-time tasks.
408 */ 427 */
409 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) 428 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
410 { 429 {
411 unsigned long background; 430 unsigned long background;
412 unsigned long dirty; 431 unsigned long dirty;
413 unsigned long uninitialized_var(available_memory); 432 unsigned long uninitialized_var(available_memory);
414 struct task_struct *tsk; 433 struct task_struct *tsk;
415 434
416 if (!vm_dirty_bytes || !dirty_background_bytes) 435 if (!vm_dirty_bytes || !dirty_background_bytes)
417 available_memory = determine_dirtyable_memory(); 436 available_memory = determine_dirtyable_memory();
418 437
419 if (vm_dirty_bytes) 438 if (vm_dirty_bytes)
420 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 439 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
421 else 440 else
422 dirty = (vm_dirty_ratio * available_memory) / 100; 441 dirty = (vm_dirty_ratio * available_memory) / 100;
423 442
424 if (dirty_background_bytes) 443 if (dirty_background_bytes)
425 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); 444 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
426 else 445 else
427 background = (dirty_background_ratio * available_memory) / 100; 446 background = (dirty_background_ratio * available_memory) / 100;
428 447
429 if (background >= dirty) 448 if (background >= dirty)
430 background = dirty / 2; 449 background = dirty / 2;
431 tsk = current; 450 tsk = current;
432 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { 451 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
433 background += background / 4; 452 background += background / 4;
434 dirty += dirty / 4; 453 dirty += dirty / 4;
435 } 454 }
436 *pbackground = background; 455 *pbackground = background;
437 *pdirty = dirty; 456 *pdirty = dirty;
457 trace_global_dirty_state(background, dirty);
438 } 458 }
439 459
440 /* 460 /**
441 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 461 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
462 * @bdi: the backing_dev_info to query
463 * @dirty: global dirty limit in pages
442 * 464 *
443 * Allocate high/low dirty limits to fast/slow devices, in order to prevent 465 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
466 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
467 * And the "limit" in the name is not seriously taken as hard limit in
468 * balance_dirty_pages().
469 *
470 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 * - starving fast devices 471 * - starving fast devices
445 * - piling up dirty pages (that will take long time to sync) on slow devices 472 * - piling up dirty pages (that will take long time to sync) on slow devices
446 * 473 *
447 * The bdi's share of dirty limit will be adapting to its throughput and 474 * The bdi's share of dirty limit will be adapting to its throughput and
448 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. 475 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
449 */ 476 */
450 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) 477 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
451 { 478 {
452 u64 bdi_dirty; 479 u64 bdi_dirty;
453 long numerator, denominator; 480 long numerator, denominator;
454 481
455 /* 482 /*
456 * Calculate this BDI's share of the dirty ratio. 483 * Calculate this BDI's share of the dirty ratio.
457 */ 484 */
458 bdi_writeout_fraction(bdi, &numerator, &denominator); 485 bdi_writeout_fraction(bdi, &numerator, &denominator);
459 486
460 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; 487 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
461 bdi_dirty *= numerator; 488 bdi_dirty *= numerator;
462 do_div(bdi_dirty, denominator); 489 do_div(bdi_dirty, denominator);
463 490
464 bdi_dirty += (dirty * bdi->min_ratio) / 100; 491 bdi_dirty += (dirty * bdi->min_ratio) / 100;
465 if (bdi_dirty > (dirty * bdi->max_ratio) / 100) 492 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
466 bdi_dirty = dirty * bdi->max_ratio / 100; 493 bdi_dirty = dirty * bdi->max_ratio / 100;
467 494
468 return bdi_dirty; 495 return bdi_dirty;
469 } 496 }
470 497
498 static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
499 unsigned long elapsed,
500 unsigned long written)
501 {
502 const unsigned long period = roundup_pow_of_two(3 * HZ);
503 unsigned long avg = bdi->avg_write_bandwidth;
504 unsigned long old = bdi->write_bandwidth;
505 u64 bw;
506
507 /*
508 * bw = written * HZ / elapsed
509 *
510 * bw * elapsed + write_bandwidth * (period - elapsed)
511 * write_bandwidth = ---------------------------------------------------
512 * period
513 */
514 bw = written - bdi->written_stamp;
515 bw *= HZ;
516 if (unlikely(elapsed > period)) {
517 do_div(bw, elapsed);
518 avg = bw;
519 goto out;
520 }
521 bw += (u64)bdi->write_bandwidth * (period - elapsed);
522 bw >>= ilog2(period);
523
524 /*
525 * one more level of smoothing, for filtering out sudden spikes
526 */
527 if (avg > old && old >= (unsigned long)bw)
528 avg -= (avg - old) >> 3;
529
530 if (avg < old && old <= (unsigned long)bw)
531 avg += (old - avg) >> 3;
532
533 out:
534 bdi->write_bandwidth = bw;
535 bdi->avg_write_bandwidth = avg;
536 }
537
471 /* 538 /*
539 * The global dirtyable memory and dirty threshold could be suddenly knocked
540 * down by a large amount (eg. on the startup of KVM in a swapless system).
541 * This may throw the system into deep dirty exceeded state and throttle
542 * heavy/light dirtiers alike. To retain good responsiveness, maintain
543 * global_dirty_limit for tracking slowly down to the knocked down dirty
544 * threshold.
545 */
546 static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
547 {
548 unsigned long limit = global_dirty_limit;
549
550 /*
551 * Follow up in one step.
552 */
553 if (limit < thresh) {
554 limit = thresh;
555 goto update;
556 }
557
558 /*
559 * Follow down slowly. Use the higher one as the target, because thresh
560 * may drop below dirty. This is exactly the reason to introduce
561 * global_dirty_limit which is guaranteed to lie above the dirty pages.
562 */
563 thresh = max(thresh, dirty);
564 if (limit > thresh) {
565 limit -= (limit - thresh) >> 5;
566 goto update;
567 }
568 return;
569 update:
570 global_dirty_limit = limit;
571 }
572
573 static void global_update_bandwidth(unsigned long thresh,
574 unsigned long dirty,
575 unsigned long now)
576 {
577 static DEFINE_SPINLOCK(dirty_lock);
578 static unsigned long update_time;
579
580 /*
581 * check locklessly first to optimize away locking for the most time
582 */
583 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
584 return;
585
586 spin_lock(&dirty_lock);
587 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
588 update_dirty_limit(thresh, dirty);
589 update_time = now;
590 }
591 spin_unlock(&dirty_lock);
592 }
593
594 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
595 unsigned long thresh,
596 unsigned long dirty,
597 unsigned long bdi_thresh,
598 unsigned long bdi_dirty,
599 unsigned long start_time)
600 {
601 unsigned long now = jiffies;
602 unsigned long elapsed = now - bdi->bw_time_stamp;
603 unsigned long written;
604
605 /*
606 * rate-limit, only update once every 200ms.
607 */
608 if (elapsed < BANDWIDTH_INTERVAL)
609 return;
610
611 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
612
613 /*
614 * Skip quiet periods when disk bandwidth is under-utilized.
615 * (at least 1s idle time between two flusher runs)
616 */
617 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
618 goto snapshot;
619
620 if (thresh)
621 global_update_bandwidth(thresh, dirty, now);
622
623 bdi_update_write_bandwidth(bdi, elapsed, written);
624
625 snapshot:
626 bdi->written_stamp = written;
627 bdi->bw_time_stamp = now;
628 }
629
630 static void bdi_update_bandwidth(struct backing_dev_info *bdi,
631 unsigned long thresh,
632 unsigned long dirty,
633 unsigned long bdi_thresh,
634 unsigned long bdi_dirty,
635 unsigned long start_time)
636 {
637 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
638 return;
639 spin_lock(&bdi->wb.list_lock);
640 __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
641 start_time);
642 spin_unlock(&bdi->wb.list_lock);
643 }
644
645 /*
472 * balance_dirty_pages() must be called by processes which are generating dirty 646 * balance_dirty_pages() must be called by processes which are generating dirty
473 * data. It looks at the number of dirty pages in the machine and will force 647 * data. It looks at the number of dirty pages in the machine and will force
474 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 648 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
475 * If we're over `background_thresh' then the writeback threads are woken to 649 * If we're over `background_thresh' then the writeback threads are woken to
476 * perform some writeout. 650 * perform some writeout.
477 */ 651 */
478 static void balance_dirty_pages(struct address_space *mapping, 652 static void balance_dirty_pages(struct address_space *mapping,
479 unsigned long write_chunk) 653 unsigned long write_chunk)
480 { 654 {
481 long nr_reclaimable, bdi_nr_reclaimable; 655 unsigned long nr_reclaimable, bdi_nr_reclaimable;
482 long nr_writeback, bdi_nr_writeback; 656 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
657 unsigned long bdi_dirty;
483 unsigned long background_thresh; 658 unsigned long background_thresh;
484 unsigned long dirty_thresh; 659 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 660 unsigned long bdi_thresh;
661 unsigned long task_bdi_thresh;
662 unsigned long min_task_bdi_thresh;
486 unsigned long pages_written = 0; 663 unsigned long pages_written = 0;
487 unsigned long pause = 1; 664 unsigned long pause = 1;
488 bool dirty_exceeded = false; 665 bool dirty_exceeded = false;
666 bool clear_dirty_exceeded = true;
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 667 struct backing_dev_info *bdi = mapping->backing_dev_info;
668 unsigned long start_time = jiffies;
490 669
491 for (;;) { 670 for (;;) {
492 struct writeback_control wbc = {
493 .sync_mode = WB_SYNC_NONE,
494 .older_than_this = NULL,
495 .nr_to_write = write_chunk,
496 .range_cyclic = 1,
497 };
498
499 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 671 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 global_page_state(NR_UNSTABLE_NFS); 672 global_page_state(NR_UNSTABLE_NFS);
501 nr_writeback = global_page_state(NR_WRITEBACK); 673 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 674
503 global_dirty_limits(&background_thresh, &dirty_thresh); 675 global_dirty_limits(&background_thresh, &dirty_thresh);
504 676
505 /* 677 /*
506 * Throttle it only when the background writeback cannot 678 * Throttle it only when the background writeback cannot
507 * catch-up. This avoids (excessively) small writeouts 679 * catch-up. This avoids (excessively) small writeouts
508 * when the bdi limits are ramping up. 680 * when the bdi limits are ramping up.
509 */ 681 */
510 if (nr_reclaimable + nr_writeback <= 682 if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
511 (background_thresh + dirty_thresh) / 2)
512 break; 683 break;
513 684
514 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515 bdi_thresh = task_dirty_limit(current, bdi_thresh); 686 min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
687 task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 688
517 /* 689 /*
518 * In order to avoid the stacked BDI deadlock we need 690 * In order to avoid the stacked BDI deadlock we need
519 * to ensure we accurately count the 'dirty' pages when 691 * to ensure we accurately count the 'dirty' pages when
520 * the threshold is low. 692 * the threshold is low.
521 * 693 *
522 * Otherwise it would be possible to get thresh+n pages 694 * Otherwise it would be possible to get thresh+n pages
523 * reported dirty, even though there are thresh-m pages 695 * reported dirty, even though there are thresh-m pages
524 * actually dirty; with m+n sitting in the percpu 696 * actually dirty; with m+n sitting in the percpu
525 * deltas. 697 * deltas.
526 */ 698 */
527 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 699 if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 700 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 701 bdi_dirty = bdi_nr_reclaimable +
702 bdi_stat_sum(bdi, BDI_WRITEBACK);
530 } else { 703 } else {
531 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 704 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 705 bdi_dirty = bdi_nr_reclaimable +
706 bdi_stat(bdi, BDI_WRITEBACK);
533 } 707 }
534 708
535 /* 709 /*
536 * The bdi thresh is somehow "soft" limit derived from the 710 * The bdi thresh is somehow "soft" limit derived from the
537 * global "hard" limit. The former helps to prevent heavy IO 711 * global "hard" limit. The former helps to prevent heavy IO
538 * bdi or process from holding back light ones; The latter is 712 * bdi or process from holding back light ones; The latter is
539 * the last resort safeguard. 713 * the last resort safeguard.
540 */ 714 */
541 dirty_exceeded = 715 dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) 716 (nr_dirty > dirty_thresh);
543 || (nr_reclaimable + nr_writeback > dirty_thresh); 717 clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
718 (nr_dirty <= dirty_thresh);
544 719
545 if (!dirty_exceeded) 720 if (!dirty_exceeded)
546 break; 721 break;
547 722
548 if (!bdi->dirty_exceeded) 723 if (!bdi->dirty_exceeded)
549 bdi->dirty_exceeded = 1; 724 bdi->dirty_exceeded = 1;
550 725
726 bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
727 bdi_thresh, bdi_dirty, start_time);
728
551 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 729 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
552 * Unstable writes are a feature of certain networked 730 * Unstable writes are a feature of certain networked
553 * filesystems (i.e. NFS) in which data may have been 731 * filesystems (i.e. NFS) in which data may have been
554 * written to the server's write cache, but has not yet 732 * written to the server's write cache, but has not yet
555 * been flushed to permanent storage. 733 * been flushed to permanent storage.
556 * Only move pages to writeback if this bdi is over its 734 * Only move pages to writeback if this bdi is over its
557 * threshold otherwise wait until the disk writes catch 735 * threshold otherwise wait until the disk writes catch
558 * up. 736 * up.
559 */ 737 */
560 trace_wbc_balance_dirty_start(&wbc, bdi); 738 trace_balance_dirty_start(bdi);
561 if (bdi_nr_reclaimable > bdi_thresh) { 739 if (bdi_nr_reclaimable > task_bdi_thresh) {
562 writeback_inodes_wb(&bdi->wb, &wbc); 740 pages_written += writeback_inodes_wb(&bdi->wb,
563 pages_written += write_chunk - wbc.nr_to_write; 741 write_chunk);
564 trace_wbc_balance_dirty_written(&wbc, bdi); 742 trace_balance_dirty_written(bdi, pages_written);
565 if (pages_written >= write_chunk) 743 if (pages_written >= write_chunk)
566 break; /* We've done our duty */ 744 break; /* We've done our duty */
567 } 745 }
568 trace_wbc_balance_dirty_wait(&wbc, bdi);
569 __set_current_state(TASK_UNINTERRUPTIBLE); 746 __set_current_state(TASK_UNINTERRUPTIBLE);
570 io_schedule_timeout(pause); 747 io_schedule_timeout(pause);
748 trace_balance_dirty_wait(bdi);
571 749
750 dirty_thresh = hard_dirty_limit(dirty_thresh);
572 /* 751 /*
752 * max-pause area. If dirty exceeded but still within this
753 * area, no need to sleep for more than 200ms: (a) 8 pages per
754 * 200ms is typically more than enough to curb heavy dirtiers;
755 * (b) the pause time limit makes the dirtiers more responsive.
756 */
757 if (nr_dirty < dirty_thresh +
758 dirty_thresh / DIRTY_MAXPAUSE_AREA &&
759 time_after(jiffies, start_time + MAX_PAUSE))
760 break;
761 /*
762 * pass-good area. When some bdi gets blocked (eg. NFS server
763 * not responding), or write bandwidth dropped dramatically due
764 * to concurrent reads, or dirty threshold suddenly dropped and
765 * the dirty pages cannot be brought down anytime soon (eg. on
766 * slow USB stick), at least let go of the good bdi's.
767 */
768 if (nr_dirty < dirty_thresh +
769 dirty_thresh / DIRTY_PASSGOOD_AREA &&
770 bdi_dirty < bdi_thresh)
771 break;
772
773 /*
573 * Increase the delay for each loop, up to our previous 774 * Increase the delay for each loop, up to our previous
574 * default of taking a 100ms nap. 775 * default of taking a 100ms nap.
575 */ 776 */
576 pause <<= 1; 777 pause <<= 1;
577 if (pause > HZ / 10) 778 if (pause > HZ / 10)
578 pause = HZ / 10; 779 pause = HZ / 10;
579 } 780 }
580 781
581 if (!dirty_exceeded && bdi->dirty_exceeded) 782 /* Clear dirty_exceeded flag only when no task can exceed the limit */
783 if (clear_dirty_exceeded && bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 784 bdi->dirty_exceeded = 0;
583 785
584 if (writeback_in_progress(bdi)) 786 if (writeback_in_progress(bdi))
585 return; 787 return;
586 788
587 /* 789 /*
588 * In laptop mode, we wait until hitting the higher threshold before 790 * In laptop mode, we wait until hitting the higher threshold before
589 * starting background writeout, and then write out all the way down 791 * starting background writeout, and then write out all the way down
590 * to the lower threshold. So slow writers cause minimal disk activity. 792 * to the lower threshold. So slow writers cause minimal disk activity.
591 * 793 *
592 * In normal mode, we start background writeout at the lower 794 * In normal mode, we start background writeout at the lower
593 * background_thresh, to keep the amount of dirty memory low. 795 * background_thresh, to keep the amount of dirty memory low.
594 */ 796 */
595 if ((laptop_mode && pages_written) || 797 if ((laptop_mode && pages_written) ||
596 (!laptop_mode && (nr_reclaimable > background_thresh))) 798 (!laptop_mode && (nr_reclaimable > background_thresh)))
597 bdi_start_background_writeback(bdi); 799 bdi_start_background_writeback(bdi);
598 } 800 }
599 801
600 void set_page_dirty_balance(struct page *page, int page_mkwrite) 802 void set_page_dirty_balance(struct page *page, int page_mkwrite)
601 { 803 {
602 if (set_page_dirty(page) || page_mkwrite) { 804 if (set_page_dirty(page) || page_mkwrite) {
603 struct address_space *mapping = page_mapping(page); 805 struct address_space *mapping = page_mapping(page);
604 806
605 if (mapping) 807 if (mapping)
606 balance_dirty_pages_ratelimited(mapping); 808 balance_dirty_pages_ratelimited(mapping);
607 } 809 }
608 } 810 }
609 811
610 static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; 812 static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
611 813
612 /** 814 /**
613 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 815 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
614 * @mapping: address_space which was dirtied 816 * @mapping: address_space which was dirtied
615 * @nr_pages_dirtied: number of pages which the caller has just dirtied 817 * @nr_pages_dirtied: number of pages which the caller has just dirtied
616 * 818 *
617 * Processes which are dirtying memory should call in here once for each page 819 * Processes which are dirtying memory should call in here once for each page
618 * which was newly dirtied. The function will periodically check the system's 820 * which was newly dirtied. The function will periodically check the system's
619 * dirty state and will initiate writeback if needed. 821 * dirty state and will initiate writeback if needed.
620 * 822 *
621 * On really big machines, get_writeback_state is expensive, so try to avoid 823 * On really big machines, get_writeback_state is expensive, so try to avoid
622 * calling it too often (ratelimiting). But once we're over the dirty memory 824 * calling it too often (ratelimiting). But once we're over the dirty memory
623 * limit we decrease the ratelimiting by a lot, to prevent individual processes 825 * limit we decrease the ratelimiting by a lot, to prevent individual processes
624 * from overshooting the limit by (ratelimit_pages) each. 826 * from overshooting the limit by (ratelimit_pages) each.
625 */ 827 */
626 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 828 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 unsigned long nr_pages_dirtied) 829 unsigned long nr_pages_dirtied)
628 { 830 {
831 struct backing_dev_info *bdi = mapping->backing_dev_info;
629 unsigned long ratelimit; 832 unsigned long ratelimit;
630 unsigned long *p; 833 unsigned long *p;
631 834
835 if (!bdi_cap_account_dirty(bdi))
836 return;
837
632 ratelimit = ratelimit_pages; 838 ratelimit = ratelimit_pages;
633 if (mapping->backing_dev_info->dirty_exceeded) 839 if (mapping->backing_dev_info->dirty_exceeded)
634 ratelimit = 8; 840 ratelimit = 8;
635 841
636 /* 842 /*
637 * Check the rate limiting. Also, we do not want to throttle real-time 843 * Check the rate limiting. Also, we do not want to throttle real-time
638 * tasks in balance_dirty_pages(). Period. 844 * tasks in balance_dirty_pages(). Period.
639 */ 845 */
640 preempt_disable(); 846 preempt_disable();
641 p = &__get_cpu_var(bdp_ratelimits); 847 p = &__get_cpu_var(bdp_ratelimits);
642 *p += nr_pages_dirtied; 848 *p += nr_pages_dirtied;
643 if (unlikely(*p >= ratelimit)) { 849 if (unlikely(*p >= ratelimit)) {
644 ratelimit = sync_writeback_pages(*p); 850 ratelimit = sync_writeback_pages(*p);
645 *p = 0; 851 *p = 0;
646 preempt_enable(); 852 preempt_enable();
647 balance_dirty_pages(mapping, ratelimit); 853 balance_dirty_pages(mapping, ratelimit);
648 return; 854 return;
649 } 855 }
650 preempt_enable(); 856 preempt_enable();
651 } 857 }
652 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 858 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
653 859
654 void throttle_vm_writeout(gfp_t gfp_mask) 860 void throttle_vm_writeout(gfp_t gfp_mask)
655 { 861 {
656 unsigned long background_thresh; 862 unsigned long background_thresh;
657 unsigned long dirty_thresh; 863 unsigned long dirty_thresh;
658 864
659 for ( ; ; ) { 865 for ( ; ; ) {
660 global_dirty_limits(&background_thresh, &dirty_thresh); 866 global_dirty_limits(&background_thresh, &dirty_thresh);
661 867
662 /* 868 /*
663 * Boost the allowable dirty threshold a bit for page 869 * Boost the allowable dirty threshold a bit for page
664 * allocators so they don't get DoS'ed by heavy writers 870 * allocators so they don't get DoS'ed by heavy writers
665 */ 871 */
666 dirty_thresh += dirty_thresh / 10; /* wheeee... */ 872 dirty_thresh += dirty_thresh / 10; /* wheeee... */
667 873
668 if (global_page_state(NR_UNSTABLE_NFS) + 874 if (global_page_state(NR_UNSTABLE_NFS) +
669 global_page_state(NR_WRITEBACK) <= dirty_thresh) 875 global_page_state(NR_WRITEBACK) <= dirty_thresh)
670 break; 876 break;
671 congestion_wait(BLK_RW_ASYNC, HZ/10); 877 congestion_wait(BLK_RW_ASYNC, HZ/10);
672 878
673 /* 879 /*
674 * The caller might hold locks which can prevent IO completion 880 * The caller might hold locks which can prevent IO completion
675 * or progress in the filesystem. So we cannot just sit here 881 * or progress in the filesystem. So we cannot just sit here
676 * waiting for IO to complete. 882 * waiting for IO to complete.
677 */ 883 */
678 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) 884 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
679 break; 885 break;
680 } 886 }
681 } 887 }
682 888
683 /* 889 /*
684 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 890 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
685 */ 891 */
686 int dirty_writeback_centisecs_handler(ctl_table *table, int write, 892 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
687 void __user *buffer, size_t *length, loff_t *ppos) 893 void __user *buffer, size_t *length, loff_t *ppos)
688 { 894 {
689 proc_dointvec(table, write, buffer, length, ppos); 895 proc_dointvec(table, write, buffer, length, ppos);
690 bdi_arm_supers_timer(); 896 bdi_arm_supers_timer();
691 return 0; 897 return 0;
692 } 898 }
693 899
694 #ifdef CONFIG_BLOCK 900 #ifdef CONFIG_BLOCK
695 void laptop_mode_timer_fn(unsigned long data) 901 void laptop_mode_timer_fn(unsigned long data)
696 { 902 {
697 struct request_queue *q = (struct request_queue *)data; 903 struct request_queue *q = (struct request_queue *)data;
698 int nr_pages = global_page_state(NR_FILE_DIRTY) + 904 int nr_pages = global_page_state(NR_FILE_DIRTY) +
699 global_page_state(NR_UNSTABLE_NFS); 905 global_page_state(NR_UNSTABLE_NFS);
700 906
701 /* 907 /*
702 * We want to write everything out, not just down to the dirty 908 * We want to write everything out, not just down to the dirty
703 * threshold 909 * threshold
704 */ 910 */
705 if (bdi_has_dirty_io(&q->backing_dev_info)) 911 if (bdi_has_dirty_io(&q->backing_dev_info))
706 bdi_start_writeback(&q->backing_dev_info, nr_pages); 912 bdi_start_writeback(&q->backing_dev_info, nr_pages);
707 } 913 }
708 914
709 /* 915 /*
710 * We've spun up the disk and we're in laptop mode: schedule writeback 916 * We've spun up the disk and we're in laptop mode: schedule writeback
711 * of all dirty data a few seconds from now. If the flush is already scheduled 917 * of all dirty data a few seconds from now. If the flush is already scheduled
712 * then push it back - the user is still using the disk. 918 * then push it back - the user is still using the disk.
713 */ 919 */
714 void laptop_io_completion(struct backing_dev_info *info) 920 void laptop_io_completion(struct backing_dev_info *info)
715 { 921 {
716 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); 922 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
717 } 923 }
718 924
719 /* 925 /*
720 * We're in laptop mode and we've just synced. The sync's writes will have 926 * We're in laptop mode and we've just synced. The sync's writes will have
721 * caused another writeback to be scheduled by laptop_io_completion. 927 * caused another writeback to be scheduled by laptop_io_completion.
722 * Nothing needs to be written back anymore, so we unschedule the writeback. 928 * Nothing needs to be written back anymore, so we unschedule the writeback.
723 */ 929 */
724 void laptop_sync_completion(void) 930 void laptop_sync_completion(void)
725 { 931 {
726 struct backing_dev_info *bdi; 932 struct backing_dev_info *bdi;
727 933
728 rcu_read_lock(); 934 rcu_read_lock();
729 935
730 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) 936 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
731 del_timer(&bdi->laptop_mode_wb_timer); 937 del_timer(&bdi->laptop_mode_wb_timer);
732 938
733 rcu_read_unlock(); 939 rcu_read_unlock();
734 } 940 }
735 #endif 941 #endif
736 942
737 /* 943 /*
738 * If ratelimit_pages is too high then we can get into dirty-data overload 944 * If ratelimit_pages is too high then we can get into dirty-data overload
739 * if a large number of processes all perform writes at the same time. 945 * if a large number of processes all perform writes at the same time.
740 * If it is too low then SMP machines will call the (expensive) 946 * If it is too low then SMP machines will call the (expensive)
741 * get_writeback_state too often. 947 * get_writeback_state too often.
742 * 948 *
743 * Here we set ratelimit_pages to a level which ensures that when all CPUs are 949 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
744 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory 950 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
745 * thresholds before writeback cuts in. 951 * thresholds before writeback cuts in.
746 * 952 *
747 * But the limit should not be set too high. Because it also controls the 953 * But the limit should not be set too high. Because it also controls the
748 * amount of memory which the balance_dirty_pages() caller has to write back. 954 * amount of memory which the balance_dirty_pages() caller has to write back.
749 * If this is too large then the caller will block on the IO queue all the 955 * If this is too large then the caller will block on the IO queue all the
750 * time. So limit it to four megabytes - the balance_dirty_pages() caller 956 * time. So limit it to four megabytes - the balance_dirty_pages() caller
751 * will write six megabyte chunks, max. 957 * will write six megabyte chunks, max.
752 */ 958 */
753 959
754 void writeback_set_ratelimit(void) 960 void writeback_set_ratelimit(void)
755 { 961 {
756 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); 962 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
757 if (ratelimit_pages < 16) 963 if (ratelimit_pages < 16)
758 ratelimit_pages = 16; 964 ratelimit_pages = 16;
759 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) 965 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
760 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; 966 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
761 } 967 }
762 968
763 static int __cpuinit 969 static int __cpuinit
764 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) 970 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
765 { 971 {
766 writeback_set_ratelimit(); 972 writeback_set_ratelimit();
767 return NOTIFY_DONE; 973 return NOTIFY_DONE;
768 } 974 }
769 975
770 static struct notifier_block __cpuinitdata ratelimit_nb = { 976 static struct notifier_block __cpuinitdata ratelimit_nb = {
771 .notifier_call = ratelimit_handler, 977 .notifier_call = ratelimit_handler,
772 .next = NULL, 978 .next = NULL,
773 }; 979 };
774 980
775 /* 981 /*
776 * Called early on to tune the page writeback dirty limits. 982 * Called early on to tune the page writeback dirty limits.
777 * 983 *
778 * We used to scale dirty pages according to how total memory 984 * We used to scale dirty pages according to how total memory
779 * related to pages that could be allocated for buffers (by 985 * related to pages that could be allocated for buffers (by
780 * comparing nr_free_buffer_pages() to vm_total_pages. 986 * comparing nr_free_buffer_pages() to vm_total_pages.
781 * 987 *
782 * However, that was when we used "dirty_ratio" to scale with 988 * However, that was when we used "dirty_ratio" to scale with
783 * all memory, and we don't do that any more. "dirty_ratio" 989 * all memory, and we don't do that any more. "dirty_ratio"
784 * is now applied to total non-HIGHPAGE memory (by subtracting 990 * is now applied to total non-HIGHPAGE memory (by subtracting
785 * totalhigh_pages from vm_total_pages), and as such we can't 991 * totalhigh_pages from vm_total_pages), and as such we can't
786 * get into the old insane situation any more where we had 992 * get into the old insane situation any more where we had
787 * large amounts of dirty pages compared to a small amount of 993 * large amounts of dirty pages compared to a small amount of
788 * non-HIGHMEM memory. 994 * non-HIGHMEM memory.
789 * 995 *
790 * But we might still want to scale the dirty_ratio by how 996 * But we might still want to scale the dirty_ratio by how
791 * much memory the box has.. 997 * much memory the box has..
792 */ 998 */
793 void __init page_writeback_init(void) 999 void __init page_writeback_init(void)
794 { 1000 {
795 int shift; 1001 int shift;
796 1002
797 writeback_set_ratelimit(); 1003 writeback_set_ratelimit();
798 register_cpu_notifier(&ratelimit_nb); 1004 register_cpu_notifier(&ratelimit_nb);
799 1005
800 shift = calc_period_shift(); 1006 shift = calc_period_shift();
801 prop_descriptor_init(&vm_completions, shift); 1007 prop_descriptor_init(&vm_completions, shift);
802 prop_descriptor_init(&vm_dirties, shift); 1008 prop_descriptor_init(&vm_dirties, shift);
803 } 1009 }
804 1010
805 /** 1011 /**
806 * tag_pages_for_writeback - tag pages to be written by write_cache_pages 1012 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
807 * @mapping: address space structure to write 1013 * @mapping: address space structure to write
808 * @start: starting page index 1014 * @start: starting page index
809 * @end: ending page index (inclusive) 1015 * @end: ending page index (inclusive)
810 * 1016 *
811 * This function scans the page range from @start to @end (inclusive) and tags 1017 * This function scans the page range from @start to @end (inclusive) and tags
812 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is 1018 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
813 * that write_cache_pages (or whoever calls this function) will then use 1019 * that write_cache_pages (or whoever calls this function) will then use
814 * TOWRITE tag to identify pages eligible for writeback. This mechanism is 1020 * TOWRITE tag to identify pages eligible for writeback. This mechanism is
815 * used to avoid livelocking of writeback by a process steadily creating new 1021 * used to avoid livelocking of writeback by a process steadily creating new
816 * dirty pages in the file (thus it is important for this function to be quick 1022 * dirty pages in the file (thus it is important for this function to be quick
817 * so that it can tag pages faster than a dirtying process can create them). 1023 * so that it can tag pages faster than a dirtying process can create them).
818 */ 1024 */
819 /* 1025 /*
820 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. 1026 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
821 */ 1027 */
822 void tag_pages_for_writeback(struct address_space *mapping, 1028 void tag_pages_for_writeback(struct address_space *mapping,
823 pgoff_t start, pgoff_t end) 1029 pgoff_t start, pgoff_t end)
824 { 1030 {
825 #define WRITEBACK_TAG_BATCH 4096 1031 #define WRITEBACK_TAG_BATCH 4096
826 unsigned long tagged; 1032 unsigned long tagged;
827 1033
828 do { 1034 do {
829 spin_lock_irq(&mapping->tree_lock); 1035 spin_lock_irq(&mapping->tree_lock);
830 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, 1036 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
831 &start, end, WRITEBACK_TAG_BATCH, 1037 &start, end, WRITEBACK_TAG_BATCH,
832 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); 1038 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
833 spin_unlock_irq(&mapping->tree_lock); 1039 spin_unlock_irq(&mapping->tree_lock);
834 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); 1040 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
835 cond_resched(); 1041 cond_resched();
836 /* We check 'start' to handle wrapping when end == ~0UL */ 1042 /* We check 'start' to handle wrapping when end == ~0UL */
837 } while (tagged >= WRITEBACK_TAG_BATCH && start); 1043 } while (tagged >= WRITEBACK_TAG_BATCH && start);
838 } 1044 }
839 EXPORT_SYMBOL(tag_pages_for_writeback); 1045 EXPORT_SYMBOL(tag_pages_for_writeback);
840 1046
841 /** 1047 /**
842 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 1048 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
843 * @mapping: address space structure to write 1049 * @mapping: address space structure to write
844 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 1050 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
845 * @writepage: function called for each page 1051 * @writepage: function called for each page
846 * @data: data passed to writepage function 1052 * @data: data passed to writepage function
847 * 1053 *
848 * If a page is already under I/O, write_cache_pages() skips it, even 1054 * If a page is already under I/O, write_cache_pages() skips it, even
849 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 1055 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
850 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 1056 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
851 * and msync() need to guarantee that all the data which was dirty at the time 1057 * and msync() need to guarantee that all the data which was dirty at the time
852 * the call was made get new I/O started against them. If wbc->sync_mode is 1058 * the call was made get new I/O started against them. If wbc->sync_mode is
853 * WB_SYNC_ALL then we were called for data integrity and we must wait for 1059 * WB_SYNC_ALL then we were called for data integrity and we must wait for
854 * existing IO to complete. 1060 * existing IO to complete.
855 * 1061 *
856 * To avoid livelocks (when other process dirties new pages), we first tag 1062 * To avoid livelocks (when other process dirties new pages), we first tag
857 * pages which should be written back with TOWRITE tag and only then start 1063 * pages which should be written back with TOWRITE tag and only then start
858 * writing them. For data-integrity sync we have to be careful so that we do 1064 * writing them. For data-integrity sync we have to be careful so that we do
859 * not miss some pages (e.g., because some other process has cleared TOWRITE 1065 * not miss some pages (e.g., because some other process has cleared TOWRITE
860 * tag we set). The rule we follow is that TOWRITE tag can be cleared only 1066 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
861 * by the process clearing the DIRTY tag (and submitting the page for IO). 1067 * by the process clearing the DIRTY tag (and submitting the page for IO).
862 */ 1068 */
863 int write_cache_pages(struct address_space *mapping, 1069 int write_cache_pages(struct address_space *mapping,
864 struct writeback_control *wbc, writepage_t writepage, 1070 struct writeback_control *wbc, writepage_t writepage,
865 void *data) 1071 void *data)
866 { 1072 {
867 int ret = 0; 1073 int ret = 0;
868 int done = 0; 1074 int done = 0;
869 struct pagevec pvec; 1075 struct pagevec pvec;
870 int nr_pages; 1076 int nr_pages;
871 pgoff_t uninitialized_var(writeback_index); 1077 pgoff_t uninitialized_var(writeback_index);
872 pgoff_t index; 1078 pgoff_t index;
873 pgoff_t end; /* Inclusive */ 1079 pgoff_t end; /* Inclusive */
874 pgoff_t done_index; 1080 pgoff_t done_index;
875 int cycled; 1081 int cycled;
876 int range_whole = 0; 1082 int range_whole = 0;
877 int tag; 1083 int tag;
878 1084
879 pagevec_init(&pvec, 0); 1085 pagevec_init(&pvec, 0);
880 if (wbc->range_cyclic) { 1086 if (wbc->range_cyclic) {
881 writeback_index = mapping->writeback_index; /* prev offset */ 1087 writeback_index = mapping->writeback_index; /* prev offset */
882 index = writeback_index; 1088 index = writeback_index;
883 if (index == 0) 1089 if (index == 0)
884 cycled = 1; 1090 cycled = 1;
885 else 1091 else
886 cycled = 0; 1092 cycled = 0;
887 end = -1; 1093 end = -1;
888 } else { 1094 } else {
889 index = wbc->range_start >> PAGE_CACHE_SHIFT; 1095 index = wbc->range_start >> PAGE_CACHE_SHIFT;
890 end = wbc->range_end >> PAGE_CACHE_SHIFT; 1096 end = wbc->range_end >> PAGE_CACHE_SHIFT;
891 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 1097 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
892 range_whole = 1; 1098 range_whole = 1;
893 cycled = 1; /* ignore range_cyclic tests */ 1099 cycled = 1; /* ignore range_cyclic tests */
894 } 1100 }
895 if (wbc->sync_mode == WB_SYNC_ALL) 1101 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 tag = PAGECACHE_TAG_TOWRITE; 1102 tag = PAGECACHE_TAG_TOWRITE;
897 else 1103 else
898 tag = PAGECACHE_TAG_DIRTY; 1104 tag = PAGECACHE_TAG_DIRTY;
899 retry: 1105 retry:
900 if (wbc->sync_mode == WB_SYNC_ALL) 1106 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 tag_pages_for_writeback(mapping, index, end); 1107 tag_pages_for_writeback(mapping, index, end);
902 done_index = index; 1108 done_index = index;
903 while (!done && (index <= end)) { 1109 while (!done && (index <= end)) {
904 int i; 1110 int i;
905 1111
906 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 1112 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
907 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 1113 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
908 if (nr_pages == 0) 1114 if (nr_pages == 0)
909 break; 1115 break;
910 1116
911 for (i = 0; i < nr_pages; i++) { 1117 for (i = 0; i < nr_pages; i++) {
912 struct page *page = pvec.pages[i]; 1118 struct page *page = pvec.pages[i];
913 1119
914 /* 1120 /*
915 * At this point, the page may be truncated or 1121 * At this point, the page may be truncated or
916 * invalidated (changing page->mapping to NULL), or 1122 * invalidated (changing page->mapping to NULL), or
917 * even swizzled back from swapper_space to tmpfs file 1123 * even swizzled back from swapper_space to tmpfs file
918 * mapping. However, page->index will not change 1124 * mapping. However, page->index will not change
919 * because we have a reference on the page. 1125 * because we have a reference on the page.
920 */ 1126 */
921 if (page->index > end) { 1127 if (page->index > end) {
922 /* 1128 /*
923 * can't be range_cyclic (1st pass) because 1129 * can't be range_cyclic (1st pass) because
924 * end == -1 in that case. 1130 * end == -1 in that case.
925 */ 1131 */
926 done = 1; 1132 done = 1;
927 break; 1133 break;
928 } 1134 }
929 1135
930 done_index = page->index; 1136 done_index = page->index;
931 1137
932 lock_page(page); 1138 lock_page(page);
933 1139
934 /* 1140 /*
935 * Page truncated or invalidated. We can freely skip it 1141 * Page truncated or invalidated. We can freely skip it
936 * then, even for data integrity operations: the page 1142 * then, even for data integrity operations: the page
937 * has disappeared concurrently, so there could be no 1143 * has disappeared concurrently, so there could be no
938 * real expectation of this data interity operation 1144 * real expectation of this data interity operation
939 * even if there is now a new, dirty page at the same 1145 * even if there is now a new, dirty page at the same
940 * pagecache address. 1146 * pagecache address.
941 */ 1147 */
942 if (unlikely(page->mapping != mapping)) { 1148 if (unlikely(page->mapping != mapping)) {
943 continue_unlock: 1149 continue_unlock:
944 unlock_page(page); 1150 unlock_page(page);
945 continue; 1151 continue;
946 } 1152 }
947 1153
948 if (!PageDirty(page)) { 1154 if (!PageDirty(page)) {
949 /* someone wrote it for us */ 1155 /* someone wrote it for us */
950 goto continue_unlock; 1156 goto continue_unlock;
951 } 1157 }
952 1158
953 if (PageWriteback(page)) { 1159 if (PageWriteback(page)) {
954 if (wbc->sync_mode != WB_SYNC_NONE) 1160 if (wbc->sync_mode != WB_SYNC_NONE)
955 wait_on_page_writeback(page); 1161 wait_on_page_writeback(page);
956 else 1162 else
957 goto continue_unlock; 1163 goto continue_unlock;
958 } 1164 }
959 1165
960 BUG_ON(PageWriteback(page)); 1166 BUG_ON(PageWriteback(page));
961 if (!clear_page_dirty_for_io(page)) 1167 if (!clear_page_dirty_for_io(page))
962 goto continue_unlock; 1168 goto continue_unlock;
963 1169
964 trace_wbc_writepage(wbc, mapping->backing_dev_info); 1170 trace_wbc_writepage(wbc, mapping->backing_dev_info);
965 ret = (*writepage)(page, wbc, data); 1171 ret = (*writepage)(page, wbc, data);
966 if (unlikely(ret)) { 1172 if (unlikely(ret)) {
967 if (ret == AOP_WRITEPAGE_ACTIVATE) { 1173 if (ret == AOP_WRITEPAGE_ACTIVATE) {
968 unlock_page(page); 1174 unlock_page(page);
969 ret = 0; 1175 ret = 0;
970 } else { 1176 } else {
971 /* 1177 /*
972 * done_index is set past this page, 1178 * done_index is set past this page,
973 * so media errors will not choke 1179 * so media errors will not choke
974 * background writeout for the entire 1180 * background writeout for the entire
975 * file. This has consequences for 1181 * file. This has consequences for
976 * range_cyclic semantics (ie. it may 1182 * range_cyclic semantics (ie. it may
977 * not be suitable for data integrity 1183 * not be suitable for data integrity
978 * writeout). 1184 * writeout).
979 */ 1185 */
980 done_index = page->index + 1; 1186 done_index = page->index + 1;
981 done = 1; 1187 done = 1;
982 break; 1188 break;
983 } 1189 }
984 } 1190 }
985 1191
986 /* 1192 /*
987 * We stop writing back only if we are not doing 1193 * We stop writing back only if we are not doing
988 * integrity sync. In case of integrity sync we have to 1194 * integrity sync. In case of integrity sync we have to
989 * keep going until we have written all the pages 1195 * keep going until we have written all the pages
990 * we tagged for writeback prior to entering this loop. 1196 * we tagged for writeback prior to entering this loop.
991 */ 1197 */
992 if (--wbc->nr_to_write <= 0 && 1198 if (--wbc->nr_to_write <= 0 &&
993 wbc->sync_mode == WB_SYNC_NONE) { 1199 wbc->sync_mode == WB_SYNC_NONE) {
994 done = 1; 1200 done = 1;
995 break; 1201 break;
996 } 1202 }
997 } 1203 }
998 pagevec_release(&pvec); 1204 pagevec_release(&pvec);
999 cond_resched(); 1205 cond_resched();
1000 } 1206 }
1001 if (!cycled && !done) { 1207 if (!cycled && !done) {
1002 /* 1208 /*
1003 * range_cyclic: 1209 * range_cyclic:
1004 * We hit the last page and there is more work to be done: wrap 1210 * We hit the last page and there is more work to be done: wrap
1005 * back to the start of the file 1211 * back to the start of the file
1006 */ 1212 */
1007 cycled = 1; 1213 cycled = 1;
1008 index = 0; 1214 index = 0;
1009 end = writeback_index - 1; 1215 end = writeback_index - 1;
1010 goto retry; 1216 goto retry;
1011 } 1217 }
1012 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 1218 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1013 mapping->writeback_index = done_index; 1219 mapping->writeback_index = done_index;
1014 1220
1015 return ret; 1221 return ret;
1016 } 1222 }
1017 EXPORT_SYMBOL(write_cache_pages); 1223 EXPORT_SYMBOL(write_cache_pages);
1018 1224
1019 /* 1225 /*
1020 * Function used by generic_writepages to call the real writepage 1226 * Function used by generic_writepages to call the real writepage
1021 * function and set the mapping flags on error 1227 * function and set the mapping flags on error
1022 */ 1228 */
1023 static int __writepage(struct page *page, struct writeback_control *wbc, 1229 static int __writepage(struct page *page, struct writeback_control *wbc,
1024 void *data) 1230 void *data)
1025 { 1231 {
1026 struct address_space *mapping = data; 1232 struct address_space *mapping = data;
1027 int ret = mapping->a_ops->writepage(page, wbc); 1233 int ret = mapping->a_ops->writepage(page, wbc);
1028 mapping_set_error(mapping, ret); 1234 mapping_set_error(mapping, ret);
1029 return ret; 1235 return ret;
1030 } 1236 }
1031 1237
1032 /** 1238 /**
1033 * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. 1239 * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
1034 * @mapping: address space structure to write 1240 * @mapping: address space structure to write
1035 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 1241 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1036 * 1242 *
1037 * This is a library function, which implements the writepages() 1243 * This is a library function, which implements the writepages()
1038 * address_space_operation. 1244 * address_space_operation.
1039 */ 1245 */
1040 int generic_writepages(struct address_space *mapping, 1246 int generic_writepages(struct address_space *mapping,
1041 struct writeback_control *wbc) 1247 struct writeback_control *wbc)
1042 { 1248 {
1043 struct blk_plug plug; 1249 struct blk_plug plug;
1044 int ret; 1250 int ret;
1045 1251
1046 /* deal with chardevs and other special file */ 1252 /* deal with chardevs and other special file */
1047 if (!mapping->a_ops->writepage) 1253 if (!mapping->a_ops->writepage)
1048 return 0; 1254 return 0;
1049 1255
1050 blk_start_plug(&plug); 1256 blk_start_plug(&plug);
1051 ret = write_cache_pages(mapping, wbc, __writepage, mapping); 1257 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
1052 blk_finish_plug(&plug); 1258 blk_finish_plug(&plug);
1053 return ret; 1259 return ret;
1054 } 1260 }
1055 1261
1056 EXPORT_SYMBOL(generic_writepages); 1262 EXPORT_SYMBOL(generic_writepages);
1057 1263
1058 int do_writepages(struct address_space *mapping, struct writeback_control *wbc) 1264 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
1059 { 1265 {
1060 int ret; 1266 int ret;
1061 1267
1062 if (wbc->nr_to_write <= 0) 1268 if (wbc->nr_to_write <= 0)
1063 return 0; 1269 return 0;
1064 if (mapping->a_ops->writepages) 1270 if (mapping->a_ops->writepages)
1065 ret = mapping->a_ops->writepages(mapping, wbc); 1271 ret = mapping->a_ops->writepages(mapping, wbc);
1066 else 1272 else
1067 ret = generic_writepages(mapping, wbc); 1273 ret = generic_writepages(mapping, wbc);
1068 return ret; 1274 return ret;
1069 } 1275 }
1070 1276
1071 /** 1277 /**
1072 * write_one_page - write out a single page and optionally wait on I/O 1278 * write_one_page - write out a single page and optionally wait on I/O
1073 * @page: the page to write 1279 * @page: the page to write
1074 * @wait: if true, wait on writeout 1280 * @wait: if true, wait on writeout
1075 * 1281 *
1076 * The page must be locked by the caller and will be unlocked upon return. 1282 * The page must be locked by the caller and will be unlocked upon return.
1077 * 1283 *
1078 * write_one_page() returns a negative error code if I/O failed. 1284 * write_one_page() returns a negative error code if I/O failed.
1079 */ 1285 */
1080 int write_one_page(struct page *page, int wait) 1286 int write_one_page(struct page *page, int wait)
1081 { 1287 {
1082 struct address_space *mapping = page->mapping; 1288 struct address_space *mapping = page->mapping;
1083 int ret = 0; 1289 int ret = 0;
1084 struct writeback_control wbc = { 1290 struct writeback_control wbc = {
1085 .sync_mode = WB_SYNC_ALL, 1291 .sync_mode = WB_SYNC_ALL,
1086 .nr_to_write = 1, 1292 .nr_to_write = 1,
1087 }; 1293 };
1088 1294
1089 BUG_ON(!PageLocked(page)); 1295 BUG_ON(!PageLocked(page));
1090 1296
1091 if (wait) 1297 if (wait)
1092 wait_on_page_writeback(page); 1298 wait_on_page_writeback(page);
1093 1299
1094 if (clear_page_dirty_for_io(page)) { 1300 if (clear_page_dirty_for_io(page)) {
1095 page_cache_get(page); 1301 page_cache_get(page);
1096 ret = mapping->a_ops->writepage(page, &wbc); 1302 ret = mapping->a_ops->writepage(page, &wbc);
1097 if (ret == 0 && wait) { 1303 if (ret == 0 && wait) {
1098 wait_on_page_writeback(page); 1304 wait_on_page_writeback(page);
1099 if (PageError(page)) 1305 if (PageError(page))
1100 ret = -EIO; 1306 ret = -EIO;
1101 } 1307 }
1102 page_cache_release(page); 1308 page_cache_release(page);
1103 } else { 1309 } else {
1104 unlock_page(page); 1310 unlock_page(page);
1105 } 1311 }
1106 return ret; 1312 return ret;
1107 } 1313 }
1108 EXPORT_SYMBOL(write_one_page); 1314 EXPORT_SYMBOL(write_one_page);
1109 1315
1110 /* 1316 /*
1111 * For address_spaces which do not use buffers nor write back. 1317 * For address_spaces which do not use buffers nor write back.
1112 */ 1318 */
1113 int __set_page_dirty_no_writeback(struct page *page) 1319 int __set_page_dirty_no_writeback(struct page *page)
1114 { 1320 {
1115 if (!PageDirty(page)) 1321 if (!PageDirty(page))
1116 return !TestSetPageDirty(page); 1322 return !TestSetPageDirty(page);
1117 return 0; 1323 return 0;
1118 } 1324 }
1119 1325
1120 /* 1326 /*
1121 * Helper function for set_page_dirty family. 1327 * Helper function for set_page_dirty family.
1122 * NOTE: This relies on being atomic wrt interrupts. 1328 * NOTE: This relies on being atomic wrt interrupts.
1123 */ 1329 */
1124 void account_page_dirtied(struct page *page, struct address_space *mapping) 1330 void account_page_dirtied(struct page *page, struct address_space *mapping)
1125 { 1331 {
1126 if (mapping_cap_account_dirty(mapping)) { 1332 if (mapping_cap_account_dirty(mapping)) {
1127 __inc_zone_page_state(page, NR_FILE_DIRTY); 1333 __inc_zone_page_state(page, NR_FILE_DIRTY);
1128 __inc_zone_page_state(page, NR_DIRTIED); 1334 __inc_zone_page_state(page, NR_DIRTIED);
1129 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1335 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1130 task_dirty_inc(current); 1336 task_dirty_inc(current);
1131 task_io_account_write(PAGE_CACHE_SIZE); 1337 task_io_account_write(PAGE_CACHE_SIZE);
1132 } 1338 }
1133 } 1339 }
1134 EXPORT_SYMBOL(account_page_dirtied); 1340 EXPORT_SYMBOL(account_page_dirtied);
1135 1341
1136 /* 1342 /*
1137 * Helper function for set_page_writeback family. 1343 * Helper function for set_page_writeback family.
1138 * NOTE: Unlike account_page_dirtied this does not rely on being atomic 1344 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
1139 * wrt interrupts. 1345 * wrt interrupts.
1140 */ 1346 */
1141 void account_page_writeback(struct page *page) 1347 void account_page_writeback(struct page *page)
1142 { 1348 {
1143 inc_zone_page_state(page, NR_WRITEBACK); 1349 inc_zone_page_state(page, NR_WRITEBACK);
1144 } 1350 }
1145 EXPORT_SYMBOL(account_page_writeback); 1351 EXPORT_SYMBOL(account_page_writeback);
1146 1352
1147 /* 1353 /*
1148 * For address_spaces which do not use buffers. Just tag the page as dirty in 1354 * For address_spaces which do not use buffers. Just tag the page as dirty in
1149 * its radix tree. 1355 * its radix tree.
1150 * 1356 *
1151 * This is also used when a single buffer is being dirtied: we want to set the 1357 * This is also used when a single buffer is being dirtied: we want to set the
1152 * page dirty in that case, but not all the buffers. This is a "bottom-up" 1358 * page dirty in that case, but not all the buffers. This is a "bottom-up"
1153 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. 1359 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
1154 * 1360 *
1155 * Most callers have locked the page, which pins the address_space in memory. 1361 * Most callers have locked the page, which pins the address_space in memory.
1156 * But zap_pte_range() does not lock the page, however in that case the 1362 * But zap_pte_range() does not lock the page, however in that case the
1157 * mapping is pinned by the vma's ->vm_file reference. 1363 * mapping is pinned by the vma's ->vm_file reference.
1158 * 1364 *
1159 * We take care to handle the case where the page was truncated from the 1365 * We take care to handle the case where the page was truncated from the
1160 * mapping by re-checking page_mapping() inside tree_lock. 1366 * mapping by re-checking page_mapping() inside tree_lock.
1161 */ 1367 */
1162 int __set_page_dirty_nobuffers(struct page *page) 1368 int __set_page_dirty_nobuffers(struct page *page)
1163 { 1369 {
1164 if (!TestSetPageDirty(page)) { 1370 if (!TestSetPageDirty(page)) {
1165 struct address_space *mapping = page_mapping(page); 1371 struct address_space *mapping = page_mapping(page);
1166 struct address_space *mapping2; 1372 struct address_space *mapping2;
1167 1373
1168 if (!mapping) 1374 if (!mapping)
1169 return 1; 1375 return 1;
1170 1376
1171 spin_lock_irq(&mapping->tree_lock); 1377 spin_lock_irq(&mapping->tree_lock);
1172 mapping2 = page_mapping(page); 1378 mapping2 = page_mapping(page);
1173 if (mapping2) { /* Race with truncate? */ 1379 if (mapping2) { /* Race with truncate? */
1174 BUG_ON(mapping2 != mapping); 1380 BUG_ON(mapping2 != mapping);
1175 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 1381 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
1176 account_page_dirtied(page, mapping); 1382 account_page_dirtied(page, mapping);
1177 radix_tree_tag_set(&mapping->page_tree, 1383 radix_tree_tag_set(&mapping->page_tree,
1178 page_index(page), PAGECACHE_TAG_DIRTY); 1384 page_index(page), PAGECACHE_TAG_DIRTY);
1179 } 1385 }
1180 spin_unlock_irq(&mapping->tree_lock); 1386 spin_unlock_irq(&mapping->tree_lock);
1181 if (mapping->host) { 1387 if (mapping->host) {
1182 /* !PageAnon && !swapper_space */ 1388 /* !PageAnon && !swapper_space */
1183 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1389 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1184 } 1390 }
1185 return 1; 1391 return 1;
1186 } 1392 }
1187 return 0; 1393 return 0;
1188 } 1394 }
1189 EXPORT_SYMBOL(__set_page_dirty_nobuffers); 1395 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1190 1396
1191 /* 1397 /*
1192 * When a writepage implementation decides that it doesn't want to write this 1398 * When a writepage implementation decides that it doesn't want to write this
1193 * page for some reason, it should redirty the locked page via 1399 * page for some reason, it should redirty the locked page via
1194 * redirty_page_for_writepage() and it should then unlock the page and return 0 1400 * redirty_page_for_writepage() and it should then unlock the page and return 0
1195 */ 1401 */
1196 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) 1402 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1197 { 1403 {
1198 wbc->pages_skipped++; 1404 wbc->pages_skipped++;
1199 return __set_page_dirty_nobuffers(page); 1405 return __set_page_dirty_nobuffers(page);
1200 } 1406 }
1201 EXPORT_SYMBOL(redirty_page_for_writepage); 1407 EXPORT_SYMBOL(redirty_page_for_writepage);
1202 1408
1203 /* 1409 /*
1204 * Dirty a page. 1410 * Dirty a page.
1205 * 1411 *
1206 * For pages with a mapping this should be done under the page lock 1412 * For pages with a mapping this should be done under the page lock
1207 * for the benefit of asynchronous memory errors who prefer a consistent 1413 * for the benefit of asynchronous memory errors who prefer a consistent
1208 * dirty state. This rule can be broken in some special cases, 1414 * dirty state. This rule can be broken in some special cases,
1209 * but should be better not to. 1415 * but should be better not to.
1210 * 1416 *
1211 * If the mapping doesn't provide a set_page_dirty a_op, then 1417 * If the mapping doesn't provide a set_page_dirty a_op, then
1212 * just fall through and assume that it wants buffer_heads. 1418 * just fall through and assume that it wants buffer_heads.
1213 */ 1419 */
1214 int set_page_dirty(struct page *page) 1420 int set_page_dirty(struct page *page)
1215 { 1421 {
1216 struct address_space *mapping = page_mapping(page); 1422 struct address_space *mapping = page_mapping(page);
1217 1423
1218 if (likely(mapping)) { 1424 if (likely(mapping)) {
1219 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 1425 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
1220 /* 1426 /*
1221 * readahead/lru_deactivate_page could remain 1427 * readahead/lru_deactivate_page could remain
1222 * PG_readahead/PG_reclaim due to race with end_page_writeback 1428 * PG_readahead/PG_reclaim due to race with end_page_writeback
1223 * About readahead, if the page is written, the flags would be 1429 * About readahead, if the page is written, the flags would be
1224 * reset. So no problem. 1430 * reset. So no problem.
1225 * About lru_deactivate_page, if the page is redirty, the flag 1431 * About lru_deactivate_page, if the page is redirty, the flag
1226 * will be reset. So no problem. but if the page is used by readahead 1432 * will be reset. So no problem. but if the page is used by readahead
1227 * it will confuse readahead and make it restart the size rampup 1433 * it will confuse readahead and make it restart the size rampup
1228 * process. But it's a trivial problem. 1434 * process. But it's a trivial problem.
1229 */ 1435 */
1230 ClearPageReclaim(page); 1436 ClearPageReclaim(page);
1231 #ifdef CONFIG_BLOCK 1437 #ifdef CONFIG_BLOCK
1232 if (!spd) 1438 if (!spd)
1233 spd = __set_page_dirty_buffers; 1439 spd = __set_page_dirty_buffers;
1234 #endif 1440 #endif
1235 return (*spd)(page); 1441 return (*spd)(page);
1236 } 1442 }
1237 if (!PageDirty(page)) { 1443 if (!PageDirty(page)) {
1238 if (!TestSetPageDirty(page)) 1444 if (!TestSetPageDirty(page))
1239 return 1; 1445 return 1;
1240 } 1446 }
1241 return 0; 1447 return 0;
1242 } 1448 }
1243 EXPORT_SYMBOL(set_page_dirty); 1449 EXPORT_SYMBOL(set_page_dirty);
1244 1450
1245 /* 1451 /*
1246 * set_page_dirty() is racy if the caller has no reference against 1452 * set_page_dirty() is racy if the caller has no reference against
1247 * page->mapping->host, and if the page is unlocked. This is because another 1453 * page->mapping->host, and if the page is unlocked. This is because another
1248 * CPU could truncate the page off the mapping and then free the mapping. 1454 * CPU could truncate the page off the mapping and then free the mapping.
1249 * 1455 *
1250 * Usually, the page _is_ locked, or the caller is a user-space process which 1456 * Usually, the page _is_ locked, or the caller is a user-space process which
1251 * holds a reference on the inode by having an open file. 1457 * holds a reference on the inode by having an open file.
1252 * 1458 *
1253 * In other cases, the page should be locked before running set_page_dirty(). 1459 * In other cases, the page should be locked before running set_page_dirty().
1254 */ 1460 */
1255 int set_page_dirty_lock(struct page *page) 1461 int set_page_dirty_lock(struct page *page)
1256 { 1462 {
1257 int ret; 1463 int ret;
1258 1464
1259 lock_page(page); 1465 lock_page(page);
1260 ret = set_page_dirty(page); 1466 ret = set_page_dirty(page);
1261 unlock_page(page); 1467 unlock_page(page);
1262 return ret; 1468 return ret;
1263 } 1469 }
1264 EXPORT_SYMBOL(set_page_dirty_lock); 1470 EXPORT_SYMBOL(set_page_dirty_lock);
1265 1471
1266 /* 1472 /*
1267 * Clear a page's dirty flag, while caring for dirty memory accounting. 1473 * Clear a page's dirty flag, while caring for dirty memory accounting.
1268 * Returns true if the page was previously dirty. 1474 * Returns true if the page was previously dirty.
1269 * 1475 *
1270 * This is for preparing to put the page under writeout. We leave the page 1476 * This is for preparing to put the page under writeout. We leave the page
1271 * tagged as dirty in the radix tree so that a concurrent write-for-sync 1477 * tagged as dirty in the radix tree so that a concurrent write-for-sync
1272 * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage 1478 * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
1273 * implementation will run either set_page_writeback() or set_page_dirty(), 1479 * implementation will run either set_page_writeback() or set_page_dirty(),
1274 * at which stage we bring the page's dirty flag and radix-tree dirty tag 1480 * at which stage we bring the page's dirty flag and radix-tree dirty tag
1275 * back into sync. 1481 * back into sync.
1276 * 1482 *
1277 * This incoherency between the page's dirty flag and radix-tree tag is 1483 * This incoherency between the page's dirty flag and radix-tree tag is
1278 * unfortunate, but it only exists while the page is locked. 1484 * unfortunate, but it only exists while the page is locked.
1279 */ 1485 */
1280 int clear_page_dirty_for_io(struct page *page) 1486 int clear_page_dirty_for_io(struct page *page)
1281 { 1487 {
1282 struct address_space *mapping = page_mapping(page); 1488 struct address_space *mapping = page_mapping(page);
1283 1489
1284 BUG_ON(!PageLocked(page)); 1490 BUG_ON(!PageLocked(page));
1285 1491
1286 if (mapping && mapping_cap_account_dirty(mapping)) { 1492 if (mapping && mapping_cap_account_dirty(mapping)) {
1287 /* 1493 /*
1288 * Yes, Virginia, this is indeed insane. 1494 * Yes, Virginia, this is indeed insane.
1289 * 1495 *
1290 * We use this sequence to make sure that 1496 * We use this sequence to make sure that
1291 * (a) we account for dirty stats properly 1497 * (a) we account for dirty stats properly
1292 * (b) we tell the low-level filesystem to 1498 * (b) we tell the low-level filesystem to
1293 * mark the whole page dirty if it was 1499 * mark the whole page dirty if it was
1294 * dirty in a pagetable. Only to then 1500 * dirty in a pagetable. Only to then
1295 * (c) clean the page again and return 1 to 1501 * (c) clean the page again and return 1 to
1296 * cause the writeback. 1502 * cause the writeback.
1297 * 1503 *
1298 * This way we avoid all nasty races with the 1504 * This way we avoid all nasty races with the
1299 * dirty bit in multiple places and clearing 1505 * dirty bit in multiple places and clearing
1300 * them concurrently from different threads. 1506 * them concurrently from different threads.
1301 * 1507 *
1302 * Note! Normally the "set_page_dirty(page)" 1508 * Note! Normally the "set_page_dirty(page)"
1303 * has no effect on the actual dirty bit - since 1509 * has no effect on the actual dirty bit - since
1304 * that will already usually be set. But we 1510 * that will already usually be set. But we
1305 * need the side effects, and it can help us 1511 * need the side effects, and it can help us
1306 * avoid races. 1512 * avoid races.
1307 * 1513 *
1308 * We basically use the page "master dirty bit" 1514 * We basically use the page "master dirty bit"
1309 * as a serialization point for all the different 1515 * as a serialization point for all the different
1310 * threads doing their things. 1516 * threads doing their things.
1311 */ 1517 */
1312 if (page_mkclean(page)) 1518 if (page_mkclean(page))
1313 set_page_dirty(page); 1519 set_page_dirty(page);
1314 /* 1520 /*
1315 * We carefully synchronise fault handlers against 1521 * We carefully synchronise fault handlers against
1316 * installing a dirty pte and marking the page dirty 1522 * installing a dirty pte and marking the page dirty
1317 * at this point. We do this by having them hold the 1523 * at this point. We do this by having them hold the
1318 * page lock at some point after installing their 1524 * page lock at some point after installing their
1319 * pte, but before marking the page dirty. 1525 * pte, but before marking the page dirty.
1320 * Pages are always locked coming in here, so we get 1526 * Pages are always locked coming in here, so we get
1321 * the desired exclusion. See mm/memory.c:do_wp_page() 1527 * the desired exclusion. See mm/memory.c:do_wp_page()
1322 * for more comments. 1528 * for more comments.
1323 */ 1529 */
1324 if (TestClearPageDirty(page)) { 1530 if (TestClearPageDirty(page)) {
1325 dec_zone_page_state(page, NR_FILE_DIRTY); 1531 dec_zone_page_state(page, NR_FILE_DIRTY);
1326 dec_bdi_stat(mapping->backing_dev_info, 1532 dec_bdi_stat(mapping->backing_dev_info,
1327 BDI_RECLAIMABLE); 1533 BDI_RECLAIMABLE);
1328 return 1; 1534 return 1;
1329 } 1535 }
1330 return 0; 1536 return 0;
1331 } 1537 }
1332 return TestClearPageDirty(page); 1538 return TestClearPageDirty(page);
1333 } 1539 }
1334 EXPORT_SYMBOL(clear_page_dirty_for_io); 1540 EXPORT_SYMBOL(clear_page_dirty_for_io);
1335 1541
1336 int test_clear_page_writeback(struct page *page) 1542 int test_clear_page_writeback(struct page *page)
1337 { 1543 {
1338 struct address_space *mapping = page_mapping(page); 1544 struct address_space *mapping = page_mapping(page);
1339 int ret; 1545 int ret;
1340 1546
1341 if (mapping) { 1547 if (mapping) {
1342 struct backing_dev_info *bdi = mapping->backing_dev_info; 1548 struct backing_dev_info *bdi = mapping->backing_dev_info;
1343 unsigned long flags; 1549 unsigned long flags;
1344 1550
1345 spin_lock_irqsave(&mapping->tree_lock, flags); 1551 spin_lock_irqsave(&mapping->tree_lock, flags);
1346 ret = TestClearPageWriteback(page); 1552 ret = TestClearPageWriteback(page);
1347 if (ret) { 1553 if (ret) {
1348 radix_tree_tag_clear(&mapping->page_tree, 1554 radix_tree_tag_clear(&mapping->page_tree,
1349 page_index(page), 1555 page_index(page),
1350 PAGECACHE_TAG_WRITEBACK); 1556 PAGECACHE_TAG_WRITEBACK);
1351 if (bdi_cap_account_writeback(bdi)) { 1557 if (bdi_cap_account_writeback(bdi)) {
1352 __dec_bdi_stat(bdi, BDI_WRITEBACK); 1558 __dec_bdi_stat(bdi, BDI_WRITEBACK);
1353 __bdi_writeout_inc(bdi); 1559 __bdi_writeout_inc(bdi);
1354 } 1560 }
1355 } 1561 }
1356 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1562 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1357 } else { 1563 } else {
1358 ret = TestClearPageWriteback(page); 1564 ret = TestClearPageWriteback(page);
1359 } 1565 }
1360 if (ret) { 1566 if (ret) {
1361 dec_zone_page_state(page, NR_WRITEBACK); 1567 dec_zone_page_state(page, NR_WRITEBACK);
1362 inc_zone_page_state(page, NR_WRITTEN); 1568 inc_zone_page_state(page, NR_WRITTEN);
1363 } 1569 }
1364 return ret; 1570 return ret;
1365 } 1571 }
1366 1572
1367 int test_set_page_writeback(struct page *page) 1573 int test_set_page_writeback(struct page *page)
1368 { 1574 {
1369 struct address_space *mapping = page_mapping(page); 1575 struct address_space *mapping = page_mapping(page);
1370 int ret; 1576 int ret;
1371 1577
1372 if (mapping) { 1578 if (mapping) {
1373 struct backing_dev_info *bdi = mapping->backing_dev_info; 1579 struct backing_dev_info *bdi = mapping->backing_dev_info;
1374 unsigned long flags; 1580 unsigned long flags;
1375 1581
1376 spin_lock_irqsave(&mapping->tree_lock, flags); 1582 spin_lock_irqsave(&mapping->tree_lock, flags);
1377 ret = TestSetPageWriteback(page); 1583 ret = TestSetPageWriteback(page);
1378 if (!ret) { 1584 if (!ret) {
1379 radix_tree_tag_set(&mapping->page_tree, 1585 radix_tree_tag_set(&mapping->page_tree,
1380 page_index(page), 1586 page_index(page),
1381 PAGECACHE_TAG_WRITEBACK); 1587 PAGECACHE_TAG_WRITEBACK);
1382 if (bdi_cap_account_writeback(bdi)) 1588 if (bdi_cap_account_writeback(bdi))
1383 __inc_bdi_stat(bdi, BDI_WRITEBACK); 1589 __inc_bdi_stat(bdi, BDI_WRITEBACK);
1384 } 1590 }
1385 if (!PageDirty(page)) 1591 if (!PageDirty(page))
1386 radix_tree_tag_clear(&mapping->page_tree, 1592 radix_tree_tag_clear(&mapping->page_tree,
1387 page_index(page), 1593 page_index(page),
1388 PAGECACHE_TAG_DIRTY); 1594 PAGECACHE_TAG_DIRTY);
1389 radix_tree_tag_clear(&mapping->page_tree, 1595 radix_tree_tag_clear(&mapping->page_tree,
1390 page_index(page), 1596 page_index(page),
1391 PAGECACHE_TAG_TOWRITE); 1597 PAGECACHE_TAG_TOWRITE);
1392 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1598 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1393 } else { 1599 } else {
1394 ret = TestSetPageWriteback(page); 1600 ret = TestSetPageWriteback(page);
1395 } 1601 }
1396 if (!ret) 1602 if (!ret)
1397 account_page_writeback(page); 1603 account_page_writeback(page);
1398 return ret; 1604 return ret;
1 /* 1 /*
2 * mm/rmap.c - physical to virtual reverse mappings 2 * mm/rmap.c - physical to virtual reverse mappings
3 * 3 *
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL). 5 * Released under the General Public License (GPL).
6 * 6 *
7 * Simple, low overhead reverse mapping scheme. 7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible. 8 * Please try to keep this thing as modular as possible.
9 * 9 *
10 * Provides methods for unmapping each kind of mapped page: 10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and 11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode. 12 * the file methods track pages belonging to an inode.
13 * 13 *
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins 2003, 2004 17 * Contributions by Hugh Dickins 2003, 2004
18 */ 18 */
19 19
20 /* 20 /*
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
27 * anon_vma->mutex 27 * anon_vma->mutex
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * swap_lock (in swap_duplicate, swap_info_get)
31 * mmlist_lock (in mmput, drain_mmlist and others) 31 * mmlist_lock (in mmput, drain_mmlist and others)
32 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mapping->private_lock (in __set_page_dirty_buffers)
33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
34 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_wb_list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
42 * pte map lock 42 * pte map lock
43 */ 43 */
44 44
45 #include <linux/mm.h> 45 #include <linux/mm.h>
46 #include <linux/pagemap.h> 46 #include <linux/pagemap.h>
47 #include <linux/swap.h> 47 #include <linux/swap.h>
48 #include <linux/swapops.h> 48 #include <linux/swapops.h>
49 #include <linux/slab.h> 49 #include <linux/slab.h>
50 #include <linux/init.h> 50 #include <linux/init.h>
51 #include <linux/ksm.h> 51 #include <linux/ksm.h>
52 #include <linux/rmap.h> 52 #include <linux/rmap.h>
53 #include <linux/rcupdate.h> 53 #include <linux/rcupdate.h>
54 #include <linux/module.h> 54 #include <linux/module.h>
55 #include <linux/memcontrol.h> 55 #include <linux/memcontrol.h>
56 #include <linux/mmu_notifier.h> 56 #include <linux/mmu_notifier.h>
57 #include <linux/migrate.h> 57 #include <linux/migrate.h>
58 #include <linux/hugetlb.h> 58 #include <linux/hugetlb.h>
59 59
60 #include <asm/tlbflush.h> 60 #include <asm/tlbflush.h>
61 61
62 #include "internal.h" 62 #include "internal.h"
63 63
64 static struct kmem_cache *anon_vma_cachep; 64 static struct kmem_cache *anon_vma_cachep;
65 static struct kmem_cache *anon_vma_chain_cachep; 65 static struct kmem_cache *anon_vma_chain_cachep;
66 66
67 static inline struct anon_vma *anon_vma_alloc(void) 67 static inline struct anon_vma *anon_vma_alloc(void)
68 { 68 {
69 struct anon_vma *anon_vma; 69 struct anon_vma *anon_vma;
70 70
71 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 71 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
72 if (anon_vma) { 72 if (anon_vma) {
73 atomic_set(&anon_vma->refcount, 1); 73 atomic_set(&anon_vma->refcount, 1);
74 /* 74 /*
75 * Initialise the anon_vma root to point to itself. If called 75 * Initialise the anon_vma root to point to itself. If called
76 * from fork, the root will be reset to the parents anon_vma. 76 * from fork, the root will be reset to the parents anon_vma.
77 */ 77 */
78 anon_vma->root = anon_vma; 78 anon_vma->root = anon_vma;
79 } 79 }
80 80
81 return anon_vma; 81 return anon_vma;
82 } 82 }
83 83
84 static inline void anon_vma_free(struct anon_vma *anon_vma) 84 static inline void anon_vma_free(struct anon_vma *anon_vma)
85 { 85 {
86 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 86 VM_BUG_ON(atomic_read(&anon_vma->refcount));
87 87
88 /* 88 /*
89 * Synchronize against page_lock_anon_vma() such that 89 * Synchronize against page_lock_anon_vma() such that
90 * we can safely hold the lock without the anon_vma getting 90 * we can safely hold the lock without the anon_vma getting
91 * freed. 91 * freed.
92 * 92 *
93 * Relies on the full mb implied by the atomic_dec_and_test() from 93 * Relies on the full mb implied by the atomic_dec_and_test() from
94 * put_anon_vma() against the acquire barrier implied by 94 * put_anon_vma() against the acquire barrier implied by
95 * mutex_trylock() from page_lock_anon_vma(). This orders: 95 * mutex_trylock() from page_lock_anon_vma(). This orders:
96 * 96 *
97 * page_lock_anon_vma() VS put_anon_vma() 97 * page_lock_anon_vma() VS put_anon_vma()
98 * mutex_trylock() atomic_dec_and_test() 98 * mutex_trylock() atomic_dec_and_test()
99 * LOCK MB 99 * LOCK MB
100 * atomic_read() mutex_is_locked() 100 * atomic_read() mutex_is_locked()
101 * 101 *
102 * LOCK should suffice since the actual taking of the lock must 102 * LOCK should suffice since the actual taking of the lock must
103 * happen _before_ what follows. 103 * happen _before_ what follows.
104 */ 104 */
105 if (mutex_is_locked(&anon_vma->root->mutex)) { 105 if (mutex_is_locked(&anon_vma->root->mutex)) {
106 anon_vma_lock(anon_vma); 106 anon_vma_lock(anon_vma);
107 anon_vma_unlock(anon_vma); 107 anon_vma_unlock(anon_vma);
108 } 108 }
109 109
110 kmem_cache_free(anon_vma_cachep, anon_vma); 110 kmem_cache_free(anon_vma_cachep, anon_vma);
111 } 111 }
112 112
113 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 113 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
114 { 114 {
115 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 115 return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
116 } 116 }
117 117
118 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 118 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
119 { 119 {
120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
121 } 121 }
122 122
123 /** 123 /**
124 * anon_vma_prepare - attach an anon_vma to a memory region 124 * anon_vma_prepare - attach an anon_vma to a memory region
125 * @vma: the memory region in question 125 * @vma: the memory region in question
126 * 126 *
127 * This makes sure the memory mapping described by 'vma' has 127 * This makes sure the memory mapping described by 'vma' has
128 * an 'anon_vma' attached to it, so that we can associate the 128 * an 'anon_vma' attached to it, so that we can associate the
129 * anonymous pages mapped into it with that anon_vma. 129 * anonymous pages mapped into it with that anon_vma.
130 * 130 *
131 * The common case will be that we already have one, but if 131 * The common case will be that we already have one, but if
132 * not we either need to find an adjacent mapping that we 132 * not we either need to find an adjacent mapping that we
133 * can re-use the anon_vma from (very common when the only 133 * can re-use the anon_vma from (very common when the only
134 * reason for splitting a vma has been mprotect()), or we 134 * reason for splitting a vma has been mprotect()), or we
135 * allocate a new one. 135 * allocate a new one.
136 * 136 *
137 * Anon-vma allocations are very subtle, because we may have 137 * Anon-vma allocations are very subtle, because we may have
138 * optimistically looked up an anon_vma in page_lock_anon_vma() 138 * optimistically looked up an anon_vma in page_lock_anon_vma()
139 * and that may actually touch the spinlock even in the newly 139 * and that may actually touch the spinlock even in the newly
140 * allocated vma (it depends on RCU to make sure that the 140 * allocated vma (it depends on RCU to make sure that the
141 * anon_vma isn't actually destroyed). 141 * anon_vma isn't actually destroyed).
142 * 142 *
143 * As a result, we need to do proper anon_vma locking even 143 * As a result, we need to do proper anon_vma locking even
144 * for the new allocation. At the same time, we do not want 144 * for the new allocation. At the same time, we do not want
145 * to do any locking for the common case of already having 145 * to do any locking for the common case of already having
146 * an anon_vma. 146 * an anon_vma.
147 * 147 *
148 * This must be called with the mmap_sem held for reading. 148 * This must be called with the mmap_sem held for reading.
149 */ 149 */
150 int anon_vma_prepare(struct vm_area_struct *vma) 150 int anon_vma_prepare(struct vm_area_struct *vma)
151 { 151 {
152 struct anon_vma *anon_vma = vma->anon_vma; 152 struct anon_vma *anon_vma = vma->anon_vma;
153 struct anon_vma_chain *avc; 153 struct anon_vma_chain *avc;
154 154
155 might_sleep(); 155 might_sleep();
156 if (unlikely(!anon_vma)) { 156 if (unlikely(!anon_vma)) {
157 struct mm_struct *mm = vma->vm_mm; 157 struct mm_struct *mm = vma->vm_mm;
158 struct anon_vma *allocated; 158 struct anon_vma *allocated;
159 159
160 avc = anon_vma_chain_alloc(GFP_KERNEL); 160 avc = anon_vma_chain_alloc(GFP_KERNEL);
161 if (!avc) 161 if (!avc)
162 goto out_enomem; 162 goto out_enomem;
163 163
164 anon_vma = find_mergeable_anon_vma(vma); 164 anon_vma = find_mergeable_anon_vma(vma);
165 allocated = NULL; 165 allocated = NULL;
166 if (!anon_vma) { 166 if (!anon_vma) {
167 anon_vma = anon_vma_alloc(); 167 anon_vma = anon_vma_alloc();
168 if (unlikely(!anon_vma)) 168 if (unlikely(!anon_vma))
169 goto out_enomem_free_avc; 169 goto out_enomem_free_avc;
170 allocated = anon_vma; 170 allocated = anon_vma;
171 } 171 }
172 172
173 anon_vma_lock(anon_vma); 173 anon_vma_lock(anon_vma);
174 /* page_table_lock to protect against threads */ 174 /* page_table_lock to protect against threads */
175 spin_lock(&mm->page_table_lock); 175 spin_lock(&mm->page_table_lock);
176 if (likely(!vma->anon_vma)) { 176 if (likely(!vma->anon_vma)) {
177 vma->anon_vma = anon_vma; 177 vma->anon_vma = anon_vma;
178 avc->anon_vma = anon_vma; 178 avc->anon_vma = anon_vma;
179 avc->vma = vma; 179 avc->vma = vma;
180 list_add(&avc->same_vma, &vma->anon_vma_chain); 180 list_add(&avc->same_vma, &vma->anon_vma_chain);
181 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 181 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
182 allocated = NULL; 182 allocated = NULL;
183 avc = NULL; 183 avc = NULL;
184 } 184 }
185 spin_unlock(&mm->page_table_lock); 185 spin_unlock(&mm->page_table_lock);
186 anon_vma_unlock(anon_vma); 186 anon_vma_unlock(anon_vma);
187 187
188 if (unlikely(allocated)) 188 if (unlikely(allocated))
189 put_anon_vma(allocated); 189 put_anon_vma(allocated);
190 if (unlikely(avc)) 190 if (unlikely(avc))
191 anon_vma_chain_free(avc); 191 anon_vma_chain_free(avc);
192 } 192 }
193 return 0; 193 return 0;
194 194
195 out_enomem_free_avc: 195 out_enomem_free_avc:
196 anon_vma_chain_free(avc); 196 anon_vma_chain_free(avc);
197 out_enomem: 197 out_enomem:
198 return -ENOMEM; 198 return -ENOMEM;
199 } 199 }
200 200
201 /* 201 /*
202 * This is a useful helper function for locking the anon_vma root as 202 * This is a useful helper function for locking the anon_vma root as
203 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 203 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
204 * have the same vma. 204 * have the same vma.
205 * 205 *
206 * Such anon_vma's should have the same root, so you'd expect to see 206 * Such anon_vma's should have the same root, so you'd expect to see
207 * just a single mutex_lock for the whole traversal. 207 * just a single mutex_lock for the whole traversal.
208 */ 208 */
209 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 209 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
210 { 210 {
211 struct anon_vma *new_root = anon_vma->root; 211 struct anon_vma *new_root = anon_vma->root;
212 if (new_root != root) { 212 if (new_root != root) {
213 if (WARN_ON_ONCE(root)) 213 if (WARN_ON_ONCE(root))
214 mutex_unlock(&root->mutex); 214 mutex_unlock(&root->mutex);
215 root = new_root; 215 root = new_root;
216 mutex_lock(&root->mutex); 216 mutex_lock(&root->mutex);
217 } 217 }
218 return root; 218 return root;
219 } 219 }
220 220
221 static inline void unlock_anon_vma_root(struct anon_vma *root) 221 static inline void unlock_anon_vma_root(struct anon_vma *root)
222 { 222 {
223 if (root) 223 if (root)
224 mutex_unlock(&root->mutex); 224 mutex_unlock(&root->mutex);
225 } 225 }
226 226
227 static void anon_vma_chain_link(struct vm_area_struct *vma, 227 static void anon_vma_chain_link(struct vm_area_struct *vma,
228 struct anon_vma_chain *avc, 228 struct anon_vma_chain *avc,
229 struct anon_vma *anon_vma) 229 struct anon_vma *anon_vma)
230 { 230 {
231 avc->vma = vma; 231 avc->vma = vma;
232 avc->anon_vma = anon_vma; 232 avc->anon_vma = anon_vma;
233 list_add(&avc->same_vma, &vma->anon_vma_chain); 233 list_add(&avc->same_vma, &vma->anon_vma_chain);
234 234
235 /* 235 /*
236 * It's critical to add new vmas to the tail of the anon_vma, 236 * It's critical to add new vmas to the tail of the anon_vma,
237 * see comment in huge_memory.c:__split_huge_page(). 237 * see comment in huge_memory.c:__split_huge_page().
238 */ 238 */
239 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 239 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
240 } 240 }
241 241
242 /* 242 /*
243 * Attach the anon_vmas from src to dst. 243 * Attach the anon_vmas from src to dst.
244 * Returns 0 on success, -ENOMEM on failure. 244 * Returns 0 on success, -ENOMEM on failure.
245 */ 245 */
246 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 246 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
247 { 247 {
248 struct anon_vma_chain *avc, *pavc; 248 struct anon_vma_chain *avc, *pavc;
249 struct anon_vma *root = NULL; 249 struct anon_vma *root = NULL;
250 250
251 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 251 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
252 struct anon_vma *anon_vma; 252 struct anon_vma *anon_vma;
253 253
254 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 254 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
255 if (unlikely(!avc)) { 255 if (unlikely(!avc)) {
256 unlock_anon_vma_root(root); 256 unlock_anon_vma_root(root);
257 root = NULL; 257 root = NULL;
258 avc = anon_vma_chain_alloc(GFP_KERNEL); 258 avc = anon_vma_chain_alloc(GFP_KERNEL);
259 if (!avc) 259 if (!avc)
260 goto enomem_failure; 260 goto enomem_failure;
261 } 261 }
262 anon_vma = pavc->anon_vma; 262 anon_vma = pavc->anon_vma;
263 root = lock_anon_vma_root(root, anon_vma); 263 root = lock_anon_vma_root(root, anon_vma);
264 anon_vma_chain_link(dst, avc, anon_vma); 264 anon_vma_chain_link(dst, avc, anon_vma);
265 } 265 }
266 unlock_anon_vma_root(root); 266 unlock_anon_vma_root(root);
267 return 0; 267 return 0;
268 268
269 enomem_failure: 269 enomem_failure:
270 unlink_anon_vmas(dst); 270 unlink_anon_vmas(dst);
271 return -ENOMEM; 271 return -ENOMEM;
272 } 272 }
273 273
274 /* 274 /*
275 * Attach vma to its own anon_vma, as well as to the anon_vmas that 275 * Attach vma to its own anon_vma, as well as to the anon_vmas that
276 * the corresponding VMA in the parent process is attached to. 276 * the corresponding VMA in the parent process is attached to.
277 * Returns 0 on success, non-zero on failure. 277 * Returns 0 on success, non-zero on failure.
278 */ 278 */
279 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 279 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
280 { 280 {
281 struct anon_vma_chain *avc; 281 struct anon_vma_chain *avc;
282 struct anon_vma *anon_vma; 282 struct anon_vma *anon_vma;
283 283
284 /* Don't bother if the parent process has no anon_vma here. */ 284 /* Don't bother if the parent process has no anon_vma here. */
285 if (!pvma->anon_vma) 285 if (!pvma->anon_vma)
286 return 0; 286 return 0;
287 287
288 /* 288 /*
289 * First, attach the new VMA to the parent VMA's anon_vmas, 289 * First, attach the new VMA to the parent VMA's anon_vmas,
290 * so rmap can find non-COWed pages in child processes. 290 * so rmap can find non-COWed pages in child processes.
291 */ 291 */
292 if (anon_vma_clone(vma, pvma)) 292 if (anon_vma_clone(vma, pvma))
293 return -ENOMEM; 293 return -ENOMEM;
294 294
295 /* Then add our own anon_vma. */ 295 /* Then add our own anon_vma. */
296 anon_vma = anon_vma_alloc(); 296 anon_vma = anon_vma_alloc();
297 if (!anon_vma) 297 if (!anon_vma)
298 goto out_error; 298 goto out_error;
299 avc = anon_vma_chain_alloc(GFP_KERNEL); 299 avc = anon_vma_chain_alloc(GFP_KERNEL);
300 if (!avc) 300 if (!avc)
301 goto out_error_free_anon_vma; 301 goto out_error_free_anon_vma;
302 302
303 /* 303 /*
304 * The root anon_vma's spinlock is the lock actually used when we 304 * The root anon_vma's spinlock is the lock actually used when we
305 * lock any of the anon_vmas in this anon_vma tree. 305 * lock any of the anon_vmas in this anon_vma tree.
306 */ 306 */
307 anon_vma->root = pvma->anon_vma->root; 307 anon_vma->root = pvma->anon_vma->root;
308 /* 308 /*
309 * With refcounts, an anon_vma can stay around longer than the 309 * With refcounts, an anon_vma can stay around longer than the
310 * process it belongs to. The root anon_vma needs to be pinned until 310 * process it belongs to. The root anon_vma needs to be pinned until
311 * this anon_vma is freed, because the lock lives in the root. 311 * this anon_vma is freed, because the lock lives in the root.
312 */ 312 */
313 get_anon_vma(anon_vma->root); 313 get_anon_vma(anon_vma->root);
314 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 314 /* Mark this anon_vma as the one where our new (COWed) pages go. */
315 vma->anon_vma = anon_vma; 315 vma->anon_vma = anon_vma;
316 anon_vma_lock(anon_vma); 316 anon_vma_lock(anon_vma);
317 anon_vma_chain_link(vma, avc, anon_vma); 317 anon_vma_chain_link(vma, avc, anon_vma);
318 anon_vma_unlock(anon_vma); 318 anon_vma_unlock(anon_vma);
319 319
320 return 0; 320 return 0;
321 321
322 out_error_free_anon_vma: 322 out_error_free_anon_vma:
323 put_anon_vma(anon_vma); 323 put_anon_vma(anon_vma);
324 out_error: 324 out_error:
325 unlink_anon_vmas(vma); 325 unlink_anon_vmas(vma);
326 return -ENOMEM; 326 return -ENOMEM;
327 } 327 }
328 328
329 void unlink_anon_vmas(struct vm_area_struct *vma) 329 void unlink_anon_vmas(struct vm_area_struct *vma)
330 { 330 {
331 struct anon_vma_chain *avc, *next; 331 struct anon_vma_chain *avc, *next;
332 struct anon_vma *root = NULL; 332 struct anon_vma *root = NULL;
333 333
334 /* 334 /*
335 * Unlink each anon_vma chained to the VMA. This list is ordered 335 * Unlink each anon_vma chained to the VMA. This list is ordered
336 * from newest to oldest, ensuring the root anon_vma gets freed last. 336 * from newest to oldest, ensuring the root anon_vma gets freed last.
337 */ 337 */
338 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 338 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
339 struct anon_vma *anon_vma = avc->anon_vma; 339 struct anon_vma *anon_vma = avc->anon_vma;
340 340
341 root = lock_anon_vma_root(root, anon_vma); 341 root = lock_anon_vma_root(root, anon_vma);
342 list_del(&avc->same_anon_vma); 342 list_del(&avc->same_anon_vma);
343 343
344 /* 344 /*
345 * Leave empty anon_vmas on the list - we'll need 345 * Leave empty anon_vmas on the list - we'll need
346 * to free them outside the lock. 346 * to free them outside the lock.
347 */ 347 */
348 if (list_empty(&anon_vma->head)) 348 if (list_empty(&anon_vma->head))
349 continue; 349 continue;
350 350
351 list_del(&avc->same_vma); 351 list_del(&avc->same_vma);
352 anon_vma_chain_free(avc); 352 anon_vma_chain_free(avc);
353 } 353 }
354 unlock_anon_vma_root(root); 354 unlock_anon_vma_root(root);
355 355
356 /* 356 /*
357 * Iterate the list once more, it now only contains empty and unlinked 357 * Iterate the list once more, it now only contains empty and unlinked
358 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 358 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
359 * needing to acquire the anon_vma->root->mutex. 359 * needing to acquire the anon_vma->root->mutex.
360 */ 360 */
361 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 361 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
362 struct anon_vma *anon_vma = avc->anon_vma; 362 struct anon_vma *anon_vma = avc->anon_vma;
363 363
364 put_anon_vma(anon_vma); 364 put_anon_vma(anon_vma);
365 365
366 list_del(&avc->same_vma); 366 list_del(&avc->same_vma);
367 anon_vma_chain_free(avc); 367 anon_vma_chain_free(avc);
368 } 368 }
369 } 369 }
370 370
371 static void anon_vma_ctor(void *data) 371 static void anon_vma_ctor(void *data)
372 { 372 {
373 struct anon_vma *anon_vma = data; 373 struct anon_vma *anon_vma = data;
374 374
375 mutex_init(&anon_vma->mutex); 375 mutex_init(&anon_vma->mutex);
376 atomic_set(&anon_vma->refcount, 0); 376 atomic_set(&anon_vma->refcount, 0);
377 INIT_LIST_HEAD(&anon_vma->head); 377 INIT_LIST_HEAD(&anon_vma->head);
378 } 378 }
379 379
380 void __init anon_vma_init(void) 380 void __init anon_vma_init(void)
381 { 381 {
382 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 382 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
383 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 383 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
384 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); 384 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
385 } 385 }
386 386
387 /* 387 /*
388 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 388 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
389 * 389 *
390 * Since there is no serialization what so ever against page_remove_rmap() 390 * Since there is no serialization what so ever against page_remove_rmap()
391 * the best this function can do is return a locked anon_vma that might 391 * the best this function can do is return a locked anon_vma that might
392 * have been relevant to this page. 392 * have been relevant to this page.
393 * 393 *
394 * The page might have been remapped to a different anon_vma or the anon_vma 394 * The page might have been remapped to a different anon_vma or the anon_vma
395 * returned may already be freed (and even reused). 395 * returned may already be freed (and even reused).
396 * 396 *
397 * In case it was remapped to a different anon_vma, the new anon_vma will be a 397 * In case it was remapped to a different anon_vma, the new anon_vma will be a
398 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 398 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
399 * ensure that any anon_vma obtained from the page will still be valid for as 399 * ensure that any anon_vma obtained from the page will still be valid for as
400 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 400 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
401 * 401 *
402 * All users of this function must be very careful when walking the anon_vma 402 * All users of this function must be very careful when walking the anon_vma
403 * chain and verify that the page in question is indeed mapped in it 403 * chain and verify that the page in question is indeed mapped in it
404 * [ something equivalent to page_mapped_in_vma() ]. 404 * [ something equivalent to page_mapped_in_vma() ].
405 * 405 *
406 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() 406 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
407 * that the anon_vma pointer from page->mapping is valid if there is a 407 * that the anon_vma pointer from page->mapping is valid if there is a
408 * mapcount, we can dereference the anon_vma after observing those. 408 * mapcount, we can dereference the anon_vma after observing those.
409 */ 409 */
410 struct anon_vma *page_get_anon_vma(struct page *page) 410 struct anon_vma *page_get_anon_vma(struct page *page)
411 { 411 {
412 struct anon_vma *anon_vma = NULL; 412 struct anon_vma *anon_vma = NULL;
413 unsigned long anon_mapping; 413 unsigned long anon_mapping;
414 414
415 rcu_read_lock(); 415 rcu_read_lock();
416 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 416 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
417 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 417 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
418 goto out; 418 goto out;
419 if (!page_mapped(page)) 419 if (!page_mapped(page))
420 goto out; 420 goto out;
421 421
422 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 422 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
423 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 423 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
424 anon_vma = NULL; 424 anon_vma = NULL;
425 goto out; 425 goto out;
426 } 426 }
427 427
428 /* 428 /*
429 * If this page is still mapped, then its anon_vma cannot have been 429 * If this page is still mapped, then its anon_vma cannot have been
430 * freed. But if it has been unmapped, we have no security against the 430 * freed. But if it has been unmapped, we have no security against the
431 * anon_vma structure being freed and reused (for another anon_vma: 431 * anon_vma structure being freed and reused (for another anon_vma:
432 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() 432 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
433 * above cannot corrupt). 433 * above cannot corrupt).
434 */ 434 */
435 if (!page_mapped(page)) { 435 if (!page_mapped(page)) {
436 put_anon_vma(anon_vma); 436 put_anon_vma(anon_vma);
437 anon_vma = NULL; 437 anon_vma = NULL;
438 } 438 }
439 out: 439 out:
440 rcu_read_unlock(); 440 rcu_read_unlock();
441 441
442 return anon_vma; 442 return anon_vma;
443 } 443 }
444 444
445 /* 445 /*
446 * Similar to page_get_anon_vma() except it locks the anon_vma. 446 * Similar to page_get_anon_vma() except it locks the anon_vma.
447 * 447 *
448 * Its a little more complex as it tries to keep the fast path to a single 448 * Its a little more complex as it tries to keep the fast path to a single
449 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 449 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
450 * reference like with page_get_anon_vma() and then block on the mutex. 450 * reference like with page_get_anon_vma() and then block on the mutex.
451 */ 451 */
452 struct anon_vma *page_lock_anon_vma(struct page *page) 452 struct anon_vma *page_lock_anon_vma(struct page *page)
453 { 453 {
454 struct anon_vma *anon_vma = NULL; 454 struct anon_vma *anon_vma = NULL;
455 struct anon_vma *root_anon_vma; 455 struct anon_vma *root_anon_vma;
456 unsigned long anon_mapping; 456 unsigned long anon_mapping;
457 457
458 rcu_read_lock(); 458 rcu_read_lock();
459 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 459 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
460 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 460 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
461 goto out; 461 goto out;
462 if (!page_mapped(page)) 462 if (!page_mapped(page))
463 goto out; 463 goto out;
464 464
465 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 465 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
466 root_anon_vma = ACCESS_ONCE(anon_vma->root); 466 root_anon_vma = ACCESS_ONCE(anon_vma->root);
467 if (mutex_trylock(&root_anon_vma->mutex)) { 467 if (mutex_trylock(&root_anon_vma->mutex)) {
468 /* 468 /*
469 * If the page is still mapped, then this anon_vma is still 469 * If the page is still mapped, then this anon_vma is still
470 * its anon_vma, and holding the mutex ensures that it will 470 * its anon_vma, and holding the mutex ensures that it will
471 * not go away, see anon_vma_free(). 471 * not go away, see anon_vma_free().
472 */ 472 */
473 if (!page_mapped(page)) { 473 if (!page_mapped(page)) {
474 mutex_unlock(&root_anon_vma->mutex); 474 mutex_unlock(&root_anon_vma->mutex);
475 anon_vma = NULL; 475 anon_vma = NULL;
476 } 476 }
477 goto out; 477 goto out;
478 } 478 }
479 479
480 /* trylock failed, we got to sleep */ 480 /* trylock failed, we got to sleep */
481 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 481 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
482 anon_vma = NULL; 482 anon_vma = NULL;
483 goto out; 483 goto out;
484 } 484 }
485 485
486 if (!page_mapped(page)) { 486 if (!page_mapped(page)) {
487 put_anon_vma(anon_vma); 487 put_anon_vma(anon_vma);
488 anon_vma = NULL; 488 anon_vma = NULL;
489 goto out; 489 goto out;
490 } 490 }
491 491
492 /* we pinned the anon_vma, its safe to sleep */ 492 /* we pinned the anon_vma, its safe to sleep */
493 rcu_read_unlock(); 493 rcu_read_unlock();
494 anon_vma_lock(anon_vma); 494 anon_vma_lock(anon_vma);
495 495
496 if (atomic_dec_and_test(&anon_vma->refcount)) { 496 if (atomic_dec_and_test(&anon_vma->refcount)) {
497 /* 497 /*
498 * Oops, we held the last refcount, release the lock 498 * Oops, we held the last refcount, release the lock
499 * and bail -- can't simply use put_anon_vma() because 499 * and bail -- can't simply use put_anon_vma() because
500 * we'll deadlock on the anon_vma_lock() recursion. 500 * we'll deadlock on the anon_vma_lock() recursion.
501 */ 501 */
502 anon_vma_unlock(anon_vma); 502 anon_vma_unlock(anon_vma);
503 __put_anon_vma(anon_vma); 503 __put_anon_vma(anon_vma);
504 anon_vma = NULL; 504 anon_vma = NULL;
505 } 505 }
506 506
507 return anon_vma; 507 return anon_vma;
508 508
509 out: 509 out:
510 rcu_read_unlock(); 510 rcu_read_unlock();
511 return anon_vma; 511 return anon_vma;
512 } 512 }
513 513
514 void page_unlock_anon_vma(struct anon_vma *anon_vma) 514 void page_unlock_anon_vma(struct anon_vma *anon_vma)
515 { 515 {
516 anon_vma_unlock(anon_vma); 516 anon_vma_unlock(anon_vma);
517 } 517 }
518 518
519 /* 519 /*
520 * At what user virtual address is page expected in @vma? 520 * At what user virtual address is page expected in @vma?
521 * Returns virtual address or -EFAULT if page's index/offset is not 521 * Returns virtual address or -EFAULT if page's index/offset is not
522 * within the range mapped the @vma. 522 * within the range mapped the @vma.
523 */ 523 */
524 inline unsigned long 524 inline unsigned long
525 vma_address(struct page *page, struct vm_area_struct *vma) 525 vma_address(struct page *page, struct vm_area_struct *vma)
526 { 526 {
527 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 527 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
528 unsigned long address; 528 unsigned long address;
529 529
530 if (unlikely(is_vm_hugetlb_page(vma))) 530 if (unlikely(is_vm_hugetlb_page(vma)))
531 pgoff = page->index << huge_page_order(page_hstate(page)); 531 pgoff = page->index << huge_page_order(page_hstate(page));
532 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 532 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
533 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 533 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
534 /* page should be within @vma mapping range */ 534 /* page should be within @vma mapping range */
535 return -EFAULT; 535 return -EFAULT;
536 } 536 }
537 return address; 537 return address;
538 } 538 }
539 539
540 /* 540 /*
541 * At what user virtual address is page expected in vma? 541 * At what user virtual address is page expected in vma?
542 * Caller should check the page is actually part of the vma. 542 * Caller should check the page is actually part of the vma.
543 */ 543 */
544 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 544 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
545 { 545 {
546 if (PageAnon(page)) { 546 if (PageAnon(page)) {
547 struct anon_vma *page__anon_vma = page_anon_vma(page); 547 struct anon_vma *page__anon_vma = page_anon_vma(page);
548 /* 548 /*
549 * Note: swapoff's unuse_vma() is more efficient with this 549 * Note: swapoff's unuse_vma() is more efficient with this
550 * check, and needs it to match anon_vma when KSM is active. 550 * check, and needs it to match anon_vma when KSM is active.
551 */ 551 */
552 if (!vma->anon_vma || !page__anon_vma || 552 if (!vma->anon_vma || !page__anon_vma ||
553 vma->anon_vma->root != page__anon_vma->root) 553 vma->anon_vma->root != page__anon_vma->root)
554 return -EFAULT; 554 return -EFAULT;
555 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 555 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
556 if (!vma->vm_file || 556 if (!vma->vm_file ||
557 vma->vm_file->f_mapping != page->mapping) 557 vma->vm_file->f_mapping != page->mapping)
558 return -EFAULT; 558 return -EFAULT;
559 } else 559 } else
560 return -EFAULT; 560 return -EFAULT;
561 return vma_address(page, vma); 561 return vma_address(page, vma);
562 } 562 }
563 563
564 /* 564 /*
565 * Check that @page is mapped at @address into @mm. 565 * Check that @page is mapped at @address into @mm.
566 * 566 *
567 * If @sync is false, page_check_address may perform a racy check to avoid 567 * If @sync is false, page_check_address may perform a racy check to avoid
568 * the page table lock when the pte is not present (helpful when reclaiming 568 * the page table lock when the pte is not present (helpful when reclaiming
569 * highly shared pages). 569 * highly shared pages).
570 * 570 *
571 * On success returns with pte mapped and locked. 571 * On success returns with pte mapped and locked.
572 */ 572 */
573 pte_t *__page_check_address(struct page *page, struct mm_struct *mm, 573 pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
574 unsigned long address, spinlock_t **ptlp, int sync) 574 unsigned long address, spinlock_t **ptlp, int sync)
575 { 575 {
576 pgd_t *pgd; 576 pgd_t *pgd;
577 pud_t *pud; 577 pud_t *pud;
578 pmd_t *pmd; 578 pmd_t *pmd;
579 pte_t *pte; 579 pte_t *pte;
580 spinlock_t *ptl; 580 spinlock_t *ptl;
581 581
582 if (unlikely(PageHuge(page))) { 582 if (unlikely(PageHuge(page))) {
583 pte = huge_pte_offset(mm, address); 583 pte = huge_pte_offset(mm, address);
584 ptl = &mm->page_table_lock; 584 ptl = &mm->page_table_lock;
585 goto check; 585 goto check;
586 } 586 }
587 587
588 pgd = pgd_offset(mm, address); 588 pgd = pgd_offset(mm, address);
589 if (!pgd_present(*pgd)) 589 if (!pgd_present(*pgd))
590 return NULL; 590 return NULL;
591 591
592 pud = pud_offset(pgd, address); 592 pud = pud_offset(pgd, address);
593 if (!pud_present(*pud)) 593 if (!pud_present(*pud))
594 return NULL; 594 return NULL;
595 595
596 pmd = pmd_offset(pud, address); 596 pmd = pmd_offset(pud, address);
597 if (!pmd_present(*pmd)) 597 if (!pmd_present(*pmd))
598 return NULL; 598 return NULL;
599 if (pmd_trans_huge(*pmd)) 599 if (pmd_trans_huge(*pmd))
600 return NULL; 600 return NULL;
601 601
602 pte = pte_offset_map(pmd, address); 602 pte = pte_offset_map(pmd, address);
603 /* Make a quick check before getting the lock */ 603 /* Make a quick check before getting the lock */
604 if (!sync && !pte_present(*pte)) { 604 if (!sync && !pte_present(*pte)) {
605 pte_unmap(pte); 605 pte_unmap(pte);
606 return NULL; 606 return NULL;
607 } 607 }
608 608
609 ptl = pte_lockptr(mm, pmd); 609 ptl = pte_lockptr(mm, pmd);
610 check: 610 check:
611 spin_lock(ptl); 611 spin_lock(ptl);
612 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 612 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
613 *ptlp = ptl; 613 *ptlp = ptl;
614 return pte; 614 return pte;
615 } 615 }
616 pte_unmap_unlock(pte, ptl); 616 pte_unmap_unlock(pte, ptl);
617 return NULL; 617 return NULL;
618 } 618 }
619 619
620 /** 620 /**
621 * page_mapped_in_vma - check whether a page is really mapped in a VMA 621 * page_mapped_in_vma - check whether a page is really mapped in a VMA
622 * @page: the page to test 622 * @page: the page to test
623 * @vma: the VMA to test 623 * @vma: the VMA to test
624 * 624 *
625 * Returns 1 if the page is mapped into the page tables of the VMA, 0 625 * Returns 1 if the page is mapped into the page tables of the VMA, 0
626 * if the page is not mapped into the page tables of this VMA. Only 626 * if the page is not mapped into the page tables of this VMA. Only
627 * valid for normal file or anonymous VMAs. 627 * valid for normal file or anonymous VMAs.
628 */ 628 */
629 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 629 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
630 { 630 {
631 unsigned long address; 631 unsigned long address;
632 pte_t *pte; 632 pte_t *pte;
633 spinlock_t *ptl; 633 spinlock_t *ptl;
634 634
635 address = vma_address(page, vma); 635 address = vma_address(page, vma);
636 if (address == -EFAULT) /* out of vma range */ 636 if (address == -EFAULT) /* out of vma range */
637 return 0; 637 return 0;
638 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); 638 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
639 if (!pte) /* the page is not in this mm */ 639 if (!pte) /* the page is not in this mm */
640 return 0; 640 return 0;
641 pte_unmap_unlock(pte, ptl); 641 pte_unmap_unlock(pte, ptl);
642 642
643 return 1; 643 return 1;
644 } 644 }
645 645
646 /* 646 /*
647 * Subfunctions of page_referenced: page_referenced_one called 647 * Subfunctions of page_referenced: page_referenced_one called
648 * repeatedly from either page_referenced_anon or page_referenced_file. 648 * repeatedly from either page_referenced_anon or page_referenced_file.
649 */ 649 */
650 int page_referenced_one(struct page *page, struct vm_area_struct *vma, 650 int page_referenced_one(struct page *page, struct vm_area_struct *vma,
651 unsigned long address, unsigned int *mapcount, 651 unsigned long address, unsigned int *mapcount,
652 unsigned long *vm_flags) 652 unsigned long *vm_flags)
653 { 653 {
654 struct mm_struct *mm = vma->vm_mm; 654 struct mm_struct *mm = vma->vm_mm;
655 int referenced = 0; 655 int referenced = 0;
656 656
657 if (unlikely(PageTransHuge(page))) { 657 if (unlikely(PageTransHuge(page))) {
658 pmd_t *pmd; 658 pmd_t *pmd;
659 659
660 spin_lock(&mm->page_table_lock); 660 spin_lock(&mm->page_table_lock);
661 /* 661 /*
662 * rmap might return false positives; we must filter 662 * rmap might return false positives; we must filter
663 * these out using page_check_address_pmd(). 663 * these out using page_check_address_pmd().
664 */ 664 */
665 pmd = page_check_address_pmd(page, mm, address, 665 pmd = page_check_address_pmd(page, mm, address,
666 PAGE_CHECK_ADDRESS_PMD_FLAG); 666 PAGE_CHECK_ADDRESS_PMD_FLAG);
667 if (!pmd) { 667 if (!pmd) {
668 spin_unlock(&mm->page_table_lock); 668 spin_unlock(&mm->page_table_lock);
669 goto out; 669 goto out;
670 } 670 }
671 671
672 if (vma->vm_flags & VM_LOCKED) { 672 if (vma->vm_flags & VM_LOCKED) {
673 spin_unlock(&mm->page_table_lock); 673 spin_unlock(&mm->page_table_lock);
674 *mapcount = 0; /* break early from loop */ 674 *mapcount = 0; /* break early from loop */
675 *vm_flags |= VM_LOCKED; 675 *vm_flags |= VM_LOCKED;
676 goto out; 676 goto out;
677 } 677 }
678 678
679 /* go ahead even if the pmd is pmd_trans_splitting() */ 679 /* go ahead even if the pmd is pmd_trans_splitting() */
680 if (pmdp_clear_flush_young_notify(vma, address, pmd)) 680 if (pmdp_clear_flush_young_notify(vma, address, pmd))
681 referenced++; 681 referenced++;
682 spin_unlock(&mm->page_table_lock); 682 spin_unlock(&mm->page_table_lock);
683 } else { 683 } else {
684 pte_t *pte; 684 pte_t *pte;
685 spinlock_t *ptl; 685 spinlock_t *ptl;
686 686
687 /* 687 /*
688 * rmap might return false positives; we must filter 688 * rmap might return false positives; we must filter
689 * these out using page_check_address(). 689 * these out using page_check_address().
690 */ 690 */
691 pte = page_check_address(page, mm, address, &ptl, 0); 691 pte = page_check_address(page, mm, address, &ptl, 0);
692 if (!pte) 692 if (!pte)
693 goto out; 693 goto out;
694 694
695 if (vma->vm_flags & VM_LOCKED) { 695 if (vma->vm_flags & VM_LOCKED) {
696 pte_unmap_unlock(pte, ptl); 696 pte_unmap_unlock(pte, ptl);
697 *mapcount = 0; /* break early from loop */ 697 *mapcount = 0; /* break early from loop */
698 *vm_flags |= VM_LOCKED; 698 *vm_flags |= VM_LOCKED;
699 goto out; 699 goto out;
700 } 700 }
701 701
702 if (ptep_clear_flush_young_notify(vma, address, pte)) { 702 if (ptep_clear_flush_young_notify(vma, address, pte)) {
703 /* 703 /*
704 * Don't treat a reference through a sequentially read 704 * Don't treat a reference through a sequentially read
705 * mapping as such. If the page has been used in 705 * mapping as such. If the page has been used in
706 * another mapping, we will catch it; if this other 706 * another mapping, we will catch it; if this other
707 * mapping is already gone, the unmap path will have 707 * mapping is already gone, the unmap path will have
708 * set PG_referenced or activated the page. 708 * set PG_referenced or activated the page.
709 */ 709 */
710 if (likely(!VM_SequentialReadHint(vma))) 710 if (likely(!VM_SequentialReadHint(vma)))
711 referenced++; 711 referenced++;
712 } 712 }
713 pte_unmap_unlock(pte, ptl); 713 pte_unmap_unlock(pte, ptl);
714 } 714 }
715 715
716 /* Pretend the page is referenced if the task has the 716 /* Pretend the page is referenced if the task has the
717 swap token and is in the middle of a page fault. */ 717 swap token and is in the middle of a page fault. */
718 if (mm != current->mm && has_swap_token(mm) && 718 if (mm != current->mm && has_swap_token(mm) &&
719 rwsem_is_locked(&mm->mmap_sem)) 719 rwsem_is_locked(&mm->mmap_sem))
720 referenced++; 720 referenced++;
721 721
722 (*mapcount)--; 722 (*mapcount)--;
723 723
724 if (referenced) 724 if (referenced)
725 *vm_flags |= vma->vm_flags; 725 *vm_flags |= vma->vm_flags;
726 out: 726 out:
727 return referenced; 727 return referenced;
728 } 728 }
729 729
730 static int page_referenced_anon(struct page *page, 730 static int page_referenced_anon(struct page *page,
731 struct mem_cgroup *mem_cont, 731 struct mem_cgroup *mem_cont,
732 unsigned long *vm_flags) 732 unsigned long *vm_flags)
733 { 733 {
734 unsigned int mapcount; 734 unsigned int mapcount;
735 struct anon_vma *anon_vma; 735 struct anon_vma *anon_vma;
736 struct anon_vma_chain *avc; 736 struct anon_vma_chain *avc;
737 int referenced = 0; 737 int referenced = 0;
738 738
739 anon_vma = page_lock_anon_vma(page); 739 anon_vma = page_lock_anon_vma(page);
740 if (!anon_vma) 740 if (!anon_vma)
741 return referenced; 741 return referenced;
742 742
743 mapcount = page_mapcount(page); 743 mapcount = page_mapcount(page);
744 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 744 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
745 struct vm_area_struct *vma = avc->vma; 745 struct vm_area_struct *vma = avc->vma;
746 unsigned long address = vma_address(page, vma); 746 unsigned long address = vma_address(page, vma);
747 if (address == -EFAULT) 747 if (address == -EFAULT)
748 continue; 748 continue;
749 /* 749 /*
750 * If we are reclaiming on behalf of a cgroup, skip 750 * If we are reclaiming on behalf of a cgroup, skip
751 * counting on behalf of references from different 751 * counting on behalf of references from different
752 * cgroups 752 * cgroups
753 */ 753 */
754 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 754 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
755 continue; 755 continue;
756 referenced += page_referenced_one(page, vma, address, 756 referenced += page_referenced_one(page, vma, address,
757 &mapcount, vm_flags); 757 &mapcount, vm_flags);
758 if (!mapcount) 758 if (!mapcount)
759 break; 759 break;
760 } 760 }
761 761
762 page_unlock_anon_vma(anon_vma); 762 page_unlock_anon_vma(anon_vma);
763 return referenced; 763 return referenced;
764 } 764 }
765 765
766 /** 766 /**
767 * page_referenced_file - referenced check for object-based rmap 767 * page_referenced_file - referenced check for object-based rmap
768 * @page: the page we're checking references on. 768 * @page: the page we're checking references on.
769 * @mem_cont: target memory controller 769 * @mem_cont: target memory controller
770 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 770 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
771 * 771 *
772 * For an object-based mapped page, find all the places it is mapped and 772 * For an object-based mapped page, find all the places it is mapped and
773 * check/clear the referenced flag. This is done by following the page->mapping 773 * check/clear the referenced flag. This is done by following the page->mapping
774 * pointer, then walking the chain of vmas it holds. It returns the number 774 * pointer, then walking the chain of vmas it holds. It returns the number
775 * of references it found. 775 * of references it found.
776 * 776 *
777 * This function is only called from page_referenced for object-based pages. 777 * This function is only called from page_referenced for object-based pages.
778 */ 778 */
779 static int page_referenced_file(struct page *page, 779 static int page_referenced_file(struct page *page,
780 struct mem_cgroup *mem_cont, 780 struct mem_cgroup *mem_cont,
781 unsigned long *vm_flags) 781 unsigned long *vm_flags)
782 { 782 {
783 unsigned int mapcount; 783 unsigned int mapcount;
784 struct address_space *mapping = page->mapping; 784 struct address_space *mapping = page->mapping;
785 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 785 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
786 struct vm_area_struct *vma; 786 struct vm_area_struct *vma;
787 struct prio_tree_iter iter; 787 struct prio_tree_iter iter;
788 int referenced = 0; 788 int referenced = 0;
789 789
790 /* 790 /*
791 * The caller's checks on page->mapping and !PageAnon have made 791 * The caller's checks on page->mapping and !PageAnon have made
792 * sure that this is a file page: the check for page->mapping 792 * sure that this is a file page: the check for page->mapping
793 * excludes the case just before it gets set on an anon page. 793 * excludes the case just before it gets set on an anon page.
794 */ 794 */
795 BUG_ON(PageAnon(page)); 795 BUG_ON(PageAnon(page));
796 796
797 /* 797 /*
798 * The page lock not only makes sure that page->mapping cannot 798 * The page lock not only makes sure that page->mapping cannot
799 * suddenly be NULLified by truncation, it makes sure that the 799 * suddenly be NULLified by truncation, it makes sure that the
800 * structure at mapping cannot be freed and reused yet, 800 * structure at mapping cannot be freed and reused yet,
801 * so we can safely take mapping->i_mmap_mutex. 801 * so we can safely take mapping->i_mmap_mutex.
802 */ 802 */
803 BUG_ON(!PageLocked(page)); 803 BUG_ON(!PageLocked(page));
804 804
805 mutex_lock(&mapping->i_mmap_mutex); 805 mutex_lock(&mapping->i_mmap_mutex);
806 806
807 /* 807 /*
808 * i_mmap_mutex does not stabilize mapcount at all, but mapcount 808 * i_mmap_mutex does not stabilize mapcount at all, but mapcount
809 * is more likely to be accurate if we note it after spinning. 809 * is more likely to be accurate if we note it after spinning.
810 */ 810 */
811 mapcount = page_mapcount(page); 811 mapcount = page_mapcount(page);
812 812
813 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 813 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
814 unsigned long address = vma_address(page, vma); 814 unsigned long address = vma_address(page, vma);
815 if (address == -EFAULT) 815 if (address == -EFAULT)
816 continue; 816 continue;
817 /* 817 /*
818 * If we are reclaiming on behalf of a cgroup, skip 818 * If we are reclaiming on behalf of a cgroup, skip
819 * counting on behalf of references from different 819 * counting on behalf of references from different
820 * cgroups 820 * cgroups
821 */ 821 */
822 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 822 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
823 continue; 823 continue;
824 referenced += page_referenced_one(page, vma, address, 824 referenced += page_referenced_one(page, vma, address,
825 &mapcount, vm_flags); 825 &mapcount, vm_flags);
826 if (!mapcount) 826 if (!mapcount)
827 break; 827 break;
828 } 828 }
829 829
830 mutex_unlock(&mapping->i_mmap_mutex); 830 mutex_unlock(&mapping->i_mmap_mutex);
831 return referenced; 831 return referenced;
832 } 832 }
833 833
834 /** 834 /**
835 * page_referenced - test if the page was referenced 835 * page_referenced - test if the page was referenced
836 * @page: the page to test 836 * @page: the page to test
837 * @is_locked: caller holds lock on the page 837 * @is_locked: caller holds lock on the page
838 * @mem_cont: target memory controller 838 * @mem_cont: target memory controller
839 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 839 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
840 * 840 *
841 * Quick test_and_clear_referenced for all mappings to a page, 841 * Quick test_and_clear_referenced for all mappings to a page,
842 * returns the number of ptes which referenced the page. 842 * returns the number of ptes which referenced the page.
843 */ 843 */
844 int page_referenced(struct page *page, 844 int page_referenced(struct page *page,
845 int is_locked, 845 int is_locked,
846 struct mem_cgroup *mem_cont, 846 struct mem_cgroup *mem_cont,
847 unsigned long *vm_flags) 847 unsigned long *vm_flags)
848 { 848 {
849 int referenced = 0; 849 int referenced = 0;
850 int we_locked = 0; 850 int we_locked = 0;
851 851
852 *vm_flags = 0; 852 *vm_flags = 0;
853 if (page_mapped(page) && page_rmapping(page)) { 853 if (page_mapped(page) && page_rmapping(page)) {
854 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 854 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
855 we_locked = trylock_page(page); 855 we_locked = trylock_page(page);
856 if (!we_locked) { 856 if (!we_locked) {
857 referenced++; 857 referenced++;
858 goto out; 858 goto out;
859 } 859 }
860 } 860 }
861 if (unlikely(PageKsm(page))) 861 if (unlikely(PageKsm(page)))
862 referenced += page_referenced_ksm(page, mem_cont, 862 referenced += page_referenced_ksm(page, mem_cont,
863 vm_flags); 863 vm_flags);
864 else if (PageAnon(page)) 864 else if (PageAnon(page))
865 referenced += page_referenced_anon(page, mem_cont, 865 referenced += page_referenced_anon(page, mem_cont,
866 vm_flags); 866 vm_flags);
867 else if (page->mapping) 867 else if (page->mapping)
868 referenced += page_referenced_file(page, mem_cont, 868 referenced += page_referenced_file(page, mem_cont,
869 vm_flags); 869 vm_flags);
870 if (we_locked) 870 if (we_locked)
871 unlock_page(page); 871 unlock_page(page);
872 872
873 if (page_test_and_clear_young(page_to_pfn(page))) 873 if (page_test_and_clear_young(page_to_pfn(page)))
874 referenced++; 874 referenced++;
875 } 875 }
876 out: 876 out:
877 return referenced; 877 return referenced;
878 } 878 }
879 879
880 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 880 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
881 unsigned long address) 881 unsigned long address)
882 { 882 {
883 struct mm_struct *mm = vma->vm_mm; 883 struct mm_struct *mm = vma->vm_mm;
884 pte_t *pte; 884 pte_t *pte;
885 spinlock_t *ptl; 885 spinlock_t *ptl;
886 int ret = 0; 886 int ret = 0;
887 887
888 pte = page_check_address(page, mm, address, &ptl, 1); 888 pte = page_check_address(page, mm, address, &ptl, 1);
889 if (!pte) 889 if (!pte)
890 goto out; 890 goto out;
891 891
892 if (pte_dirty(*pte) || pte_write(*pte)) { 892 if (pte_dirty(*pte) || pte_write(*pte)) {
893 pte_t entry; 893 pte_t entry;
894 894
895 flush_cache_page(vma, address, pte_pfn(*pte)); 895 flush_cache_page(vma, address, pte_pfn(*pte));
896 entry = ptep_clear_flush_notify(vma, address, pte); 896 entry = ptep_clear_flush_notify(vma, address, pte);
897 entry = pte_wrprotect(entry); 897 entry = pte_wrprotect(entry);
898 entry = pte_mkclean(entry); 898 entry = pte_mkclean(entry);
899 set_pte_at(mm, address, pte, entry); 899 set_pte_at(mm, address, pte, entry);
900 ret = 1; 900 ret = 1;
901 } 901 }
902 902
903 pte_unmap_unlock(pte, ptl); 903 pte_unmap_unlock(pte, ptl);
904 out: 904 out:
905 return ret; 905 return ret;
906 } 906 }
907 907
908 static int page_mkclean_file(struct address_space *mapping, struct page *page) 908 static int page_mkclean_file(struct address_space *mapping, struct page *page)
909 { 909 {
910 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 910 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
911 struct vm_area_struct *vma; 911 struct vm_area_struct *vma;
912 struct prio_tree_iter iter; 912 struct prio_tree_iter iter;
913 int ret = 0; 913 int ret = 0;
914 914
915 BUG_ON(PageAnon(page)); 915 BUG_ON(PageAnon(page));
916 916
917 mutex_lock(&mapping->i_mmap_mutex); 917 mutex_lock(&mapping->i_mmap_mutex);
918 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 918 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
919 if (vma->vm_flags & VM_SHARED) { 919 if (vma->vm_flags & VM_SHARED) {
920 unsigned long address = vma_address(page, vma); 920 unsigned long address = vma_address(page, vma);
921 if (address == -EFAULT) 921 if (address == -EFAULT)
922 continue; 922 continue;
923 ret += page_mkclean_one(page, vma, address); 923 ret += page_mkclean_one(page, vma, address);
924 } 924 }
925 } 925 }
926 mutex_unlock(&mapping->i_mmap_mutex); 926 mutex_unlock(&mapping->i_mmap_mutex);
927 return ret; 927 return ret;
928 } 928 }
929 929
930 int page_mkclean(struct page *page) 930 int page_mkclean(struct page *page)
931 { 931 {
932 int ret = 0; 932 int ret = 0;
933 933
934 BUG_ON(!PageLocked(page)); 934 BUG_ON(!PageLocked(page));
935 935
936 if (page_mapped(page)) { 936 if (page_mapped(page)) {
937 struct address_space *mapping = page_mapping(page); 937 struct address_space *mapping = page_mapping(page);
938 if (mapping) { 938 if (mapping) {
939 ret = page_mkclean_file(mapping, page); 939 ret = page_mkclean_file(mapping, page);
940 if (page_test_and_clear_dirty(page_to_pfn(page), 1)) 940 if (page_test_and_clear_dirty(page_to_pfn(page), 1))
941 ret = 1; 941 ret = 1;
942 } 942 }
943 } 943 }
944 944
945 return ret; 945 return ret;
946 } 946 }
947 EXPORT_SYMBOL_GPL(page_mkclean); 947 EXPORT_SYMBOL_GPL(page_mkclean);
948 948
949 /** 949 /**
950 * page_move_anon_rmap - move a page to our anon_vma 950 * page_move_anon_rmap - move a page to our anon_vma
951 * @page: the page to move to our anon_vma 951 * @page: the page to move to our anon_vma
952 * @vma: the vma the page belongs to 952 * @vma: the vma the page belongs to
953 * @address: the user virtual address mapped 953 * @address: the user virtual address mapped
954 * 954 *
955 * When a page belongs exclusively to one process after a COW event, 955 * When a page belongs exclusively to one process after a COW event,
956 * that page can be moved into the anon_vma that belongs to just that 956 * that page can be moved into the anon_vma that belongs to just that
957 * process, so the rmap code will not search the parent or sibling 957 * process, so the rmap code will not search the parent or sibling
958 * processes. 958 * processes.
959 */ 959 */
960 void page_move_anon_rmap(struct page *page, 960 void page_move_anon_rmap(struct page *page,
961 struct vm_area_struct *vma, unsigned long address) 961 struct vm_area_struct *vma, unsigned long address)
962 { 962 {
963 struct anon_vma *anon_vma = vma->anon_vma; 963 struct anon_vma *anon_vma = vma->anon_vma;
964 964
965 VM_BUG_ON(!PageLocked(page)); 965 VM_BUG_ON(!PageLocked(page));
966 VM_BUG_ON(!anon_vma); 966 VM_BUG_ON(!anon_vma);
967 VM_BUG_ON(page->index != linear_page_index(vma, address)); 967 VM_BUG_ON(page->index != linear_page_index(vma, address));
968 968
969 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 969 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
970 page->mapping = (struct address_space *) anon_vma; 970 page->mapping = (struct address_space *) anon_vma;
971 } 971 }
972 972
973 /** 973 /**
974 * __page_set_anon_rmap - set up new anonymous rmap 974 * __page_set_anon_rmap - set up new anonymous rmap
975 * @page: Page to add to rmap 975 * @page: Page to add to rmap
976 * @vma: VM area to add page to. 976 * @vma: VM area to add page to.
977 * @address: User virtual address of the mapping 977 * @address: User virtual address of the mapping
978 * @exclusive: the page is exclusively owned by the current process 978 * @exclusive: the page is exclusively owned by the current process
979 */ 979 */
980 static void __page_set_anon_rmap(struct page *page, 980 static void __page_set_anon_rmap(struct page *page,
981 struct vm_area_struct *vma, unsigned long address, int exclusive) 981 struct vm_area_struct *vma, unsigned long address, int exclusive)
982 { 982 {
983 struct anon_vma *anon_vma = vma->anon_vma; 983 struct anon_vma *anon_vma = vma->anon_vma;
984 984
985 BUG_ON(!anon_vma); 985 BUG_ON(!anon_vma);
986 986
987 if (PageAnon(page)) 987 if (PageAnon(page))
988 return; 988 return;
989 989
990 /* 990 /*
991 * If the page isn't exclusively mapped into this vma, 991 * If the page isn't exclusively mapped into this vma,
992 * we must use the _oldest_ possible anon_vma for the 992 * we must use the _oldest_ possible anon_vma for the
993 * page mapping! 993 * page mapping!
994 */ 994 */
995 if (!exclusive) 995 if (!exclusive)
996 anon_vma = anon_vma->root; 996 anon_vma = anon_vma->root;
997 997
998 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 998 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
999 page->mapping = (struct address_space *) anon_vma; 999 page->mapping = (struct address_space *) anon_vma;
1000 page->index = linear_page_index(vma, address); 1000 page->index = linear_page_index(vma, address);
1001 } 1001 }
1002 1002
1003 /** 1003 /**
1004 * __page_check_anon_rmap - sanity check anonymous rmap addition 1004 * __page_check_anon_rmap - sanity check anonymous rmap addition
1005 * @page: the page to add the mapping to 1005 * @page: the page to add the mapping to
1006 * @vma: the vm area in which the mapping is added 1006 * @vma: the vm area in which the mapping is added
1007 * @address: the user virtual address mapped 1007 * @address: the user virtual address mapped
1008 */ 1008 */
1009 static void __page_check_anon_rmap(struct page *page, 1009 static void __page_check_anon_rmap(struct page *page,
1010 struct vm_area_struct *vma, unsigned long address) 1010 struct vm_area_struct *vma, unsigned long address)
1011 { 1011 {
1012 #ifdef CONFIG_DEBUG_VM 1012 #ifdef CONFIG_DEBUG_VM
1013 /* 1013 /*
1014 * The page's anon-rmap details (mapping and index) are guaranteed to 1014 * The page's anon-rmap details (mapping and index) are guaranteed to
1015 * be set up correctly at this point. 1015 * be set up correctly at this point.
1016 * 1016 *
1017 * We have exclusion against page_add_anon_rmap because the caller 1017 * We have exclusion against page_add_anon_rmap because the caller
1018 * always holds the page locked, except if called from page_dup_rmap, 1018 * always holds the page locked, except if called from page_dup_rmap,
1019 * in which case the page is already known to be setup. 1019 * in which case the page is already known to be setup.
1020 * 1020 *
1021 * We have exclusion against page_add_new_anon_rmap because those pages 1021 * We have exclusion against page_add_new_anon_rmap because those pages
1022 * are initially only visible via the pagetables, and the pte is locked 1022 * are initially only visible via the pagetables, and the pte is locked
1023 * over the call to page_add_new_anon_rmap. 1023 * over the call to page_add_new_anon_rmap.
1024 */ 1024 */
1025 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 1025 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
1026 BUG_ON(page->index != linear_page_index(vma, address)); 1026 BUG_ON(page->index != linear_page_index(vma, address));
1027 #endif 1027 #endif
1028 } 1028 }
1029 1029
1030 /** 1030 /**
1031 * page_add_anon_rmap - add pte mapping to an anonymous page 1031 * page_add_anon_rmap - add pte mapping to an anonymous page
1032 * @page: the page to add the mapping to 1032 * @page: the page to add the mapping to
1033 * @vma: the vm area in which the mapping is added 1033 * @vma: the vm area in which the mapping is added
1034 * @address: the user virtual address mapped 1034 * @address: the user virtual address mapped
1035 * 1035 *
1036 * The caller needs to hold the pte lock, and the page must be locked in 1036 * The caller needs to hold the pte lock, and the page must be locked in
1037 * the anon_vma case: to serialize mapping,index checking after setting, 1037 * the anon_vma case: to serialize mapping,index checking after setting,
1038 * and to ensure that PageAnon is not being upgraded racily to PageKsm 1038 * and to ensure that PageAnon is not being upgraded racily to PageKsm
1039 * (but PageKsm is never downgraded to PageAnon). 1039 * (but PageKsm is never downgraded to PageAnon).
1040 */ 1040 */
1041 void page_add_anon_rmap(struct page *page, 1041 void page_add_anon_rmap(struct page *page,
1042 struct vm_area_struct *vma, unsigned long address) 1042 struct vm_area_struct *vma, unsigned long address)
1043 { 1043 {
1044 do_page_add_anon_rmap(page, vma, address, 0); 1044 do_page_add_anon_rmap(page, vma, address, 0);
1045 } 1045 }
1046 1046
1047 /* 1047 /*
1048 * Special version of the above for do_swap_page, which often runs 1048 * Special version of the above for do_swap_page, which often runs
1049 * into pages that are exclusively owned by the current process. 1049 * into pages that are exclusively owned by the current process.
1050 * Everybody else should continue to use page_add_anon_rmap above. 1050 * Everybody else should continue to use page_add_anon_rmap above.
1051 */ 1051 */
1052 void do_page_add_anon_rmap(struct page *page, 1052 void do_page_add_anon_rmap(struct page *page,
1053 struct vm_area_struct *vma, unsigned long address, int exclusive) 1053 struct vm_area_struct *vma, unsigned long address, int exclusive)
1054 { 1054 {
1055 int first = atomic_inc_and_test(&page->_mapcount); 1055 int first = atomic_inc_and_test(&page->_mapcount);
1056 if (first) { 1056 if (first) {
1057 if (!PageTransHuge(page)) 1057 if (!PageTransHuge(page))
1058 __inc_zone_page_state(page, NR_ANON_PAGES); 1058 __inc_zone_page_state(page, NR_ANON_PAGES);
1059 else 1059 else
1060 __inc_zone_page_state(page, 1060 __inc_zone_page_state(page,
1061 NR_ANON_TRANSPARENT_HUGEPAGES); 1061 NR_ANON_TRANSPARENT_HUGEPAGES);
1062 } 1062 }
1063 if (unlikely(PageKsm(page))) 1063 if (unlikely(PageKsm(page)))
1064 return; 1064 return;
1065 1065
1066 VM_BUG_ON(!PageLocked(page)); 1066 VM_BUG_ON(!PageLocked(page));
1067 /* address might be in next vma when migration races vma_adjust */ 1067 /* address might be in next vma when migration races vma_adjust */
1068 if (first) 1068 if (first)
1069 __page_set_anon_rmap(page, vma, address, exclusive); 1069 __page_set_anon_rmap(page, vma, address, exclusive);
1070 else 1070 else
1071 __page_check_anon_rmap(page, vma, address); 1071 __page_check_anon_rmap(page, vma, address);
1072 } 1072 }
1073 1073
1074 /** 1074 /**
1075 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1075 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
1076 * @page: the page to add the mapping to 1076 * @page: the page to add the mapping to
1077 * @vma: the vm area in which the mapping is added 1077 * @vma: the vm area in which the mapping is added
1078 * @address: the user virtual address mapped 1078 * @address: the user virtual address mapped
1079 * 1079 *
1080 * Same as page_add_anon_rmap but must only be called on *new* pages. 1080 * Same as page_add_anon_rmap but must only be called on *new* pages.
1081 * This means the inc-and-test can be bypassed. 1081 * This means the inc-and-test can be bypassed.
1082 * Page does not have to be locked. 1082 * Page does not have to be locked.
1083 */ 1083 */
1084 void page_add_new_anon_rmap(struct page *page, 1084 void page_add_new_anon_rmap(struct page *page,
1085 struct vm_area_struct *vma, unsigned long address) 1085 struct vm_area_struct *vma, unsigned long address)
1086 { 1086 {
1087 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1087 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1088 SetPageSwapBacked(page); 1088 SetPageSwapBacked(page);
1089 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1089 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
1090 if (!PageTransHuge(page)) 1090 if (!PageTransHuge(page))
1091 __inc_zone_page_state(page, NR_ANON_PAGES); 1091 __inc_zone_page_state(page, NR_ANON_PAGES);
1092 else 1092 else
1093 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1093 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1094 __page_set_anon_rmap(page, vma, address, 1); 1094 __page_set_anon_rmap(page, vma, address, 1);
1095 if (page_evictable(page, vma)) 1095 if (page_evictable(page, vma))
1096 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1096 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
1097 else 1097 else
1098 add_page_to_unevictable_list(page); 1098 add_page_to_unevictable_list(page);
1099 } 1099 }
1100 1100
1101 /** 1101 /**
1102 * page_add_file_rmap - add pte mapping to a file page 1102 * page_add_file_rmap - add pte mapping to a file page
1103 * @page: the page to add the mapping to 1103 * @page: the page to add the mapping to
1104 * 1104 *
1105 * The caller needs to hold the pte lock. 1105 * The caller needs to hold the pte lock.
1106 */ 1106 */
1107 void page_add_file_rmap(struct page *page) 1107 void page_add_file_rmap(struct page *page)
1108 { 1108 {
1109 if (atomic_inc_and_test(&page->_mapcount)) { 1109 if (atomic_inc_and_test(&page->_mapcount)) {
1110 __inc_zone_page_state(page, NR_FILE_MAPPED); 1110 __inc_zone_page_state(page, NR_FILE_MAPPED);
1111 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); 1111 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
1112 } 1112 }
1113 } 1113 }
1114 1114
1115 /** 1115 /**
1116 * page_remove_rmap - take down pte mapping from a page 1116 * page_remove_rmap - take down pte mapping from a page
1117 * @page: page to remove mapping from 1117 * @page: page to remove mapping from
1118 * 1118 *
1119 * The caller needs to hold the pte lock. 1119 * The caller needs to hold the pte lock.
1120 */ 1120 */
1121 void page_remove_rmap(struct page *page) 1121 void page_remove_rmap(struct page *page)
1122 { 1122 {
1123 /* page still mapped by someone else? */ 1123 /* page still mapped by someone else? */
1124 if (!atomic_add_negative(-1, &page->_mapcount)) 1124 if (!atomic_add_negative(-1, &page->_mapcount))
1125 return; 1125 return;
1126 1126
1127 /* 1127 /*
1128 * Now that the last pte has gone, s390 must transfer dirty 1128 * Now that the last pte has gone, s390 must transfer dirty
1129 * flag from storage key to struct page. We can usually skip 1129 * flag from storage key to struct page. We can usually skip
1130 * this if the page is anon, so about to be freed; but perhaps 1130 * this if the page is anon, so about to be freed; but perhaps
1131 * not if it's in swapcache - there might be another pte slot 1131 * not if it's in swapcache - there might be another pte slot
1132 * containing the swap entry, but page not yet written to swap. 1132 * containing the swap entry, but page not yet written to swap.
1133 */ 1133 */
1134 if ((!PageAnon(page) || PageSwapCache(page)) && 1134 if ((!PageAnon(page) || PageSwapCache(page)) &&
1135 page_test_and_clear_dirty(page_to_pfn(page), 1)) 1135 page_test_and_clear_dirty(page_to_pfn(page), 1))
1136 set_page_dirty(page); 1136 set_page_dirty(page);
1137 /* 1137 /*
1138 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1138 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1139 * and not charged by memcg for now. 1139 * and not charged by memcg for now.
1140 */ 1140 */
1141 if (unlikely(PageHuge(page))) 1141 if (unlikely(PageHuge(page)))
1142 return; 1142 return;
1143 if (PageAnon(page)) { 1143 if (PageAnon(page)) {
1144 mem_cgroup_uncharge_page(page); 1144 mem_cgroup_uncharge_page(page);
1145 if (!PageTransHuge(page)) 1145 if (!PageTransHuge(page))
1146 __dec_zone_page_state(page, NR_ANON_PAGES); 1146 __dec_zone_page_state(page, NR_ANON_PAGES);
1147 else 1147 else
1148 __dec_zone_page_state(page, 1148 __dec_zone_page_state(page,
1149 NR_ANON_TRANSPARENT_HUGEPAGES); 1149 NR_ANON_TRANSPARENT_HUGEPAGES);
1150 } else { 1150 } else {
1151 __dec_zone_page_state(page, NR_FILE_MAPPED); 1151 __dec_zone_page_state(page, NR_FILE_MAPPED);
1152 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); 1152 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
1153 } 1153 }
1154 /* 1154 /*
1155 * It would be tidy to reset the PageAnon mapping here, 1155 * It would be tidy to reset the PageAnon mapping here,
1156 * but that might overwrite a racing page_add_anon_rmap 1156 * but that might overwrite a racing page_add_anon_rmap
1157 * which increments mapcount after us but sets mapping 1157 * which increments mapcount after us but sets mapping
1158 * before us: so leave the reset to free_hot_cold_page, 1158 * before us: so leave the reset to free_hot_cold_page,
1159 * and remember that it's only reliable while mapped. 1159 * and remember that it's only reliable while mapped.
1160 * Leaving it set also helps swapoff to reinstate ptes 1160 * Leaving it set also helps swapoff to reinstate ptes
1161 * faster for those pages still in swapcache. 1161 * faster for those pages still in swapcache.
1162 */ 1162 */
1163 } 1163 }
1164 1164
1165 /* 1165 /*
1166 * Subfunctions of try_to_unmap: try_to_unmap_one called 1166 * Subfunctions of try_to_unmap: try_to_unmap_one called
1167 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 1167 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
1168 */ 1168 */
1169 int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1169 int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1170 unsigned long address, enum ttu_flags flags) 1170 unsigned long address, enum ttu_flags flags)
1171 { 1171 {
1172 struct mm_struct *mm = vma->vm_mm; 1172 struct mm_struct *mm = vma->vm_mm;
1173 pte_t *pte; 1173 pte_t *pte;
1174 pte_t pteval; 1174 pte_t pteval;
1175 spinlock_t *ptl; 1175 spinlock_t *ptl;
1176 int ret = SWAP_AGAIN; 1176 int ret = SWAP_AGAIN;
1177 1177
1178 pte = page_check_address(page, mm, address, &ptl, 0); 1178 pte = page_check_address(page, mm, address, &ptl, 0);
1179 if (!pte) 1179 if (!pte)
1180 goto out; 1180 goto out;
1181 1181
1182 /* 1182 /*
1183 * If the page is mlock()d, we cannot swap it out. 1183 * If the page is mlock()d, we cannot swap it out.
1184 * If it's recently referenced (perhaps page_referenced 1184 * If it's recently referenced (perhaps page_referenced
1185 * skipped over this mm) then we should reactivate it. 1185 * skipped over this mm) then we should reactivate it.
1186 */ 1186 */
1187 if (!(flags & TTU_IGNORE_MLOCK)) { 1187 if (!(flags & TTU_IGNORE_MLOCK)) {
1188 if (vma->vm_flags & VM_LOCKED) 1188 if (vma->vm_flags & VM_LOCKED)
1189 goto out_mlock; 1189 goto out_mlock;
1190 1190
1191 if (TTU_ACTION(flags) == TTU_MUNLOCK) 1191 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1192 goto out_unmap; 1192 goto out_unmap;
1193 } 1193 }
1194 if (!(flags & TTU_IGNORE_ACCESS)) { 1194 if (!(flags & TTU_IGNORE_ACCESS)) {
1195 if (ptep_clear_flush_young_notify(vma, address, pte)) { 1195 if (ptep_clear_flush_young_notify(vma, address, pte)) {
1196 ret = SWAP_FAIL; 1196 ret = SWAP_FAIL;
1197 goto out_unmap; 1197 goto out_unmap;
1198 } 1198 }
1199 } 1199 }
1200 1200
1201 /* Nuke the page table entry. */ 1201 /* Nuke the page table entry. */
1202 flush_cache_page(vma, address, page_to_pfn(page)); 1202 flush_cache_page(vma, address, page_to_pfn(page));
1203 pteval = ptep_clear_flush_notify(vma, address, pte); 1203 pteval = ptep_clear_flush_notify(vma, address, pte);
1204 1204
1205 /* Move the dirty bit to the physical page now the pte is gone. */ 1205 /* Move the dirty bit to the physical page now the pte is gone. */
1206 if (pte_dirty(pteval)) 1206 if (pte_dirty(pteval))
1207 set_page_dirty(page); 1207 set_page_dirty(page);
1208 1208
1209 /* Update high watermark before we lower rss */ 1209 /* Update high watermark before we lower rss */
1210 update_hiwater_rss(mm); 1210 update_hiwater_rss(mm);
1211 1211
1212 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1212 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1213 if (PageAnon(page)) 1213 if (PageAnon(page))
1214 dec_mm_counter(mm, MM_ANONPAGES); 1214 dec_mm_counter(mm, MM_ANONPAGES);
1215 else 1215 else
1216 dec_mm_counter(mm, MM_FILEPAGES); 1216 dec_mm_counter(mm, MM_FILEPAGES);
1217 set_pte_at(mm, address, pte, 1217 set_pte_at(mm, address, pte,
1218 swp_entry_to_pte(make_hwpoison_entry(page))); 1218 swp_entry_to_pte(make_hwpoison_entry(page)));
1219 } else if (PageAnon(page)) { 1219 } else if (PageAnon(page)) {
1220 swp_entry_t entry = { .val = page_private(page) }; 1220 swp_entry_t entry = { .val = page_private(page) };
1221 1221
1222 if (PageSwapCache(page)) { 1222 if (PageSwapCache(page)) {
1223 /* 1223 /*
1224 * Store the swap location in the pte. 1224 * Store the swap location in the pte.
1225 * See handle_pte_fault() ... 1225 * See handle_pte_fault() ...
1226 */ 1226 */
1227 if (swap_duplicate(entry) < 0) { 1227 if (swap_duplicate(entry) < 0) {
1228 set_pte_at(mm, address, pte, pteval); 1228 set_pte_at(mm, address, pte, pteval);
1229 ret = SWAP_FAIL; 1229 ret = SWAP_FAIL;
1230 goto out_unmap; 1230 goto out_unmap;
1231 } 1231 }
1232 if (list_empty(&mm->mmlist)) { 1232 if (list_empty(&mm->mmlist)) {
1233 spin_lock(&mmlist_lock); 1233 spin_lock(&mmlist_lock);
1234 if (list_empty(&mm->mmlist)) 1234 if (list_empty(&mm->mmlist))
1235 list_add(&mm->mmlist, &init_mm.mmlist); 1235 list_add(&mm->mmlist, &init_mm.mmlist);
1236 spin_unlock(&mmlist_lock); 1236 spin_unlock(&mmlist_lock);
1237 } 1237 }
1238 dec_mm_counter(mm, MM_ANONPAGES); 1238 dec_mm_counter(mm, MM_ANONPAGES);
1239 inc_mm_counter(mm, MM_SWAPENTS); 1239 inc_mm_counter(mm, MM_SWAPENTS);
1240 } else if (PAGE_MIGRATION) { 1240 } else if (PAGE_MIGRATION) {
1241 /* 1241 /*
1242 * Store the pfn of the page in a special migration 1242 * Store the pfn of the page in a special migration
1243 * pte. do_swap_page() will wait until the migration 1243 * pte. do_swap_page() will wait until the migration
1244 * pte is removed and then restart fault handling. 1244 * pte is removed and then restart fault handling.
1245 */ 1245 */
1246 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); 1246 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
1247 entry = make_migration_entry(page, pte_write(pteval)); 1247 entry = make_migration_entry(page, pte_write(pteval));
1248 } 1248 }
1249 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1249 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1250 BUG_ON(pte_file(*pte)); 1250 BUG_ON(pte_file(*pte));
1251 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { 1251 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
1252 /* Establish migration entry for a file page */ 1252 /* Establish migration entry for a file page */
1253 swp_entry_t entry; 1253 swp_entry_t entry;
1254 entry = make_migration_entry(page, pte_write(pteval)); 1254 entry = make_migration_entry(page, pte_write(pteval));
1255 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1255 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1256 } else 1256 } else
1257 dec_mm_counter(mm, MM_FILEPAGES); 1257 dec_mm_counter(mm, MM_FILEPAGES);
1258 1258
1259 page_remove_rmap(page); 1259 page_remove_rmap(page);
1260 page_cache_release(page); 1260 page_cache_release(page);
1261 1261
1262 out_unmap: 1262 out_unmap:
1263 pte_unmap_unlock(pte, ptl); 1263 pte_unmap_unlock(pte, ptl);
1264 out: 1264 out:
1265 return ret; 1265 return ret;
1266 1266
1267 out_mlock: 1267 out_mlock:
1268 pte_unmap_unlock(pte, ptl); 1268 pte_unmap_unlock(pte, ptl);
1269 1269
1270 1270
1271 /* 1271 /*
1272 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1272 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1273 * unstable result and race. Plus, We can't wait here because 1273 * unstable result and race. Plus, We can't wait here because
1274 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. 1274 * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
1275 * if trylock failed, the page remain in evictable lru and later 1275 * if trylock failed, the page remain in evictable lru and later
1276 * vmscan could retry to move the page to unevictable lru if the 1276 * vmscan could retry to move the page to unevictable lru if the
1277 * page is actually mlocked. 1277 * page is actually mlocked.
1278 */ 1278 */
1279 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 1279 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1280 if (vma->vm_flags & VM_LOCKED) { 1280 if (vma->vm_flags & VM_LOCKED) {
1281 mlock_vma_page(page); 1281 mlock_vma_page(page);
1282 ret = SWAP_MLOCK; 1282 ret = SWAP_MLOCK;
1283 } 1283 }
1284 up_read(&vma->vm_mm->mmap_sem); 1284 up_read(&vma->vm_mm->mmap_sem);
1285 } 1285 }
1286 return ret; 1286 return ret;
1287 } 1287 }
1288 1288
1289 /* 1289 /*
1290 * objrmap doesn't work for nonlinear VMAs because the assumption that 1290 * objrmap doesn't work for nonlinear VMAs because the assumption that
1291 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 1291 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
1292 * Consequently, given a particular page and its ->index, we cannot locate the 1292 * Consequently, given a particular page and its ->index, we cannot locate the
1293 * ptes which are mapping that page without an exhaustive linear search. 1293 * ptes which are mapping that page without an exhaustive linear search.
1294 * 1294 *
1295 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 1295 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
1296 * maps the file to which the target page belongs. The ->vm_private_data field 1296 * maps the file to which the target page belongs. The ->vm_private_data field
1297 * holds the current cursor into that scan. Successive searches will circulate 1297 * holds the current cursor into that scan. Successive searches will circulate
1298 * around the vma's virtual address space. 1298 * around the vma's virtual address space.
1299 * 1299 *
1300 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 1300 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
1301 * more scanning pressure is placed against them as well. Eventually pages 1301 * more scanning pressure is placed against them as well. Eventually pages
1302 * will become fully unmapped and are eligible for eviction. 1302 * will become fully unmapped and are eligible for eviction.
1303 * 1303 *
1304 * For very sparsely populated VMAs this is a little inefficient - chances are 1304 * For very sparsely populated VMAs this is a little inefficient - chances are
1305 * there there won't be many ptes located within the scan cluster. In this case 1305 * there there won't be many ptes located within the scan cluster. In this case
1306 * maybe we could scan further - to the end of the pte page, perhaps. 1306 * maybe we could scan further - to the end of the pte page, perhaps.
1307 * 1307 *
1308 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can 1308 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
1309 * acquire it without blocking. If vma locked, mlock the pages in the cluster, 1309 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
1310 * rather than unmapping them. If we encounter the "check_page" that vmscan is 1310 * rather than unmapping them. If we encounter the "check_page" that vmscan is
1311 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. 1311 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
1312 */ 1312 */
1313 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 1313 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
1314 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 1314 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
1315 1315
1316 static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, 1316 static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1317 struct vm_area_struct *vma, struct page *check_page) 1317 struct vm_area_struct *vma, struct page *check_page)
1318 { 1318 {
1319 struct mm_struct *mm = vma->vm_mm; 1319 struct mm_struct *mm = vma->vm_mm;
1320 pgd_t *pgd; 1320 pgd_t *pgd;
1321 pud_t *pud; 1321 pud_t *pud;
1322 pmd_t *pmd; 1322 pmd_t *pmd;
1323 pte_t *pte; 1323 pte_t *pte;
1324 pte_t pteval; 1324 pte_t pteval;
1325 spinlock_t *ptl; 1325 spinlock_t *ptl;
1326 struct page *page; 1326 struct page *page;
1327 unsigned long address; 1327 unsigned long address;
1328 unsigned long end; 1328 unsigned long end;
1329 int ret = SWAP_AGAIN; 1329 int ret = SWAP_AGAIN;
1330 int locked_vma = 0; 1330 int locked_vma = 0;
1331 1331
1332 address = (vma->vm_start + cursor) & CLUSTER_MASK; 1332 address = (vma->vm_start + cursor) & CLUSTER_MASK;
1333 end = address + CLUSTER_SIZE; 1333 end = address + CLUSTER_SIZE;
1334 if (address < vma->vm_start) 1334 if (address < vma->vm_start)
1335 address = vma->vm_start; 1335 address = vma->vm_start;
1336 if (end > vma->vm_end) 1336 if (end > vma->vm_end)
1337 end = vma->vm_end; 1337 end = vma->vm_end;
1338 1338
1339 pgd = pgd_offset(mm, address); 1339 pgd = pgd_offset(mm, address);
1340 if (!pgd_present(*pgd)) 1340 if (!pgd_present(*pgd))
1341 return ret; 1341 return ret;
1342 1342
1343 pud = pud_offset(pgd, address); 1343 pud = pud_offset(pgd, address);
1344 if (!pud_present(*pud)) 1344 if (!pud_present(*pud))
1345 return ret; 1345 return ret;
1346 1346
1347 pmd = pmd_offset(pud, address); 1347 pmd = pmd_offset(pud, address);
1348 if (!pmd_present(*pmd)) 1348 if (!pmd_present(*pmd))
1349 return ret; 1349 return ret;
1350 1350
1351 /* 1351 /*
1352 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1352 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1353 * keep the sem while scanning the cluster for mlocking pages. 1353 * keep the sem while scanning the cluster for mlocking pages.
1354 */ 1354 */
1355 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 1355 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1356 locked_vma = (vma->vm_flags & VM_LOCKED); 1356 locked_vma = (vma->vm_flags & VM_LOCKED);
1357 if (!locked_vma) 1357 if (!locked_vma)
1358 up_read(&vma->vm_mm->mmap_sem); /* don't need it */ 1358 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
1359 } 1359 }
1360 1360
1361 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 1361 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1362 1362
1363 /* Update high watermark before we lower rss */ 1363 /* Update high watermark before we lower rss */
1364 update_hiwater_rss(mm); 1364 update_hiwater_rss(mm);
1365 1365
1366 for (; address < end; pte++, address += PAGE_SIZE) { 1366 for (; address < end; pte++, address += PAGE_SIZE) {
1367 if (!pte_present(*pte)) 1367 if (!pte_present(*pte))
1368 continue; 1368 continue;
1369 page = vm_normal_page(vma, address, *pte); 1369 page = vm_normal_page(vma, address, *pte);
1370 BUG_ON(!page || PageAnon(page)); 1370 BUG_ON(!page || PageAnon(page));
1371 1371
1372 if (locked_vma) { 1372 if (locked_vma) {
1373 mlock_vma_page(page); /* no-op if already mlocked */ 1373 mlock_vma_page(page); /* no-op if already mlocked */
1374 if (page == check_page) 1374 if (page == check_page)
1375 ret = SWAP_MLOCK; 1375 ret = SWAP_MLOCK;
1376 continue; /* don't unmap */ 1376 continue; /* don't unmap */
1377 } 1377 }
1378 1378
1379 if (ptep_clear_flush_young_notify(vma, address, pte)) 1379 if (ptep_clear_flush_young_notify(vma, address, pte))
1380 continue; 1380 continue;
1381 1381
1382 /* Nuke the page table entry. */ 1382 /* Nuke the page table entry. */
1383 flush_cache_page(vma, address, pte_pfn(*pte)); 1383 flush_cache_page(vma, address, pte_pfn(*pte));
1384 pteval = ptep_clear_flush_notify(vma, address, pte); 1384 pteval = ptep_clear_flush_notify(vma, address, pte);
1385 1385
1386 /* If nonlinear, store the file page offset in the pte. */ 1386 /* If nonlinear, store the file page offset in the pte. */
1387 if (page->index != linear_page_index(vma, address)) 1387 if (page->index != linear_page_index(vma, address))
1388 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 1388 set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
1389 1389
1390 /* Move the dirty bit to the physical page now the pte is gone. */ 1390 /* Move the dirty bit to the physical page now the pte is gone. */
1391 if (pte_dirty(pteval)) 1391 if (pte_dirty(pteval))
1392 set_page_dirty(page); 1392 set_page_dirty(page);
1393 1393
1394 page_remove_rmap(page); 1394 page_remove_rmap(page);
1395 page_cache_release(page); 1395 page_cache_release(page);
1396 dec_mm_counter(mm, MM_FILEPAGES); 1396 dec_mm_counter(mm, MM_FILEPAGES);
1397 (*mapcount)--; 1397 (*mapcount)--;
1398 } 1398 }
1399 pte_unmap_unlock(pte - 1, ptl); 1399 pte_unmap_unlock(pte - 1, ptl);
1400 if (locked_vma) 1400 if (locked_vma)
1401 up_read(&vma->vm_mm->mmap_sem); 1401 up_read(&vma->vm_mm->mmap_sem);
1402 return ret; 1402 return ret;
1403 } 1403 }
1404 1404
1405 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1405 bool is_vma_temporary_stack(struct vm_area_struct *vma)
1406 { 1406 {
1407 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1407 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1408 1408
1409 if (!maybe_stack) 1409 if (!maybe_stack)
1410 return false; 1410 return false;
1411 1411
1412 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1412 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1413 VM_STACK_INCOMPLETE_SETUP) 1413 VM_STACK_INCOMPLETE_SETUP)
1414 return true; 1414 return true;
1415 1415
1416 return false; 1416 return false;
1417 } 1417 }
1418 1418
1419 /** 1419 /**
1420 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1420 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1421 * rmap method 1421 * rmap method
1422 * @page: the page to unmap/unlock 1422 * @page: the page to unmap/unlock
1423 * @flags: action and flags 1423 * @flags: action and flags
1424 * 1424 *
1425 * Find all the mappings of a page using the mapping pointer and the vma chains 1425 * Find all the mappings of a page using the mapping pointer and the vma chains
1426 * contained in the anon_vma struct it points to. 1426 * contained in the anon_vma struct it points to.
1427 * 1427 *
1428 * This function is only called from try_to_unmap/try_to_munlock for 1428 * This function is only called from try_to_unmap/try_to_munlock for
1429 * anonymous pages. 1429 * anonymous pages.
1430 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1430 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1431 * where the page was found will be held for write. So, we won't recheck 1431 * where the page was found will be held for write. So, we won't recheck
1432 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1432 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1433 * 'LOCKED. 1433 * 'LOCKED.
1434 */ 1434 */
1435 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1435 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1436 { 1436 {
1437 struct anon_vma *anon_vma; 1437 struct anon_vma *anon_vma;
1438 struct anon_vma_chain *avc; 1438 struct anon_vma_chain *avc;
1439 int ret = SWAP_AGAIN; 1439 int ret = SWAP_AGAIN;
1440 1440
1441 anon_vma = page_lock_anon_vma(page); 1441 anon_vma = page_lock_anon_vma(page);
1442 if (!anon_vma) 1442 if (!anon_vma)
1443 return ret; 1443 return ret;
1444 1444
1445 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1445 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1446 struct vm_area_struct *vma = avc->vma; 1446 struct vm_area_struct *vma = avc->vma;
1447 unsigned long address; 1447 unsigned long address;
1448 1448
1449 /* 1449 /*
1450 * During exec, a temporary VMA is setup and later moved. 1450 * During exec, a temporary VMA is setup and later moved.
1451 * The VMA is moved under the anon_vma lock but not the 1451 * The VMA is moved under the anon_vma lock but not the
1452 * page tables leading to a race where migration cannot 1452 * page tables leading to a race where migration cannot
1453 * find the migration ptes. Rather than increasing the 1453 * find the migration ptes. Rather than increasing the
1454 * locking requirements of exec(), migration skips 1454 * locking requirements of exec(), migration skips
1455 * temporary VMAs until after exec() completes. 1455 * temporary VMAs until after exec() completes.
1456 */ 1456 */
1457 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && 1457 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
1458 is_vma_temporary_stack(vma)) 1458 is_vma_temporary_stack(vma))
1459 continue; 1459 continue;
1460 1460
1461 address = vma_address(page, vma); 1461 address = vma_address(page, vma);
1462 if (address == -EFAULT) 1462 if (address == -EFAULT)
1463 continue; 1463 continue;
1464 ret = try_to_unmap_one(page, vma, address, flags); 1464 ret = try_to_unmap_one(page, vma, address, flags);
1465 if (ret != SWAP_AGAIN || !page_mapped(page)) 1465 if (ret != SWAP_AGAIN || !page_mapped(page))
1466 break; 1466 break;
1467 } 1467 }
1468 1468
1469 page_unlock_anon_vma(anon_vma); 1469 page_unlock_anon_vma(anon_vma);
1470 return ret; 1470 return ret;
1471 } 1471 }
1472 1472
1473 /** 1473 /**
1474 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1474 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1475 * @page: the page to unmap/unlock 1475 * @page: the page to unmap/unlock
1476 * @flags: action and flags 1476 * @flags: action and flags
1477 * 1477 *
1478 * Find all the mappings of a page using the mapping pointer and the vma chains 1478 * Find all the mappings of a page using the mapping pointer and the vma chains
1479 * contained in the address_space struct it points to. 1479 * contained in the address_space struct it points to.
1480 * 1480 *
1481 * This function is only called from try_to_unmap/try_to_munlock for 1481 * This function is only called from try_to_unmap/try_to_munlock for
1482 * object-based pages. 1482 * object-based pages.
1483 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1483 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1484 * where the page was found will be held for write. So, we won't recheck 1484 * where the page was found will be held for write. So, we won't recheck
1485 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1485 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1486 * 'LOCKED. 1486 * 'LOCKED.
1487 */ 1487 */
1488 static int try_to_unmap_file(struct page *page, enum ttu_flags flags) 1488 static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1489 { 1489 {
1490 struct address_space *mapping = page->mapping; 1490 struct address_space *mapping = page->mapping;
1491 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1491 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1492 struct vm_area_struct *vma; 1492 struct vm_area_struct *vma;
1493 struct prio_tree_iter iter; 1493 struct prio_tree_iter iter;
1494 int ret = SWAP_AGAIN; 1494 int ret = SWAP_AGAIN;
1495 unsigned long cursor; 1495 unsigned long cursor;
1496 unsigned long max_nl_cursor = 0; 1496 unsigned long max_nl_cursor = 0;
1497 unsigned long max_nl_size = 0; 1497 unsigned long max_nl_size = 0;
1498 unsigned int mapcount; 1498 unsigned int mapcount;
1499 1499
1500 mutex_lock(&mapping->i_mmap_mutex); 1500 mutex_lock(&mapping->i_mmap_mutex);
1501 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1501 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1502 unsigned long address = vma_address(page, vma); 1502 unsigned long address = vma_address(page, vma);
1503 if (address == -EFAULT) 1503 if (address == -EFAULT)
1504 continue; 1504 continue;
1505 ret = try_to_unmap_one(page, vma, address, flags); 1505 ret = try_to_unmap_one(page, vma, address, flags);
1506 if (ret != SWAP_AGAIN || !page_mapped(page)) 1506 if (ret != SWAP_AGAIN || !page_mapped(page))
1507 goto out; 1507 goto out;
1508 } 1508 }
1509 1509
1510 if (list_empty(&mapping->i_mmap_nonlinear)) 1510 if (list_empty(&mapping->i_mmap_nonlinear))
1511 goto out; 1511 goto out;
1512 1512
1513 /* 1513 /*
1514 * We don't bother to try to find the munlocked page in nonlinears. 1514 * We don't bother to try to find the munlocked page in nonlinears.
1515 * It's costly. Instead, later, page reclaim logic may call 1515 * It's costly. Instead, later, page reclaim logic may call
1516 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. 1516 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1517 */ 1517 */
1518 if (TTU_ACTION(flags) == TTU_MUNLOCK) 1518 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1519 goto out; 1519 goto out;
1520 1520
1521 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1521 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1522 shared.vm_set.list) { 1522 shared.vm_set.list) {
1523 cursor = (unsigned long) vma->vm_private_data; 1523 cursor = (unsigned long) vma->vm_private_data;
1524 if (cursor > max_nl_cursor) 1524 if (cursor > max_nl_cursor)
1525 max_nl_cursor = cursor; 1525 max_nl_cursor = cursor;
1526 cursor = vma->vm_end - vma->vm_start; 1526 cursor = vma->vm_end - vma->vm_start;
1527 if (cursor > max_nl_size) 1527 if (cursor > max_nl_size)
1528 max_nl_size = cursor; 1528 max_nl_size = cursor;
1529 } 1529 }
1530 1530
1531 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1531 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1532 ret = SWAP_FAIL; 1532 ret = SWAP_FAIL;
1533 goto out; 1533 goto out;
1534 } 1534 }
1535 1535
1536 /* 1536 /*
1537 * We don't try to search for this page in the nonlinear vmas, 1537 * We don't try to search for this page in the nonlinear vmas,
1538 * and page_referenced wouldn't have found it anyway. Instead 1538 * and page_referenced wouldn't have found it anyway. Instead
1539 * just walk the nonlinear vmas trying to age and unmap some. 1539 * just walk the nonlinear vmas trying to age and unmap some.
1540 * The mapcount of the page we came in with is irrelevant, 1540 * The mapcount of the page we came in with is irrelevant,
1541 * but even so use it as a guide to how hard we should try? 1541 * but even so use it as a guide to how hard we should try?
1542 */ 1542 */
1543 mapcount = page_mapcount(page); 1543 mapcount = page_mapcount(page);
1544 if (!mapcount) 1544 if (!mapcount)
1545 goto out; 1545 goto out;
1546 cond_resched(); 1546 cond_resched();
1547 1547
1548 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1548 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1549 if (max_nl_cursor == 0) 1549 if (max_nl_cursor == 0)
1550 max_nl_cursor = CLUSTER_SIZE; 1550 max_nl_cursor = CLUSTER_SIZE;
1551 1551
1552 do { 1552 do {
1553 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1553 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1554 shared.vm_set.list) { 1554 shared.vm_set.list) {
1555 cursor = (unsigned long) vma->vm_private_data; 1555 cursor = (unsigned long) vma->vm_private_data;
1556 while ( cursor < max_nl_cursor && 1556 while ( cursor < max_nl_cursor &&
1557 cursor < vma->vm_end - vma->vm_start) { 1557 cursor < vma->vm_end - vma->vm_start) {
1558 if (try_to_unmap_cluster(cursor, &mapcount, 1558 if (try_to_unmap_cluster(cursor, &mapcount,
1559 vma, page) == SWAP_MLOCK) 1559 vma, page) == SWAP_MLOCK)
1560 ret = SWAP_MLOCK; 1560 ret = SWAP_MLOCK;
1561 cursor += CLUSTER_SIZE; 1561 cursor += CLUSTER_SIZE;
1562 vma->vm_private_data = (void *) cursor; 1562 vma->vm_private_data = (void *) cursor;
1563 if ((int)mapcount <= 0) 1563 if ((int)mapcount <= 0)
1564 goto out; 1564 goto out;
1565 } 1565 }
1566 vma->vm_private_data = (void *) max_nl_cursor; 1566 vma->vm_private_data = (void *) max_nl_cursor;
1567 } 1567 }
1568 cond_resched(); 1568 cond_resched();
1569 max_nl_cursor += CLUSTER_SIZE; 1569 max_nl_cursor += CLUSTER_SIZE;
1570 } while (max_nl_cursor <= max_nl_size); 1570 } while (max_nl_cursor <= max_nl_size);
1571 1571
1572 /* 1572 /*
1573 * Don't loop forever (perhaps all the remaining pages are 1573 * Don't loop forever (perhaps all the remaining pages are
1574 * in locked vmas). Reset cursor on all unreserved nonlinear 1574 * in locked vmas). Reset cursor on all unreserved nonlinear
1575 * vmas, now forgetting on which ones it had fallen behind. 1575 * vmas, now forgetting on which ones it had fallen behind.
1576 */ 1576 */
1577 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1577 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1578 vma->vm_private_data = NULL; 1578 vma->vm_private_data = NULL;
1579 out: 1579 out:
1580 mutex_unlock(&mapping->i_mmap_mutex); 1580 mutex_unlock(&mapping->i_mmap_mutex);
1581 return ret; 1581 return ret;
1582 } 1582 }
1583 1583
1584 /** 1584 /**
1585 * try_to_unmap - try to remove all page table mappings to a page 1585 * try_to_unmap - try to remove all page table mappings to a page
1586 * @page: the page to get unmapped 1586 * @page: the page to get unmapped
1587 * @flags: action and flags 1587 * @flags: action and flags
1588 * 1588 *
1589 * Tries to remove all the page table entries which are mapping this 1589 * Tries to remove all the page table entries which are mapping this
1590 * page, used in the pageout path. Caller must hold the page lock. 1590 * page, used in the pageout path. Caller must hold the page lock.
1591 * Return values are: 1591 * Return values are:
1592 * 1592 *
1593 * SWAP_SUCCESS - we succeeded in removing all mappings 1593 * SWAP_SUCCESS - we succeeded in removing all mappings
1594 * SWAP_AGAIN - we missed a mapping, try again later 1594 * SWAP_AGAIN - we missed a mapping, try again later
1595 * SWAP_FAIL - the page is unswappable 1595 * SWAP_FAIL - the page is unswappable
1596 * SWAP_MLOCK - page is mlocked. 1596 * SWAP_MLOCK - page is mlocked.
1597 */ 1597 */
1598 int try_to_unmap(struct page *page, enum ttu_flags flags) 1598 int try_to_unmap(struct page *page, enum ttu_flags flags)
1599 { 1599 {
1600 int ret; 1600 int ret;
1601 1601
1602 BUG_ON(!PageLocked(page)); 1602 BUG_ON(!PageLocked(page));
1603 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); 1603 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1604 1604
1605 if (unlikely(PageKsm(page))) 1605 if (unlikely(PageKsm(page)))
1606 ret = try_to_unmap_ksm(page, flags); 1606 ret = try_to_unmap_ksm(page, flags);
1607 else if (PageAnon(page)) 1607 else if (PageAnon(page))
1608 ret = try_to_unmap_anon(page, flags); 1608 ret = try_to_unmap_anon(page, flags);
1609 else 1609 else
1610 ret = try_to_unmap_file(page, flags); 1610 ret = try_to_unmap_file(page, flags);
1611 if (ret != SWAP_MLOCK && !page_mapped(page)) 1611 if (ret != SWAP_MLOCK && !page_mapped(page))
1612 ret = SWAP_SUCCESS; 1612 ret = SWAP_SUCCESS;
1613 return ret; 1613 return ret;
1614 } 1614 }
1615 1615
1616 /** 1616 /**
1617 * try_to_munlock - try to munlock a page 1617 * try_to_munlock - try to munlock a page
1618 * @page: the page to be munlocked 1618 * @page: the page to be munlocked
1619 * 1619 *
1620 * Called from munlock code. Checks all of the VMAs mapping the page 1620 * Called from munlock code. Checks all of the VMAs mapping the page
1621 * to make sure nobody else has this page mlocked. The page will be 1621 * to make sure nobody else has this page mlocked. The page will be
1622 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1622 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1623 * 1623 *
1624 * Return values are: 1624 * Return values are:
1625 * 1625 *
1626 * SWAP_AGAIN - no vma is holding page mlocked, or, 1626 * SWAP_AGAIN - no vma is holding page mlocked, or,
1627 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1627 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1628 * SWAP_FAIL - page cannot be located at present 1628 * SWAP_FAIL - page cannot be located at present
1629 * SWAP_MLOCK - page is now mlocked. 1629 * SWAP_MLOCK - page is now mlocked.
1630 */ 1630 */
1631 int try_to_munlock(struct page *page) 1631 int try_to_munlock(struct page *page)
1632 { 1632 {
1633 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1633 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1634 1634
1635 if (unlikely(PageKsm(page))) 1635 if (unlikely(PageKsm(page)))
1636 return try_to_unmap_ksm(page, TTU_MUNLOCK); 1636 return try_to_unmap_ksm(page, TTU_MUNLOCK);
1637 else if (PageAnon(page)) 1637 else if (PageAnon(page))
1638 return try_to_unmap_anon(page, TTU_MUNLOCK); 1638 return try_to_unmap_anon(page, TTU_MUNLOCK);
1639 else 1639 else
1640 return try_to_unmap_file(page, TTU_MUNLOCK); 1640 return try_to_unmap_file(page, TTU_MUNLOCK);
1641 } 1641 }
1642 1642
1643 void __put_anon_vma(struct anon_vma *anon_vma) 1643 void __put_anon_vma(struct anon_vma *anon_vma)
1644 { 1644 {
1645 struct anon_vma *root = anon_vma->root; 1645 struct anon_vma *root = anon_vma->root;
1646 1646
1647 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1647 if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1648 anon_vma_free(root); 1648 anon_vma_free(root);
1649 1649
1650 anon_vma_free(anon_vma); 1650 anon_vma_free(anon_vma);
1651 } 1651 }
1652 1652
1653 #ifdef CONFIG_MIGRATION 1653 #ifdef CONFIG_MIGRATION
1654 /* 1654 /*
1655 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1655 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1656 * Called by migrate.c to remove migration ptes, but might be used more later. 1656 * Called by migrate.c to remove migration ptes, but might be used more later.
1657 */ 1657 */
1658 static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, 1658 static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1659 struct vm_area_struct *, unsigned long, void *), void *arg) 1659 struct vm_area_struct *, unsigned long, void *), void *arg)
1660 { 1660 {
1661 struct anon_vma *anon_vma; 1661 struct anon_vma *anon_vma;
1662 struct anon_vma_chain *avc; 1662 struct anon_vma_chain *avc;
1663 int ret = SWAP_AGAIN; 1663 int ret = SWAP_AGAIN;
1664 1664
1665 /* 1665 /*
1666 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1666 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1667 * because that depends on page_mapped(); but not all its usages 1667 * because that depends on page_mapped(); but not all its usages
1668 * are holding mmap_sem. Users without mmap_sem are required to 1668 * are holding mmap_sem. Users without mmap_sem are required to
1669 * take a reference count to prevent the anon_vma disappearing 1669 * take a reference count to prevent the anon_vma disappearing
1670 */ 1670 */
1671 anon_vma = page_anon_vma(page); 1671 anon_vma = page_anon_vma(page);
1672 if (!anon_vma) 1672 if (!anon_vma)
1673 return ret; 1673 return ret;
1674 anon_vma_lock(anon_vma); 1674 anon_vma_lock(anon_vma);
1675 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1675 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1676 struct vm_area_struct *vma = avc->vma; 1676 struct vm_area_struct *vma = avc->vma;
1677 unsigned long address = vma_address(page, vma); 1677 unsigned long address = vma_address(page, vma);
1678 if (address == -EFAULT) 1678 if (address == -EFAULT)
1679 continue; 1679 continue;
1680 ret = rmap_one(page, vma, address, arg); 1680 ret = rmap_one(page, vma, address, arg);
1681 if (ret != SWAP_AGAIN) 1681 if (ret != SWAP_AGAIN)
1682 break; 1682 break;
1683 } 1683 }
1684 anon_vma_unlock(anon_vma); 1684 anon_vma_unlock(anon_vma);
1685 return ret; 1685 return ret;
1686 } 1686 }
1687 1687
1688 static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, 1688 static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1689 struct vm_area_struct *, unsigned long, void *), void *arg) 1689 struct vm_area_struct *, unsigned long, void *), void *arg)
1690 { 1690 {
1691 struct address_space *mapping = page->mapping; 1691 struct address_space *mapping = page->mapping;
1692 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1692 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1693 struct vm_area_struct *vma; 1693 struct vm_area_struct *vma;
1694 struct prio_tree_iter iter; 1694 struct prio_tree_iter iter;
1695 int ret = SWAP_AGAIN; 1695 int ret = SWAP_AGAIN;
1696 1696
1697 if (!mapping) 1697 if (!mapping)
1698 return ret; 1698 return ret;
1699 mutex_lock(&mapping->i_mmap_mutex); 1699 mutex_lock(&mapping->i_mmap_mutex);
1700 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1700 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1701 unsigned long address = vma_address(page, vma); 1701 unsigned long address = vma_address(page, vma);
1702 if (address == -EFAULT) 1702 if (address == -EFAULT)
1703 continue; 1703 continue;
1704 ret = rmap_one(page, vma, address, arg); 1704 ret = rmap_one(page, vma, address, arg);
1705 if (ret != SWAP_AGAIN) 1705 if (ret != SWAP_AGAIN)
1706 break; 1706 break;
1707 } 1707 }
1708 /* 1708 /*
1709 * No nonlinear handling: being always shared, nonlinear vmas 1709 * No nonlinear handling: being always shared, nonlinear vmas
1710 * never contain migration ptes. Decide what to do about this 1710 * never contain migration ptes. Decide what to do about this
1711 * limitation to linear when we need rmap_walk() on nonlinear. 1711 * limitation to linear when we need rmap_walk() on nonlinear.
1712 */ 1712 */
1713 mutex_unlock(&mapping->i_mmap_mutex); 1713 mutex_unlock(&mapping->i_mmap_mutex);
1714 return ret; 1714 return ret;
1715 } 1715 }
1716 1716
1717 int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 1717 int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1718 struct vm_area_struct *, unsigned long, void *), void *arg) 1718 struct vm_area_struct *, unsigned long, void *), void *arg)
1719 { 1719 {
1720 VM_BUG_ON(!PageLocked(page)); 1720 VM_BUG_ON(!PageLocked(page));
1721 1721
1722 if (unlikely(PageKsm(page))) 1722 if (unlikely(PageKsm(page)))
1723 return rmap_walk_ksm(page, rmap_one, arg); 1723 return rmap_walk_ksm(page, rmap_one, arg);
1724 else if (PageAnon(page)) 1724 else if (PageAnon(page))
1725 return rmap_walk_anon(page, rmap_one, arg); 1725 return rmap_walk_anon(page, rmap_one, arg);
1726 else 1726 else
1727 return rmap_walk_file(page, rmap_one, arg); 1727 return rmap_walk_file(page, rmap_one, arg);
1728 } 1728 }
1729 #endif /* CONFIG_MIGRATION */ 1729 #endif /* CONFIG_MIGRATION */
1730 1730
1731 #ifdef CONFIG_HUGETLB_PAGE 1731 #ifdef CONFIG_HUGETLB_PAGE
1732 /* 1732 /*
1733 * The following three functions are for anonymous (private mapped) hugepages. 1733 * The following three functions are for anonymous (private mapped) hugepages.
1734 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1734 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1735 * and no lru code, because we handle hugepages differently from common pages. 1735 * and no lru code, because we handle hugepages differently from common pages.
1736 */ 1736 */
1737 static void __hugepage_set_anon_rmap(struct page *page, 1737 static void __hugepage_set_anon_rmap(struct page *page,
1738 struct vm_area_struct *vma, unsigned long address, int exclusive) 1738 struct vm_area_struct *vma, unsigned long address, int exclusive)
1739 { 1739 {
1740 struct anon_vma *anon_vma = vma->anon_vma; 1740 struct anon_vma *anon_vma = vma->anon_vma;
1741 1741
1742 BUG_ON(!anon_vma); 1742 BUG_ON(!anon_vma);
1743 1743
1744 if (PageAnon(page)) 1744 if (PageAnon(page))
1745 return; 1745 return;
1746 if (!exclusive) 1746 if (!exclusive)
1747 anon_vma = anon_vma->root; 1747 anon_vma = anon_vma->root;
1748 1748
1749 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1749 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1750 page->mapping = (struct address_space *) anon_vma; 1750 page->mapping = (struct address_space *) anon_vma;
1751 page->index = linear_page_index(vma, address); 1751 page->index = linear_page_index(vma, address);
1752 } 1752 }
1753 1753
1754 void hugepage_add_anon_rmap(struct page *page, 1754 void hugepage_add_anon_rmap(struct page *page,
1755 struct vm_area_struct *vma, unsigned long address) 1755 struct vm_area_struct *vma, unsigned long address)
1756 { 1756 {
1757 struct anon_vma *anon_vma = vma->anon_vma; 1757 struct anon_vma *anon_vma = vma->anon_vma;
1758 int first; 1758 int first;
1759 1759
1760 BUG_ON(!PageLocked(page)); 1760 BUG_ON(!PageLocked(page));
1761 BUG_ON(!anon_vma); 1761 BUG_ON(!anon_vma);
1762 /* address might be in next vma when migration races vma_adjust */ 1762 /* address might be in next vma when migration races vma_adjust */
1763 first = atomic_inc_and_test(&page->_mapcount); 1763 first = atomic_inc_and_test(&page->_mapcount);
1764 if (first) 1764 if (first)
1765 __hugepage_set_anon_rmap(page, vma, address, 0); 1765 __hugepage_set_anon_rmap(page, vma, address, 0);
1766 } 1766 }
1767 1767
1768 void hugepage_add_new_anon_rmap(struct page *page, 1768 void hugepage_add_new_anon_rmap(struct page *page,
1769 struct vm_area_struct *vma, unsigned long address) 1769 struct vm_area_struct *vma, unsigned long address)
1770 { 1770 {
1771 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1771 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1772 atomic_set(&page->_mapcount, 0); 1772 atomic_set(&page->_mapcount, 0);
1773 __hugepage_set_anon_rmap(page, vma, address, 1); 1773 __hugepage_set_anon_rmap(page, vma, address, 1);
1774 } 1774 }
1775 #endif /* CONFIG_HUGETLB_PAGE */ 1775 #endif /* CONFIG_HUGETLB_PAGE */
1776 1776