Commit f01ef569cddb1a8627b1c6b3a134998ad1cf4b22
Exists in
master
and in
4 other branches
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits) mm: properly reflect task dirty limits in dirty_exceeded logic writeback: don't busy retry writeback on new/freeing inodes writeback: scale IO chunk size up to half device bandwidth writeback: trace global_dirty_state writeback: introduce max-pause and pass-good dirty limits writeback: introduce smoothed global dirty limit writeback: consolidate variable names in balance_dirty_pages() writeback: show bdi write bandwidth in debugfs writeback: bdi write bandwidth estimation writeback: account per-bdi accumulated written pages writeback: make writeback_control.nr_to_write straight writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr() writeback: trace event writeback_queue_io writeback: trace event writeback_single_inode writeback: remove .nonblocking and .encountered_congestion writeback: remove writeback_control.more_io writeback: skip balance_dirty_pages() for in-memory fs writeback: add bdi_dirty_limit() kernel-doc writeback: avoid extra sync work at enqueue time writeback: elevate queue_io() into wb_writeback() ... Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c
Showing 15 changed files Inline Diff
fs/block_dev.c
1 | /* | 1 | /* |
2 | * linux/fs/block_dev.c | 2 | * linux/fs/block_dev.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | 5 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/init.h> | 8 | #include <linux/init.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/fcntl.h> | 10 | #include <linux/fcntl.h> |
11 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
12 | #include <linux/kmod.h> | 12 | #include <linux/kmod.h> |
13 | #include <linux/major.h> | 13 | #include <linux/major.h> |
14 | #include <linux/device_cgroup.h> | 14 | #include <linux/device_cgroup.h> |
15 | #include <linux/highmem.h> | 15 | #include <linux/highmem.h> |
16 | #include <linux/blkdev.h> | 16 | #include <linux/blkdev.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/blkpg.h> | 18 | #include <linux/blkpg.h> |
19 | #include <linux/buffer_head.h> | 19 | #include <linux/buffer_head.h> |
20 | #include <linux/pagevec.h> | 20 | #include <linux/pagevec.h> |
21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/mpage.h> | 22 | #include <linux/mpage.h> |
23 | #include <linux/mount.h> | 23 | #include <linux/mount.h> |
24 | #include <linux/uio.h> | 24 | #include <linux/uio.h> |
25 | #include <linux/namei.h> | 25 | #include <linux/namei.h> |
26 | #include <linux/log2.h> | 26 | #include <linux/log2.h> |
27 | #include <linux/kmemleak.h> | 27 | #include <linux/kmemleak.h> |
28 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
29 | #include "internal.h" | 29 | #include "internal.h" |
30 | 30 | ||
31 | struct bdev_inode { | 31 | struct bdev_inode { |
32 | struct block_device bdev; | 32 | struct block_device bdev; |
33 | struct inode vfs_inode; | 33 | struct inode vfs_inode; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | static const struct address_space_operations def_blk_aops; | 36 | static const struct address_space_operations def_blk_aops; |
37 | 37 | ||
38 | static inline struct bdev_inode *BDEV_I(struct inode *inode) | 38 | static inline struct bdev_inode *BDEV_I(struct inode *inode) |
39 | { | 39 | { |
40 | return container_of(inode, struct bdev_inode, vfs_inode); | 40 | return container_of(inode, struct bdev_inode, vfs_inode); |
41 | } | 41 | } |
42 | 42 | ||
43 | inline struct block_device *I_BDEV(struct inode *inode) | 43 | inline struct block_device *I_BDEV(struct inode *inode) |
44 | { | 44 | { |
45 | return &BDEV_I(inode)->bdev; | 45 | return &BDEV_I(inode)->bdev; |
46 | } | 46 | } |
47 | |||
48 | EXPORT_SYMBOL(I_BDEV); | 47 | EXPORT_SYMBOL(I_BDEV); |
49 | 48 | ||
50 | /* | 49 | /* |
51 | * move the inode from it's current bdi to the a new bdi. if the inode is dirty | 50 | * Move the inode from its current bdi to a new bdi. If the inode is dirty we |
52 | * we need to move it onto the dirty list of @dst so that the inode is always | 51 | * need to move it onto the dirty list of @dst so that the inode is always on |
53 | * on the right list. | 52 | * the right list. |
54 | */ | 53 | */ |
55 | static void bdev_inode_switch_bdi(struct inode *inode, | 54 | static void bdev_inode_switch_bdi(struct inode *inode, |
56 | struct backing_dev_info *dst) | 55 | struct backing_dev_info *dst) |
57 | { | 56 | { |
58 | spin_lock(&inode_wb_list_lock); | 57 | struct backing_dev_info *old = inode->i_data.backing_dev_info; |
58 | |||
59 | if (unlikely(dst == old)) /* deadlock avoidance */ | ||
60 | return; | ||
61 | bdi_lock_two(&old->wb, &dst->wb); | ||
59 | spin_lock(&inode->i_lock); | 62 | spin_lock(&inode->i_lock); |
60 | inode->i_data.backing_dev_info = dst; | 63 | inode->i_data.backing_dev_info = dst; |
61 | if (inode->i_state & I_DIRTY) | 64 | if (inode->i_state & I_DIRTY) |
62 | list_move(&inode->i_wb_list, &dst->wb.b_dirty); | 65 | list_move(&inode->i_wb_list, &dst->wb.b_dirty); |
63 | spin_unlock(&inode->i_lock); | 66 | spin_unlock(&inode->i_lock); |
64 | spin_unlock(&inode_wb_list_lock); | 67 | spin_unlock(&old->wb.list_lock); |
68 | spin_unlock(&dst->wb.list_lock); | ||
65 | } | 69 | } |
66 | 70 | ||
67 | static sector_t max_block(struct block_device *bdev) | 71 | static sector_t max_block(struct block_device *bdev) |
68 | { | 72 | { |
69 | sector_t retval = ~((sector_t)0); | 73 | sector_t retval = ~((sector_t)0); |
70 | loff_t sz = i_size_read(bdev->bd_inode); | 74 | loff_t sz = i_size_read(bdev->bd_inode); |
71 | 75 | ||
72 | if (sz) { | 76 | if (sz) { |
73 | unsigned int size = block_size(bdev); | 77 | unsigned int size = block_size(bdev); |
74 | unsigned int sizebits = blksize_bits(size); | 78 | unsigned int sizebits = blksize_bits(size); |
75 | retval = (sz >> sizebits); | 79 | retval = (sz >> sizebits); |
76 | } | 80 | } |
77 | return retval; | 81 | return retval; |
78 | } | 82 | } |
79 | 83 | ||
80 | /* Kill _all_ buffers and pagecache , dirty or not.. */ | 84 | /* Kill _all_ buffers and pagecache , dirty or not.. */ |
81 | static void kill_bdev(struct block_device *bdev) | 85 | static void kill_bdev(struct block_device *bdev) |
82 | { | 86 | { |
83 | if (bdev->bd_inode->i_mapping->nrpages == 0) | 87 | if (bdev->bd_inode->i_mapping->nrpages == 0) |
84 | return; | 88 | return; |
85 | invalidate_bh_lrus(); | 89 | invalidate_bh_lrus(); |
86 | truncate_inode_pages(bdev->bd_inode->i_mapping, 0); | 90 | truncate_inode_pages(bdev->bd_inode->i_mapping, 0); |
87 | } | 91 | } |
88 | 92 | ||
89 | int set_blocksize(struct block_device *bdev, int size) | 93 | int set_blocksize(struct block_device *bdev, int size) |
90 | { | 94 | { |
91 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ | 95 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ |
92 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) | 96 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) |
93 | return -EINVAL; | 97 | return -EINVAL; |
94 | 98 | ||
95 | /* Size cannot be smaller than the size supported by the device */ | 99 | /* Size cannot be smaller than the size supported by the device */ |
96 | if (size < bdev_logical_block_size(bdev)) | 100 | if (size < bdev_logical_block_size(bdev)) |
97 | return -EINVAL; | 101 | return -EINVAL; |
98 | 102 | ||
99 | /* Don't change the size if it is same as current */ | 103 | /* Don't change the size if it is same as current */ |
100 | if (bdev->bd_block_size != size) { | 104 | if (bdev->bd_block_size != size) { |
101 | sync_blockdev(bdev); | 105 | sync_blockdev(bdev); |
102 | bdev->bd_block_size = size; | 106 | bdev->bd_block_size = size; |
103 | bdev->bd_inode->i_blkbits = blksize_bits(size); | 107 | bdev->bd_inode->i_blkbits = blksize_bits(size); |
104 | kill_bdev(bdev); | 108 | kill_bdev(bdev); |
105 | } | 109 | } |
106 | return 0; | 110 | return 0; |
107 | } | 111 | } |
108 | 112 | ||
109 | EXPORT_SYMBOL(set_blocksize); | 113 | EXPORT_SYMBOL(set_blocksize); |
110 | 114 | ||
111 | int sb_set_blocksize(struct super_block *sb, int size) | 115 | int sb_set_blocksize(struct super_block *sb, int size) |
112 | { | 116 | { |
113 | if (set_blocksize(sb->s_bdev, size)) | 117 | if (set_blocksize(sb->s_bdev, size)) |
114 | return 0; | 118 | return 0; |
115 | /* If we get here, we know size is power of two | 119 | /* If we get here, we know size is power of two |
116 | * and it's value is between 512 and PAGE_SIZE */ | 120 | * and it's value is between 512 and PAGE_SIZE */ |
117 | sb->s_blocksize = size; | 121 | sb->s_blocksize = size; |
118 | sb->s_blocksize_bits = blksize_bits(size); | 122 | sb->s_blocksize_bits = blksize_bits(size); |
119 | return sb->s_blocksize; | 123 | return sb->s_blocksize; |
120 | } | 124 | } |
121 | 125 | ||
122 | EXPORT_SYMBOL(sb_set_blocksize); | 126 | EXPORT_SYMBOL(sb_set_blocksize); |
123 | 127 | ||
124 | int sb_min_blocksize(struct super_block *sb, int size) | 128 | int sb_min_blocksize(struct super_block *sb, int size) |
125 | { | 129 | { |
126 | int minsize = bdev_logical_block_size(sb->s_bdev); | 130 | int minsize = bdev_logical_block_size(sb->s_bdev); |
127 | if (size < minsize) | 131 | if (size < minsize) |
128 | size = minsize; | 132 | size = minsize; |
129 | return sb_set_blocksize(sb, size); | 133 | return sb_set_blocksize(sb, size); |
130 | } | 134 | } |
131 | 135 | ||
132 | EXPORT_SYMBOL(sb_min_blocksize); | 136 | EXPORT_SYMBOL(sb_min_blocksize); |
133 | 137 | ||
134 | static int | 138 | static int |
135 | blkdev_get_block(struct inode *inode, sector_t iblock, | 139 | blkdev_get_block(struct inode *inode, sector_t iblock, |
136 | struct buffer_head *bh, int create) | 140 | struct buffer_head *bh, int create) |
137 | { | 141 | { |
138 | if (iblock >= max_block(I_BDEV(inode))) { | 142 | if (iblock >= max_block(I_BDEV(inode))) { |
139 | if (create) | 143 | if (create) |
140 | return -EIO; | 144 | return -EIO; |
141 | 145 | ||
142 | /* | 146 | /* |
143 | * for reads, we're just trying to fill a partial page. | 147 | * for reads, we're just trying to fill a partial page. |
144 | * return a hole, they will have to call get_block again | 148 | * return a hole, they will have to call get_block again |
145 | * before they can fill it, and they will get -EIO at that | 149 | * before they can fill it, and they will get -EIO at that |
146 | * time | 150 | * time |
147 | */ | 151 | */ |
148 | return 0; | 152 | return 0; |
149 | } | 153 | } |
150 | bh->b_bdev = I_BDEV(inode); | 154 | bh->b_bdev = I_BDEV(inode); |
151 | bh->b_blocknr = iblock; | 155 | bh->b_blocknr = iblock; |
152 | set_buffer_mapped(bh); | 156 | set_buffer_mapped(bh); |
153 | return 0; | 157 | return 0; |
154 | } | 158 | } |
155 | 159 | ||
156 | static int | 160 | static int |
157 | blkdev_get_blocks(struct inode *inode, sector_t iblock, | 161 | blkdev_get_blocks(struct inode *inode, sector_t iblock, |
158 | struct buffer_head *bh, int create) | 162 | struct buffer_head *bh, int create) |
159 | { | 163 | { |
160 | sector_t end_block = max_block(I_BDEV(inode)); | 164 | sector_t end_block = max_block(I_BDEV(inode)); |
161 | unsigned long max_blocks = bh->b_size >> inode->i_blkbits; | 165 | unsigned long max_blocks = bh->b_size >> inode->i_blkbits; |
162 | 166 | ||
163 | if ((iblock + max_blocks) > end_block) { | 167 | if ((iblock + max_blocks) > end_block) { |
164 | max_blocks = end_block - iblock; | 168 | max_blocks = end_block - iblock; |
165 | if ((long)max_blocks <= 0) { | 169 | if ((long)max_blocks <= 0) { |
166 | if (create) | 170 | if (create) |
167 | return -EIO; /* write fully beyond EOF */ | 171 | return -EIO; /* write fully beyond EOF */ |
168 | /* | 172 | /* |
169 | * It is a read which is fully beyond EOF. We return | 173 | * It is a read which is fully beyond EOF. We return |
170 | * a !buffer_mapped buffer | 174 | * a !buffer_mapped buffer |
171 | */ | 175 | */ |
172 | max_blocks = 0; | 176 | max_blocks = 0; |
173 | } | 177 | } |
174 | } | 178 | } |
175 | 179 | ||
176 | bh->b_bdev = I_BDEV(inode); | 180 | bh->b_bdev = I_BDEV(inode); |
177 | bh->b_blocknr = iblock; | 181 | bh->b_blocknr = iblock; |
178 | bh->b_size = max_blocks << inode->i_blkbits; | 182 | bh->b_size = max_blocks << inode->i_blkbits; |
179 | if (max_blocks) | 183 | if (max_blocks) |
180 | set_buffer_mapped(bh); | 184 | set_buffer_mapped(bh); |
181 | return 0; | 185 | return 0; |
182 | } | 186 | } |
183 | 187 | ||
184 | static ssize_t | 188 | static ssize_t |
185 | blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | 189 | blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, |
186 | loff_t offset, unsigned long nr_segs) | 190 | loff_t offset, unsigned long nr_segs) |
187 | { | 191 | { |
188 | struct file *file = iocb->ki_filp; | 192 | struct file *file = iocb->ki_filp; |
189 | struct inode *inode = file->f_mapping->host; | 193 | struct inode *inode = file->f_mapping->host; |
190 | 194 | ||
191 | return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, | 195 | return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, |
192 | nr_segs, blkdev_get_blocks, NULL, NULL, 0); | 196 | nr_segs, blkdev_get_blocks, NULL, NULL, 0); |
193 | } | 197 | } |
194 | 198 | ||
195 | int __sync_blockdev(struct block_device *bdev, int wait) | 199 | int __sync_blockdev(struct block_device *bdev, int wait) |
196 | { | 200 | { |
197 | if (!bdev) | 201 | if (!bdev) |
198 | return 0; | 202 | return 0; |
199 | if (!wait) | 203 | if (!wait) |
200 | return filemap_flush(bdev->bd_inode->i_mapping); | 204 | return filemap_flush(bdev->bd_inode->i_mapping); |
201 | return filemap_write_and_wait(bdev->bd_inode->i_mapping); | 205 | return filemap_write_and_wait(bdev->bd_inode->i_mapping); |
202 | } | 206 | } |
203 | 207 | ||
204 | /* | 208 | /* |
205 | * Write out and wait upon all the dirty data associated with a block | 209 | * Write out and wait upon all the dirty data associated with a block |
206 | * device via its mapping. Does not take the superblock lock. | 210 | * device via its mapping. Does not take the superblock lock. |
207 | */ | 211 | */ |
208 | int sync_blockdev(struct block_device *bdev) | 212 | int sync_blockdev(struct block_device *bdev) |
209 | { | 213 | { |
210 | return __sync_blockdev(bdev, 1); | 214 | return __sync_blockdev(bdev, 1); |
211 | } | 215 | } |
212 | EXPORT_SYMBOL(sync_blockdev); | 216 | EXPORT_SYMBOL(sync_blockdev); |
213 | 217 | ||
214 | /* | 218 | /* |
215 | * Write out and wait upon all dirty data associated with this | 219 | * Write out and wait upon all dirty data associated with this |
216 | * device. Filesystem data as well as the underlying block | 220 | * device. Filesystem data as well as the underlying block |
217 | * device. Takes the superblock lock. | 221 | * device. Takes the superblock lock. |
218 | */ | 222 | */ |
219 | int fsync_bdev(struct block_device *bdev) | 223 | int fsync_bdev(struct block_device *bdev) |
220 | { | 224 | { |
221 | struct super_block *sb = get_super(bdev); | 225 | struct super_block *sb = get_super(bdev); |
222 | if (sb) { | 226 | if (sb) { |
223 | int res = sync_filesystem(sb); | 227 | int res = sync_filesystem(sb); |
224 | drop_super(sb); | 228 | drop_super(sb); |
225 | return res; | 229 | return res; |
226 | } | 230 | } |
227 | return sync_blockdev(bdev); | 231 | return sync_blockdev(bdev); |
228 | } | 232 | } |
229 | EXPORT_SYMBOL(fsync_bdev); | 233 | EXPORT_SYMBOL(fsync_bdev); |
230 | 234 | ||
231 | /** | 235 | /** |
232 | * freeze_bdev -- lock a filesystem and force it into a consistent state | 236 | * freeze_bdev -- lock a filesystem and force it into a consistent state |
233 | * @bdev: blockdevice to lock | 237 | * @bdev: blockdevice to lock |
234 | * | 238 | * |
235 | * If a superblock is found on this device, we take the s_umount semaphore | 239 | * If a superblock is found on this device, we take the s_umount semaphore |
236 | * on it to make sure nobody unmounts until the snapshot creation is done. | 240 | * on it to make sure nobody unmounts until the snapshot creation is done. |
237 | * The reference counter (bd_fsfreeze_count) guarantees that only the last | 241 | * The reference counter (bd_fsfreeze_count) guarantees that only the last |
238 | * unfreeze process can unfreeze the frozen filesystem actually when multiple | 242 | * unfreeze process can unfreeze the frozen filesystem actually when multiple |
239 | * freeze requests arrive simultaneously. It counts up in freeze_bdev() and | 243 | * freeze requests arrive simultaneously. It counts up in freeze_bdev() and |
240 | * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze | 244 | * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze |
241 | * actually. | 245 | * actually. |
242 | */ | 246 | */ |
243 | struct super_block *freeze_bdev(struct block_device *bdev) | 247 | struct super_block *freeze_bdev(struct block_device *bdev) |
244 | { | 248 | { |
245 | struct super_block *sb; | 249 | struct super_block *sb; |
246 | int error = 0; | 250 | int error = 0; |
247 | 251 | ||
248 | mutex_lock(&bdev->bd_fsfreeze_mutex); | 252 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
249 | if (++bdev->bd_fsfreeze_count > 1) { | 253 | if (++bdev->bd_fsfreeze_count > 1) { |
250 | /* | 254 | /* |
251 | * We don't even need to grab a reference - the first call | 255 | * We don't even need to grab a reference - the first call |
252 | * to freeze_bdev grab an active reference and only the last | 256 | * to freeze_bdev grab an active reference and only the last |
253 | * thaw_bdev drops it. | 257 | * thaw_bdev drops it. |
254 | */ | 258 | */ |
255 | sb = get_super(bdev); | 259 | sb = get_super(bdev); |
256 | drop_super(sb); | 260 | drop_super(sb); |
257 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 261 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
258 | return sb; | 262 | return sb; |
259 | } | 263 | } |
260 | 264 | ||
261 | sb = get_active_super(bdev); | 265 | sb = get_active_super(bdev); |
262 | if (!sb) | 266 | if (!sb) |
263 | goto out; | 267 | goto out; |
264 | error = freeze_super(sb); | 268 | error = freeze_super(sb); |
265 | if (error) { | 269 | if (error) { |
266 | deactivate_super(sb); | 270 | deactivate_super(sb); |
267 | bdev->bd_fsfreeze_count--; | 271 | bdev->bd_fsfreeze_count--; |
268 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 272 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
269 | return ERR_PTR(error); | 273 | return ERR_PTR(error); |
270 | } | 274 | } |
271 | deactivate_super(sb); | 275 | deactivate_super(sb); |
272 | out: | 276 | out: |
273 | sync_blockdev(bdev); | 277 | sync_blockdev(bdev); |
274 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 278 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
275 | return sb; /* thaw_bdev releases s->s_umount */ | 279 | return sb; /* thaw_bdev releases s->s_umount */ |
276 | } | 280 | } |
277 | EXPORT_SYMBOL(freeze_bdev); | 281 | EXPORT_SYMBOL(freeze_bdev); |
278 | 282 | ||
279 | /** | 283 | /** |
280 | * thaw_bdev -- unlock filesystem | 284 | * thaw_bdev -- unlock filesystem |
281 | * @bdev: blockdevice to unlock | 285 | * @bdev: blockdevice to unlock |
282 | * @sb: associated superblock | 286 | * @sb: associated superblock |
283 | * | 287 | * |
284 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). | 288 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). |
285 | */ | 289 | */ |
286 | int thaw_bdev(struct block_device *bdev, struct super_block *sb) | 290 | int thaw_bdev(struct block_device *bdev, struct super_block *sb) |
287 | { | 291 | { |
288 | int error = -EINVAL; | 292 | int error = -EINVAL; |
289 | 293 | ||
290 | mutex_lock(&bdev->bd_fsfreeze_mutex); | 294 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
291 | if (!bdev->bd_fsfreeze_count) | 295 | if (!bdev->bd_fsfreeze_count) |
292 | goto out; | 296 | goto out; |
293 | 297 | ||
294 | error = 0; | 298 | error = 0; |
295 | if (--bdev->bd_fsfreeze_count > 0) | 299 | if (--bdev->bd_fsfreeze_count > 0) |
296 | goto out; | 300 | goto out; |
297 | 301 | ||
298 | if (!sb) | 302 | if (!sb) |
299 | goto out; | 303 | goto out; |
300 | 304 | ||
301 | error = thaw_super(sb); | 305 | error = thaw_super(sb); |
302 | if (error) { | 306 | if (error) { |
303 | bdev->bd_fsfreeze_count++; | 307 | bdev->bd_fsfreeze_count++; |
304 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 308 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
305 | return error; | 309 | return error; |
306 | } | 310 | } |
307 | out: | 311 | out: |
308 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | 312 | mutex_unlock(&bdev->bd_fsfreeze_mutex); |
309 | return 0; | 313 | return 0; |
310 | } | 314 | } |
311 | EXPORT_SYMBOL(thaw_bdev); | 315 | EXPORT_SYMBOL(thaw_bdev); |
312 | 316 | ||
313 | static int blkdev_writepage(struct page *page, struct writeback_control *wbc) | 317 | static int blkdev_writepage(struct page *page, struct writeback_control *wbc) |
314 | { | 318 | { |
315 | return block_write_full_page(page, blkdev_get_block, wbc); | 319 | return block_write_full_page(page, blkdev_get_block, wbc); |
316 | } | 320 | } |
317 | 321 | ||
318 | static int blkdev_readpage(struct file * file, struct page * page) | 322 | static int blkdev_readpage(struct file * file, struct page * page) |
319 | { | 323 | { |
320 | return block_read_full_page(page, blkdev_get_block); | 324 | return block_read_full_page(page, blkdev_get_block); |
321 | } | 325 | } |
322 | 326 | ||
323 | static int blkdev_write_begin(struct file *file, struct address_space *mapping, | 327 | static int blkdev_write_begin(struct file *file, struct address_space *mapping, |
324 | loff_t pos, unsigned len, unsigned flags, | 328 | loff_t pos, unsigned len, unsigned flags, |
325 | struct page **pagep, void **fsdata) | 329 | struct page **pagep, void **fsdata) |
326 | { | 330 | { |
327 | return block_write_begin(mapping, pos, len, flags, pagep, | 331 | return block_write_begin(mapping, pos, len, flags, pagep, |
328 | blkdev_get_block); | 332 | blkdev_get_block); |
329 | } | 333 | } |
330 | 334 | ||
331 | static int blkdev_write_end(struct file *file, struct address_space *mapping, | 335 | static int blkdev_write_end(struct file *file, struct address_space *mapping, |
332 | loff_t pos, unsigned len, unsigned copied, | 336 | loff_t pos, unsigned len, unsigned copied, |
333 | struct page *page, void *fsdata) | 337 | struct page *page, void *fsdata) |
334 | { | 338 | { |
335 | int ret; | 339 | int ret; |
336 | ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); | 340 | ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); |
337 | 341 | ||
338 | unlock_page(page); | 342 | unlock_page(page); |
339 | page_cache_release(page); | 343 | page_cache_release(page); |
340 | 344 | ||
341 | return ret; | 345 | return ret; |
342 | } | 346 | } |
343 | 347 | ||
344 | /* | 348 | /* |
345 | * private llseek: | 349 | * private llseek: |
346 | * for a block special file file->f_path.dentry->d_inode->i_size is zero | 350 | * for a block special file file->f_path.dentry->d_inode->i_size is zero |
347 | * so we compute the size by hand (just as in block_read/write above) | 351 | * so we compute the size by hand (just as in block_read/write above) |
348 | */ | 352 | */ |
349 | static loff_t block_llseek(struct file *file, loff_t offset, int origin) | 353 | static loff_t block_llseek(struct file *file, loff_t offset, int origin) |
350 | { | 354 | { |
351 | struct inode *bd_inode = file->f_mapping->host; | 355 | struct inode *bd_inode = file->f_mapping->host; |
352 | loff_t size; | 356 | loff_t size; |
353 | loff_t retval; | 357 | loff_t retval; |
354 | 358 | ||
355 | mutex_lock(&bd_inode->i_mutex); | 359 | mutex_lock(&bd_inode->i_mutex); |
356 | size = i_size_read(bd_inode); | 360 | size = i_size_read(bd_inode); |
357 | 361 | ||
358 | retval = -EINVAL; | 362 | retval = -EINVAL; |
359 | switch (origin) { | 363 | switch (origin) { |
360 | case SEEK_END: | 364 | case SEEK_END: |
361 | offset += size; | 365 | offset += size; |
362 | break; | 366 | break; |
363 | case SEEK_CUR: | 367 | case SEEK_CUR: |
364 | offset += file->f_pos; | 368 | offset += file->f_pos; |
365 | case SEEK_SET: | 369 | case SEEK_SET: |
366 | break; | 370 | break; |
367 | default: | 371 | default: |
368 | goto out; | 372 | goto out; |
369 | } | 373 | } |
370 | if (offset >= 0 && offset <= size) { | 374 | if (offset >= 0 && offset <= size) { |
371 | if (offset != file->f_pos) { | 375 | if (offset != file->f_pos) { |
372 | file->f_pos = offset; | 376 | file->f_pos = offset; |
373 | } | 377 | } |
374 | retval = offset; | 378 | retval = offset; |
375 | } | 379 | } |
376 | out: | 380 | out: |
377 | mutex_unlock(&bd_inode->i_mutex); | 381 | mutex_unlock(&bd_inode->i_mutex); |
378 | return retval; | 382 | return retval; |
379 | } | 383 | } |
380 | 384 | ||
381 | int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) | 385 | int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) |
382 | { | 386 | { |
383 | struct inode *bd_inode = filp->f_mapping->host; | 387 | struct inode *bd_inode = filp->f_mapping->host; |
384 | struct block_device *bdev = I_BDEV(bd_inode); | 388 | struct block_device *bdev = I_BDEV(bd_inode); |
385 | int error; | 389 | int error; |
386 | 390 | ||
387 | /* | 391 | /* |
388 | * There is no need to serialise calls to blkdev_issue_flush with | 392 | * There is no need to serialise calls to blkdev_issue_flush with |
389 | * i_mutex and doing so causes performance issues with concurrent | 393 | * i_mutex and doing so causes performance issues with concurrent |
390 | * O_SYNC writers to a block device. | 394 | * O_SYNC writers to a block device. |
391 | */ | 395 | */ |
392 | error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); | 396 | error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); |
393 | if (error == -EOPNOTSUPP) | 397 | if (error == -EOPNOTSUPP) |
394 | error = 0; | 398 | error = 0; |
395 | 399 | ||
396 | return error; | 400 | return error; |
397 | } | 401 | } |
398 | EXPORT_SYMBOL(blkdev_fsync); | 402 | EXPORT_SYMBOL(blkdev_fsync); |
399 | 403 | ||
400 | /* | 404 | /* |
401 | * pseudo-fs | 405 | * pseudo-fs |
402 | */ | 406 | */ |
403 | 407 | ||
404 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); | 408 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); |
405 | static struct kmem_cache * bdev_cachep __read_mostly; | 409 | static struct kmem_cache * bdev_cachep __read_mostly; |
406 | 410 | ||
407 | static struct inode *bdev_alloc_inode(struct super_block *sb) | 411 | static struct inode *bdev_alloc_inode(struct super_block *sb) |
408 | { | 412 | { |
409 | struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); | 413 | struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); |
410 | if (!ei) | 414 | if (!ei) |
411 | return NULL; | 415 | return NULL; |
412 | return &ei->vfs_inode; | 416 | return &ei->vfs_inode; |
413 | } | 417 | } |
414 | 418 | ||
415 | static void bdev_i_callback(struct rcu_head *head) | 419 | static void bdev_i_callback(struct rcu_head *head) |
416 | { | 420 | { |
417 | struct inode *inode = container_of(head, struct inode, i_rcu); | 421 | struct inode *inode = container_of(head, struct inode, i_rcu); |
418 | struct bdev_inode *bdi = BDEV_I(inode); | 422 | struct bdev_inode *bdi = BDEV_I(inode); |
419 | 423 | ||
420 | INIT_LIST_HEAD(&inode->i_dentry); | 424 | INIT_LIST_HEAD(&inode->i_dentry); |
421 | kmem_cache_free(bdev_cachep, bdi); | 425 | kmem_cache_free(bdev_cachep, bdi); |
422 | } | 426 | } |
423 | 427 | ||
424 | static void bdev_destroy_inode(struct inode *inode) | 428 | static void bdev_destroy_inode(struct inode *inode) |
425 | { | 429 | { |
426 | call_rcu(&inode->i_rcu, bdev_i_callback); | 430 | call_rcu(&inode->i_rcu, bdev_i_callback); |
427 | } | 431 | } |
428 | 432 | ||
429 | static void init_once(void *foo) | 433 | static void init_once(void *foo) |
430 | { | 434 | { |
431 | struct bdev_inode *ei = (struct bdev_inode *) foo; | 435 | struct bdev_inode *ei = (struct bdev_inode *) foo; |
432 | struct block_device *bdev = &ei->bdev; | 436 | struct block_device *bdev = &ei->bdev; |
433 | 437 | ||
434 | memset(bdev, 0, sizeof(*bdev)); | 438 | memset(bdev, 0, sizeof(*bdev)); |
435 | mutex_init(&bdev->bd_mutex); | 439 | mutex_init(&bdev->bd_mutex); |
436 | INIT_LIST_HEAD(&bdev->bd_inodes); | 440 | INIT_LIST_HEAD(&bdev->bd_inodes); |
437 | INIT_LIST_HEAD(&bdev->bd_list); | 441 | INIT_LIST_HEAD(&bdev->bd_list); |
438 | #ifdef CONFIG_SYSFS | 442 | #ifdef CONFIG_SYSFS |
439 | INIT_LIST_HEAD(&bdev->bd_holder_disks); | 443 | INIT_LIST_HEAD(&bdev->bd_holder_disks); |
440 | #endif | 444 | #endif |
441 | inode_init_once(&ei->vfs_inode); | 445 | inode_init_once(&ei->vfs_inode); |
442 | /* Initialize mutex for freeze. */ | 446 | /* Initialize mutex for freeze. */ |
443 | mutex_init(&bdev->bd_fsfreeze_mutex); | 447 | mutex_init(&bdev->bd_fsfreeze_mutex); |
444 | } | 448 | } |
445 | 449 | ||
446 | static inline void __bd_forget(struct inode *inode) | 450 | static inline void __bd_forget(struct inode *inode) |
447 | { | 451 | { |
448 | list_del_init(&inode->i_devices); | 452 | list_del_init(&inode->i_devices); |
449 | inode->i_bdev = NULL; | 453 | inode->i_bdev = NULL; |
450 | inode->i_mapping = &inode->i_data; | 454 | inode->i_mapping = &inode->i_data; |
451 | } | 455 | } |
452 | 456 | ||
453 | static void bdev_evict_inode(struct inode *inode) | 457 | static void bdev_evict_inode(struct inode *inode) |
454 | { | 458 | { |
455 | struct block_device *bdev = &BDEV_I(inode)->bdev; | 459 | struct block_device *bdev = &BDEV_I(inode)->bdev; |
456 | struct list_head *p; | 460 | struct list_head *p; |
457 | truncate_inode_pages(&inode->i_data, 0); | 461 | truncate_inode_pages(&inode->i_data, 0); |
458 | invalidate_inode_buffers(inode); /* is it needed here? */ | 462 | invalidate_inode_buffers(inode); /* is it needed here? */ |
459 | end_writeback(inode); | 463 | end_writeback(inode); |
460 | spin_lock(&bdev_lock); | 464 | spin_lock(&bdev_lock); |
461 | while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { | 465 | while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { |
462 | __bd_forget(list_entry(p, struct inode, i_devices)); | 466 | __bd_forget(list_entry(p, struct inode, i_devices)); |
463 | } | 467 | } |
464 | list_del_init(&bdev->bd_list); | 468 | list_del_init(&bdev->bd_list); |
465 | spin_unlock(&bdev_lock); | 469 | spin_unlock(&bdev_lock); |
466 | } | 470 | } |
467 | 471 | ||
468 | static const struct super_operations bdev_sops = { | 472 | static const struct super_operations bdev_sops = { |
469 | .statfs = simple_statfs, | 473 | .statfs = simple_statfs, |
470 | .alloc_inode = bdev_alloc_inode, | 474 | .alloc_inode = bdev_alloc_inode, |
471 | .destroy_inode = bdev_destroy_inode, | 475 | .destroy_inode = bdev_destroy_inode, |
472 | .drop_inode = generic_delete_inode, | 476 | .drop_inode = generic_delete_inode, |
473 | .evict_inode = bdev_evict_inode, | 477 | .evict_inode = bdev_evict_inode, |
474 | }; | 478 | }; |
475 | 479 | ||
476 | static struct dentry *bd_mount(struct file_system_type *fs_type, | 480 | static struct dentry *bd_mount(struct file_system_type *fs_type, |
477 | int flags, const char *dev_name, void *data) | 481 | int flags, const char *dev_name, void *data) |
478 | { | 482 | { |
479 | return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); | 483 | return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); |
480 | } | 484 | } |
481 | 485 | ||
482 | static struct file_system_type bd_type = { | 486 | static struct file_system_type bd_type = { |
483 | .name = "bdev", | 487 | .name = "bdev", |
484 | .mount = bd_mount, | 488 | .mount = bd_mount, |
485 | .kill_sb = kill_anon_super, | 489 | .kill_sb = kill_anon_super, |
486 | }; | 490 | }; |
487 | 491 | ||
488 | struct super_block *blockdev_superblock __read_mostly; | 492 | struct super_block *blockdev_superblock __read_mostly; |
489 | 493 | ||
490 | void __init bdev_cache_init(void) | 494 | void __init bdev_cache_init(void) |
491 | { | 495 | { |
492 | int err; | 496 | int err; |
493 | struct vfsmount *bd_mnt; | 497 | struct vfsmount *bd_mnt; |
494 | 498 | ||
495 | bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), | 499 | bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), |
496 | 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| | 500 | 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| |
497 | SLAB_MEM_SPREAD|SLAB_PANIC), | 501 | SLAB_MEM_SPREAD|SLAB_PANIC), |
498 | init_once); | 502 | init_once); |
499 | err = register_filesystem(&bd_type); | 503 | err = register_filesystem(&bd_type); |
500 | if (err) | 504 | if (err) |
501 | panic("Cannot register bdev pseudo-fs"); | 505 | panic("Cannot register bdev pseudo-fs"); |
502 | bd_mnt = kern_mount(&bd_type); | 506 | bd_mnt = kern_mount(&bd_type); |
503 | if (IS_ERR(bd_mnt)) | 507 | if (IS_ERR(bd_mnt)) |
504 | panic("Cannot create bdev pseudo-fs"); | 508 | panic("Cannot create bdev pseudo-fs"); |
505 | /* | 509 | /* |
506 | * This vfsmount structure is only used to obtain the | 510 | * This vfsmount structure is only used to obtain the |
507 | * blockdev_superblock, so tell kmemleak not to report it. | 511 | * blockdev_superblock, so tell kmemleak not to report it. |
508 | */ | 512 | */ |
509 | kmemleak_not_leak(bd_mnt); | 513 | kmemleak_not_leak(bd_mnt); |
510 | blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ | 514 | blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ |
511 | } | 515 | } |
512 | 516 | ||
513 | /* | 517 | /* |
514 | * Most likely _very_ bad one - but then it's hardly critical for small | 518 | * Most likely _very_ bad one - but then it's hardly critical for small |
515 | * /dev and can be fixed when somebody will need really large one. | 519 | * /dev and can be fixed when somebody will need really large one. |
516 | * Keep in mind that it will be fed through icache hash function too. | 520 | * Keep in mind that it will be fed through icache hash function too. |
517 | */ | 521 | */ |
518 | static inline unsigned long hash(dev_t dev) | 522 | static inline unsigned long hash(dev_t dev) |
519 | { | 523 | { |
520 | return MAJOR(dev)+MINOR(dev); | 524 | return MAJOR(dev)+MINOR(dev); |
521 | } | 525 | } |
522 | 526 | ||
523 | static int bdev_test(struct inode *inode, void *data) | 527 | static int bdev_test(struct inode *inode, void *data) |
524 | { | 528 | { |
525 | return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; | 529 | return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; |
526 | } | 530 | } |
527 | 531 | ||
528 | static int bdev_set(struct inode *inode, void *data) | 532 | static int bdev_set(struct inode *inode, void *data) |
529 | { | 533 | { |
530 | BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; | 534 | BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; |
531 | return 0; | 535 | return 0; |
532 | } | 536 | } |
533 | 537 | ||
534 | static LIST_HEAD(all_bdevs); | 538 | static LIST_HEAD(all_bdevs); |
535 | 539 | ||
536 | struct block_device *bdget(dev_t dev) | 540 | struct block_device *bdget(dev_t dev) |
537 | { | 541 | { |
538 | struct block_device *bdev; | 542 | struct block_device *bdev; |
539 | struct inode *inode; | 543 | struct inode *inode; |
540 | 544 | ||
541 | inode = iget5_locked(blockdev_superblock, hash(dev), | 545 | inode = iget5_locked(blockdev_superblock, hash(dev), |
542 | bdev_test, bdev_set, &dev); | 546 | bdev_test, bdev_set, &dev); |
543 | 547 | ||
544 | if (!inode) | 548 | if (!inode) |
545 | return NULL; | 549 | return NULL; |
546 | 550 | ||
547 | bdev = &BDEV_I(inode)->bdev; | 551 | bdev = &BDEV_I(inode)->bdev; |
548 | 552 | ||
549 | if (inode->i_state & I_NEW) { | 553 | if (inode->i_state & I_NEW) { |
550 | bdev->bd_contains = NULL; | 554 | bdev->bd_contains = NULL; |
551 | bdev->bd_inode = inode; | 555 | bdev->bd_inode = inode; |
552 | bdev->bd_block_size = (1 << inode->i_blkbits); | 556 | bdev->bd_block_size = (1 << inode->i_blkbits); |
553 | bdev->bd_part_count = 0; | 557 | bdev->bd_part_count = 0; |
554 | bdev->bd_invalidated = 0; | 558 | bdev->bd_invalidated = 0; |
555 | inode->i_mode = S_IFBLK; | 559 | inode->i_mode = S_IFBLK; |
556 | inode->i_rdev = dev; | 560 | inode->i_rdev = dev; |
557 | inode->i_bdev = bdev; | 561 | inode->i_bdev = bdev; |
558 | inode->i_data.a_ops = &def_blk_aops; | 562 | inode->i_data.a_ops = &def_blk_aops; |
559 | mapping_set_gfp_mask(&inode->i_data, GFP_USER); | 563 | mapping_set_gfp_mask(&inode->i_data, GFP_USER); |
560 | inode->i_data.backing_dev_info = &default_backing_dev_info; | 564 | inode->i_data.backing_dev_info = &default_backing_dev_info; |
561 | spin_lock(&bdev_lock); | 565 | spin_lock(&bdev_lock); |
562 | list_add(&bdev->bd_list, &all_bdevs); | 566 | list_add(&bdev->bd_list, &all_bdevs); |
563 | spin_unlock(&bdev_lock); | 567 | spin_unlock(&bdev_lock); |
564 | unlock_new_inode(inode); | 568 | unlock_new_inode(inode); |
565 | } | 569 | } |
566 | return bdev; | 570 | return bdev; |
567 | } | 571 | } |
568 | 572 | ||
569 | EXPORT_SYMBOL(bdget); | 573 | EXPORT_SYMBOL(bdget); |
570 | 574 | ||
571 | /** | 575 | /** |
572 | * bdgrab -- Grab a reference to an already referenced block device | 576 | * bdgrab -- Grab a reference to an already referenced block device |
573 | * @bdev: Block device to grab a reference to. | 577 | * @bdev: Block device to grab a reference to. |
574 | */ | 578 | */ |
575 | struct block_device *bdgrab(struct block_device *bdev) | 579 | struct block_device *bdgrab(struct block_device *bdev) |
576 | { | 580 | { |
577 | ihold(bdev->bd_inode); | 581 | ihold(bdev->bd_inode); |
578 | return bdev; | 582 | return bdev; |
579 | } | 583 | } |
580 | 584 | ||
581 | long nr_blockdev_pages(void) | 585 | long nr_blockdev_pages(void) |
582 | { | 586 | { |
583 | struct block_device *bdev; | 587 | struct block_device *bdev; |
584 | long ret = 0; | 588 | long ret = 0; |
585 | spin_lock(&bdev_lock); | 589 | spin_lock(&bdev_lock); |
586 | list_for_each_entry(bdev, &all_bdevs, bd_list) { | 590 | list_for_each_entry(bdev, &all_bdevs, bd_list) { |
587 | ret += bdev->bd_inode->i_mapping->nrpages; | 591 | ret += bdev->bd_inode->i_mapping->nrpages; |
588 | } | 592 | } |
589 | spin_unlock(&bdev_lock); | 593 | spin_unlock(&bdev_lock); |
590 | return ret; | 594 | return ret; |
591 | } | 595 | } |
592 | 596 | ||
593 | void bdput(struct block_device *bdev) | 597 | void bdput(struct block_device *bdev) |
594 | { | 598 | { |
595 | iput(bdev->bd_inode); | 599 | iput(bdev->bd_inode); |
596 | } | 600 | } |
597 | 601 | ||
598 | EXPORT_SYMBOL(bdput); | 602 | EXPORT_SYMBOL(bdput); |
599 | 603 | ||
600 | static struct block_device *bd_acquire(struct inode *inode) | 604 | static struct block_device *bd_acquire(struct inode *inode) |
601 | { | 605 | { |
602 | struct block_device *bdev; | 606 | struct block_device *bdev; |
603 | 607 | ||
604 | spin_lock(&bdev_lock); | 608 | spin_lock(&bdev_lock); |
605 | bdev = inode->i_bdev; | 609 | bdev = inode->i_bdev; |
606 | if (bdev) { | 610 | if (bdev) { |
607 | ihold(bdev->bd_inode); | 611 | ihold(bdev->bd_inode); |
608 | spin_unlock(&bdev_lock); | 612 | spin_unlock(&bdev_lock); |
609 | return bdev; | 613 | return bdev; |
610 | } | 614 | } |
611 | spin_unlock(&bdev_lock); | 615 | spin_unlock(&bdev_lock); |
612 | 616 | ||
613 | bdev = bdget(inode->i_rdev); | 617 | bdev = bdget(inode->i_rdev); |
614 | if (bdev) { | 618 | if (bdev) { |
615 | spin_lock(&bdev_lock); | 619 | spin_lock(&bdev_lock); |
616 | if (!inode->i_bdev) { | 620 | if (!inode->i_bdev) { |
617 | /* | 621 | /* |
618 | * We take an additional reference to bd_inode, | 622 | * We take an additional reference to bd_inode, |
619 | * and it's released in clear_inode() of inode. | 623 | * and it's released in clear_inode() of inode. |
620 | * So, we can access it via ->i_mapping always | 624 | * So, we can access it via ->i_mapping always |
621 | * without igrab(). | 625 | * without igrab(). |
622 | */ | 626 | */ |
623 | ihold(bdev->bd_inode); | 627 | ihold(bdev->bd_inode); |
624 | inode->i_bdev = bdev; | 628 | inode->i_bdev = bdev; |
625 | inode->i_mapping = bdev->bd_inode->i_mapping; | 629 | inode->i_mapping = bdev->bd_inode->i_mapping; |
626 | list_add(&inode->i_devices, &bdev->bd_inodes); | 630 | list_add(&inode->i_devices, &bdev->bd_inodes); |
627 | } | 631 | } |
628 | spin_unlock(&bdev_lock); | 632 | spin_unlock(&bdev_lock); |
629 | } | 633 | } |
630 | return bdev; | 634 | return bdev; |
631 | } | 635 | } |
632 | 636 | ||
633 | /* Call when you free inode */ | 637 | /* Call when you free inode */ |
634 | 638 | ||
635 | void bd_forget(struct inode *inode) | 639 | void bd_forget(struct inode *inode) |
636 | { | 640 | { |
637 | struct block_device *bdev = NULL; | 641 | struct block_device *bdev = NULL; |
638 | 642 | ||
639 | spin_lock(&bdev_lock); | 643 | spin_lock(&bdev_lock); |
640 | if (inode->i_bdev) { | 644 | if (inode->i_bdev) { |
641 | if (!sb_is_blkdev_sb(inode->i_sb)) | 645 | if (!sb_is_blkdev_sb(inode->i_sb)) |
642 | bdev = inode->i_bdev; | 646 | bdev = inode->i_bdev; |
643 | __bd_forget(inode); | 647 | __bd_forget(inode); |
644 | } | 648 | } |
645 | spin_unlock(&bdev_lock); | 649 | spin_unlock(&bdev_lock); |
646 | 650 | ||
647 | if (bdev) | 651 | if (bdev) |
648 | iput(bdev->bd_inode); | 652 | iput(bdev->bd_inode); |
649 | } | 653 | } |
650 | 654 | ||
651 | /** | 655 | /** |
652 | * bd_may_claim - test whether a block device can be claimed | 656 | * bd_may_claim - test whether a block device can be claimed |
653 | * @bdev: block device of interest | 657 | * @bdev: block device of interest |
654 | * @whole: whole block device containing @bdev, may equal @bdev | 658 | * @whole: whole block device containing @bdev, may equal @bdev |
655 | * @holder: holder trying to claim @bdev | 659 | * @holder: holder trying to claim @bdev |
656 | * | 660 | * |
657 | * Test whether @bdev can be claimed by @holder. | 661 | * Test whether @bdev can be claimed by @holder. |
658 | * | 662 | * |
659 | * CONTEXT: | 663 | * CONTEXT: |
660 | * spin_lock(&bdev_lock). | 664 | * spin_lock(&bdev_lock). |
661 | * | 665 | * |
662 | * RETURNS: | 666 | * RETURNS: |
663 | * %true if @bdev can be claimed, %false otherwise. | 667 | * %true if @bdev can be claimed, %false otherwise. |
664 | */ | 668 | */ |
665 | static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, | 669 | static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, |
666 | void *holder) | 670 | void *holder) |
667 | { | 671 | { |
668 | if (bdev->bd_holder == holder) | 672 | if (bdev->bd_holder == holder) |
669 | return true; /* already a holder */ | 673 | return true; /* already a holder */ |
670 | else if (bdev->bd_holder != NULL) | 674 | else if (bdev->bd_holder != NULL) |
671 | return false; /* held by someone else */ | 675 | return false; /* held by someone else */ |
672 | else if (bdev->bd_contains == bdev) | 676 | else if (bdev->bd_contains == bdev) |
673 | return true; /* is a whole device which isn't held */ | 677 | return true; /* is a whole device which isn't held */ |
674 | 678 | ||
675 | else if (whole->bd_holder == bd_may_claim) | 679 | else if (whole->bd_holder == bd_may_claim) |
676 | return true; /* is a partition of a device that is being partitioned */ | 680 | return true; /* is a partition of a device that is being partitioned */ |
677 | else if (whole->bd_holder != NULL) | 681 | else if (whole->bd_holder != NULL) |
678 | return false; /* is a partition of a held device */ | 682 | return false; /* is a partition of a held device */ |
679 | else | 683 | else |
680 | return true; /* is a partition of an un-held device */ | 684 | return true; /* is a partition of an un-held device */ |
681 | } | 685 | } |
682 | 686 | ||
683 | /** | 687 | /** |
684 | * bd_prepare_to_claim - prepare to claim a block device | 688 | * bd_prepare_to_claim - prepare to claim a block device |
685 | * @bdev: block device of interest | 689 | * @bdev: block device of interest |
686 | * @whole: the whole device containing @bdev, may equal @bdev | 690 | * @whole: the whole device containing @bdev, may equal @bdev |
687 | * @holder: holder trying to claim @bdev | 691 | * @holder: holder trying to claim @bdev |
688 | * | 692 | * |
689 | * Prepare to claim @bdev. This function fails if @bdev is already | 693 | * Prepare to claim @bdev. This function fails if @bdev is already |
690 | * claimed by another holder and waits if another claiming is in | 694 | * claimed by another holder and waits if another claiming is in |
691 | * progress. This function doesn't actually claim. On successful | 695 | * progress. This function doesn't actually claim. On successful |
692 | * return, the caller has ownership of bd_claiming and bd_holder[s]. | 696 | * return, the caller has ownership of bd_claiming and bd_holder[s]. |
693 | * | 697 | * |
694 | * CONTEXT: | 698 | * CONTEXT: |
695 | * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab | 699 | * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab |
696 | * it multiple times. | 700 | * it multiple times. |
697 | * | 701 | * |
698 | * RETURNS: | 702 | * RETURNS: |
699 | * 0 if @bdev can be claimed, -EBUSY otherwise. | 703 | * 0 if @bdev can be claimed, -EBUSY otherwise. |
700 | */ | 704 | */ |
701 | static int bd_prepare_to_claim(struct block_device *bdev, | 705 | static int bd_prepare_to_claim(struct block_device *bdev, |
702 | struct block_device *whole, void *holder) | 706 | struct block_device *whole, void *holder) |
703 | { | 707 | { |
704 | retry: | 708 | retry: |
705 | /* if someone else claimed, fail */ | 709 | /* if someone else claimed, fail */ |
706 | if (!bd_may_claim(bdev, whole, holder)) | 710 | if (!bd_may_claim(bdev, whole, holder)) |
707 | return -EBUSY; | 711 | return -EBUSY; |
708 | 712 | ||
709 | /* if claiming is already in progress, wait for it to finish */ | 713 | /* if claiming is already in progress, wait for it to finish */ |
710 | if (whole->bd_claiming) { | 714 | if (whole->bd_claiming) { |
711 | wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); | 715 | wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); |
712 | DEFINE_WAIT(wait); | 716 | DEFINE_WAIT(wait); |
713 | 717 | ||
714 | prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); | 718 | prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); |
715 | spin_unlock(&bdev_lock); | 719 | spin_unlock(&bdev_lock); |
716 | schedule(); | 720 | schedule(); |
717 | finish_wait(wq, &wait); | 721 | finish_wait(wq, &wait); |
718 | spin_lock(&bdev_lock); | 722 | spin_lock(&bdev_lock); |
719 | goto retry; | 723 | goto retry; |
720 | } | 724 | } |
721 | 725 | ||
722 | /* yay, all mine */ | 726 | /* yay, all mine */ |
723 | return 0; | 727 | return 0; |
724 | } | 728 | } |
725 | 729 | ||
726 | /** | 730 | /** |
727 | * bd_start_claiming - start claiming a block device | 731 | * bd_start_claiming - start claiming a block device |
728 | * @bdev: block device of interest | 732 | * @bdev: block device of interest |
729 | * @holder: holder trying to claim @bdev | 733 | * @holder: holder trying to claim @bdev |
730 | * | 734 | * |
731 | * @bdev is about to be opened exclusively. Check @bdev can be opened | 735 | * @bdev is about to be opened exclusively. Check @bdev can be opened |
732 | * exclusively and mark that an exclusive open is in progress. Each | 736 | * exclusively and mark that an exclusive open is in progress. Each |
733 | * successful call to this function must be matched with a call to | 737 | * successful call to this function must be matched with a call to |
734 | * either bd_finish_claiming() or bd_abort_claiming() (which do not | 738 | * either bd_finish_claiming() or bd_abort_claiming() (which do not |
735 | * fail). | 739 | * fail). |
736 | * | 740 | * |
737 | * This function is used to gain exclusive access to the block device | 741 | * This function is used to gain exclusive access to the block device |
738 | * without actually causing other exclusive open attempts to fail. It | 742 | * without actually causing other exclusive open attempts to fail. It |
739 | * should be used when the open sequence itself requires exclusive | 743 | * should be used when the open sequence itself requires exclusive |
740 | * access but may subsequently fail. | 744 | * access but may subsequently fail. |
741 | * | 745 | * |
742 | * CONTEXT: | 746 | * CONTEXT: |
743 | * Might sleep. | 747 | * Might sleep. |
744 | * | 748 | * |
745 | * RETURNS: | 749 | * RETURNS: |
746 | * Pointer to the block device containing @bdev on success, ERR_PTR() | 750 | * Pointer to the block device containing @bdev on success, ERR_PTR() |
747 | * value on failure. | 751 | * value on failure. |
748 | */ | 752 | */ |
749 | static struct block_device *bd_start_claiming(struct block_device *bdev, | 753 | static struct block_device *bd_start_claiming(struct block_device *bdev, |
750 | void *holder) | 754 | void *holder) |
751 | { | 755 | { |
752 | struct gendisk *disk; | 756 | struct gendisk *disk; |
753 | struct block_device *whole; | 757 | struct block_device *whole; |
754 | int partno, err; | 758 | int partno, err; |
755 | 759 | ||
756 | might_sleep(); | 760 | might_sleep(); |
757 | 761 | ||
758 | /* | 762 | /* |
759 | * @bdev might not have been initialized properly yet, look up | 763 | * @bdev might not have been initialized properly yet, look up |
760 | * and grab the outer block device the hard way. | 764 | * and grab the outer block device the hard way. |
761 | */ | 765 | */ |
762 | disk = get_gendisk(bdev->bd_dev, &partno); | 766 | disk = get_gendisk(bdev->bd_dev, &partno); |
763 | if (!disk) | 767 | if (!disk) |
764 | return ERR_PTR(-ENXIO); | 768 | return ERR_PTR(-ENXIO); |
765 | 769 | ||
766 | /* | 770 | /* |
767 | * Normally, @bdev should equal what's returned from bdget_disk() | 771 | * Normally, @bdev should equal what's returned from bdget_disk() |
768 | * if partno is 0; however, some drivers (floppy) use multiple | 772 | * if partno is 0; however, some drivers (floppy) use multiple |
769 | * bdev's for the same physical device and @bdev may be one of the | 773 | * bdev's for the same physical device and @bdev may be one of the |
770 | * aliases. Keep @bdev if partno is 0. This means claimer | 774 | * aliases. Keep @bdev if partno is 0. This means claimer |
771 | * tracking is broken for those devices but it has always been that | 775 | * tracking is broken for those devices but it has always been that |
772 | * way. | 776 | * way. |
773 | */ | 777 | */ |
774 | if (partno) | 778 | if (partno) |
775 | whole = bdget_disk(disk, 0); | 779 | whole = bdget_disk(disk, 0); |
776 | else | 780 | else |
777 | whole = bdgrab(bdev); | 781 | whole = bdgrab(bdev); |
778 | 782 | ||
779 | module_put(disk->fops->owner); | 783 | module_put(disk->fops->owner); |
780 | put_disk(disk); | 784 | put_disk(disk); |
781 | if (!whole) | 785 | if (!whole) |
782 | return ERR_PTR(-ENOMEM); | 786 | return ERR_PTR(-ENOMEM); |
783 | 787 | ||
784 | /* prepare to claim, if successful, mark claiming in progress */ | 788 | /* prepare to claim, if successful, mark claiming in progress */ |
785 | spin_lock(&bdev_lock); | 789 | spin_lock(&bdev_lock); |
786 | 790 | ||
787 | err = bd_prepare_to_claim(bdev, whole, holder); | 791 | err = bd_prepare_to_claim(bdev, whole, holder); |
788 | if (err == 0) { | 792 | if (err == 0) { |
789 | whole->bd_claiming = holder; | 793 | whole->bd_claiming = holder; |
790 | spin_unlock(&bdev_lock); | 794 | spin_unlock(&bdev_lock); |
791 | return whole; | 795 | return whole; |
792 | } else { | 796 | } else { |
793 | spin_unlock(&bdev_lock); | 797 | spin_unlock(&bdev_lock); |
794 | bdput(whole); | 798 | bdput(whole); |
795 | return ERR_PTR(err); | 799 | return ERR_PTR(err); |
796 | } | 800 | } |
797 | } | 801 | } |
798 | 802 | ||
799 | #ifdef CONFIG_SYSFS | 803 | #ifdef CONFIG_SYSFS |
800 | struct bd_holder_disk { | 804 | struct bd_holder_disk { |
801 | struct list_head list; | 805 | struct list_head list; |
802 | struct gendisk *disk; | 806 | struct gendisk *disk; |
803 | int refcnt; | 807 | int refcnt; |
804 | }; | 808 | }; |
805 | 809 | ||
806 | static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, | 810 | static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, |
807 | struct gendisk *disk) | 811 | struct gendisk *disk) |
808 | { | 812 | { |
809 | struct bd_holder_disk *holder; | 813 | struct bd_holder_disk *holder; |
810 | 814 | ||
811 | list_for_each_entry(holder, &bdev->bd_holder_disks, list) | 815 | list_for_each_entry(holder, &bdev->bd_holder_disks, list) |
812 | if (holder->disk == disk) | 816 | if (holder->disk == disk) |
813 | return holder; | 817 | return holder; |
814 | return NULL; | 818 | return NULL; |
815 | } | 819 | } |
816 | 820 | ||
817 | static int add_symlink(struct kobject *from, struct kobject *to) | 821 | static int add_symlink(struct kobject *from, struct kobject *to) |
818 | { | 822 | { |
819 | return sysfs_create_link(from, to, kobject_name(to)); | 823 | return sysfs_create_link(from, to, kobject_name(to)); |
820 | } | 824 | } |
821 | 825 | ||
822 | static void del_symlink(struct kobject *from, struct kobject *to) | 826 | static void del_symlink(struct kobject *from, struct kobject *to) |
823 | { | 827 | { |
824 | sysfs_remove_link(from, kobject_name(to)); | 828 | sysfs_remove_link(from, kobject_name(to)); |
825 | } | 829 | } |
826 | 830 | ||
827 | /** | 831 | /** |
828 | * bd_link_disk_holder - create symlinks between holding disk and slave bdev | 832 | * bd_link_disk_holder - create symlinks between holding disk and slave bdev |
829 | * @bdev: the claimed slave bdev | 833 | * @bdev: the claimed slave bdev |
830 | * @disk: the holding disk | 834 | * @disk: the holding disk |
831 | * | 835 | * |
832 | * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. | 836 | * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. |
833 | * | 837 | * |
834 | * This functions creates the following sysfs symlinks. | 838 | * This functions creates the following sysfs symlinks. |
835 | * | 839 | * |
836 | * - from "slaves" directory of the holder @disk to the claimed @bdev | 840 | * - from "slaves" directory of the holder @disk to the claimed @bdev |
837 | * - from "holders" directory of the @bdev to the holder @disk | 841 | * - from "holders" directory of the @bdev to the holder @disk |
838 | * | 842 | * |
839 | * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is | 843 | * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is |
840 | * passed to bd_link_disk_holder(), then: | 844 | * passed to bd_link_disk_holder(), then: |
841 | * | 845 | * |
842 | * /sys/block/dm-0/slaves/sda --> /sys/block/sda | 846 | * /sys/block/dm-0/slaves/sda --> /sys/block/sda |
843 | * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 | 847 | * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 |
844 | * | 848 | * |
845 | * The caller must have claimed @bdev before calling this function and | 849 | * The caller must have claimed @bdev before calling this function and |
846 | * ensure that both @bdev and @disk are valid during the creation and | 850 | * ensure that both @bdev and @disk are valid during the creation and |
847 | * lifetime of these symlinks. | 851 | * lifetime of these symlinks. |
848 | * | 852 | * |
849 | * CONTEXT: | 853 | * CONTEXT: |
850 | * Might sleep. | 854 | * Might sleep. |
851 | * | 855 | * |
852 | * RETURNS: | 856 | * RETURNS: |
853 | * 0 on success, -errno on failure. | 857 | * 0 on success, -errno on failure. |
854 | */ | 858 | */ |
855 | int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) | 859 | int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) |
856 | { | 860 | { |
857 | struct bd_holder_disk *holder; | 861 | struct bd_holder_disk *holder; |
858 | int ret = 0; | 862 | int ret = 0; |
859 | 863 | ||
860 | mutex_lock(&bdev->bd_mutex); | 864 | mutex_lock(&bdev->bd_mutex); |
861 | 865 | ||
862 | WARN_ON_ONCE(!bdev->bd_holder); | 866 | WARN_ON_ONCE(!bdev->bd_holder); |
863 | 867 | ||
864 | /* FIXME: remove the following once add_disk() handles errors */ | 868 | /* FIXME: remove the following once add_disk() handles errors */ |
865 | if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) | 869 | if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) |
866 | goto out_unlock; | 870 | goto out_unlock; |
867 | 871 | ||
868 | holder = bd_find_holder_disk(bdev, disk); | 872 | holder = bd_find_holder_disk(bdev, disk); |
869 | if (holder) { | 873 | if (holder) { |
870 | holder->refcnt++; | 874 | holder->refcnt++; |
871 | goto out_unlock; | 875 | goto out_unlock; |
872 | } | 876 | } |
873 | 877 | ||
874 | holder = kzalloc(sizeof(*holder), GFP_KERNEL); | 878 | holder = kzalloc(sizeof(*holder), GFP_KERNEL); |
875 | if (!holder) { | 879 | if (!holder) { |
876 | ret = -ENOMEM; | 880 | ret = -ENOMEM; |
877 | goto out_unlock; | 881 | goto out_unlock; |
878 | } | 882 | } |
879 | 883 | ||
880 | INIT_LIST_HEAD(&holder->list); | 884 | INIT_LIST_HEAD(&holder->list); |
881 | holder->disk = disk; | 885 | holder->disk = disk; |
882 | holder->refcnt = 1; | 886 | holder->refcnt = 1; |
883 | 887 | ||
884 | ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); | 888 | ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); |
885 | if (ret) | 889 | if (ret) |
886 | goto out_free; | 890 | goto out_free; |
887 | 891 | ||
888 | ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); | 892 | ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); |
889 | if (ret) | 893 | if (ret) |
890 | goto out_del; | 894 | goto out_del; |
891 | /* | 895 | /* |
892 | * bdev could be deleted beneath us which would implicitly destroy | 896 | * bdev could be deleted beneath us which would implicitly destroy |
893 | * the holder directory. Hold on to it. | 897 | * the holder directory. Hold on to it. |
894 | */ | 898 | */ |
895 | kobject_get(bdev->bd_part->holder_dir); | 899 | kobject_get(bdev->bd_part->holder_dir); |
896 | 900 | ||
897 | list_add(&holder->list, &bdev->bd_holder_disks); | 901 | list_add(&holder->list, &bdev->bd_holder_disks); |
898 | goto out_unlock; | 902 | goto out_unlock; |
899 | 903 | ||
900 | out_del: | 904 | out_del: |
901 | del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); | 905 | del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); |
902 | out_free: | 906 | out_free: |
903 | kfree(holder); | 907 | kfree(holder); |
904 | out_unlock: | 908 | out_unlock: |
905 | mutex_unlock(&bdev->bd_mutex); | 909 | mutex_unlock(&bdev->bd_mutex); |
906 | return ret; | 910 | return ret; |
907 | } | 911 | } |
908 | EXPORT_SYMBOL_GPL(bd_link_disk_holder); | 912 | EXPORT_SYMBOL_GPL(bd_link_disk_holder); |
909 | 913 | ||
910 | /** | 914 | /** |
911 | * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() | 915 | * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() |
912 | * @bdev: the calimed slave bdev | 916 | * @bdev: the calimed slave bdev |
913 | * @disk: the holding disk | 917 | * @disk: the holding disk |
914 | * | 918 | * |
915 | * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. | 919 | * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. |
916 | * | 920 | * |
917 | * CONTEXT: | 921 | * CONTEXT: |
918 | * Might sleep. | 922 | * Might sleep. |
919 | */ | 923 | */ |
920 | void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) | 924 | void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) |
921 | { | 925 | { |
922 | struct bd_holder_disk *holder; | 926 | struct bd_holder_disk *holder; |
923 | 927 | ||
924 | mutex_lock(&bdev->bd_mutex); | 928 | mutex_lock(&bdev->bd_mutex); |
925 | 929 | ||
926 | holder = bd_find_holder_disk(bdev, disk); | 930 | holder = bd_find_holder_disk(bdev, disk); |
927 | 931 | ||
928 | if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { | 932 | if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { |
929 | del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); | 933 | del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); |
930 | del_symlink(bdev->bd_part->holder_dir, | 934 | del_symlink(bdev->bd_part->holder_dir, |
931 | &disk_to_dev(disk)->kobj); | 935 | &disk_to_dev(disk)->kobj); |
932 | kobject_put(bdev->bd_part->holder_dir); | 936 | kobject_put(bdev->bd_part->holder_dir); |
933 | list_del_init(&holder->list); | 937 | list_del_init(&holder->list); |
934 | kfree(holder); | 938 | kfree(holder); |
935 | } | 939 | } |
936 | 940 | ||
937 | mutex_unlock(&bdev->bd_mutex); | 941 | mutex_unlock(&bdev->bd_mutex); |
938 | } | 942 | } |
939 | EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); | 943 | EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); |
940 | #endif | 944 | #endif |
941 | 945 | ||
942 | /** | 946 | /** |
943 | * flush_disk - invalidates all buffer-cache entries on a disk | 947 | * flush_disk - invalidates all buffer-cache entries on a disk |
944 | * | 948 | * |
945 | * @bdev: struct block device to be flushed | 949 | * @bdev: struct block device to be flushed |
946 | * @kill_dirty: flag to guide handling of dirty inodes | 950 | * @kill_dirty: flag to guide handling of dirty inodes |
947 | * | 951 | * |
948 | * Invalidates all buffer-cache entries on a disk. It should be called | 952 | * Invalidates all buffer-cache entries on a disk. It should be called |
949 | * when a disk has been changed -- either by a media change or online | 953 | * when a disk has been changed -- either by a media change or online |
950 | * resize. | 954 | * resize. |
951 | */ | 955 | */ |
952 | static void flush_disk(struct block_device *bdev, bool kill_dirty) | 956 | static void flush_disk(struct block_device *bdev, bool kill_dirty) |
953 | { | 957 | { |
954 | if (__invalidate_device(bdev, kill_dirty)) { | 958 | if (__invalidate_device(bdev, kill_dirty)) { |
955 | char name[BDEVNAME_SIZE] = ""; | 959 | char name[BDEVNAME_SIZE] = ""; |
956 | 960 | ||
957 | if (bdev->bd_disk) | 961 | if (bdev->bd_disk) |
958 | disk_name(bdev->bd_disk, 0, name); | 962 | disk_name(bdev->bd_disk, 0, name); |
959 | printk(KERN_WARNING "VFS: busy inodes on changed media or " | 963 | printk(KERN_WARNING "VFS: busy inodes on changed media or " |
960 | "resized disk %s\n", name); | 964 | "resized disk %s\n", name); |
961 | } | 965 | } |
962 | 966 | ||
963 | if (!bdev->bd_disk) | 967 | if (!bdev->bd_disk) |
964 | return; | 968 | return; |
965 | if (disk_partitionable(bdev->bd_disk)) | 969 | if (disk_partitionable(bdev->bd_disk)) |
966 | bdev->bd_invalidated = 1; | 970 | bdev->bd_invalidated = 1; |
967 | } | 971 | } |
968 | 972 | ||
969 | /** | 973 | /** |
970 | * check_disk_size_change - checks for disk size change and adjusts bdev size. | 974 | * check_disk_size_change - checks for disk size change and adjusts bdev size. |
971 | * @disk: struct gendisk to check | 975 | * @disk: struct gendisk to check |
972 | * @bdev: struct bdev to adjust. | 976 | * @bdev: struct bdev to adjust. |
973 | * | 977 | * |
974 | * This routine checks to see if the bdev size does not match the disk size | 978 | * This routine checks to see if the bdev size does not match the disk size |
975 | * and adjusts it if it differs. | 979 | * and adjusts it if it differs. |
976 | */ | 980 | */ |
977 | void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) | 981 | void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) |
978 | { | 982 | { |
979 | loff_t disk_size, bdev_size; | 983 | loff_t disk_size, bdev_size; |
980 | 984 | ||
981 | disk_size = (loff_t)get_capacity(disk) << 9; | 985 | disk_size = (loff_t)get_capacity(disk) << 9; |
982 | bdev_size = i_size_read(bdev->bd_inode); | 986 | bdev_size = i_size_read(bdev->bd_inode); |
983 | if (disk_size != bdev_size) { | 987 | if (disk_size != bdev_size) { |
984 | char name[BDEVNAME_SIZE]; | 988 | char name[BDEVNAME_SIZE]; |
985 | 989 | ||
986 | disk_name(disk, 0, name); | 990 | disk_name(disk, 0, name); |
987 | printk(KERN_INFO | 991 | printk(KERN_INFO |
988 | "%s: detected capacity change from %lld to %lld\n", | 992 | "%s: detected capacity change from %lld to %lld\n", |
989 | name, bdev_size, disk_size); | 993 | name, bdev_size, disk_size); |
990 | i_size_write(bdev->bd_inode, disk_size); | 994 | i_size_write(bdev->bd_inode, disk_size); |
991 | flush_disk(bdev, false); | 995 | flush_disk(bdev, false); |
992 | } | 996 | } |
993 | } | 997 | } |
994 | EXPORT_SYMBOL(check_disk_size_change); | 998 | EXPORT_SYMBOL(check_disk_size_change); |
995 | 999 | ||
996 | /** | 1000 | /** |
997 | * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back | 1001 | * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back |
998 | * @disk: struct gendisk to be revalidated | 1002 | * @disk: struct gendisk to be revalidated |
999 | * | 1003 | * |
1000 | * This routine is a wrapper for lower-level driver's revalidate_disk | 1004 | * This routine is a wrapper for lower-level driver's revalidate_disk |
1001 | * call-backs. It is used to do common pre and post operations needed | 1005 | * call-backs. It is used to do common pre and post operations needed |
1002 | * for all revalidate_disk operations. | 1006 | * for all revalidate_disk operations. |
1003 | */ | 1007 | */ |
1004 | int revalidate_disk(struct gendisk *disk) | 1008 | int revalidate_disk(struct gendisk *disk) |
1005 | { | 1009 | { |
1006 | struct block_device *bdev; | 1010 | struct block_device *bdev; |
1007 | int ret = 0; | 1011 | int ret = 0; |
1008 | 1012 | ||
1009 | if (disk->fops->revalidate_disk) | 1013 | if (disk->fops->revalidate_disk) |
1010 | ret = disk->fops->revalidate_disk(disk); | 1014 | ret = disk->fops->revalidate_disk(disk); |
1011 | 1015 | ||
1012 | bdev = bdget_disk(disk, 0); | 1016 | bdev = bdget_disk(disk, 0); |
1013 | if (!bdev) | 1017 | if (!bdev) |
1014 | return ret; | 1018 | return ret; |
1015 | 1019 | ||
1016 | mutex_lock(&bdev->bd_mutex); | 1020 | mutex_lock(&bdev->bd_mutex); |
1017 | check_disk_size_change(disk, bdev); | 1021 | check_disk_size_change(disk, bdev); |
1018 | mutex_unlock(&bdev->bd_mutex); | 1022 | mutex_unlock(&bdev->bd_mutex); |
1019 | bdput(bdev); | 1023 | bdput(bdev); |
1020 | return ret; | 1024 | return ret; |
1021 | } | 1025 | } |
1022 | EXPORT_SYMBOL(revalidate_disk); | 1026 | EXPORT_SYMBOL(revalidate_disk); |
1023 | 1027 | ||
1024 | /* | 1028 | /* |
1025 | * This routine checks whether a removable media has been changed, | 1029 | * This routine checks whether a removable media has been changed, |
1026 | * and invalidates all buffer-cache-entries in that case. This | 1030 | * and invalidates all buffer-cache-entries in that case. This |
1027 | * is a relatively slow routine, so we have to try to minimize using | 1031 | * is a relatively slow routine, so we have to try to minimize using |
1028 | * it. Thus it is called only upon a 'mount' or 'open'. This | 1032 | * it. Thus it is called only upon a 'mount' or 'open'. This |
1029 | * is the best way of combining speed and utility, I think. | 1033 | * is the best way of combining speed and utility, I think. |
1030 | * People changing diskettes in the middle of an operation deserve | 1034 | * People changing diskettes in the middle of an operation deserve |
1031 | * to lose :-) | 1035 | * to lose :-) |
1032 | */ | 1036 | */ |
1033 | int check_disk_change(struct block_device *bdev) | 1037 | int check_disk_change(struct block_device *bdev) |
1034 | { | 1038 | { |
1035 | struct gendisk *disk = bdev->bd_disk; | 1039 | struct gendisk *disk = bdev->bd_disk; |
1036 | const struct block_device_operations *bdops = disk->fops; | 1040 | const struct block_device_operations *bdops = disk->fops; |
1037 | unsigned int events; | 1041 | unsigned int events; |
1038 | 1042 | ||
1039 | events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | | 1043 | events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | |
1040 | DISK_EVENT_EJECT_REQUEST); | 1044 | DISK_EVENT_EJECT_REQUEST); |
1041 | if (!(events & DISK_EVENT_MEDIA_CHANGE)) | 1045 | if (!(events & DISK_EVENT_MEDIA_CHANGE)) |
1042 | return 0; | 1046 | return 0; |
1043 | 1047 | ||
1044 | flush_disk(bdev, true); | 1048 | flush_disk(bdev, true); |
1045 | if (bdops->revalidate_disk) | 1049 | if (bdops->revalidate_disk) |
1046 | bdops->revalidate_disk(bdev->bd_disk); | 1050 | bdops->revalidate_disk(bdev->bd_disk); |
1047 | return 1; | 1051 | return 1; |
1048 | } | 1052 | } |
1049 | 1053 | ||
1050 | EXPORT_SYMBOL(check_disk_change); | 1054 | EXPORT_SYMBOL(check_disk_change); |
1051 | 1055 | ||
1052 | void bd_set_size(struct block_device *bdev, loff_t size) | 1056 | void bd_set_size(struct block_device *bdev, loff_t size) |
1053 | { | 1057 | { |
1054 | unsigned bsize = bdev_logical_block_size(bdev); | 1058 | unsigned bsize = bdev_logical_block_size(bdev); |
1055 | 1059 | ||
1056 | bdev->bd_inode->i_size = size; | 1060 | bdev->bd_inode->i_size = size; |
1057 | while (bsize < PAGE_CACHE_SIZE) { | 1061 | while (bsize < PAGE_CACHE_SIZE) { |
1058 | if (size & bsize) | 1062 | if (size & bsize) |
1059 | break; | 1063 | break; |
1060 | bsize <<= 1; | 1064 | bsize <<= 1; |
1061 | } | 1065 | } |
1062 | bdev->bd_block_size = bsize; | 1066 | bdev->bd_block_size = bsize; |
1063 | bdev->bd_inode->i_blkbits = blksize_bits(bsize); | 1067 | bdev->bd_inode->i_blkbits = blksize_bits(bsize); |
1064 | } | 1068 | } |
1065 | EXPORT_SYMBOL(bd_set_size); | 1069 | EXPORT_SYMBOL(bd_set_size); |
1066 | 1070 | ||
1067 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); | 1071 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); |
1068 | 1072 | ||
1069 | /* | 1073 | /* |
1070 | * bd_mutex locking: | 1074 | * bd_mutex locking: |
1071 | * | 1075 | * |
1072 | * mutex_lock(part->bd_mutex) | 1076 | * mutex_lock(part->bd_mutex) |
1073 | * mutex_lock_nested(whole->bd_mutex, 1) | 1077 | * mutex_lock_nested(whole->bd_mutex, 1) |
1074 | */ | 1078 | */ |
1075 | 1079 | ||
1076 | static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | 1080 | static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) |
1077 | { | 1081 | { |
1078 | struct gendisk *disk; | 1082 | struct gendisk *disk; |
1079 | int ret; | 1083 | int ret; |
1080 | int partno; | 1084 | int partno; |
1081 | int perm = 0; | 1085 | int perm = 0; |
1082 | 1086 | ||
1083 | if (mode & FMODE_READ) | 1087 | if (mode & FMODE_READ) |
1084 | perm |= MAY_READ; | 1088 | perm |= MAY_READ; |
1085 | if (mode & FMODE_WRITE) | 1089 | if (mode & FMODE_WRITE) |
1086 | perm |= MAY_WRITE; | 1090 | perm |= MAY_WRITE; |
1087 | /* | 1091 | /* |
1088 | * hooks: /n/, see "layering violations". | 1092 | * hooks: /n/, see "layering violations". |
1089 | */ | 1093 | */ |
1090 | if (!for_part) { | 1094 | if (!for_part) { |
1091 | ret = devcgroup_inode_permission(bdev->bd_inode, perm); | 1095 | ret = devcgroup_inode_permission(bdev->bd_inode, perm); |
1092 | if (ret != 0) { | 1096 | if (ret != 0) { |
1093 | bdput(bdev); | 1097 | bdput(bdev); |
1094 | return ret; | 1098 | return ret; |
1095 | } | 1099 | } |
1096 | } | 1100 | } |
1097 | 1101 | ||
1098 | restart: | 1102 | restart: |
1099 | 1103 | ||
1100 | ret = -ENXIO; | 1104 | ret = -ENXIO; |
1101 | disk = get_gendisk(bdev->bd_dev, &partno); | 1105 | disk = get_gendisk(bdev->bd_dev, &partno); |
1102 | if (!disk) | 1106 | if (!disk) |
1103 | goto out; | 1107 | goto out; |
1104 | 1108 | ||
1105 | disk_block_events(disk); | 1109 | disk_block_events(disk); |
1106 | mutex_lock_nested(&bdev->bd_mutex, for_part); | 1110 | mutex_lock_nested(&bdev->bd_mutex, for_part); |
1107 | if (!bdev->bd_openers) { | 1111 | if (!bdev->bd_openers) { |
1108 | bdev->bd_disk = disk; | 1112 | bdev->bd_disk = disk; |
1109 | bdev->bd_contains = bdev; | 1113 | bdev->bd_contains = bdev; |
1110 | if (!partno) { | 1114 | if (!partno) { |
1111 | struct backing_dev_info *bdi; | 1115 | struct backing_dev_info *bdi; |
1112 | 1116 | ||
1113 | ret = -ENXIO; | 1117 | ret = -ENXIO; |
1114 | bdev->bd_part = disk_get_part(disk, partno); | 1118 | bdev->bd_part = disk_get_part(disk, partno); |
1115 | if (!bdev->bd_part) | 1119 | if (!bdev->bd_part) |
1116 | goto out_clear; | 1120 | goto out_clear; |
1117 | 1121 | ||
1118 | ret = 0; | 1122 | ret = 0; |
1119 | if (disk->fops->open) { | 1123 | if (disk->fops->open) { |
1120 | ret = disk->fops->open(bdev, mode); | 1124 | ret = disk->fops->open(bdev, mode); |
1121 | if (ret == -ERESTARTSYS) { | 1125 | if (ret == -ERESTARTSYS) { |
1122 | /* Lost a race with 'disk' being | 1126 | /* Lost a race with 'disk' being |
1123 | * deleted, try again. | 1127 | * deleted, try again. |
1124 | * See md.c | 1128 | * See md.c |
1125 | */ | 1129 | */ |
1126 | disk_put_part(bdev->bd_part); | 1130 | disk_put_part(bdev->bd_part); |
1127 | bdev->bd_part = NULL; | 1131 | bdev->bd_part = NULL; |
1128 | bdev->bd_disk = NULL; | 1132 | bdev->bd_disk = NULL; |
1129 | mutex_unlock(&bdev->bd_mutex); | 1133 | mutex_unlock(&bdev->bd_mutex); |
1130 | disk_unblock_events(disk); | 1134 | disk_unblock_events(disk); |
1131 | module_put(disk->fops->owner); | 1135 | module_put(disk->fops->owner); |
1132 | put_disk(disk); | 1136 | put_disk(disk); |
1133 | goto restart; | 1137 | goto restart; |
1134 | } | 1138 | } |
1135 | } | 1139 | } |
1136 | 1140 | ||
1137 | if (!ret && !bdev->bd_openers) { | 1141 | if (!ret && !bdev->bd_openers) { |
1138 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); | 1142 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); |
1139 | bdi = blk_get_backing_dev_info(bdev); | 1143 | bdi = blk_get_backing_dev_info(bdev); |
1140 | if (bdi == NULL) | 1144 | if (bdi == NULL) |
1141 | bdi = &default_backing_dev_info; | 1145 | bdi = &default_backing_dev_info; |
1142 | bdev_inode_switch_bdi(bdev->bd_inode, bdi); | 1146 | bdev_inode_switch_bdi(bdev->bd_inode, bdi); |
1143 | } | 1147 | } |
1144 | 1148 | ||
1145 | /* | 1149 | /* |
1146 | * If the device is invalidated, rescan partition | 1150 | * If the device is invalidated, rescan partition |
1147 | * if open succeeded or failed with -ENOMEDIUM. | 1151 | * if open succeeded or failed with -ENOMEDIUM. |
1148 | * The latter is necessary to prevent ghost | 1152 | * The latter is necessary to prevent ghost |
1149 | * partitions on a removed medium. | 1153 | * partitions on a removed medium. |
1150 | */ | 1154 | */ |
1151 | if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) | 1155 | if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) |
1152 | rescan_partitions(disk, bdev); | 1156 | rescan_partitions(disk, bdev); |
1153 | if (ret) | 1157 | if (ret) |
1154 | goto out_clear; | 1158 | goto out_clear; |
1155 | } else { | 1159 | } else { |
1156 | struct block_device *whole; | 1160 | struct block_device *whole; |
1157 | whole = bdget_disk(disk, 0); | 1161 | whole = bdget_disk(disk, 0); |
1158 | ret = -ENOMEM; | 1162 | ret = -ENOMEM; |
1159 | if (!whole) | 1163 | if (!whole) |
1160 | goto out_clear; | 1164 | goto out_clear; |
1161 | BUG_ON(for_part); | 1165 | BUG_ON(for_part); |
1162 | ret = __blkdev_get(whole, mode, 1); | 1166 | ret = __blkdev_get(whole, mode, 1); |
1163 | if (ret) | 1167 | if (ret) |
1164 | goto out_clear; | 1168 | goto out_clear; |
1165 | bdev->bd_contains = whole; | 1169 | bdev->bd_contains = whole; |
1166 | bdev_inode_switch_bdi(bdev->bd_inode, | 1170 | bdev_inode_switch_bdi(bdev->bd_inode, |
1167 | whole->bd_inode->i_data.backing_dev_info); | 1171 | whole->bd_inode->i_data.backing_dev_info); |
1168 | bdev->bd_part = disk_get_part(disk, partno); | 1172 | bdev->bd_part = disk_get_part(disk, partno); |
1169 | if (!(disk->flags & GENHD_FL_UP) || | 1173 | if (!(disk->flags & GENHD_FL_UP) || |
1170 | !bdev->bd_part || !bdev->bd_part->nr_sects) { | 1174 | !bdev->bd_part || !bdev->bd_part->nr_sects) { |
1171 | ret = -ENXIO; | 1175 | ret = -ENXIO; |
1172 | goto out_clear; | 1176 | goto out_clear; |
1173 | } | 1177 | } |
1174 | bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); | 1178 | bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); |
1175 | } | 1179 | } |
1176 | } else { | 1180 | } else { |
1177 | if (bdev->bd_contains == bdev) { | 1181 | if (bdev->bd_contains == bdev) { |
1178 | ret = 0; | 1182 | ret = 0; |
1179 | if (bdev->bd_disk->fops->open) | 1183 | if (bdev->bd_disk->fops->open) |
1180 | ret = bdev->bd_disk->fops->open(bdev, mode); | 1184 | ret = bdev->bd_disk->fops->open(bdev, mode); |
1181 | /* the same as first opener case, read comment there */ | 1185 | /* the same as first opener case, read comment there */ |
1182 | if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) | 1186 | if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) |
1183 | rescan_partitions(bdev->bd_disk, bdev); | 1187 | rescan_partitions(bdev->bd_disk, bdev); |
1184 | if (ret) | 1188 | if (ret) |
1185 | goto out_unlock_bdev; | 1189 | goto out_unlock_bdev; |
1186 | } | 1190 | } |
1187 | /* only one opener holds refs to the module and disk */ | 1191 | /* only one opener holds refs to the module and disk */ |
1188 | module_put(disk->fops->owner); | 1192 | module_put(disk->fops->owner); |
1189 | put_disk(disk); | 1193 | put_disk(disk); |
1190 | } | 1194 | } |
1191 | bdev->bd_openers++; | 1195 | bdev->bd_openers++; |
1192 | if (for_part) | 1196 | if (for_part) |
1193 | bdev->bd_part_count++; | 1197 | bdev->bd_part_count++; |
1194 | mutex_unlock(&bdev->bd_mutex); | 1198 | mutex_unlock(&bdev->bd_mutex); |
1195 | disk_unblock_events(disk); | 1199 | disk_unblock_events(disk); |
1196 | return 0; | 1200 | return 0; |
1197 | 1201 | ||
1198 | out_clear: | 1202 | out_clear: |
1199 | disk_put_part(bdev->bd_part); | 1203 | disk_put_part(bdev->bd_part); |
1200 | bdev->bd_disk = NULL; | 1204 | bdev->bd_disk = NULL; |
1201 | bdev->bd_part = NULL; | 1205 | bdev->bd_part = NULL; |
1202 | bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); | 1206 | bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); |
1203 | if (bdev != bdev->bd_contains) | 1207 | if (bdev != bdev->bd_contains) |
1204 | __blkdev_put(bdev->bd_contains, mode, 1); | 1208 | __blkdev_put(bdev->bd_contains, mode, 1); |
1205 | bdev->bd_contains = NULL; | 1209 | bdev->bd_contains = NULL; |
1206 | out_unlock_bdev: | 1210 | out_unlock_bdev: |
1207 | mutex_unlock(&bdev->bd_mutex); | 1211 | mutex_unlock(&bdev->bd_mutex); |
1208 | disk_unblock_events(disk); | 1212 | disk_unblock_events(disk); |
1209 | module_put(disk->fops->owner); | 1213 | module_put(disk->fops->owner); |
1210 | put_disk(disk); | 1214 | put_disk(disk); |
1211 | out: | 1215 | out: |
1212 | bdput(bdev); | 1216 | bdput(bdev); |
1213 | 1217 | ||
1214 | return ret; | 1218 | return ret; |
1215 | } | 1219 | } |
1216 | 1220 | ||
1217 | /** | 1221 | /** |
1218 | * blkdev_get - open a block device | 1222 | * blkdev_get - open a block device |
1219 | * @bdev: block_device to open | 1223 | * @bdev: block_device to open |
1220 | * @mode: FMODE_* mask | 1224 | * @mode: FMODE_* mask |
1221 | * @holder: exclusive holder identifier | 1225 | * @holder: exclusive holder identifier |
1222 | * | 1226 | * |
1223 | * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is | 1227 | * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is |
1224 | * open with exclusive access. Specifying %FMODE_EXCL with %NULL | 1228 | * open with exclusive access. Specifying %FMODE_EXCL with %NULL |
1225 | * @holder is invalid. Exclusive opens may nest for the same @holder. | 1229 | * @holder is invalid. Exclusive opens may nest for the same @holder. |
1226 | * | 1230 | * |
1227 | * On success, the reference count of @bdev is unchanged. On failure, | 1231 | * On success, the reference count of @bdev is unchanged. On failure, |
1228 | * @bdev is put. | 1232 | * @bdev is put. |
1229 | * | 1233 | * |
1230 | * CONTEXT: | 1234 | * CONTEXT: |
1231 | * Might sleep. | 1235 | * Might sleep. |
1232 | * | 1236 | * |
1233 | * RETURNS: | 1237 | * RETURNS: |
1234 | * 0 on success, -errno on failure. | 1238 | * 0 on success, -errno on failure. |
1235 | */ | 1239 | */ |
1236 | int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) | 1240 | int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) |
1237 | { | 1241 | { |
1238 | struct block_device *whole = NULL; | 1242 | struct block_device *whole = NULL; |
1239 | int res; | 1243 | int res; |
1240 | 1244 | ||
1241 | WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); | 1245 | WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); |
1242 | 1246 | ||
1243 | if ((mode & FMODE_EXCL) && holder) { | 1247 | if ((mode & FMODE_EXCL) && holder) { |
1244 | whole = bd_start_claiming(bdev, holder); | 1248 | whole = bd_start_claiming(bdev, holder); |
1245 | if (IS_ERR(whole)) { | 1249 | if (IS_ERR(whole)) { |
1246 | bdput(bdev); | 1250 | bdput(bdev); |
1247 | return PTR_ERR(whole); | 1251 | return PTR_ERR(whole); |
1248 | } | 1252 | } |
1249 | } | 1253 | } |
1250 | 1254 | ||
1251 | res = __blkdev_get(bdev, mode, 0); | 1255 | res = __blkdev_get(bdev, mode, 0); |
1252 | 1256 | ||
1253 | if (whole) { | 1257 | if (whole) { |
1254 | struct gendisk *disk = whole->bd_disk; | 1258 | struct gendisk *disk = whole->bd_disk; |
1255 | 1259 | ||
1256 | /* finish claiming */ | 1260 | /* finish claiming */ |
1257 | mutex_lock(&bdev->bd_mutex); | 1261 | mutex_lock(&bdev->bd_mutex); |
1258 | spin_lock(&bdev_lock); | 1262 | spin_lock(&bdev_lock); |
1259 | 1263 | ||
1260 | if (!res) { | 1264 | if (!res) { |
1261 | BUG_ON(!bd_may_claim(bdev, whole, holder)); | 1265 | BUG_ON(!bd_may_claim(bdev, whole, holder)); |
1262 | /* | 1266 | /* |
1263 | * Note that for a whole device bd_holders | 1267 | * Note that for a whole device bd_holders |
1264 | * will be incremented twice, and bd_holder | 1268 | * will be incremented twice, and bd_holder |
1265 | * will be set to bd_may_claim before being | 1269 | * will be set to bd_may_claim before being |
1266 | * set to holder | 1270 | * set to holder |
1267 | */ | 1271 | */ |
1268 | whole->bd_holders++; | 1272 | whole->bd_holders++; |
1269 | whole->bd_holder = bd_may_claim; | 1273 | whole->bd_holder = bd_may_claim; |
1270 | bdev->bd_holders++; | 1274 | bdev->bd_holders++; |
1271 | bdev->bd_holder = holder; | 1275 | bdev->bd_holder = holder; |
1272 | } | 1276 | } |
1273 | 1277 | ||
1274 | /* tell others that we're done */ | 1278 | /* tell others that we're done */ |
1275 | BUG_ON(whole->bd_claiming != holder); | 1279 | BUG_ON(whole->bd_claiming != holder); |
1276 | whole->bd_claiming = NULL; | 1280 | whole->bd_claiming = NULL; |
1277 | wake_up_bit(&whole->bd_claiming, 0); | 1281 | wake_up_bit(&whole->bd_claiming, 0); |
1278 | 1282 | ||
1279 | spin_unlock(&bdev_lock); | 1283 | spin_unlock(&bdev_lock); |
1280 | 1284 | ||
1281 | /* | 1285 | /* |
1282 | * Block event polling for write claims if requested. Any | 1286 | * Block event polling for write claims if requested. Any |
1283 | * write holder makes the write_holder state stick until | 1287 | * write holder makes the write_holder state stick until |
1284 | * all are released. This is good enough and tracking | 1288 | * all are released. This is good enough and tracking |
1285 | * individual writeable reference is too fragile given the | 1289 | * individual writeable reference is too fragile given the |
1286 | * way @mode is used in blkdev_get/put(). | 1290 | * way @mode is used in blkdev_get/put(). |
1287 | */ | 1291 | */ |
1288 | if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && | 1292 | if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && |
1289 | (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { | 1293 | (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { |
1290 | bdev->bd_write_holder = true; | 1294 | bdev->bd_write_holder = true; |
1291 | disk_block_events(disk); | 1295 | disk_block_events(disk); |
1292 | } | 1296 | } |
1293 | 1297 | ||
1294 | mutex_unlock(&bdev->bd_mutex); | 1298 | mutex_unlock(&bdev->bd_mutex); |
1295 | bdput(whole); | 1299 | bdput(whole); |
1296 | } | 1300 | } |
1297 | 1301 | ||
1298 | return res; | 1302 | return res; |
1299 | } | 1303 | } |
1300 | EXPORT_SYMBOL(blkdev_get); | 1304 | EXPORT_SYMBOL(blkdev_get); |
1301 | 1305 | ||
1302 | /** | 1306 | /** |
1303 | * blkdev_get_by_path - open a block device by name | 1307 | * blkdev_get_by_path - open a block device by name |
1304 | * @path: path to the block device to open | 1308 | * @path: path to the block device to open |
1305 | * @mode: FMODE_* mask | 1309 | * @mode: FMODE_* mask |
1306 | * @holder: exclusive holder identifier | 1310 | * @holder: exclusive holder identifier |
1307 | * | 1311 | * |
1308 | * Open the blockdevice described by the device file at @path. @mode | 1312 | * Open the blockdevice described by the device file at @path. @mode |
1309 | * and @holder are identical to blkdev_get(). | 1313 | * and @holder are identical to blkdev_get(). |
1310 | * | 1314 | * |
1311 | * On success, the returned block_device has reference count of one. | 1315 | * On success, the returned block_device has reference count of one. |
1312 | * | 1316 | * |
1313 | * CONTEXT: | 1317 | * CONTEXT: |
1314 | * Might sleep. | 1318 | * Might sleep. |
1315 | * | 1319 | * |
1316 | * RETURNS: | 1320 | * RETURNS: |
1317 | * Pointer to block_device on success, ERR_PTR(-errno) on failure. | 1321 | * Pointer to block_device on success, ERR_PTR(-errno) on failure. |
1318 | */ | 1322 | */ |
1319 | struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, | 1323 | struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, |
1320 | void *holder) | 1324 | void *holder) |
1321 | { | 1325 | { |
1322 | struct block_device *bdev; | 1326 | struct block_device *bdev; |
1323 | int err; | 1327 | int err; |
1324 | 1328 | ||
1325 | bdev = lookup_bdev(path); | 1329 | bdev = lookup_bdev(path); |
1326 | if (IS_ERR(bdev)) | 1330 | if (IS_ERR(bdev)) |
1327 | return bdev; | 1331 | return bdev; |
1328 | 1332 | ||
1329 | err = blkdev_get(bdev, mode, holder); | 1333 | err = blkdev_get(bdev, mode, holder); |
1330 | if (err) | 1334 | if (err) |
1331 | return ERR_PTR(err); | 1335 | return ERR_PTR(err); |
1332 | 1336 | ||
1333 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { | 1337 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { |
1334 | blkdev_put(bdev, mode); | 1338 | blkdev_put(bdev, mode); |
1335 | return ERR_PTR(-EACCES); | 1339 | return ERR_PTR(-EACCES); |
1336 | } | 1340 | } |
1337 | 1341 | ||
1338 | return bdev; | 1342 | return bdev; |
1339 | } | 1343 | } |
1340 | EXPORT_SYMBOL(blkdev_get_by_path); | 1344 | EXPORT_SYMBOL(blkdev_get_by_path); |
1341 | 1345 | ||
1342 | /** | 1346 | /** |
1343 | * blkdev_get_by_dev - open a block device by device number | 1347 | * blkdev_get_by_dev - open a block device by device number |
1344 | * @dev: device number of block device to open | 1348 | * @dev: device number of block device to open |
1345 | * @mode: FMODE_* mask | 1349 | * @mode: FMODE_* mask |
1346 | * @holder: exclusive holder identifier | 1350 | * @holder: exclusive holder identifier |
1347 | * | 1351 | * |
1348 | * Open the blockdevice described by device number @dev. @mode and | 1352 | * Open the blockdevice described by device number @dev. @mode and |
1349 | * @holder are identical to blkdev_get(). | 1353 | * @holder are identical to blkdev_get(). |
1350 | * | 1354 | * |
1351 | * Use it ONLY if you really do not have anything better - i.e. when | 1355 | * Use it ONLY if you really do not have anything better - i.e. when |
1352 | * you are behind a truly sucky interface and all you are given is a | 1356 | * you are behind a truly sucky interface and all you are given is a |
1353 | * device number. _Never_ to be used for internal purposes. If you | 1357 | * device number. _Never_ to be used for internal purposes. If you |
1354 | * ever need it - reconsider your API. | 1358 | * ever need it - reconsider your API. |
1355 | * | 1359 | * |
1356 | * On success, the returned block_device has reference count of one. | 1360 | * On success, the returned block_device has reference count of one. |
1357 | * | 1361 | * |
1358 | * CONTEXT: | 1362 | * CONTEXT: |
1359 | * Might sleep. | 1363 | * Might sleep. |
1360 | * | 1364 | * |
1361 | * RETURNS: | 1365 | * RETURNS: |
1362 | * Pointer to block_device on success, ERR_PTR(-errno) on failure. | 1366 | * Pointer to block_device on success, ERR_PTR(-errno) on failure. |
1363 | */ | 1367 | */ |
1364 | struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) | 1368 | struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) |
1365 | { | 1369 | { |
1366 | struct block_device *bdev; | 1370 | struct block_device *bdev; |
1367 | int err; | 1371 | int err; |
1368 | 1372 | ||
1369 | bdev = bdget(dev); | 1373 | bdev = bdget(dev); |
1370 | if (!bdev) | 1374 | if (!bdev) |
1371 | return ERR_PTR(-ENOMEM); | 1375 | return ERR_PTR(-ENOMEM); |
1372 | 1376 | ||
1373 | err = blkdev_get(bdev, mode, holder); | 1377 | err = blkdev_get(bdev, mode, holder); |
1374 | if (err) | 1378 | if (err) |
1375 | return ERR_PTR(err); | 1379 | return ERR_PTR(err); |
1376 | 1380 | ||
1377 | return bdev; | 1381 | return bdev; |
1378 | } | 1382 | } |
1379 | EXPORT_SYMBOL(blkdev_get_by_dev); | 1383 | EXPORT_SYMBOL(blkdev_get_by_dev); |
1380 | 1384 | ||
1381 | static int blkdev_open(struct inode * inode, struct file * filp) | 1385 | static int blkdev_open(struct inode * inode, struct file * filp) |
1382 | { | 1386 | { |
1383 | struct block_device *bdev; | 1387 | struct block_device *bdev; |
1384 | 1388 | ||
1385 | /* | 1389 | /* |
1386 | * Preserve backwards compatibility and allow large file access | 1390 | * Preserve backwards compatibility and allow large file access |
1387 | * even if userspace doesn't ask for it explicitly. Some mkfs | 1391 | * even if userspace doesn't ask for it explicitly. Some mkfs |
1388 | * binary needs it. We might want to drop this workaround | 1392 | * binary needs it. We might want to drop this workaround |
1389 | * during an unstable branch. | 1393 | * during an unstable branch. |
1390 | */ | 1394 | */ |
1391 | filp->f_flags |= O_LARGEFILE; | 1395 | filp->f_flags |= O_LARGEFILE; |
1392 | 1396 | ||
1393 | if (filp->f_flags & O_NDELAY) | 1397 | if (filp->f_flags & O_NDELAY) |
1394 | filp->f_mode |= FMODE_NDELAY; | 1398 | filp->f_mode |= FMODE_NDELAY; |
1395 | if (filp->f_flags & O_EXCL) | 1399 | if (filp->f_flags & O_EXCL) |
1396 | filp->f_mode |= FMODE_EXCL; | 1400 | filp->f_mode |= FMODE_EXCL; |
1397 | if ((filp->f_flags & O_ACCMODE) == 3) | 1401 | if ((filp->f_flags & O_ACCMODE) == 3) |
1398 | filp->f_mode |= FMODE_WRITE_IOCTL; | 1402 | filp->f_mode |= FMODE_WRITE_IOCTL; |
1399 | 1403 | ||
1400 | bdev = bd_acquire(inode); | 1404 | bdev = bd_acquire(inode); |
1401 | if (bdev == NULL) | 1405 | if (bdev == NULL) |
1402 | return -ENOMEM; | 1406 | return -ENOMEM; |
1403 | 1407 | ||
1404 | filp->f_mapping = bdev->bd_inode->i_mapping; | 1408 | filp->f_mapping = bdev->bd_inode->i_mapping; |
1405 | 1409 | ||
1406 | return blkdev_get(bdev, filp->f_mode, filp); | 1410 | return blkdev_get(bdev, filp->f_mode, filp); |
1407 | } | 1411 | } |
1408 | 1412 | ||
1409 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) | 1413 | static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) |
1410 | { | 1414 | { |
1411 | int ret = 0; | 1415 | int ret = 0; |
1412 | struct gendisk *disk = bdev->bd_disk; | 1416 | struct gendisk *disk = bdev->bd_disk; |
1413 | struct block_device *victim = NULL; | 1417 | struct block_device *victim = NULL; |
1414 | 1418 | ||
1415 | mutex_lock_nested(&bdev->bd_mutex, for_part); | 1419 | mutex_lock_nested(&bdev->bd_mutex, for_part); |
1416 | if (for_part) | 1420 | if (for_part) |
1417 | bdev->bd_part_count--; | 1421 | bdev->bd_part_count--; |
1418 | 1422 | ||
1419 | if (!--bdev->bd_openers) { | 1423 | if (!--bdev->bd_openers) { |
1420 | WARN_ON_ONCE(bdev->bd_holders); | 1424 | WARN_ON_ONCE(bdev->bd_holders); |
1421 | sync_blockdev(bdev); | 1425 | sync_blockdev(bdev); |
1422 | kill_bdev(bdev); | 1426 | kill_bdev(bdev); |
1423 | } | 1427 | } |
1424 | if (bdev->bd_contains == bdev) { | 1428 | if (bdev->bd_contains == bdev) { |
1425 | if (disk->fops->release) | 1429 | if (disk->fops->release) |
1426 | ret = disk->fops->release(disk, mode); | 1430 | ret = disk->fops->release(disk, mode); |
1427 | } | 1431 | } |
1428 | if (!bdev->bd_openers) { | 1432 | if (!bdev->bd_openers) { |
1429 | struct module *owner = disk->fops->owner; | 1433 | struct module *owner = disk->fops->owner; |
1430 | 1434 | ||
1431 | put_disk(disk); | 1435 | put_disk(disk); |
1432 | module_put(owner); | 1436 | module_put(owner); |
1433 | disk_put_part(bdev->bd_part); | 1437 | disk_put_part(bdev->bd_part); |
1434 | bdev->bd_part = NULL; | 1438 | bdev->bd_part = NULL; |
1435 | bdev->bd_disk = NULL; | 1439 | bdev->bd_disk = NULL; |
1436 | bdev_inode_switch_bdi(bdev->bd_inode, | 1440 | bdev_inode_switch_bdi(bdev->bd_inode, |
1437 | &default_backing_dev_info); | 1441 | &default_backing_dev_info); |
1438 | if (bdev != bdev->bd_contains) | 1442 | if (bdev != bdev->bd_contains) |
1439 | victim = bdev->bd_contains; | 1443 | victim = bdev->bd_contains; |
1440 | bdev->bd_contains = NULL; | 1444 | bdev->bd_contains = NULL; |
1441 | } | 1445 | } |
1442 | mutex_unlock(&bdev->bd_mutex); | 1446 | mutex_unlock(&bdev->bd_mutex); |
1443 | bdput(bdev); | 1447 | bdput(bdev); |
1444 | if (victim) | 1448 | if (victim) |
1445 | __blkdev_put(victim, mode, 1); | 1449 | __blkdev_put(victim, mode, 1); |
1446 | return ret; | 1450 | return ret; |
1447 | } | 1451 | } |
1448 | 1452 | ||
1449 | int blkdev_put(struct block_device *bdev, fmode_t mode) | 1453 | int blkdev_put(struct block_device *bdev, fmode_t mode) |
1450 | { | 1454 | { |
1451 | mutex_lock(&bdev->bd_mutex); | 1455 | mutex_lock(&bdev->bd_mutex); |
1452 | 1456 | ||
1453 | if (mode & FMODE_EXCL) { | 1457 | if (mode & FMODE_EXCL) { |
1454 | bool bdev_free; | 1458 | bool bdev_free; |
1455 | 1459 | ||
1456 | /* | 1460 | /* |
1457 | * Release a claim on the device. The holder fields | 1461 | * Release a claim on the device. The holder fields |
1458 | * are protected with bdev_lock. bd_mutex is to | 1462 | * are protected with bdev_lock. bd_mutex is to |
1459 | * synchronize disk_holder unlinking. | 1463 | * synchronize disk_holder unlinking. |
1460 | */ | 1464 | */ |
1461 | spin_lock(&bdev_lock); | 1465 | spin_lock(&bdev_lock); |
1462 | 1466 | ||
1463 | WARN_ON_ONCE(--bdev->bd_holders < 0); | 1467 | WARN_ON_ONCE(--bdev->bd_holders < 0); |
1464 | WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); | 1468 | WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); |
1465 | 1469 | ||
1466 | /* bd_contains might point to self, check in a separate step */ | 1470 | /* bd_contains might point to self, check in a separate step */ |
1467 | if ((bdev_free = !bdev->bd_holders)) | 1471 | if ((bdev_free = !bdev->bd_holders)) |
1468 | bdev->bd_holder = NULL; | 1472 | bdev->bd_holder = NULL; |
1469 | if (!bdev->bd_contains->bd_holders) | 1473 | if (!bdev->bd_contains->bd_holders) |
1470 | bdev->bd_contains->bd_holder = NULL; | 1474 | bdev->bd_contains->bd_holder = NULL; |
1471 | 1475 | ||
1472 | spin_unlock(&bdev_lock); | 1476 | spin_unlock(&bdev_lock); |
1473 | 1477 | ||
1474 | /* | 1478 | /* |
1475 | * If this was the last claim, remove holder link and | 1479 | * If this was the last claim, remove holder link and |
1476 | * unblock evpoll if it was a write holder. | 1480 | * unblock evpoll if it was a write holder. |
1477 | */ | 1481 | */ |
1478 | if (bdev_free && bdev->bd_write_holder) { | 1482 | if (bdev_free && bdev->bd_write_holder) { |
1479 | disk_unblock_events(bdev->bd_disk); | 1483 | disk_unblock_events(bdev->bd_disk); |
1480 | bdev->bd_write_holder = false; | 1484 | bdev->bd_write_holder = false; |
1481 | } | 1485 | } |
1482 | } | 1486 | } |
1483 | 1487 | ||
1484 | /* | 1488 | /* |
1485 | * Trigger event checking and tell drivers to flush MEDIA_CHANGE | 1489 | * Trigger event checking and tell drivers to flush MEDIA_CHANGE |
1486 | * event. This is to ensure detection of media removal commanded | 1490 | * event. This is to ensure detection of media removal commanded |
1487 | * from userland - e.g. eject(1). | 1491 | * from userland - e.g. eject(1). |
1488 | */ | 1492 | */ |
1489 | disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); | 1493 | disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); |
1490 | 1494 | ||
1491 | mutex_unlock(&bdev->bd_mutex); | 1495 | mutex_unlock(&bdev->bd_mutex); |
1492 | 1496 | ||
1493 | return __blkdev_put(bdev, mode, 0); | 1497 | return __blkdev_put(bdev, mode, 0); |
1494 | } | 1498 | } |
1495 | EXPORT_SYMBOL(blkdev_put); | 1499 | EXPORT_SYMBOL(blkdev_put); |
1496 | 1500 | ||
1497 | static int blkdev_close(struct inode * inode, struct file * filp) | 1501 | static int blkdev_close(struct inode * inode, struct file * filp) |
1498 | { | 1502 | { |
1499 | struct block_device *bdev = I_BDEV(filp->f_mapping->host); | 1503 | struct block_device *bdev = I_BDEV(filp->f_mapping->host); |
1500 | 1504 | ||
1501 | return blkdev_put(bdev, filp->f_mode); | 1505 | return blkdev_put(bdev, filp->f_mode); |
1502 | } | 1506 | } |
1503 | 1507 | ||
1504 | static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) | 1508 | static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) |
1505 | { | 1509 | { |
1506 | struct block_device *bdev = I_BDEV(file->f_mapping->host); | 1510 | struct block_device *bdev = I_BDEV(file->f_mapping->host); |
1507 | fmode_t mode = file->f_mode; | 1511 | fmode_t mode = file->f_mode; |
1508 | 1512 | ||
1509 | /* | 1513 | /* |
1510 | * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have | 1514 | * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have |
1511 | * to updated it before every ioctl. | 1515 | * to updated it before every ioctl. |
1512 | */ | 1516 | */ |
1513 | if (file->f_flags & O_NDELAY) | 1517 | if (file->f_flags & O_NDELAY) |
1514 | mode |= FMODE_NDELAY; | 1518 | mode |= FMODE_NDELAY; |
1515 | else | 1519 | else |
1516 | mode &= ~FMODE_NDELAY; | 1520 | mode &= ~FMODE_NDELAY; |
1517 | 1521 | ||
1518 | return blkdev_ioctl(bdev, mode, cmd, arg); | 1522 | return blkdev_ioctl(bdev, mode, cmd, arg); |
1519 | } | 1523 | } |
1520 | 1524 | ||
1521 | /* | 1525 | /* |
1522 | * Write data to the block device. Only intended for the block device itself | 1526 | * Write data to the block device. Only intended for the block device itself |
1523 | * and the raw driver which basically is a fake block device. | 1527 | * and the raw driver which basically is a fake block device. |
1524 | * | 1528 | * |
1525 | * Does not take i_mutex for the write and thus is not for general purpose | 1529 | * Does not take i_mutex for the write and thus is not for general purpose |
1526 | * use. | 1530 | * use. |
1527 | */ | 1531 | */ |
1528 | ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | 1532 | ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, |
1529 | unsigned long nr_segs, loff_t pos) | 1533 | unsigned long nr_segs, loff_t pos) |
1530 | { | 1534 | { |
1531 | struct file *file = iocb->ki_filp; | 1535 | struct file *file = iocb->ki_filp; |
1532 | ssize_t ret; | 1536 | ssize_t ret; |
1533 | 1537 | ||
1534 | BUG_ON(iocb->ki_pos != pos); | 1538 | BUG_ON(iocb->ki_pos != pos); |
1535 | 1539 | ||
1536 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 1540 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
1537 | if (ret > 0 || ret == -EIOCBQUEUED) { | 1541 | if (ret > 0 || ret == -EIOCBQUEUED) { |
1538 | ssize_t err; | 1542 | ssize_t err; |
1539 | 1543 | ||
1540 | err = generic_write_sync(file, pos, ret); | 1544 | err = generic_write_sync(file, pos, ret); |
1541 | if (err < 0 && ret > 0) | 1545 | if (err < 0 && ret > 0) |
1542 | ret = err; | 1546 | ret = err; |
1543 | } | 1547 | } |
1544 | return ret; | 1548 | return ret; |
1545 | } | 1549 | } |
1546 | EXPORT_SYMBOL_GPL(blkdev_aio_write); | 1550 | EXPORT_SYMBOL_GPL(blkdev_aio_write); |
1547 | 1551 | ||
1548 | /* | 1552 | /* |
1549 | * Try to release a page associated with block device when the system | 1553 | * Try to release a page associated with block device when the system |
1550 | * is under memory pressure. | 1554 | * is under memory pressure. |
1551 | */ | 1555 | */ |
1552 | static int blkdev_releasepage(struct page *page, gfp_t wait) | 1556 | static int blkdev_releasepage(struct page *page, gfp_t wait) |
1553 | { | 1557 | { |
1554 | struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; | 1558 | struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; |
1555 | 1559 | ||
1556 | if (super && super->s_op->bdev_try_to_free_page) | 1560 | if (super && super->s_op->bdev_try_to_free_page) |
1557 | return super->s_op->bdev_try_to_free_page(super, page, wait); | 1561 | return super->s_op->bdev_try_to_free_page(super, page, wait); |
1558 | 1562 | ||
1559 | return try_to_free_buffers(page); | 1563 | return try_to_free_buffers(page); |
1560 | } | 1564 | } |
1561 | 1565 | ||
1562 | static const struct address_space_operations def_blk_aops = { | 1566 | static const struct address_space_operations def_blk_aops = { |
1563 | .readpage = blkdev_readpage, | 1567 | .readpage = blkdev_readpage, |
1564 | .writepage = blkdev_writepage, | 1568 | .writepage = blkdev_writepage, |
1565 | .write_begin = blkdev_write_begin, | 1569 | .write_begin = blkdev_write_begin, |
1566 | .write_end = blkdev_write_end, | 1570 | .write_end = blkdev_write_end, |
1567 | .writepages = generic_writepages, | 1571 | .writepages = generic_writepages, |
1568 | .releasepage = blkdev_releasepage, | 1572 | .releasepage = blkdev_releasepage, |
1569 | .direct_IO = blkdev_direct_IO, | 1573 | .direct_IO = blkdev_direct_IO, |
1570 | }; | 1574 | }; |
1571 | 1575 | ||
1572 | const struct file_operations def_blk_fops = { | 1576 | const struct file_operations def_blk_fops = { |
1573 | .open = blkdev_open, | 1577 | .open = blkdev_open, |
1574 | .release = blkdev_close, | 1578 | .release = blkdev_close, |
1575 | .llseek = block_llseek, | 1579 | .llseek = block_llseek, |
1576 | .read = do_sync_read, | 1580 | .read = do_sync_read, |
1577 | .write = do_sync_write, | 1581 | .write = do_sync_write, |
1578 | .aio_read = generic_file_aio_read, | 1582 | .aio_read = generic_file_aio_read, |
1579 | .aio_write = blkdev_aio_write, | 1583 | .aio_write = blkdev_aio_write, |
1580 | .mmap = generic_file_mmap, | 1584 | .mmap = generic_file_mmap, |
1581 | .fsync = blkdev_fsync, | 1585 | .fsync = blkdev_fsync, |
1582 | .unlocked_ioctl = block_ioctl, | 1586 | .unlocked_ioctl = block_ioctl, |
1583 | #ifdef CONFIG_COMPAT | 1587 | #ifdef CONFIG_COMPAT |
1584 | .compat_ioctl = compat_blkdev_ioctl, | 1588 | .compat_ioctl = compat_blkdev_ioctl, |
1585 | #endif | 1589 | #endif |
1586 | .splice_read = generic_file_splice_read, | 1590 | .splice_read = generic_file_splice_read, |
1587 | .splice_write = generic_file_splice_write, | 1591 | .splice_write = generic_file_splice_write, |
1588 | }; | 1592 | }; |
1589 | 1593 | ||
1590 | int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) | 1594 | int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) |
1591 | { | 1595 | { |
1592 | int res; | 1596 | int res; |
1593 | mm_segment_t old_fs = get_fs(); | 1597 | mm_segment_t old_fs = get_fs(); |
1594 | set_fs(KERNEL_DS); | 1598 | set_fs(KERNEL_DS); |
1595 | res = blkdev_ioctl(bdev, 0, cmd, arg); | 1599 | res = blkdev_ioctl(bdev, 0, cmd, arg); |
1596 | set_fs(old_fs); | 1600 | set_fs(old_fs); |
1597 | return res; | 1601 | return res; |
1598 | } | 1602 | } |
1599 | 1603 | ||
1600 | EXPORT_SYMBOL(ioctl_by_bdev); | 1604 | EXPORT_SYMBOL(ioctl_by_bdev); |
1601 | 1605 | ||
1602 | /** | 1606 | /** |
1603 | * lookup_bdev - lookup a struct block_device by name | 1607 | * lookup_bdev - lookup a struct block_device by name |
1604 | * @pathname: special file representing the block device | 1608 | * @pathname: special file representing the block device |
1605 | * | 1609 | * |
1606 | * Get a reference to the blockdevice at @pathname in the current | 1610 | * Get a reference to the blockdevice at @pathname in the current |
1607 | * namespace if possible and return it. Return ERR_PTR(error) | 1611 | * namespace if possible and return it. Return ERR_PTR(error) |
1608 | * otherwise. | 1612 | * otherwise. |
1609 | */ | 1613 | */ |
1610 | struct block_device *lookup_bdev(const char *pathname) | 1614 | struct block_device *lookup_bdev(const char *pathname) |
1611 | { | 1615 | { |
1612 | struct block_device *bdev; | 1616 | struct block_device *bdev; |
1613 | struct inode *inode; | 1617 | struct inode *inode; |
1614 | struct path path; | 1618 | struct path path; |
1615 | int error; | 1619 | int error; |
1616 | 1620 | ||
1617 | if (!pathname || !*pathname) | 1621 | if (!pathname || !*pathname) |
1618 | return ERR_PTR(-EINVAL); | 1622 | return ERR_PTR(-EINVAL); |
1619 | 1623 | ||
1620 | error = kern_path(pathname, LOOKUP_FOLLOW, &path); | 1624 | error = kern_path(pathname, LOOKUP_FOLLOW, &path); |
1621 | if (error) | 1625 | if (error) |
1622 | return ERR_PTR(error); | 1626 | return ERR_PTR(error); |
1623 | 1627 | ||
1624 | inode = path.dentry->d_inode; | 1628 | inode = path.dentry->d_inode; |
1625 | error = -ENOTBLK; | 1629 | error = -ENOTBLK; |
1626 | if (!S_ISBLK(inode->i_mode)) | 1630 | if (!S_ISBLK(inode->i_mode)) |
1627 | goto fail; | 1631 | goto fail; |
1628 | error = -EACCES; | 1632 | error = -EACCES; |
1629 | if (path.mnt->mnt_flags & MNT_NODEV) | 1633 | if (path.mnt->mnt_flags & MNT_NODEV) |
1630 | goto fail; | 1634 | goto fail; |
1631 | error = -ENOMEM; | 1635 | error = -ENOMEM; |
1632 | bdev = bd_acquire(inode); | 1636 | bdev = bd_acquire(inode); |
1633 | if (!bdev) | 1637 | if (!bdev) |
1634 | goto fail; | 1638 | goto fail; |
1635 | out: | 1639 | out: |
1636 | path_put(&path); | 1640 | path_put(&path); |
1637 | return bdev; | 1641 | return bdev; |
1638 | fail: | 1642 | fail: |
1639 | bdev = ERR_PTR(error); | 1643 | bdev = ERR_PTR(error); |
1640 | goto out; | 1644 | goto out; |
1641 | } | 1645 | } |
1642 | EXPORT_SYMBOL(lookup_bdev); | 1646 | EXPORT_SYMBOL(lookup_bdev); |
1643 | 1647 | ||
1644 | int __invalidate_device(struct block_device *bdev, bool kill_dirty) | 1648 | int __invalidate_device(struct block_device *bdev, bool kill_dirty) |
1645 | { | 1649 | { |
1646 | struct super_block *sb = get_super(bdev); | 1650 | struct super_block *sb = get_super(bdev); |
1647 | int res = 0; | 1651 | int res = 0; |
1648 | 1652 | ||
1649 | if (sb) { | 1653 | if (sb) { |
1650 | /* | 1654 | /* |
1651 | * no need to lock the super, get_super holds the | 1655 | * no need to lock the super, get_super holds the |
1652 | * read mutex so the filesystem cannot go away | 1656 | * read mutex so the filesystem cannot go away |
1653 | * under us (->put_super runs with the write lock | 1657 | * under us (->put_super runs with the write lock |
1654 | * hold). | 1658 | * hold). |
1655 | */ | 1659 | */ |
1656 | shrink_dcache_sb(sb); | 1660 | shrink_dcache_sb(sb); |
1657 | res = invalidate_inodes(sb, kill_dirty); | 1661 | res = invalidate_inodes(sb, kill_dirty); |
1658 | drop_super(sb); | 1662 | drop_super(sb); |
1659 | } | 1663 | } |
1660 | invalidate_bdev(bdev); | 1664 | invalidate_bdev(bdev); |
1661 | return res; | 1665 | return res; |
1662 | } | 1666 | } |
1663 | EXPORT_SYMBOL(__invalidate_device); | 1667 | EXPORT_SYMBOL(__invalidate_device); |
fs/btrfs/extent_io.c
1 | #include <linux/bitops.h> | 1 | #include <linux/bitops.h> |
2 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
3 | #include <linux/bio.h> | 3 | #include <linux/bio.h> |
4 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
5 | #include <linux/pagemap.h> | 5 | #include <linux/pagemap.h> |
6 | #include <linux/page-flags.h> | 6 | #include <linux/page-flags.h> |
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
9 | #include <linux/blkdev.h> | 9 | #include <linux/blkdev.h> |
10 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
11 | #include <linux/writeback.h> | 11 | #include <linux/writeback.h> |
12 | #include <linux/pagevec.h> | 12 | #include <linux/pagevec.h> |
13 | #include <linux/prefetch.h> | 13 | #include <linux/prefetch.h> |
14 | #include <linux/cleancache.h> | 14 | #include <linux/cleancache.h> |
15 | #include "extent_io.h" | 15 | #include "extent_io.h" |
16 | #include "extent_map.h" | 16 | #include "extent_map.h" |
17 | #include "compat.h" | 17 | #include "compat.h" |
18 | #include "ctree.h" | 18 | #include "ctree.h" |
19 | #include "btrfs_inode.h" | 19 | #include "btrfs_inode.h" |
20 | 20 | ||
21 | static struct kmem_cache *extent_state_cache; | 21 | static struct kmem_cache *extent_state_cache; |
22 | static struct kmem_cache *extent_buffer_cache; | 22 | static struct kmem_cache *extent_buffer_cache; |
23 | 23 | ||
24 | static LIST_HEAD(buffers); | 24 | static LIST_HEAD(buffers); |
25 | static LIST_HEAD(states); | 25 | static LIST_HEAD(states); |
26 | 26 | ||
27 | #define LEAK_DEBUG 0 | 27 | #define LEAK_DEBUG 0 |
28 | #if LEAK_DEBUG | 28 | #if LEAK_DEBUG |
29 | static DEFINE_SPINLOCK(leak_lock); | 29 | static DEFINE_SPINLOCK(leak_lock); |
30 | #endif | 30 | #endif |
31 | 31 | ||
32 | #define BUFFER_LRU_MAX 64 | 32 | #define BUFFER_LRU_MAX 64 |
33 | 33 | ||
34 | struct tree_entry { | 34 | struct tree_entry { |
35 | u64 start; | 35 | u64 start; |
36 | u64 end; | 36 | u64 end; |
37 | struct rb_node rb_node; | 37 | struct rb_node rb_node; |
38 | }; | 38 | }; |
39 | 39 | ||
40 | struct extent_page_data { | 40 | struct extent_page_data { |
41 | struct bio *bio; | 41 | struct bio *bio; |
42 | struct extent_io_tree *tree; | 42 | struct extent_io_tree *tree; |
43 | get_extent_t *get_extent; | 43 | get_extent_t *get_extent; |
44 | 44 | ||
45 | /* tells writepage not to lock the state bits for this range | 45 | /* tells writepage not to lock the state bits for this range |
46 | * it still does the unlocking | 46 | * it still does the unlocking |
47 | */ | 47 | */ |
48 | unsigned int extent_locked:1; | 48 | unsigned int extent_locked:1; |
49 | 49 | ||
50 | /* tells the submit_bio code to use a WRITE_SYNC */ | 50 | /* tells the submit_bio code to use a WRITE_SYNC */ |
51 | unsigned int sync_io:1; | 51 | unsigned int sync_io:1; |
52 | }; | 52 | }; |
53 | 53 | ||
54 | int __init extent_io_init(void) | 54 | int __init extent_io_init(void) |
55 | { | 55 | { |
56 | extent_state_cache = kmem_cache_create("extent_state", | 56 | extent_state_cache = kmem_cache_create("extent_state", |
57 | sizeof(struct extent_state), 0, | 57 | sizeof(struct extent_state), 0, |
58 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | 58 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); |
59 | if (!extent_state_cache) | 59 | if (!extent_state_cache) |
60 | return -ENOMEM; | 60 | return -ENOMEM; |
61 | 61 | ||
62 | extent_buffer_cache = kmem_cache_create("extent_buffers", | 62 | extent_buffer_cache = kmem_cache_create("extent_buffers", |
63 | sizeof(struct extent_buffer), 0, | 63 | sizeof(struct extent_buffer), 0, |
64 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | 64 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); |
65 | if (!extent_buffer_cache) | 65 | if (!extent_buffer_cache) |
66 | goto free_state_cache; | 66 | goto free_state_cache; |
67 | return 0; | 67 | return 0; |
68 | 68 | ||
69 | free_state_cache: | 69 | free_state_cache: |
70 | kmem_cache_destroy(extent_state_cache); | 70 | kmem_cache_destroy(extent_state_cache); |
71 | return -ENOMEM; | 71 | return -ENOMEM; |
72 | } | 72 | } |
73 | 73 | ||
74 | void extent_io_exit(void) | 74 | void extent_io_exit(void) |
75 | { | 75 | { |
76 | struct extent_state *state; | 76 | struct extent_state *state; |
77 | struct extent_buffer *eb; | 77 | struct extent_buffer *eb; |
78 | 78 | ||
79 | while (!list_empty(&states)) { | 79 | while (!list_empty(&states)) { |
80 | state = list_entry(states.next, struct extent_state, leak_list); | 80 | state = list_entry(states.next, struct extent_state, leak_list); |
81 | printk(KERN_ERR "btrfs state leak: start %llu end %llu " | 81 | printk(KERN_ERR "btrfs state leak: start %llu end %llu " |
82 | "state %lu in tree %p refs %d\n", | 82 | "state %lu in tree %p refs %d\n", |
83 | (unsigned long long)state->start, | 83 | (unsigned long long)state->start, |
84 | (unsigned long long)state->end, | 84 | (unsigned long long)state->end, |
85 | state->state, state->tree, atomic_read(&state->refs)); | 85 | state->state, state->tree, atomic_read(&state->refs)); |
86 | list_del(&state->leak_list); | 86 | list_del(&state->leak_list); |
87 | kmem_cache_free(extent_state_cache, state); | 87 | kmem_cache_free(extent_state_cache, state); |
88 | 88 | ||
89 | } | 89 | } |
90 | 90 | ||
91 | while (!list_empty(&buffers)) { | 91 | while (!list_empty(&buffers)) { |
92 | eb = list_entry(buffers.next, struct extent_buffer, leak_list); | 92 | eb = list_entry(buffers.next, struct extent_buffer, leak_list); |
93 | printk(KERN_ERR "btrfs buffer leak start %llu len %lu " | 93 | printk(KERN_ERR "btrfs buffer leak start %llu len %lu " |
94 | "refs %d\n", (unsigned long long)eb->start, | 94 | "refs %d\n", (unsigned long long)eb->start, |
95 | eb->len, atomic_read(&eb->refs)); | 95 | eb->len, atomic_read(&eb->refs)); |
96 | list_del(&eb->leak_list); | 96 | list_del(&eb->leak_list); |
97 | kmem_cache_free(extent_buffer_cache, eb); | 97 | kmem_cache_free(extent_buffer_cache, eb); |
98 | } | 98 | } |
99 | if (extent_state_cache) | 99 | if (extent_state_cache) |
100 | kmem_cache_destroy(extent_state_cache); | 100 | kmem_cache_destroy(extent_state_cache); |
101 | if (extent_buffer_cache) | 101 | if (extent_buffer_cache) |
102 | kmem_cache_destroy(extent_buffer_cache); | 102 | kmem_cache_destroy(extent_buffer_cache); |
103 | } | 103 | } |
104 | 104 | ||
105 | void extent_io_tree_init(struct extent_io_tree *tree, | 105 | void extent_io_tree_init(struct extent_io_tree *tree, |
106 | struct address_space *mapping) | 106 | struct address_space *mapping) |
107 | { | 107 | { |
108 | tree->state = RB_ROOT; | 108 | tree->state = RB_ROOT; |
109 | INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); | 109 | INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); |
110 | tree->ops = NULL; | 110 | tree->ops = NULL; |
111 | tree->dirty_bytes = 0; | 111 | tree->dirty_bytes = 0; |
112 | spin_lock_init(&tree->lock); | 112 | spin_lock_init(&tree->lock); |
113 | spin_lock_init(&tree->buffer_lock); | 113 | spin_lock_init(&tree->buffer_lock); |
114 | tree->mapping = mapping; | 114 | tree->mapping = mapping; |
115 | } | 115 | } |
116 | 116 | ||
117 | static struct extent_state *alloc_extent_state(gfp_t mask) | 117 | static struct extent_state *alloc_extent_state(gfp_t mask) |
118 | { | 118 | { |
119 | struct extent_state *state; | 119 | struct extent_state *state; |
120 | #if LEAK_DEBUG | 120 | #if LEAK_DEBUG |
121 | unsigned long flags; | 121 | unsigned long flags; |
122 | #endif | 122 | #endif |
123 | 123 | ||
124 | state = kmem_cache_alloc(extent_state_cache, mask); | 124 | state = kmem_cache_alloc(extent_state_cache, mask); |
125 | if (!state) | 125 | if (!state) |
126 | return state; | 126 | return state; |
127 | state->state = 0; | 127 | state->state = 0; |
128 | state->private = 0; | 128 | state->private = 0; |
129 | state->tree = NULL; | 129 | state->tree = NULL; |
130 | #if LEAK_DEBUG | 130 | #if LEAK_DEBUG |
131 | spin_lock_irqsave(&leak_lock, flags); | 131 | spin_lock_irqsave(&leak_lock, flags); |
132 | list_add(&state->leak_list, &states); | 132 | list_add(&state->leak_list, &states); |
133 | spin_unlock_irqrestore(&leak_lock, flags); | 133 | spin_unlock_irqrestore(&leak_lock, flags); |
134 | #endif | 134 | #endif |
135 | atomic_set(&state->refs, 1); | 135 | atomic_set(&state->refs, 1); |
136 | init_waitqueue_head(&state->wq); | 136 | init_waitqueue_head(&state->wq); |
137 | return state; | 137 | return state; |
138 | } | 138 | } |
139 | 139 | ||
140 | void free_extent_state(struct extent_state *state) | 140 | void free_extent_state(struct extent_state *state) |
141 | { | 141 | { |
142 | if (!state) | 142 | if (!state) |
143 | return; | 143 | return; |
144 | if (atomic_dec_and_test(&state->refs)) { | 144 | if (atomic_dec_and_test(&state->refs)) { |
145 | #if LEAK_DEBUG | 145 | #if LEAK_DEBUG |
146 | unsigned long flags; | 146 | unsigned long flags; |
147 | #endif | 147 | #endif |
148 | WARN_ON(state->tree); | 148 | WARN_ON(state->tree); |
149 | #if LEAK_DEBUG | 149 | #if LEAK_DEBUG |
150 | spin_lock_irqsave(&leak_lock, flags); | 150 | spin_lock_irqsave(&leak_lock, flags); |
151 | list_del(&state->leak_list); | 151 | list_del(&state->leak_list); |
152 | spin_unlock_irqrestore(&leak_lock, flags); | 152 | spin_unlock_irqrestore(&leak_lock, flags); |
153 | #endif | 153 | #endif |
154 | kmem_cache_free(extent_state_cache, state); | 154 | kmem_cache_free(extent_state_cache, state); |
155 | } | 155 | } |
156 | } | 156 | } |
157 | 157 | ||
158 | static struct rb_node *tree_insert(struct rb_root *root, u64 offset, | 158 | static struct rb_node *tree_insert(struct rb_root *root, u64 offset, |
159 | struct rb_node *node) | 159 | struct rb_node *node) |
160 | { | 160 | { |
161 | struct rb_node **p = &root->rb_node; | 161 | struct rb_node **p = &root->rb_node; |
162 | struct rb_node *parent = NULL; | 162 | struct rb_node *parent = NULL; |
163 | struct tree_entry *entry; | 163 | struct tree_entry *entry; |
164 | 164 | ||
165 | while (*p) { | 165 | while (*p) { |
166 | parent = *p; | 166 | parent = *p; |
167 | entry = rb_entry(parent, struct tree_entry, rb_node); | 167 | entry = rb_entry(parent, struct tree_entry, rb_node); |
168 | 168 | ||
169 | if (offset < entry->start) | 169 | if (offset < entry->start) |
170 | p = &(*p)->rb_left; | 170 | p = &(*p)->rb_left; |
171 | else if (offset > entry->end) | 171 | else if (offset > entry->end) |
172 | p = &(*p)->rb_right; | 172 | p = &(*p)->rb_right; |
173 | else | 173 | else |
174 | return parent; | 174 | return parent; |
175 | } | 175 | } |
176 | 176 | ||
177 | entry = rb_entry(node, struct tree_entry, rb_node); | 177 | entry = rb_entry(node, struct tree_entry, rb_node); |
178 | rb_link_node(node, parent, p); | 178 | rb_link_node(node, parent, p); |
179 | rb_insert_color(node, root); | 179 | rb_insert_color(node, root); |
180 | return NULL; | 180 | return NULL; |
181 | } | 181 | } |
182 | 182 | ||
183 | static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, | 183 | static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, |
184 | struct rb_node **prev_ret, | 184 | struct rb_node **prev_ret, |
185 | struct rb_node **next_ret) | 185 | struct rb_node **next_ret) |
186 | { | 186 | { |
187 | struct rb_root *root = &tree->state; | 187 | struct rb_root *root = &tree->state; |
188 | struct rb_node *n = root->rb_node; | 188 | struct rb_node *n = root->rb_node; |
189 | struct rb_node *prev = NULL; | 189 | struct rb_node *prev = NULL; |
190 | struct rb_node *orig_prev = NULL; | 190 | struct rb_node *orig_prev = NULL; |
191 | struct tree_entry *entry; | 191 | struct tree_entry *entry; |
192 | struct tree_entry *prev_entry = NULL; | 192 | struct tree_entry *prev_entry = NULL; |
193 | 193 | ||
194 | while (n) { | 194 | while (n) { |
195 | entry = rb_entry(n, struct tree_entry, rb_node); | 195 | entry = rb_entry(n, struct tree_entry, rb_node); |
196 | prev = n; | 196 | prev = n; |
197 | prev_entry = entry; | 197 | prev_entry = entry; |
198 | 198 | ||
199 | if (offset < entry->start) | 199 | if (offset < entry->start) |
200 | n = n->rb_left; | 200 | n = n->rb_left; |
201 | else if (offset > entry->end) | 201 | else if (offset > entry->end) |
202 | n = n->rb_right; | 202 | n = n->rb_right; |
203 | else | 203 | else |
204 | return n; | 204 | return n; |
205 | } | 205 | } |
206 | 206 | ||
207 | if (prev_ret) { | 207 | if (prev_ret) { |
208 | orig_prev = prev; | 208 | orig_prev = prev; |
209 | while (prev && offset > prev_entry->end) { | 209 | while (prev && offset > prev_entry->end) { |
210 | prev = rb_next(prev); | 210 | prev = rb_next(prev); |
211 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); | 211 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); |
212 | } | 212 | } |
213 | *prev_ret = prev; | 213 | *prev_ret = prev; |
214 | prev = orig_prev; | 214 | prev = orig_prev; |
215 | } | 215 | } |
216 | 216 | ||
217 | if (next_ret) { | 217 | if (next_ret) { |
218 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); | 218 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); |
219 | while (prev && offset < prev_entry->start) { | 219 | while (prev && offset < prev_entry->start) { |
220 | prev = rb_prev(prev); | 220 | prev = rb_prev(prev); |
221 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); | 221 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); |
222 | } | 222 | } |
223 | *next_ret = prev; | 223 | *next_ret = prev; |
224 | } | 224 | } |
225 | return NULL; | 225 | return NULL; |
226 | } | 226 | } |
227 | 227 | ||
228 | static inline struct rb_node *tree_search(struct extent_io_tree *tree, | 228 | static inline struct rb_node *tree_search(struct extent_io_tree *tree, |
229 | u64 offset) | 229 | u64 offset) |
230 | { | 230 | { |
231 | struct rb_node *prev = NULL; | 231 | struct rb_node *prev = NULL; |
232 | struct rb_node *ret; | 232 | struct rb_node *ret; |
233 | 233 | ||
234 | ret = __etree_search(tree, offset, &prev, NULL); | 234 | ret = __etree_search(tree, offset, &prev, NULL); |
235 | if (!ret) | 235 | if (!ret) |
236 | return prev; | 236 | return prev; |
237 | return ret; | 237 | return ret; |
238 | } | 238 | } |
239 | 239 | ||
240 | static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, | 240 | static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, |
241 | struct extent_state *other) | 241 | struct extent_state *other) |
242 | { | 242 | { |
243 | if (tree->ops && tree->ops->merge_extent_hook) | 243 | if (tree->ops && tree->ops->merge_extent_hook) |
244 | tree->ops->merge_extent_hook(tree->mapping->host, new, | 244 | tree->ops->merge_extent_hook(tree->mapping->host, new, |
245 | other); | 245 | other); |
246 | } | 246 | } |
247 | 247 | ||
248 | /* | 248 | /* |
249 | * utility function to look for merge candidates inside a given range. | 249 | * utility function to look for merge candidates inside a given range. |
250 | * Any extents with matching state are merged together into a single | 250 | * Any extents with matching state are merged together into a single |
251 | * extent in the tree. Extents with EXTENT_IO in their state field | 251 | * extent in the tree. Extents with EXTENT_IO in their state field |
252 | * are not merged because the end_io handlers need to be able to do | 252 | * are not merged because the end_io handlers need to be able to do |
253 | * operations on them without sleeping (or doing allocations/splits). | 253 | * operations on them without sleeping (or doing allocations/splits). |
254 | * | 254 | * |
255 | * This should be called with the tree lock held. | 255 | * This should be called with the tree lock held. |
256 | */ | 256 | */ |
257 | static int merge_state(struct extent_io_tree *tree, | 257 | static int merge_state(struct extent_io_tree *tree, |
258 | struct extent_state *state) | 258 | struct extent_state *state) |
259 | { | 259 | { |
260 | struct extent_state *other; | 260 | struct extent_state *other; |
261 | struct rb_node *other_node; | 261 | struct rb_node *other_node; |
262 | 262 | ||
263 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) | 263 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) |
264 | return 0; | 264 | return 0; |
265 | 265 | ||
266 | other_node = rb_prev(&state->rb_node); | 266 | other_node = rb_prev(&state->rb_node); |
267 | if (other_node) { | 267 | if (other_node) { |
268 | other = rb_entry(other_node, struct extent_state, rb_node); | 268 | other = rb_entry(other_node, struct extent_state, rb_node); |
269 | if (other->end == state->start - 1 && | 269 | if (other->end == state->start - 1 && |
270 | other->state == state->state) { | 270 | other->state == state->state) { |
271 | merge_cb(tree, state, other); | 271 | merge_cb(tree, state, other); |
272 | state->start = other->start; | 272 | state->start = other->start; |
273 | other->tree = NULL; | 273 | other->tree = NULL; |
274 | rb_erase(&other->rb_node, &tree->state); | 274 | rb_erase(&other->rb_node, &tree->state); |
275 | free_extent_state(other); | 275 | free_extent_state(other); |
276 | } | 276 | } |
277 | } | 277 | } |
278 | other_node = rb_next(&state->rb_node); | 278 | other_node = rb_next(&state->rb_node); |
279 | if (other_node) { | 279 | if (other_node) { |
280 | other = rb_entry(other_node, struct extent_state, rb_node); | 280 | other = rb_entry(other_node, struct extent_state, rb_node); |
281 | if (other->start == state->end + 1 && | 281 | if (other->start == state->end + 1 && |
282 | other->state == state->state) { | 282 | other->state == state->state) { |
283 | merge_cb(tree, state, other); | 283 | merge_cb(tree, state, other); |
284 | other->start = state->start; | 284 | other->start = state->start; |
285 | state->tree = NULL; | 285 | state->tree = NULL; |
286 | rb_erase(&state->rb_node, &tree->state); | 286 | rb_erase(&state->rb_node, &tree->state); |
287 | free_extent_state(state); | 287 | free_extent_state(state); |
288 | state = NULL; | 288 | state = NULL; |
289 | } | 289 | } |
290 | } | 290 | } |
291 | 291 | ||
292 | return 0; | 292 | return 0; |
293 | } | 293 | } |
294 | 294 | ||
295 | static int set_state_cb(struct extent_io_tree *tree, | 295 | static int set_state_cb(struct extent_io_tree *tree, |
296 | struct extent_state *state, int *bits) | 296 | struct extent_state *state, int *bits) |
297 | { | 297 | { |
298 | if (tree->ops && tree->ops->set_bit_hook) { | 298 | if (tree->ops && tree->ops->set_bit_hook) { |
299 | return tree->ops->set_bit_hook(tree->mapping->host, | 299 | return tree->ops->set_bit_hook(tree->mapping->host, |
300 | state, bits); | 300 | state, bits); |
301 | } | 301 | } |
302 | 302 | ||
303 | return 0; | 303 | return 0; |
304 | } | 304 | } |
305 | 305 | ||
306 | static void clear_state_cb(struct extent_io_tree *tree, | 306 | static void clear_state_cb(struct extent_io_tree *tree, |
307 | struct extent_state *state, int *bits) | 307 | struct extent_state *state, int *bits) |
308 | { | 308 | { |
309 | if (tree->ops && tree->ops->clear_bit_hook) | 309 | if (tree->ops && tree->ops->clear_bit_hook) |
310 | tree->ops->clear_bit_hook(tree->mapping->host, state, bits); | 310 | tree->ops->clear_bit_hook(tree->mapping->host, state, bits); |
311 | } | 311 | } |
312 | 312 | ||
313 | /* | 313 | /* |
314 | * insert an extent_state struct into the tree. 'bits' are set on the | 314 | * insert an extent_state struct into the tree. 'bits' are set on the |
315 | * struct before it is inserted. | 315 | * struct before it is inserted. |
316 | * | 316 | * |
317 | * This may return -EEXIST if the extent is already there, in which case the | 317 | * This may return -EEXIST if the extent is already there, in which case the |
318 | * state struct is freed. | 318 | * state struct is freed. |
319 | * | 319 | * |
320 | * The tree lock is not taken internally. This is a utility function and | 320 | * The tree lock is not taken internally. This is a utility function and |
321 | * probably isn't what you want to call (see set/clear_extent_bit). | 321 | * probably isn't what you want to call (see set/clear_extent_bit). |
322 | */ | 322 | */ |
323 | static int insert_state(struct extent_io_tree *tree, | 323 | static int insert_state(struct extent_io_tree *tree, |
324 | struct extent_state *state, u64 start, u64 end, | 324 | struct extent_state *state, u64 start, u64 end, |
325 | int *bits) | 325 | int *bits) |
326 | { | 326 | { |
327 | struct rb_node *node; | 327 | struct rb_node *node; |
328 | int bits_to_set = *bits & ~EXTENT_CTLBITS; | 328 | int bits_to_set = *bits & ~EXTENT_CTLBITS; |
329 | int ret; | 329 | int ret; |
330 | 330 | ||
331 | if (end < start) { | 331 | if (end < start) { |
332 | printk(KERN_ERR "btrfs end < start %llu %llu\n", | 332 | printk(KERN_ERR "btrfs end < start %llu %llu\n", |
333 | (unsigned long long)end, | 333 | (unsigned long long)end, |
334 | (unsigned long long)start); | 334 | (unsigned long long)start); |
335 | WARN_ON(1); | 335 | WARN_ON(1); |
336 | } | 336 | } |
337 | state->start = start; | 337 | state->start = start; |
338 | state->end = end; | 338 | state->end = end; |
339 | ret = set_state_cb(tree, state, bits); | 339 | ret = set_state_cb(tree, state, bits); |
340 | if (ret) | 340 | if (ret) |
341 | return ret; | 341 | return ret; |
342 | 342 | ||
343 | if (bits_to_set & EXTENT_DIRTY) | 343 | if (bits_to_set & EXTENT_DIRTY) |
344 | tree->dirty_bytes += end - start + 1; | 344 | tree->dirty_bytes += end - start + 1; |
345 | state->state |= bits_to_set; | 345 | state->state |= bits_to_set; |
346 | node = tree_insert(&tree->state, end, &state->rb_node); | 346 | node = tree_insert(&tree->state, end, &state->rb_node); |
347 | if (node) { | 347 | if (node) { |
348 | struct extent_state *found; | 348 | struct extent_state *found; |
349 | found = rb_entry(node, struct extent_state, rb_node); | 349 | found = rb_entry(node, struct extent_state, rb_node); |
350 | printk(KERN_ERR "btrfs found node %llu %llu on insert of " | 350 | printk(KERN_ERR "btrfs found node %llu %llu on insert of " |
351 | "%llu %llu\n", (unsigned long long)found->start, | 351 | "%llu %llu\n", (unsigned long long)found->start, |
352 | (unsigned long long)found->end, | 352 | (unsigned long long)found->end, |
353 | (unsigned long long)start, (unsigned long long)end); | 353 | (unsigned long long)start, (unsigned long long)end); |
354 | free_extent_state(state); | 354 | free_extent_state(state); |
355 | return -EEXIST; | 355 | return -EEXIST; |
356 | } | 356 | } |
357 | state->tree = tree; | 357 | state->tree = tree; |
358 | merge_state(tree, state); | 358 | merge_state(tree, state); |
359 | return 0; | 359 | return 0; |
360 | } | 360 | } |
361 | 361 | ||
362 | static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, | 362 | static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, |
363 | u64 split) | 363 | u64 split) |
364 | { | 364 | { |
365 | if (tree->ops && tree->ops->split_extent_hook) | 365 | if (tree->ops && tree->ops->split_extent_hook) |
366 | return tree->ops->split_extent_hook(tree->mapping->host, | 366 | return tree->ops->split_extent_hook(tree->mapping->host, |
367 | orig, split); | 367 | orig, split); |
368 | return 0; | 368 | return 0; |
369 | } | 369 | } |
370 | 370 | ||
371 | /* | 371 | /* |
372 | * split a given extent state struct in two, inserting the preallocated | 372 | * split a given extent state struct in two, inserting the preallocated |
373 | * struct 'prealloc' as the newly created second half. 'split' indicates an | 373 | * struct 'prealloc' as the newly created second half. 'split' indicates an |
374 | * offset inside 'orig' where it should be split. | 374 | * offset inside 'orig' where it should be split. |
375 | * | 375 | * |
376 | * Before calling, | 376 | * Before calling, |
377 | * the tree has 'orig' at [orig->start, orig->end]. After calling, there | 377 | * the tree has 'orig' at [orig->start, orig->end]. After calling, there |
378 | * are two extent state structs in the tree: | 378 | * are two extent state structs in the tree: |
379 | * prealloc: [orig->start, split - 1] | 379 | * prealloc: [orig->start, split - 1] |
380 | * orig: [ split, orig->end ] | 380 | * orig: [ split, orig->end ] |
381 | * | 381 | * |
382 | * The tree locks are not taken by this function. They need to be held | 382 | * The tree locks are not taken by this function. They need to be held |
383 | * by the caller. | 383 | * by the caller. |
384 | */ | 384 | */ |
385 | static int split_state(struct extent_io_tree *tree, struct extent_state *orig, | 385 | static int split_state(struct extent_io_tree *tree, struct extent_state *orig, |
386 | struct extent_state *prealloc, u64 split) | 386 | struct extent_state *prealloc, u64 split) |
387 | { | 387 | { |
388 | struct rb_node *node; | 388 | struct rb_node *node; |
389 | 389 | ||
390 | split_cb(tree, orig, split); | 390 | split_cb(tree, orig, split); |
391 | 391 | ||
392 | prealloc->start = orig->start; | 392 | prealloc->start = orig->start; |
393 | prealloc->end = split - 1; | 393 | prealloc->end = split - 1; |
394 | prealloc->state = orig->state; | 394 | prealloc->state = orig->state; |
395 | orig->start = split; | 395 | orig->start = split; |
396 | 396 | ||
397 | node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); | 397 | node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); |
398 | if (node) { | 398 | if (node) { |
399 | free_extent_state(prealloc); | 399 | free_extent_state(prealloc); |
400 | return -EEXIST; | 400 | return -EEXIST; |
401 | } | 401 | } |
402 | prealloc->tree = tree; | 402 | prealloc->tree = tree; |
403 | return 0; | 403 | return 0; |
404 | } | 404 | } |
405 | 405 | ||
406 | /* | 406 | /* |
407 | * utility function to clear some bits in an extent state struct. | 407 | * utility function to clear some bits in an extent state struct. |
408 | * it will optionally wake up any one waiting on this state (wake == 1), or | 408 | * it will optionally wake up any one waiting on this state (wake == 1), or |
409 | * forcibly remove the state from the tree (delete == 1). | 409 | * forcibly remove the state from the tree (delete == 1). |
410 | * | 410 | * |
411 | * If no bits are set on the state struct after clearing things, the | 411 | * If no bits are set on the state struct after clearing things, the |
412 | * struct is freed and removed from the tree | 412 | * struct is freed and removed from the tree |
413 | */ | 413 | */ |
414 | static int clear_state_bit(struct extent_io_tree *tree, | 414 | static int clear_state_bit(struct extent_io_tree *tree, |
415 | struct extent_state *state, | 415 | struct extent_state *state, |
416 | int *bits, int wake) | 416 | int *bits, int wake) |
417 | { | 417 | { |
418 | int bits_to_clear = *bits & ~EXTENT_CTLBITS; | 418 | int bits_to_clear = *bits & ~EXTENT_CTLBITS; |
419 | int ret = state->state & bits_to_clear; | 419 | int ret = state->state & bits_to_clear; |
420 | 420 | ||
421 | if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { | 421 | if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { |
422 | u64 range = state->end - state->start + 1; | 422 | u64 range = state->end - state->start + 1; |
423 | WARN_ON(range > tree->dirty_bytes); | 423 | WARN_ON(range > tree->dirty_bytes); |
424 | tree->dirty_bytes -= range; | 424 | tree->dirty_bytes -= range; |
425 | } | 425 | } |
426 | clear_state_cb(tree, state, bits); | 426 | clear_state_cb(tree, state, bits); |
427 | state->state &= ~bits_to_clear; | 427 | state->state &= ~bits_to_clear; |
428 | if (wake) | 428 | if (wake) |
429 | wake_up(&state->wq); | 429 | wake_up(&state->wq); |
430 | if (state->state == 0) { | 430 | if (state->state == 0) { |
431 | if (state->tree) { | 431 | if (state->tree) { |
432 | rb_erase(&state->rb_node, &tree->state); | 432 | rb_erase(&state->rb_node, &tree->state); |
433 | state->tree = NULL; | 433 | state->tree = NULL; |
434 | free_extent_state(state); | 434 | free_extent_state(state); |
435 | } else { | 435 | } else { |
436 | WARN_ON(1); | 436 | WARN_ON(1); |
437 | } | 437 | } |
438 | } else { | 438 | } else { |
439 | merge_state(tree, state); | 439 | merge_state(tree, state); |
440 | } | 440 | } |
441 | return ret; | 441 | return ret; |
442 | } | 442 | } |
443 | 443 | ||
444 | static struct extent_state * | 444 | static struct extent_state * |
445 | alloc_extent_state_atomic(struct extent_state *prealloc) | 445 | alloc_extent_state_atomic(struct extent_state *prealloc) |
446 | { | 446 | { |
447 | if (!prealloc) | 447 | if (!prealloc) |
448 | prealloc = alloc_extent_state(GFP_ATOMIC); | 448 | prealloc = alloc_extent_state(GFP_ATOMIC); |
449 | 449 | ||
450 | return prealloc; | 450 | return prealloc; |
451 | } | 451 | } |
452 | 452 | ||
453 | /* | 453 | /* |
454 | * clear some bits on a range in the tree. This may require splitting | 454 | * clear some bits on a range in the tree. This may require splitting |
455 | * or inserting elements in the tree, so the gfp mask is used to | 455 | * or inserting elements in the tree, so the gfp mask is used to |
456 | * indicate which allocations or sleeping are allowed. | 456 | * indicate which allocations or sleeping are allowed. |
457 | * | 457 | * |
458 | * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove | 458 | * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove |
459 | * the given range from the tree regardless of state (ie for truncate). | 459 | * the given range from the tree regardless of state (ie for truncate). |
460 | * | 460 | * |
461 | * the range [start, end] is inclusive. | 461 | * the range [start, end] is inclusive. |
462 | * | 462 | * |
463 | * This takes the tree lock, and returns < 0 on error, > 0 if any of the | 463 | * This takes the tree lock, and returns < 0 on error, > 0 if any of the |
464 | * bits were already set, or zero if none of the bits were already set. | 464 | * bits were already set, or zero if none of the bits were already set. |
465 | */ | 465 | */ |
466 | int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | 466 | int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
467 | int bits, int wake, int delete, | 467 | int bits, int wake, int delete, |
468 | struct extent_state **cached_state, | 468 | struct extent_state **cached_state, |
469 | gfp_t mask) | 469 | gfp_t mask) |
470 | { | 470 | { |
471 | struct extent_state *state; | 471 | struct extent_state *state; |
472 | struct extent_state *cached; | 472 | struct extent_state *cached; |
473 | struct extent_state *prealloc = NULL; | 473 | struct extent_state *prealloc = NULL; |
474 | struct rb_node *next_node; | 474 | struct rb_node *next_node; |
475 | struct rb_node *node; | 475 | struct rb_node *node; |
476 | u64 last_end; | 476 | u64 last_end; |
477 | int err; | 477 | int err; |
478 | int set = 0; | 478 | int set = 0; |
479 | int clear = 0; | 479 | int clear = 0; |
480 | 480 | ||
481 | if (delete) | 481 | if (delete) |
482 | bits |= ~EXTENT_CTLBITS; | 482 | bits |= ~EXTENT_CTLBITS; |
483 | bits |= EXTENT_FIRST_DELALLOC; | 483 | bits |= EXTENT_FIRST_DELALLOC; |
484 | 484 | ||
485 | if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) | 485 | if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) |
486 | clear = 1; | 486 | clear = 1; |
487 | again: | 487 | again: |
488 | if (!prealloc && (mask & __GFP_WAIT)) { | 488 | if (!prealloc && (mask & __GFP_WAIT)) { |
489 | prealloc = alloc_extent_state(mask); | 489 | prealloc = alloc_extent_state(mask); |
490 | if (!prealloc) | 490 | if (!prealloc) |
491 | return -ENOMEM; | 491 | return -ENOMEM; |
492 | } | 492 | } |
493 | 493 | ||
494 | spin_lock(&tree->lock); | 494 | spin_lock(&tree->lock); |
495 | if (cached_state) { | 495 | if (cached_state) { |
496 | cached = *cached_state; | 496 | cached = *cached_state; |
497 | 497 | ||
498 | if (clear) { | 498 | if (clear) { |
499 | *cached_state = NULL; | 499 | *cached_state = NULL; |
500 | cached_state = NULL; | 500 | cached_state = NULL; |
501 | } | 501 | } |
502 | 502 | ||
503 | if (cached && cached->tree && cached->start == start) { | 503 | if (cached && cached->tree && cached->start == start) { |
504 | if (clear) | 504 | if (clear) |
505 | atomic_dec(&cached->refs); | 505 | atomic_dec(&cached->refs); |
506 | state = cached; | 506 | state = cached; |
507 | goto hit_next; | 507 | goto hit_next; |
508 | } | 508 | } |
509 | if (clear) | 509 | if (clear) |
510 | free_extent_state(cached); | 510 | free_extent_state(cached); |
511 | } | 511 | } |
512 | /* | 512 | /* |
513 | * this search will find the extents that end after | 513 | * this search will find the extents that end after |
514 | * our range starts | 514 | * our range starts |
515 | */ | 515 | */ |
516 | node = tree_search(tree, start); | 516 | node = tree_search(tree, start); |
517 | if (!node) | 517 | if (!node) |
518 | goto out; | 518 | goto out; |
519 | state = rb_entry(node, struct extent_state, rb_node); | 519 | state = rb_entry(node, struct extent_state, rb_node); |
520 | hit_next: | 520 | hit_next: |
521 | if (state->start > end) | 521 | if (state->start > end) |
522 | goto out; | 522 | goto out; |
523 | WARN_ON(state->end < start); | 523 | WARN_ON(state->end < start); |
524 | last_end = state->end; | 524 | last_end = state->end; |
525 | 525 | ||
526 | /* | 526 | /* |
527 | * | ---- desired range ---- | | 527 | * | ---- desired range ---- | |
528 | * | state | or | 528 | * | state | or |
529 | * | ------------- state -------------- | | 529 | * | ------------- state -------------- | |
530 | * | 530 | * |
531 | * We need to split the extent we found, and may flip | 531 | * We need to split the extent we found, and may flip |
532 | * bits on second half. | 532 | * bits on second half. |
533 | * | 533 | * |
534 | * If the extent we found extends past our range, we | 534 | * If the extent we found extends past our range, we |
535 | * just split and search again. It'll get split again | 535 | * just split and search again. It'll get split again |
536 | * the next time though. | 536 | * the next time though. |
537 | * | 537 | * |
538 | * If the extent we found is inside our range, we clear | 538 | * If the extent we found is inside our range, we clear |
539 | * the desired bit on it. | 539 | * the desired bit on it. |
540 | */ | 540 | */ |
541 | 541 | ||
542 | if (state->start < start) { | 542 | if (state->start < start) { |
543 | prealloc = alloc_extent_state_atomic(prealloc); | 543 | prealloc = alloc_extent_state_atomic(prealloc); |
544 | BUG_ON(!prealloc); | 544 | BUG_ON(!prealloc); |
545 | err = split_state(tree, state, prealloc, start); | 545 | err = split_state(tree, state, prealloc, start); |
546 | BUG_ON(err == -EEXIST); | 546 | BUG_ON(err == -EEXIST); |
547 | prealloc = NULL; | 547 | prealloc = NULL; |
548 | if (err) | 548 | if (err) |
549 | goto out; | 549 | goto out; |
550 | if (state->end <= end) { | 550 | if (state->end <= end) { |
551 | set |= clear_state_bit(tree, state, &bits, wake); | 551 | set |= clear_state_bit(tree, state, &bits, wake); |
552 | if (last_end == (u64)-1) | 552 | if (last_end == (u64)-1) |
553 | goto out; | 553 | goto out; |
554 | start = last_end + 1; | 554 | start = last_end + 1; |
555 | } | 555 | } |
556 | goto search_again; | 556 | goto search_again; |
557 | } | 557 | } |
558 | /* | 558 | /* |
559 | * | ---- desired range ---- | | 559 | * | ---- desired range ---- | |
560 | * | state | | 560 | * | state | |
561 | * We need to split the extent, and clear the bit | 561 | * We need to split the extent, and clear the bit |
562 | * on the first half | 562 | * on the first half |
563 | */ | 563 | */ |
564 | if (state->start <= end && state->end > end) { | 564 | if (state->start <= end && state->end > end) { |
565 | prealloc = alloc_extent_state_atomic(prealloc); | 565 | prealloc = alloc_extent_state_atomic(prealloc); |
566 | BUG_ON(!prealloc); | 566 | BUG_ON(!prealloc); |
567 | err = split_state(tree, state, prealloc, end + 1); | 567 | err = split_state(tree, state, prealloc, end + 1); |
568 | BUG_ON(err == -EEXIST); | 568 | BUG_ON(err == -EEXIST); |
569 | if (wake) | 569 | if (wake) |
570 | wake_up(&state->wq); | 570 | wake_up(&state->wq); |
571 | 571 | ||
572 | set |= clear_state_bit(tree, prealloc, &bits, wake); | 572 | set |= clear_state_bit(tree, prealloc, &bits, wake); |
573 | 573 | ||
574 | prealloc = NULL; | 574 | prealloc = NULL; |
575 | goto out; | 575 | goto out; |
576 | } | 576 | } |
577 | 577 | ||
578 | if (state->end < end && prealloc && !need_resched()) | 578 | if (state->end < end && prealloc && !need_resched()) |
579 | next_node = rb_next(&state->rb_node); | 579 | next_node = rb_next(&state->rb_node); |
580 | else | 580 | else |
581 | next_node = NULL; | 581 | next_node = NULL; |
582 | 582 | ||
583 | set |= clear_state_bit(tree, state, &bits, wake); | 583 | set |= clear_state_bit(tree, state, &bits, wake); |
584 | if (last_end == (u64)-1) | 584 | if (last_end == (u64)-1) |
585 | goto out; | 585 | goto out; |
586 | start = last_end + 1; | 586 | start = last_end + 1; |
587 | if (start <= end && next_node) { | 587 | if (start <= end && next_node) { |
588 | state = rb_entry(next_node, struct extent_state, | 588 | state = rb_entry(next_node, struct extent_state, |
589 | rb_node); | 589 | rb_node); |
590 | if (state->start == start) | 590 | if (state->start == start) |
591 | goto hit_next; | 591 | goto hit_next; |
592 | } | 592 | } |
593 | goto search_again; | 593 | goto search_again; |
594 | 594 | ||
595 | out: | 595 | out: |
596 | spin_unlock(&tree->lock); | 596 | spin_unlock(&tree->lock); |
597 | if (prealloc) | 597 | if (prealloc) |
598 | free_extent_state(prealloc); | 598 | free_extent_state(prealloc); |
599 | 599 | ||
600 | return set; | 600 | return set; |
601 | 601 | ||
602 | search_again: | 602 | search_again: |
603 | if (start > end) | 603 | if (start > end) |
604 | goto out; | 604 | goto out; |
605 | spin_unlock(&tree->lock); | 605 | spin_unlock(&tree->lock); |
606 | if (mask & __GFP_WAIT) | 606 | if (mask & __GFP_WAIT) |
607 | cond_resched(); | 607 | cond_resched(); |
608 | goto again; | 608 | goto again; |
609 | } | 609 | } |
610 | 610 | ||
611 | static int wait_on_state(struct extent_io_tree *tree, | 611 | static int wait_on_state(struct extent_io_tree *tree, |
612 | struct extent_state *state) | 612 | struct extent_state *state) |
613 | __releases(tree->lock) | 613 | __releases(tree->lock) |
614 | __acquires(tree->lock) | 614 | __acquires(tree->lock) |
615 | { | 615 | { |
616 | DEFINE_WAIT(wait); | 616 | DEFINE_WAIT(wait); |
617 | prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); | 617 | prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); |
618 | spin_unlock(&tree->lock); | 618 | spin_unlock(&tree->lock); |
619 | schedule(); | 619 | schedule(); |
620 | spin_lock(&tree->lock); | 620 | spin_lock(&tree->lock); |
621 | finish_wait(&state->wq, &wait); | 621 | finish_wait(&state->wq, &wait); |
622 | return 0; | 622 | return 0; |
623 | } | 623 | } |
624 | 624 | ||
625 | /* | 625 | /* |
626 | * waits for one or more bits to clear on a range in the state tree. | 626 | * waits for one or more bits to clear on a range in the state tree. |
627 | * The range [start, end] is inclusive. | 627 | * The range [start, end] is inclusive. |
628 | * The tree lock is taken by this function | 628 | * The tree lock is taken by this function |
629 | */ | 629 | */ |
630 | int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) | 630 | int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) |
631 | { | 631 | { |
632 | struct extent_state *state; | 632 | struct extent_state *state; |
633 | struct rb_node *node; | 633 | struct rb_node *node; |
634 | 634 | ||
635 | spin_lock(&tree->lock); | 635 | spin_lock(&tree->lock); |
636 | again: | 636 | again: |
637 | while (1) { | 637 | while (1) { |
638 | /* | 638 | /* |
639 | * this search will find all the extents that end after | 639 | * this search will find all the extents that end after |
640 | * our range starts | 640 | * our range starts |
641 | */ | 641 | */ |
642 | node = tree_search(tree, start); | 642 | node = tree_search(tree, start); |
643 | if (!node) | 643 | if (!node) |
644 | break; | 644 | break; |
645 | 645 | ||
646 | state = rb_entry(node, struct extent_state, rb_node); | 646 | state = rb_entry(node, struct extent_state, rb_node); |
647 | 647 | ||
648 | if (state->start > end) | 648 | if (state->start > end) |
649 | goto out; | 649 | goto out; |
650 | 650 | ||
651 | if (state->state & bits) { | 651 | if (state->state & bits) { |
652 | start = state->start; | 652 | start = state->start; |
653 | atomic_inc(&state->refs); | 653 | atomic_inc(&state->refs); |
654 | wait_on_state(tree, state); | 654 | wait_on_state(tree, state); |
655 | free_extent_state(state); | 655 | free_extent_state(state); |
656 | goto again; | 656 | goto again; |
657 | } | 657 | } |
658 | start = state->end + 1; | 658 | start = state->end + 1; |
659 | 659 | ||
660 | if (start > end) | 660 | if (start > end) |
661 | break; | 661 | break; |
662 | 662 | ||
663 | if (need_resched()) { | 663 | if (need_resched()) { |
664 | spin_unlock(&tree->lock); | 664 | spin_unlock(&tree->lock); |
665 | cond_resched(); | 665 | cond_resched(); |
666 | spin_lock(&tree->lock); | 666 | spin_lock(&tree->lock); |
667 | } | 667 | } |
668 | } | 668 | } |
669 | out: | 669 | out: |
670 | spin_unlock(&tree->lock); | 670 | spin_unlock(&tree->lock); |
671 | return 0; | 671 | return 0; |
672 | } | 672 | } |
673 | 673 | ||
674 | static int set_state_bits(struct extent_io_tree *tree, | 674 | static int set_state_bits(struct extent_io_tree *tree, |
675 | struct extent_state *state, | 675 | struct extent_state *state, |
676 | int *bits) | 676 | int *bits) |
677 | { | 677 | { |
678 | int ret; | 678 | int ret; |
679 | int bits_to_set = *bits & ~EXTENT_CTLBITS; | 679 | int bits_to_set = *bits & ~EXTENT_CTLBITS; |
680 | 680 | ||
681 | ret = set_state_cb(tree, state, bits); | 681 | ret = set_state_cb(tree, state, bits); |
682 | if (ret) | 682 | if (ret) |
683 | return ret; | 683 | return ret; |
684 | if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { | 684 | if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { |
685 | u64 range = state->end - state->start + 1; | 685 | u64 range = state->end - state->start + 1; |
686 | tree->dirty_bytes += range; | 686 | tree->dirty_bytes += range; |
687 | } | 687 | } |
688 | state->state |= bits_to_set; | 688 | state->state |= bits_to_set; |
689 | 689 | ||
690 | return 0; | 690 | return 0; |
691 | } | 691 | } |
692 | 692 | ||
693 | static void cache_state(struct extent_state *state, | 693 | static void cache_state(struct extent_state *state, |
694 | struct extent_state **cached_ptr) | 694 | struct extent_state **cached_ptr) |
695 | { | 695 | { |
696 | if (cached_ptr && !(*cached_ptr)) { | 696 | if (cached_ptr && !(*cached_ptr)) { |
697 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { | 697 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { |
698 | *cached_ptr = state; | 698 | *cached_ptr = state; |
699 | atomic_inc(&state->refs); | 699 | atomic_inc(&state->refs); |
700 | } | 700 | } |
701 | } | 701 | } |
702 | } | 702 | } |
703 | 703 | ||
704 | static void uncache_state(struct extent_state **cached_ptr) | 704 | static void uncache_state(struct extent_state **cached_ptr) |
705 | { | 705 | { |
706 | if (cached_ptr && (*cached_ptr)) { | 706 | if (cached_ptr && (*cached_ptr)) { |
707 | struct extent_state *state = *cached_ptr; | 707 | struct extent_state *state = *cached_ptr; |
708 | *cached_ptr = NULL; | 708 | *cached_ptr = NULL; |
709 | free_extent_state(state); | 709 | free_extent_state(state); |
710 | } | 710 | } |
711 | } | 711 | } |
712 | 712 | ||
713 | /* | 713 | /* |
714 | * set some bits on a range in the tree. This may require allocations or | 714 | * set some bits on a range in the tree. This may require allocations or |
715 | * sleeping, so the gfp mask is used to indicate what is allowed. | 715 | * sleeping, so the gfp mask is used to indicate what is allowed. |
716 | * | 716 | * |
717 | * If any of the exclusive bits are set, this will fail with -EEXIST if some | 717 | * If any of the exclusive bits are set, this will fail with -EEXIST if some |
718 | * part of the range already has the desired bits set. The start of the | 718 | * part of the range already has the desired bits set. The start of the |
719 | * existing range is returned in failed_start in this case. | 719 | * existing range is returned in failed_start in this case. |
720 | * | 720 | * |
721 | * [start, end] is inclusive This takes the tree lock. | 721 | * [start, end] is inclusive This takes the tree lock. |
722 | */ | 722 | */ |
723 | 723 | ||
724 | int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | 724 | int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
725 | int bits, int exclusive_bits, u64 *failed_start, | 725 | int bits, int exclusive_bits, u64 *failed_start, |
726 | struct extent_state **cached_state, gfp_t mask) | 726 | struct extent_state **cached_state, gfp_t mask) |
727 | { | 727 | { |
728 | struct extent_state *state; | 728 | struct extent_state *state; |
729 | struct extent_state *prealloc = NULL; | 729 | struct extent_state *prealloc = NULL; |
730 | struct rb_node *node; | 730 | struct rb_node *node; |
731 | int err = 0; | 731 | int err = 0; |
732 | u64 last_start; | 732 | u64 last_start; |
733 | u64 last_end; | 733 | u64 last_end; |
734 | 734 | ||
735 | bits |= EXTENT_FIRST_DELALLOC; | 735 | bits |= EXTENT_FIRST_DELALLOC; |
736 | again: | 736 | again: |
737 | if (!prealloc && (mask & __GFP_WAIT)) { | 737 | if (!prealloc && (mask & __GFP_WAIT)) { |
738 | prealloc = alloc_extent_state(mask); | 738 | prealloc = alloc_extent_state(mask); |
739 | BUG_ON(!prealloc); | 739 | BUG_ON(!prealloc); |
740 | } | 740 | } |
741 | 741 | ||
742 | spin_lock(&tree->lock); | 742 | spin_lock(&tree->lock); |
743 | if (cached_state && *cached_state) { | 743 | if (cached_state && *cached_state) { |
744 | state = *cached_state; | 744 | state = *cached_state; |
745 | if (state->start == start && state->tree) { | 745 | if (state->start == start && state->tree) { |
746 | node = &state->rb_node; | 746 | node = &state->rb_node; |
747 | goto hit_next; | 747 | goto hit_next; |
748 | } | 748 | } |
749 | } | 749 | } |
750 | /* | 750 | /* |
751 | * this search will find all the extents that end after | 751 | * this search will find all the extents that end after |
752 | * our range starts. | 752 | * our range starts. |
753 | */ | 753 | */ |
754 | node = tree_search(tree, start); | 754 | node = tree_search(tree, start); |
755 | if (!node) { | 755 | if (!node) { |
756 | prealloc = alloc_extent_state_atomic(prealloc); | 756 | prealloc = alloc_extent_state_atomic(prealloc); |
757 | BUG_ON(!prealloc); | 757 | BUG_ON(!prealloc); |
758 | err = insert_state(tree, prealloc, start, end, &bits); | 758 | err = insert_state(tree, prealloc, start, end, &bits); |
759 | prealloc = NULL; | 759 | prealloc = NULL; |
760 | BUG_ON(err == -EEXIST); | 760 | BUG_ON(err == -EEXIST); |
761 | goto out; | 761 | goto out; |
762 | } | 762 | } |
763 | state = rb_entry(node, struct extent_state, rb_node); | 763 | state = rb_entry(node, struct extent_state, rb_node); |
764 | hit_next: | 764 | hit_next: |
765 | last_start = state->start; | 765 | last_start = state->start; |
766 | last_end = state->end; | 766 | last_end = state->end; |
767 | 767 | ||
768 | /* | 768 | /* |
769 | * | ---- desired range ---- | | 769 | * | ---- desired range ---- | |
770 | * | state | | 770 | * | state | |
771 | * | 771 | * |
772 | * Just lock what we found and keep going | 772 | * Just lock what we found and keep going |
773 | */ | 773 | */ |
774 | if (state->start == start && state->end <= end) { | 774 | if (state->start == start && state->end <= end) { |
775 | struct rb_node *next_node; | 775 | struct rb_node *next_node; |
776 | if (state->state & exclusive_bits) { | 776 | if (state->state & exclusive_bits) { |
777 | *failed_start = state->start; | 777 | *failed_start = state->start; |
778 | err = -EEXIST; | 778 | err = -EEXIST; |
779 | goto out; | 779 | goto out; |
780 | } | 780 | } |
781 | 781 | ||
782 | err = set_state_bits(tree, state, &bits); | 782 | err = set_state_bits(tree, state, &bits); |
783 | if (err) | 783 | if (err) |
784 | goto out; | 784 | goto out; |
785 | 785 | ||
786 | next_node = rb_next(node); | 786 | next_node = rb_next(node); |
787 | cache_state(state, cached_state); | 787 | cache_state(state, cached_state); |
788 | merge_state(tree, state); | 788 | merge_state(tree, state); |
789 | if (last_end == (u64)-1) | 789 | if (last_end == (u64)-1) |
790 | goto out; | 790 | goto out; |
791 | 791 | ||
792 | start = last_end + 1; | 792 | start = last_end + 1; |
793 | if (next_node && start < end && prealloc && !need_resched()) { | 793 | if (next_node && start < end && prealloc && !need_resched()) { |
794 | state = rb_entry(next_node, struct extent_state, | 794 | state = rb_entry(next_node, struct extent_state, |
795 | rb_node); | 795 | rb_node); |
796 | if (state->start == start) | 796 | if (state->start == start) |
797 | goto hit_next; | 797 | goto hit_next; |
798 | } | 798 | } |
799 | goto search_again; | 799 | goto search_again; |
800 | } | 800 | } |
801 | 801 | ||
802 | /* | 802 | /* |
803 | * | ---- desired range ---- | | 803 | * | ---- desired range ---- | |
804 | * | state | | 804 | * | state | |
805 | * or | 805 | * or |
806 | * | ------------- state -------------- | | 806 | * | ------------- state -------------- | |
807 | * | 807 | * |
808 | * We need to split the extent we found, and may flip bits on | 808 | * We need to split the extent we found, and may flip bits on |
809 | * second half. | 809 | * second half. |
810 | * | 810 | * |
811 | * If the extent we found extends past our | 811 | * If the extent we found extends past our |
812 | * range, we just split and search again. It'll get split | 812 | * range, we just split and search again. It'll get split |
813 | * again the next time though. | 813 | * again the next time though. |
814 | * | 814 | * |
815 | * If the extent we found is inside our range, we set the | 815 | * If the extent we found is inside our range, we set the |
816 | * desired bit on it. | 816 | * desired bit on it. |
817 | */ | 817 | */ |
818 | if (state->start < start) { | 818 | if (state->start < start) { |
819 | if (state->state & exclusive_bits) { | 819 | if (state->state & exclusive_bits) { |
820 | *failed_start = start; | 820 | *failed_start = start; |
821 | err = -EEXIST; | 821 | err = -EEXIST; |
822 | goto out; | 822 | goto out; |
823 | } | 823 | } |
824 | 824 | ||
825 | prealloc = alloc_extent_state_atomic(prealloc); | 825 | prealloc = alloc_extent_state_atomic(prealloc); |
826 | BUG_ON(!prealloc); | 826 | BUG_ON(!prealloc); |
827 | err = split_state(tree, state, prealloc, start); | 827 | err = split_state(tree, state, prealloc, start); |
828 | BUG_ON(err == -EEXIST); | 828 | BUG_ON(err == -EEXIST); |
829 | prealloc = NULL; | 829 | prealloc = NULL; |
830 | if (err) | 830 | if (err) |
831 | goto out; | 831 | goto out; |
832 | if (state->end <= end) { | 832 | if (state->end <= end) { |
833 | err = set_state_bits(tree, state, &bits); | 833 | err = set_state_bits(tree, state, &bits); |
834 | if (err) | 834 | if (err) |
835 | goto out; | 835 | goto out; |
836 | cache_state(state, cached_state); | 836 | cache_state(state, cached_state); |
837 | merge_state(tree, state); | 837 | merge_state(tree, state); |
838 | if (last_end == (u64)-1) | 838 | if (last_end == (u64)-1) |
839 | goto out; | 839 | goto out; |
840 | start = last_end + 1; | 840 | start = last_end + 1; |
841 | } | 841 | } |
842 | goto search_again; | 842 | goto search_again; |
843 | } | 843 | } |
844 | /* | 844 | /* |
845 | * | ---- desired range ---- | | 845 | * | ---- desired range ---- | |
846 | * | state | or | state | | 846 | * | state | or | state | |
847 | * | 847 | * |
848 | * There's a hole, we need to insert something in it and | 848 | * There's a hole, we need to insert something in it and |
849 | * ignore the extent we found. | 849 | * ignore the extent we found. |
850 | */ | 850 | */ |
851 | if (state->start > start) { | 851 | if (state->start > start) { |
852 | u64 this_end; | 852 | u64 this_end; |
853 | if (end < last_start) | 853 | if (end < last_start) |
854 | this_end = end; | 854 | this_end = end; |
855 | else | 855 | else |
856 | this_end = last_start - 1; | 856 | this_end = last_start - 1; |
857 | 857 | ||
858 | prealloc = alloc_extent_state_atomic(prealloc); | 858 | prealloc = alloc_extent_state_atomic(prealloc); |
859 | BUG_ON(!prealloc); | 859 | BUG_ON(!prealloc); |
860 | 860 | ||
861 | /* | 861 | /* |
862 | * Avoid to free 'prealloc' if it can be merged with | 862 | * Avoid to free 'prealloc' if it can be merged with |
863 | * the later extent. | 863 | * the later extent. |
864 | */ | 864 | */ |
865 | atomic_inc(&prealloc->refs); | 865 | atomic_inc(&prealloc->refs); |
866 | err = insert_state(tree, prealloc, start, this_end, | 866 | err = insert_state(tree, prealloc, start, this_end, |
867 | &bits); | 867 | &bits); |
868 | BUG_ON(err == -EEXIST); | 868 | BUG_ON(err == -EEXIST); |
869 | if (err) { | 869 | if (err) { |
870 | free_extent_state(prealloc); | 870 | free_extent_state(prealloc); |
871 | prealloc = NULL; | 871 | prealloc = NULL; |
872 | goto out; | 872 | goto out; |
873 | } | 873 | } |
874 | cache_state(prealloc, cached_state); | 874 | cache_state(prealloc, cached_state); |
875 | free_extent_state(prealloc); | 875 | free_extent_state(prealloc); |
876 | prealloc = NULL; | 876 | prealloc = NULL; |
877 | start = this_end + 1; | 877 | start = this_end + 1; |
878 | goto search_again; | 878 | goto search_again; |
879 | } | 879 | } |
880 | /* | 880 | /* |
881 | * | ---- desired range ---- | | 881 | * | ---- desired range ---- | |
882 | * | state | | 882 | * | state | |
883 | * We need to split the extent, and set the bit | 883 | * We need to split the extent, and set the bit |
884 | * on the first half | 884 | * on the first half |
885 | */ | 885 | */ |
886 | if (state->start <= end && state->end > end) { | 886 | if (state->start <= end && state->end > end) { |
887 | if (state->state & exclusive_bits) { | 887 | if (state->state & exclusive_bits) { |
888 | *failed_start = start; | 888 | *failed_start = start; |
889 | err = -EEXIST; | 889 | err = -EEXIST; |
890 | goto out; | 890 | goto out; |
891 | } | 891 | } |
892 | 892 | ||
893 | prealloc = alloc_extent_state_atomic(prealloc); | 893 | prealloc = alloc_extent_state_atomic(prealloc); |
894 | BUG_ON(!prealloc); | 894 | BUG_ON(!prealloc); |
895 | err = split_state(tree, state, prealloc, end + 1); | 895 | err = split_state(tree, state, prealloc, end + 1); |
896 | BUG_ON(err == -EEXIST); | 896 | BUG_ON(err == -EEXIST); |
897 | 897 | ||
898 | err = set_state_bits(tree, prealloc, &bits); | 898 | err = set_state_bits(tree, prealloc, &bits); |
899 | if (err) { | 899 | if (err) { |
900 | prealloc = NULL; | 900 | prealloc = NULL; |
901 | goto out; | 901 | goto out; |
902 | } | 902 | } |
903 | cache_state(prealloc, cached_state); | 903 | cache_state(prealloc, cached_state); |
904 | merge_state(tree, prealloc); | 904 | merge_state(tree, prealloc); |
905 | prealloc = NULL; | 905 | prealloc = NULL; |
906 | goto out; | 906 | goto out; |
907 | } | 907 | } |
908 | 908 | ||
909 | goto search_again; | 909 | goto search_again; |
910 | 910 | ||
911 | out: | 911 | out: |
912 | spin_unlock(&tree->lock); | 912 | spin_unlock(&tree->lock); |
913 | if (prealloc) | 913 | if (prealloc) |
914 | free_extent_state(prealloc); | 914 | free_extent_state(prealloc); |
915 | 915 | ||
916 | return err; | 916 | return err; |
917 | 917 | ||
918 | search_again: | 918 | search_again: |
919 | if (start > end) | 919 | if (start > end) |
920 | goto out; | 920 | goto out; |
921 | spin_unlock(&tree->lock); | 921 | spin_unlock(&tree->lock); |
922 | if (mask & __GFP_WAIT) | 922 | if (mask & __GFP_WAIT) |
923 | cond_resched(); | 923 | cond_resched(); |
924 | goto again; | 924 | goto again; |
925 | } | 925 | } |
926 | 926 | ||
927 | /* wrappers around set/clear extent bit */ | 927 | /* wrappers around set/clear extent bit */ |
928 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | 928 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, |
929 | gfp_t mask) | 929 | gfp_t mask) |
930 | { | 930 | { |
931 | return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, | 931 | return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, |
932 | NULL, mask); | 932 | NULL, mask); |
933 | } | 933 | } |
934 | 934 | ||
935 | int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, | 935 | int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
936 | int bits, gfp_t mask) | 936 | int bits, gfp_t mask) |
937 | { | 937 | { |
938 | return set_extent_bit(tree, start, end, bits, 0, NULL, | 938 | return set_extent_bit(tree, start, end, bits, 0, NULL, |
939 | NULL, mask); | 939 | NULL, mask); |
940 | } | 940 | } |
941 | 941 | ||
942 | int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, | 942 | int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
943 | int bits, gfp_t mask) | 943 | int bits, gfp_t mask) |
944 | { | 944 | { |
945 | return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); | 945 | return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); |
946 | } | 946 | } |
947 | 947 | ||
948 | int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, | 948 | int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, |
949 | struct extent_state **cached_state, gfp_t mask) | 949 | struct extent_state **cached_state, gfp_t mask) |
950 | { | 950 | { |
951 | return set_extent_bit(tree, start, end, | 951 | return set_extent_bit(tree, start, end, |
952 | EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, | 952 | EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, |
953 | 0, NULL, cached_state, mask); | 953 | 0, NULL, cached_state, mask); |
954 | } | 954 | } |
955 | 955 | ||
956 | int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | 956 | int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, |
957 | gfp_t mask) | 957 | gfp_t mask) |
958 | { | 958 | { |
959 | return clear_extent_bit(tree, start, end, | 959 | return clear_extent_bit(tree, start, end, |
960 | EXTENT_DIRTY | EXTENT_DELALLOC | | 960 | EXTENT_DIRTY | EXTENT_DELALLOC | |
961 | EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); | 961 | EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); |
962 | } | 962 | } |
963 | 963 | ||
964 | int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, | 964 | int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, |
965 | gfp_t mask) | 965 | gfp_t mask) |
966 | { | 966 | { |
967 | return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, | 967 | return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, |
968 | NULL, mask); | 968 | NULL, mask); |
969 | } | 969 | } |
970 | 970 | ||
971 | int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, | 971 | int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, |
972 | struct extent_state **cached_state, gfp_t mask) | 972 | struct extent_state **cached_state, gfp_t mask) |
973 | { | 973 | { |
974 | return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, | 974 | return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, |
975 | NULL, cached_state, mask); | 975 | NULL, cached_state, mask); |
976 | } | 976 | } |
977 | 977 | ||
978 | static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, | 978 | static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, |
979 | u64 end, struct extent_state **cached_state, | 979 | u64 end, struct extent_state **cached_state, |
980 | gfp_t mask) | 980 | gfp_t mask) |
981 | { | 981 | { |
982 | return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, | 982 | return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, |
983 | cached_state, mask); | 983 | cached_state, mask); |
984 | } | 984 | } |
985 | 985 | ||
986 | /* | 986 | /* |
987 | * either insert or lock state struct between start and end use mask to tell | 987 | * either insert or lock state struct between start and end use mask to tell |
988 | * us if waiting is desired. | 988 | * us if waiting is desired. |
989 | */ | 989 | */ |
990 | int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, | 990 | int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
991 | int bits, struct extent_state **cached_state, gfp_t mask) | 991 | int bits, struct extent_state **cached_state, gfp_t mask) |
992 | { | 992 | { |
993 | int err; | 993 | int err; |
994 | u64 failed_start; | 994 | u64 failed_start; |
995 | while (1) { | 995 | while (1) { |
996 | err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, | 996 | err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, |
997 | EXTENT_LOCKED, &failed_start, | 997 | EXTENT_LOCKED, &failed_start, |
998 | cached_state, mask); | 998 | cached_state, mask); |
999 | if (err == -EEXIST && (mask & __GFP_WAIT)) { | 999 | if (err == -EEXIST && (mask & __GFP_WAIT)) { |
1000 | wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); | 1000 | wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); |
1001 | start = failed_start; | 1001 | start = failed_start; |
1002 | } else { | 1002 | } else { |
1003 | break; | 1003 | break; |
1004 | } | 1004 | } |
1005 | WARN_ON(start > end); | 1005 | WARN_ON(start > end); |
1006 | } | 1006 | } |
1007 | return err; | 1007 | return err; |
1008 | } | 1008 | } |
1009 | 1009 | ||
1010 | int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) | 1010 | int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) |
1011 | { | 1011 | { |
1012 | return lock_extent_bits(tree, start, end, 0, NULL, mask); | 1012 | return lock_extent_bits(tree, start, end, 0, NULL, mask); |
1013 | } | 1013 | } |
1014 | 1014 | ||
1015 | int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, | 1015 | int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, |
1016 | gfp_t mask) | 1016 | gfp_t mask) |
1017 | { | 1017 | { |
1018 | int err; | 1018 | int err; |
1019 | u64 failed_start; | 1019 | u64 failed_start; |
1020 | 1020 | ||
1021 | err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, | 1021 | err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, |
1022 | &failed_start, NULL, mask); | 1022 | &failed_start, NULL, mask); |
1023 | if (err == -EEXIST) { | 1023 | if (err == -EEXIST) { |
1024 | if (failed_start > start) | 1024 | if (failed_start > start) |
1025 | clear_extent_bit(tree, start, failed_start - 1, | 1025 | clear_extent_bit(tree, start, failed_start - 1, |
1026 | EXTENT_LOCKED, 1, 0, NULL, mask); | 1026 | EXTENT_LOCKED, 1, 0, NULL, mask); |
1027 | return 0; | 1027 | return 0; |
1028 | } | 1028 | } |
1029 | return 1; | 1029 | return 1; |
1030 | } | 1030 | } |
1031 | 1031 | ||
1032 | int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, | 1032 | int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, |
1033 | struct extent_state **cached, gfp_t mask) | 1033 | struct extent_state **cached, gfp_t mask) |
1034 | { | 1034 | { |
1035 | return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, | 1035 | return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, |
1036 | mask); | 1036 | mask); |
1037 | } | 1037 | } |
1038 | 1038 | ||
1039 | int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) | 1039 | int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) |
1040 | { | 1040 | { |
1041 | return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, | 1041 | return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, |
1042 | mask); | 1042 | mask); |
1043 | } | 1043 | } |
1044 | 1044 | ||
1045 | /* | 1045 | /* |
1046 | * helper function to set both pages and extents in the tree writeback | 1046 | * helper function to set both pages and extents in the tree writeback |
1047 | */ | 1047 | */ |
1048 | static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) | 1048 | static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) |
1049 | { | 1049 | { |
1050 | unsigned long index = start >> PAGE_CACHE_SHIFT; | 1050 | unsigned long index = start >> PAGE_CACHE_SHIFT; |
1051 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | 1051 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; |
1052 | struct page *page; | 1052 | struct page *page; |
1053 | 1053 | ||
1054 | while (index <= end_index) { | 1054 | while (index <= end_index) { |
1055 | page = find_get_page(tree->mapping, index); | 1055 | page = find_get_page(tree->mapping, index); |
1056 | BUG_ON(!page); | 1056 | BUG_ON(!page); |
1057 | set_page_writeback(page); | 1057 | set_page_writeback(page); |
1058 | page_cache_release(page); | 1058 | page_cache_release(page); |
1059 | index++; | 1059 | index++; |
1060 | } | 1060 | } |
1061 | return 0; | 1061 | return 0; |
1062 | } | 1062 | } |
1063 | 1063 | ||
1064 | /* | 1064 | /* |
1065 | * find the first offset in the io tree with 'bits' set. zero is | 1065 | * find the first offset in the io tree with 'bits' set. zero is |
1066 | * returned if we find something, and *start_ret and *end_ret are | 1066 | * returned if we find something, and *start_ret and *end_ret are |
1067 | * set to reflect the state struct that was found. | 1067 | * set to reflect the state struct that was found. |
1068 | * | 1068 | * |
1069 | * If nothing was found, 1 is returned, < 0 on error | 1069 | * If nothing was found, 1 is returned, < 0 on error |
1070 | */ | 1070 | */ |
1071 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, | 1071 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, |
1072 | u64 *start_ret, u64 *end_ret, int bits) | 1072 | u64 *start_ret, u64 *end_ret, int bits) |
1073 | { | 1073 | { |
1074 | struct rb_node *node; | 1074 | struct rb_node *node; |
1075 | struct extent_state *state; | 1075 | struct extent_state *state; |
1076 | int ret = 1; | 1076 | int ret = 1; |
1077 | 1077 | ||
1078 | spin_lock(&tree->lock); | 1078 | spin_lock(&tree->lock); |
1079 | /* | 1079 | /* |
1080 | * this search will find all the extents that end after | 1080 | * this search will find all the extents that end after |
1081 | * our range starts. | 1081 | * our range starts. |
1082 | */ | 1082 | */ |
1083 | node = tree_search(tree, start); | 1083 | node = tree_search(tree, start); |
1084 | if (!node) | 1084 | if (!node) |
1085 | goto out; | 1085 | goto out; |
1086 | 1086 | ||
1087 | while (1) { | 1087 | while (1) { |
1088 | state = rb_entry(node, struct extent_state, rb_node); | 1088 | state = rb_entry(node, struct extent_state, rb_node); |
1089 | if (state->end >= start && (state->state & bits)) { | 1089 | if (state->end >= start && (state->state & bits)) { |
1090 | *start_ret = state->start; | 1090 | *start_ret = state->start; |
1091 | *end_ret = state->end; | 1091 | *end_ret = state->end; |
1092 | ret = 0; | 1092 | ret = 0; |
1093 | break; | 1093 | break; |
1094 | } | 1094 | } |
1095 | node = rb_next(node); | 1095 | node = rb_next(node); |
1096 | if (!node) | 1096 | if (!node) |
1097 | break; | 1097 | break; |
1098 | } | 1098 | } |
1099 | out: | 1099 | out: |
1100 | spin_unlock(&tree->lock); | 1100 | spin_unlock(&tree->lock); |
1101 | return ret; | 1101 | return ret; |
1102 | } | 1102 | } |
1103 | 1103 | ||
1104 | /* find the first state struct with 'bits' set after 'start', and | 1104 | /* find the first state struct with 'bits' set after 'start', and |
1105 | * return it. tree->lock must be held. NULL will returned if | 1105 | * return it. tree->lock must be held. NULL will returned if |
1106 | * nothing was found after 'start' | 1106 | * nothing was found after 'start' |
1107 | */ | 1107 | */ |
1108 | struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, | 1108 | struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, |
1109 | u64 start, int bits) | 1109 | u64 start, int bits) |
1110 | { | 1110 | { |
1111 | struct rb_node *node; | 1111 | struct rb_node *node; |
1112 | struct extent_state *state; | 1112 | struct extent_state *state; |
1113 | 1113 | ||
1114 | /* | 1114 | /* |
1115 | * this search will find all the extents that end after | 1115 | * this search will find all the extents that end after |
1116 | * our range starts. | 1116 | * our range starts. |
1117 | */ | 1117 | */ |
1118 | node = tree_search(tree, start); | 1118 | node = tree_search(tree, start); |
1119 | if (!node) | 1119 | if (!node) |
1120 | goto out; | 1120 | goto out; |
1121 | 1121 | ||
1122 | while (1) { | 1122 | while (1) { |
1123 | state = rb_entry(node, struct extent_state, rb_node); | 1123 | state = rb_entry(node, struct extent_state, rb_node); |
1124 | if (state->end >= start && (state->state & bits)) | 1124 | if (state->end >= start && (state->state & bits)) |
1125 | return state; | 1125 | return state; |
1126 | 1126 | ||
1127 | node = rb_next(node); | 1127 | node = rb_next(node); |
1128 | if (!node) | 1128 | if (!node) |
1129 | break; | 1129 | break; |
1130 | } | 1130 | } |
1131 | out: | 1131 | out: |
1132 | return NULL; | 1132 | return NULL; |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | /* | 1135 | /* |
1136 | * find a contiguous range of bytes in the file marked as delalloc, not | 1136 | * find a contiguous range of bytes in the file marked as delalloc, not |
1137 | * more than 'max_bytes'. start and end are used to return the range, | 1137 | * more than 'max_bytes'. start and end are used to return the range, |
1138 | * | 1138 | * |
1139 | * 1 is returned if we find something, 0 if nothing was in the tree | 1139 | * 1 is returned if we find something, 0 if nothing was in the tree |
1140 | */ | 1140 | */ |
1141 | static noinline u64 find_delalloc_range(struct extent_io_tree *tree, | 1141 | static noinline u64 find_delalloc_range(struct extent_io_tree *tree, |
1142 | u64 *start, u64 *end, u64 max_bytes, | 1142 | u64 *start, u64 *end, u64 max_bytes, |
1143 | struct extent_state **cached_state) | 1143 | struct extent_state **cached_state) |
1144 | { | 1144 | { |
1145 | struct rb_node *node; | 1145 | struct rb_node *node; |
1146 | struct extent_state *state; | 1146 | struct extent_state *state; |
1147 | u64 cur_start = *start; | 1147 | u64 cur_start = *start; |
1148 | u64 found = 0; | 1148 | u64 found = 0; |
1149 | u64 total_bytes = 0; | 1149 | u64 total_bytes = 0; |
1150 | 1150 | ||
1151 | spin_lock(&tree->lock); | 1151 | spin_lock(&tree->lock); |
1152 | 1152 | ||
1153 | /* | 1153 | /* |
1154 | * this search will find all the extents that end after | 1154 | * this search will find all the extents that end after |
1155 | * our range starts. | 1155 | * our range starts. |
1156 | */ | 1156 | */ |
1157 | node = tree_search(tree, cur_start); | 1157 | node = tree_search(tree, cur_start); |
1158 | if (!node) { | 1158 | if (!node) { |
1159 | if (!found) | 1159 | if (!found) |
1160 | *end = (u64)-1; | 1160 | *end = (u64)-1; |
1161 | goto out; | 1161 | goto out; |
1162 | } | 1162 | } |
1163 | 1163 | ||
1164 | while (1) { | 1164 | while (1) { |
1165 | state = rb_entry(node, struct extent_state, rb_node); | 1165 | state = rb_entry(node, struct extent_state, rb_node); |
1166 | if (found && (state->start != cur_start || | 1166 | if (found && (state->start != cur_start || |
1167 | (state->state & EXTENT_BOUNDARY))) { | 1167 | (state->state & EXTENT_BOUNDARY))) { |
1168 | goto out; | 1168 | goto out; |
1169 | } | 1169 | } |
1170 | if (!(state->state & EXTENT_DELALLOC)) { | 1170 | if (!(state->state & EXTENT_DELALLOC)) { |
1171 | if (!found) | 1171 | if (!found) |
1172 | *end = state->end; | 1172 | *end = state->end; |
1173 | goto out; | 1173 | goto out; |
1174 | } | 1174 | } |
1175 | if (!found) { | 1175 | if (!found) { |
1176 | *start = state->start; | 1176 | *start = state->start; |
1177 | *cached_state = state; | 1177 | *cached_state = state; |
1178 | atomic_inc(&state->refs); | 1178 | atomic_inc(&state->refs); |
1179 | } | 1179 | } |
1180 | found++; | 1180 | found++; |
1181 | *end = state->end; | 1181 | *end = state->end; |
1182 | cur_start = state->end + 1; | 1182 | cur_start = state->end + 1; |
1183 | node = rb_next(node); | 1183 | node = rb_next(node); |
1184 | if (!node) | 1184 | if (!node) |
1185 | break; | 1185 | break; |
1186 | total_bytes += state->end - state->start + 1; | 1186 | total_bytes += state->end - state->start + 1; |
1187 | if (total_bytes >= max_bytes) | 1187 | if (total_bytes >= max_bytes) |
1188 | break; | 1188 | break; |
1189 | } | 1189 | } |
1190 | out: | 1190 | out: |
1191 | spin_unlock(&tree->lock); | 1191 | spin_unlock(&tree->lock); |
1192 | return found; | 1192 | return found; |
1193 | } | 1193 | } |
1194 | 1194 | ||
1195 | static noinline int __unlock_for_delalloc(struct inode *inode, | 1195 | static noinline int __unlock_for_delalloc(struct inode *inode, |
1196 | struct page *locked_page, | 1196 | struct page *locked_page, |
1197 | u64 start, u64 end) | 1197 | u64 start, u64 end) |
1198 | { | 1198 | { |
1199 | int ret; | 1199 | int ret; |
1200 | struct page *pages[16]; | 1200 | struct page *pages[16]; |
1201 | unsigned long index = start >> PAGE_CACHE_SHIFT; | 1201 | unsigned long index = start >> PAGE_CACHE_SHIFT; |
1202 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | 1202 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; |
1203 | unsigned long nr_pages = end_index - index + 1; | 1203 | unsigned long nr_pages = end_index - index + 1; |
1204 | int i; | 1204 | int i; |
1205 | 1205 | ||
1206 | if (index == locked_page->index && end_index == index) | 1206 | if (index == locked_page->index && end_index == index) |
1207 | return 0; | 1207 | return 0; |
1208 | 1208 | ||
1209 | while (nr_pages > 0) { | 1209 | while (nr_pages > 0) { |
1210 | ret = find_get_pages_contig(inode->i_mapping, index, | 1210 | ret = find_get_pages_contig(inode->i_mapping, index, |
1211 | min_t(unsigned long, nr_pages, | 1211 | min_t(unsigned long, nr_pages, |
1212 | ARRAY_SIZE(pages)), pages); | 1212 | ARRAY_SIZE(pages)), pages); |
1213 | for (i = 0; i < ret; i++) { | 1213 | for (i = 0; i < ret; i++) { |
1214 | if (pages[i] != locked_page) | 1214 | if (pages[i] != locked_page) |
1215 | unlock_page(pages[i]); | 1215 | unlock_page(pages[i]); |
1216 | page_cache_release(pages[i]); | 1216 | page_cache_release(pages[i]); |
1217 | } | 1217 | } |
1218 | nr_pages -= ret; | 1218 | nr_pages -= ret; |
1219 | index += ret; | 1219 | index += ret; |
1220 | cond_resched(); | 1220 | cond_resched(); |
1221 | } | 1221 | } |
1222 | return 0; | 1222 | return 0; |
1223 | } | 1223 | } |
1224 | 1224 | ||
1225 | static noinline int lock_delalloc_pages(struct inode *inode, | 1225 | static noinline int lock_delalloc_pages(struct inode *inode, |
1226 | struct page *locked_page, | 1226 | struct page *locked_page, |
1227 | u64 delalloc_start, | 1227 | u64 delalloc_start, |
1228 | u64 delalloc_end) | 1228 | u64 delalloc_end) |
1229 | { | 1229 | { |
1230 | unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; | 1230 | unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; |
1231 | unsigned long start_index = index; | 1231 | unsigned long start_index = index; |
1232 | unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; | 1232 | unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; |
1233 | unsigned long pages_locked = 0; | 1233 | unsigned long pages_locked = 0; |
1234 | struct page *pages[16]; | 1234 | struct page *pages[16]; |
1235 | unsigned long nrpages; | 1235 | unsigned long nrpages; |
1236 | int ret; | 1236 | int ret; |
1237 | int i; | 1237 | int i; |
1238 | 1238 | ||
1239 | /* the caller is responsible for locking the start index */ | 1239 | /* the caller is responsible for locking the start index */ |
1240 | if (index == locked_page->index && index == end_index) | 1240 | if (index == locked_page->index && index == end_index) |
1241 | return 0; | 1241 | return 0; |
1242 | 1242 | ||
1243 | /* skip the page at the start index */ | 1243 | /* skip the page at the start index */ |
1244 | nrpages = end_index - index + 1; | 1244 | nrpages = end_index - index + 1; |
1245 | while (nrpages > 0) { | 1245 | while (nrpages > 0) { |
1246 | ret = find_get_pages_contig(inode->i_mapping, index, | 1246 | ret = find_get_pages_contig(inode->i_mapping, index, |
1247 | min_t(unsigned long, | 1247 | min_t(unsigned long, |
1248 | nrpages, ARRAY_SIZE(pages)), pages); | 1248 | nrpages, ARRAY_SIZE(pages)), pages); |
1249 | if (ret == 0) { | 1249 | if (ret == 0) { |
1250 | ret = -EAGAIN; | 1250 | ret = -EAGAIN; |
1251 | goto done; | 1251 | goto done; |
1252 | } | 1252 | } |
1253 | /* now we have an array of pages, lock them all */ | 1253 | /* now we have an array of pages, lock them all */ |
1254 | for (i = 0; i < ret; i++) { | 1254 | for (i = 0; i < ret; i++) { |
1255 | /* | 1255 | /* |
1256 | * the caller is taking responsibility for | 1256 | * the caller is taking responsibility for |
1257 | * locked_page | 1257 | * locked_page |
1258 | */ | 1258 | */ |
1259 | if (pages[i] != locked_page) { | 1259 | if (pages[i] != locked_page) { |
1260 | lock_page(pages[i]); | 1260 | lock_page(pages[i]); |
1261 | if (!PageDirty(pages[i]) || | 1261 | if (!PageDirty(pages[i]) || |
1262 | pages[i]->mapping != inode->i_mapping) { | 1262 | pages[i]->mapping != inode->i_mapping) { |
1263 | ret = -EAGAIN; | 1263 | ret = -EAGAIN; |
1264 | unlock_page(pages[i]); | 1264 | unlock_page(pages[i]); |
1265 | page_cache_release(pages[i]); | 1265 | page_cache_release(pages[i]); |
1266 | goto done; | 1266 | goto done; |
1267 | } | 1267 | } |
1268 | } | 1268 | } |
1269 | page_cache_release(pages[i]); | 1269 | page_cache_release(pages[i]); |
1270 | pages_locked++; | 1270 | pages_locked++; |
1271 | } | 1271 | } |
1272 | nrpages -= ret; | 1272 | nrpages -= ret; |
1273 | index += ret; | 1273 | index += ret; |
1274 | cond_resched(); | 1274 | cond_resched(); |
1275 | } | 1275 | } |
1276 | ret = 0; | 1276 | ret = 0; |
1277 | done: | 1277 | done: |
1278 | if (ret && pages_locked) { | 1278 | if (ret && pages_locked) { |
1279 | __unlock_for_delalloc(inode, locked_page, | 1279 | __unlock_for_delalloc(inode, locked_page, |
1280 | delalloc_start, | 1280 | delalloc_start, |
1281 | ((u64)(start_index + pages_locked - 1)) << | 1281 | ((u64)(start_index + pages_locked - 1)) << |
1282 | PAGE_CACHE_SHIFT); | 1282 | PAGE_CACHE_SHIFT); |
1283 | } | 1283 | } |
1284 | return ret; | 1284 | return ret; |
1285 | } | 1285 | } |
1286 | 1286 | ||
1287 | /* | 1287 | /* |
1288 | * find a contiguous range of bytes in the file marked as delalloc, not | 1288 | * find a contiguous range of bytes in the file marked as delalloc, not |
1289 | * more than 'max_bytes'. start and end are used to return the range, | 1289 | * more than 'max_bytes'. start and end are used to return the range, |
1290 | * | 1290 | * |
1291 | * 1 is returned if we find something, 0 if nothing was in the tree | 1291 | * 1 is returned if we find something, 0 if nothing was in the tree |
1292 | */ | 1292 | */ |
1293 | static noinline u64 find_lock_delalloc_range(struct inode *inode, | 1293 | static noinline u64 find_lock_delalloc_range(struct inode *inode, |
1294 | struct extent_io_tree *tree, | 1294 | struct extent_io_tree *tree, |
1295 | struct page *locked_page, | 1295 | struct page *locked_page, |
1296 | u64 *start, u64 *end, | 1296 | u64 *start, u64 *end, |
1297 | u64 max_bytes) | 1297 | u64 max_bytes) |
1298 | { | 1298 | { |
1299 | u64 delalloc_start; | 1299 | u64 delalloc_start; |
1300 | u64 delalloc_end; | 1300 | u64 delalloc_end; |
1301 | u64 found; | 1301 | u64 found; |
1302 | struct extent_state *cached_state = NULL; | 1302 | struct extent_state *cached_state = NULL; |
1303 | int ret; | 1303 | int ret; |
1304 | int loops = 0; | 1304 | int loops = 0; |
1305 | 1305 | ||
1306 | again: | 1306 | again: |
1307 | /* step one, find a bunch of delalloc bytes starting at start */ | 1307 | /* step one, find a bunch of delalloc bytes starting at start */ |
1308 | delalloc_start = *start; | 1308 | delalloc_start = *start; |
1309 | delalloc_end = 0; | 1309 | delalloc_end = 0; |
1310 | found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, | 1310 | found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, |
1311 | max_bytes, &cached_state); | 1311 | max_bytes, &cached_state); |
1312 | if (!found || delalloc_end <= *start) { | 1312 | if (!found || delalloc_end <= *start) { |
1313 | *start = delalloc_start; | 1313 | *start = delalloc_start; |
1314 | *end = delalloc_end; | 1314 | *end = delalloc_end; |
1315 | free_extent_state(cached_state); | 1315 | free_extent_state(cached_state); |
1316 | return found; | 1316 | return found; |
1317 | } | 1317 | } |
1318 | 1318 | ||
1319 | /* | 1319 | /* |
1320 | * start comes from the offset of locked_page. We have to lock | 1320 | * start comes from the offset of locked_page. We have to lock |
1321 | * pages in order, so we can't process delalloc bytes before | 1321 | * pages in order, so we can't process delalloc bytes before |
1322 | * locked_page | 1322 | * locked_page |
1323 | */ | 1323 | */ |
1324 | if (delalloc_start < *start) | 1324 | if (delalloc_start < *start) |
1325 | delalloc_start = *start; | 1325 | delalloc_start = *start; |
1326 | 1326 | ||
1327 | /* | 1327 | /* |
1328 | * make sure to limit the number of pages we try to lock down | 1328 | * make sure to limit the number of pages we try to lock down |
1329 | * if we're looping. | 1329 | * if we're looping. |
1330 | */ | 1330 | */ |
1331 | if (delalloc_end + 1 - delalloc_start > max_bytes && loops) | 1331 | if (delalloc_end + 1 - delalloc_start > max_bytes && loops) |
1332 | delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; | 1332 | delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; |
1333 | 1333 | ||
1334 | /* step two, lock all the pages after the page that has start */ | 1334 | /* step two, lock all the pages after the page that has start */ |
1335 | ret = lock_delalloc_pages(inode, locked_page, | 1335 | ret = lock_delalloc_pages(inode, locked_page, |
1336 | delalloc_start, delalloc_end); | 1336 | delalloc_start, delalloc_end); |
1337 | if (ret == -EAGAIN) { | 1337 | if (ret == -EAGAIN) { |
1338 | /* some of the pages are gone, lets avoid looping by | 1338 | /* some of the pages are gone, lets avoid looping by |
1339 | * shortening the size of the delalloc range we're searching | 1339 | * shortening the size of the delalloc range we're searching |
1340 | */ | 1340 | */ |
1341 | free_extent_state(cached_state); | 1341 | free_extent_state(cached_state); |
1342 | if (!loops) { | 1342 | if (!loops) { |
1343 | unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); | 1343 | unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); |
1344 | max_bytes = PAGE_CACHE_SIZE - offset; | 1344 | max_bytes = PAGE_CACHE_SIZE - offset; |
1345 | loops = 1; | 1345 | loops = 1; |
1346 | goto again; | 1346 | goto again; |
1347 | } else { | 1347 | } else { |
1348 | found = 0; | 1348 | found = 0; |
1349 | goto out_failed; | 1349 | goto out_failed; |
1350 | } | 1350 | } |
1351 | } | 1351 | } |
1352 | BUG_ON(ret); | 1352 | BUG_ON(ret); |
1353 | 1353 | ||
1354 | /* step three, lock the state bits for the whole range */ | 1354 | /* step three, lock the state bits for the whole range */ |
1355 | lock_extent_bits(tree, delalloc_start, delalloc_end, | 1355 | lock_extent_bits(tree, delalloc_start, delalloc_end, |
1356 | 0, &cached_state, GFP_NOFS); | 1356 | 0, &cached_state, GFP_NOFS); |
1357 | 1357 | ||
1358 | /* then test to make sure it is all still delalloc */ | 1358 | /* then test to make sure it is all still delalloc */ |
1359 | ret = test_range_bit(tree, delalloc_start, delalloc_end, | 1359 | ret = test_range_bit(tree, delalloc_start, delalloc_end, |
1360 | EXTENT_DELALLOC, 1, cached_state); | 1360 | EXTENT_DELALLOC, 1, cached_state); |
1361 | if (!ret) { | 1361 | if (!ret) { |
1362 | unlock_extent_cached(tree, delalloc_start, delalloc_end, | 1362 | unlock_extent_cached(tree, delalloc_start, delalloc_end, |
1363 | &cached_state, GFP_NOFS); | 1363 | &cached_state, GFP_NOFS); |
1364 | __unlock_for_delalloc(inode, locked_page, | 1364 | __unlock_for_delalloc(inode, locked_page, |
1365 | delalloc_start, delalloc_end); | 1365 | delalloc_start, delalloc_end); |
1366 | cond_resched(); | 1366 | cond_resched(); |
1367 | goto again; | 1367 | goto again; |
1368 | } | 1368 | } |
1369 | free_extent_state(cached_state); | 1369 | free_extent_state(cached_state); |
1370 | *start = delalloc_start; | 1370 | *start = delalloc_start; |
1371 | *end = delalloc_end; | 1371 | *end = delalloc_end; |
1372 | out_failed: | 1372 | out_failed: |
1373 | return found; | 1373 | return found; |
1374 | } | 1374 | } |
1375 | 1375 | ||
1376 | int extent_clear_unlock_delalloc(struct inode *inode, | 1376 | int extent_clear_unlock_delalloc(struct inode *inode, |
1377 | struct extent_io_tree *tree, | 1377 | struct extent_io_tree *tree, |
1378 | u64 start, u64 end, struct page *locked_page, | 1378 | u64 start, u64 end, struct page *locked_page, |
1379 | unsigned long op) | 1379 | unsigned long op) |
1380 | { | 1380 | { |
1381 | int ret; | 1381 | int ret; |
1382 | struct page *pages[16]; | 1382 | struct page *pages[16]; |
1383 | unsigned long index = start >> PAGE_CACHE_SHIFT; | 1383 | unsigned long index = start >> PAGE_CACHE_SHIFT; |
1384 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | 1384 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; |
1385 | unsigned long nr_pages = end_index - index + 1; | 1385 | unsigned long nr_pages = end_index - index + 1; |
1386 | int i; | 1386 | int i; |
1387 | int clear_bits = 0; | 1387 | int clear_bits = 0; |
1388 | 1388 | ||
1389 | if (op & EXTENT_CLEAR_UNLOCK) | 1389 | if (op & EXTENT_CLEAR_UNLOCK) |
1390 | clear_bits |= EXTENT_LOCKED; | 1390 | clear_bits |= EXTENT_LOCKED; |
1391 | if (op & EXTENT_CLEAR_DIRTY) | 1391 | if (op & EXTENT_CLEAR_DIRTY) |
1392 | clear_bits |= EXTENT_DIRTY; | 1392 | clear_bits |= EXTENT_DIRTY; |
1393 | 1393 | ||
1394 | if (op & EXTENT_CLEAR_DELALLOC) | 1394 | if (op & EXTENT_CLEAR_DELALLOC) |
1395 | clear_bits |= EXTENT_DELALLOC; | 1395 | clear_bits |= EXTENT_DELALLOC; |
1396 | 1396 | ||
1397 | clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); | 1397 | clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); |
1398 | if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | | 1398 | if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | |
1399 | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | | 1399 | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | |
1400 | EXTENT_SET_PRIVATE2))) | 1400 | EXTENT_SET_PRIVATE2))) |
1401 | return 0; | 1401 | return 0; |
1402 | 1402 | ||
1403 | while (nr_pages > 0) { | 1403 | while (nr_pages > 0) { |
1404 | ret = find_get_pages_contig(inode->i_mapping, index, | 1404 | ret = find_get_pages_contig(inode->i_mapping, index, |
1405 | min_t(unsigned long, | 1405 | min_t(unsigned long, |
1406 | nr_pages, ARRAY_SIZE(pages)), pages); | 1406 | nr_pages, ARRAY_SIZE(pages)), pages); |
1407 | for (i = 0; i < ret; i++) { | 1407 | for (i = 0; i < ret; i++) { |
1408 | 1408 | ||
1409 | if (op & EXTENT_SET_PRIVATE2) | 1409 | if (op & EXTENT_SET_PRIVATE2) |
1410 | SetPagePrivate2(pages[i]); | 1410 | SetPagePrivate2(pages[i]); |
1411 | 1411 | ||
1412 | if (pages[i] == locked_page) { | 1412 | if (pages[i] == locked_page) { |
1413 | page_cache_release(pages[i]); | 1413 | page_cache_release(pages[i]); |
1414 | continue; | 1414 | continue; |
1415 | } | 1415 | } |
1416 | if (op & EXTENT_CLEAR_DIRTY) | 1416 | if (op & EXTENT_CLEAR_DIRTY) |
1417 | clear_page_dirty_for_io(pages[i]); | 1417 | clear_page_dirty_for_io(pages[i]); |
1418 | if (op & EXTENT_SET_WRITEBACK) | 1418 | if (op & EXTENT_SET_WRITEBACK) |
1419 | set_page_writeback(pages[i]); | 1419 | set_page_writeback(pages[i]); |
1420 | if (op & EXTENT_END_WRITEBACK) | 1420 | if (op & EXTENT_END_WRITEBACK) |
1421 | end_page_writeback(pages[i]); | 1421 | end_page_writeback(pages[i]); |
1422 | if (op & EXTENT_CLEAR_UNLOCK_PAGE) | 1422 | if (op & EXTENT_CLEAR_UNLOCK_PAGE) |
1423 | unlock_page(pages[i]); | 1423 | unlock_page(pages[i]); |
1424 | page_cache_release(pages[i]); | 1424 | page_cache_release(pages[i]); |
1425 | } | 1425 | } |
1426 | nr_pages -= ret; | 1426 | nr_pages -= ret; |
1427 | index += ret; | 1427 | index += ret; |
1428 | cond_resched(); | 1428 | cond_resched(); |
1429 | } | 1429 | } |
1430 | return 0; | 1430 | return 0; |
1431 | } | 1431 | } |
1432 | 1432 | ||
1433 | /* | 1433 | /* |
1434 | * count the number of bytes in the tree that have a given bit(s) | 1434 | * count the number of bytes in the tree that have a given bit(s) |
1435 | * set. This can be fairly slow, except for EXTENT_DIRTY which is | 1435 | * set. This can be fairly slow, except for EXTENT_DIRTY which is |
1436 | * cached. The total number found is returned. | 1436 | * cached. The total number found is returned. |
1437 | */ | 1437 | */ |
1438 | u64 count_range_bits(struct extent_io_tree *tree, | 1438 | u64 count_range_bits(struct extent_io_tree *tree, |
1439 | u64 *start, u64 search_end, u64 max_bytes, | 1439 | u64 *start, u64 search_end, u64 max_bytes, |
1440 | unsigned long bits, int contig) | 1440 | unsigned long bits, int contig) |
1441 | { | 1441 | { |
1442 | struct rb_node *node; | 1442 | struct rb_node *node; |
1443 | struct extent_state *state; | 1443 | struct extent_state *state; |
1444 | u64 cur_start = *start; | 1444 | u64 cur_start = *start; |
1445 | u64 total_bytes = 0; | 1445 | u64 total_bytes = 0; |
1446 | u64 last = 0; | 1446 | u64 last = 0; |
1447 | int found = 0; | 1447 | int found = 0; |
1448 | 1448 | ||
1449 | if (search_end <= cur_start) { | 1449 | if (search_end <= cur_start) { |
1450 | WARN_ON(1); | 1450 | WARN_ON(1); |
1451 | return 0; | 1451 | return 0; |
1452 | } | 1452 | } |
1453 | 1453 | ||
1454 | spin_lock(&tree->lock); | 1454 | spin_lock(&tree->lock); |
1455 | if (cur_start == 0 && bits == EXTENT_DIRTY) { | 1455 | if (cur_start == 0 && bits == EXTENT_DIRTY) { |
1456 | total_bytes = tree->dirty_bytes; | 1456 | total_bytes = tree->dirty_bytes; |
1457 | goto out; | 1457 | goto out; |
1458 | } | 1458 | } |
1459 | /* | 1459 | /* |
1460 | * this search will find all the extents that end after | 1460 | * this search will find all the extents that end after |
1461 | * our range starts. | 1461 | * our range starts. |
1462 | */ | 1462 | */ |
1463 | node = tree_search(tree, cur_start); | 1463 | node = tree_search(tree, cur_start); |
1464 | if (!node) | 1464 | if (!node) |
1465 | goto out; | 1465 | goto out; |
1466 | 1466 | ||
1467 | while (1) { | 1467 | while (1) { |
1468 | state = rb_entry(node, struct extent_state, rb_node); | 1468 | state = rb_entry(node, struct extent_state, rb_node); |
1469 | if (state->start > search_end) | 1469 | if (state->start > search_end) |
1470 | break; | 1470 | break; |
1471 | if (contig && found && state->start > last + 1) | 1471 | if (contig && found && state->start > last + 1) |
1472 | break; | 1472 | break; |
1473 | if (state->end >= cur_start && (state->state & bits) == bits) { | 1473 | if (state->end >= cur_start && (state->state & bits) == bits) { |
1474 | total_bytes += min(search_end, state->end) + 1 - | 1474 | total_bytes += min(search_end, state->end) + 1 - |
1475 | max(cur_start, state->start); | 1475 | max(cur_start, state->start); |
1476 | if (total_bytes >= max_bytes) | 1476 | if (total_bytes >= max_bytes) |
1477 | break; | 1477 | break; |
1478 | if (!found) { | 1478 | if (!found) { |
1479 | *start = max(cur_start, state->start); | 1479 | *start = max(cur_start, state->start); |
1480 | found = 1; | 1480 | found = 1; |
1481 | } | 1481 | } |
1482 | last = state->end; | 1482 | last = state->end; |
1483 | } else if (contig && found) { | 1483 | } else if (contig && found) { |
1484 | break; | 1484 | break; |
1485 | } | 1485 | } |
1486 | node = rb_next(node); | 1486 | node = rb_next(node); |
1487 | if (!node) | 1487 | if (!node) |
1488 | break; | 1488 | break; |
1489 | } | 1489 | } |
1490 | out: | 1490 | out: |
1491 | spin_unlock(&tree->lock); | 1491 | spin_unlock(&tree->lock); |
1492 | return total_bytes; | 1492 | return total_bytes; |
1493 | } | 1493 | } |
1494 | 1494 | ||
1495 | /* | 1495 | /* |
1496 | * set the private field for a given byte offset in the tree. If there isn't | 1496 | * set the private field for a given byte offset in the tree. If there isn't |
1497 | * an extent_state there already, this does nothing. | 1497 | * an extent_state there already, this does nothing. |
1498 | */ | 1498 | */ |
1499 | int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) | 1499 | int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) |
1500 | { | 1500 | { |
1501 | struct rb_node *node; | 1501 | struct rb_node *node; |
1502 | struct extent_state *state; | 1502 | struct extent_state *state; |
1503 | int ret = 0; | 1503 | int ret = 0; |
1504 | 1504 | ||
1505 | spin_lock(&tree->lock); | 1505 | spin_lock(&tree->lock); |
1506 | /* | 1506 | /* |
1507 | * this search will find all the extents that end after | 1507 | * this search will find all the extents that end after |
1508 | * our range starts. | 1508 | * our range starts. |
1509 | */ | 1509 | */ |
1510 | node = tree_search(tree, start); | 1510 | node = tree_search(tree, start); |
1511 | if (!node) { | 1511 | if (!node) { |
1512 | ret = -ENOENT; | 1512 | ret = -ENOENT; |
1513 | goto out; | 1513 | goto out; |
1514 | } | 1514 | } |
1515 | state = rb_entry(node, struct extent_state, rb_node); | 1515 | state = rb_entry(node, struct extent_state, rb_node); |
1516 | if (state->start != start) { | 1516 | if (state->start != start) { |
1517 | ret = -ENOENT; | 1517 | ret = -ENOENT; |
1518 | goto out; | 1518 | goto out; |
1519 | } | 1519 | } |
1520 | state->private = private; | 1520 | state->private = private; |
1521 | out: | 1521 | out: |
1522 | spin_unlock(&tree->lock); | 1522 | spin_unlock(&tree->lock); |
1523 | return ret; | 1523 | return ret; |
1524 | } | 1524 | } |
1525 | 1525 | ||
1526 | int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) | 1526 | int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) |
1527 | { | 1527 | { |
1528 | struct rb_node *node; | 1528 | struct rb_node *node; |
1529 | struct extent_state *state; | 1529 | struct extent_state *state; |
1530 | int ret = 0; | 1530 | int ret = 0; |
1531 | 1531 | ||
1532 | spin_lock(&tree->lock); | 1532 | spin_lock(&tree->lock); |
1533 | /* | 1533 | /* |
1534 | * this search will find all the extents that end after | 1534 | * this search will find all the extents that end after |
1535 | * our range starts. | 1535 | * our range starts. |
1536 | */ | 1536 | */ |
1537 | node = tree_search(tree, start); | 1537 | node = tree_search(tree, start); |
1538 | if (!node) { | 1538 | if (!node) { |
1539 | ret = -ENOENT; | 1539 | ret = -ENOENT; |
1540 | goto out; | 1540 | goto out; |
1541 | } | 1541 | } |
1542 | state = rb_entry(node, struct extent_state, rb_node); | 1542 | state = rb_entry(node, struct extent_state, rb_node); |
1543 | if (state->start != start) { | 1543 | if (state->start != start) { |
1544 | ret = -ENOENT; | 1544 | ret = -ENOENT; |
1545 | goto out; | 1545 | goto out; |
1546 | } | 1546 | } |
1547 | *private = state->private; | 1547 | *private = state->private; |
1548 | out: | 1548 | out: |
1549 | spin_unlock(&tree->lock); | 1549 | spin_unlock(&tree->lock); |
1550 | return ret; | 1550 | return ret; |
1551 | } | 1551 | } |
1552 | 1552 | ||
1553 | /* | 1553 | /* |
1554 | * searches a range in the state tree for a given mask. | 1554 | * searches a range in the state tree for a given mask. |
1555 | * If 'filled' == 1, this returns 1 only if every extent in the tree | 1555 | * If 'filled' == 1, this returns 1 only if every extent in the tree |
1556 | * has the bits set. Otherwise, 1 is returned if any bit in the | 1556 | * has the bits set. Otherwise, 1 is returned if any bit in the |
1557 | * range is found set. | 1557 | * range is found set. |
1558 | */ | 1558 | */ |
1559 | int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, | 1559 | int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, |
1560 | int bits, int filled, struct extent_state *cached) | 1560 | int bits, int filled, struct extent_state *cached) |
1561 | { | 1561 | { |
1562 | struct extent_state *state = NULL; | 1562 | struct extent_state *state = NULL; |
1563 | struct rb_node *node; | 1563 | struct rb_node *node; |
1564 | int bitset = 0; | 1564 | int bitset = 0; |
1565 | 1565 | ||
1566 | spin_lock(&tree->lock); | 1566 | spin_lock(&tree->lock); |
1567 | if (cached && cached->tree && cached->start == start) | 1567 | if (cached && cached->tree && cached->start == start) |
1568 | node = &cached->rb_node; | 1568 | node = &cached->rb_node; |
1569 | else | 1569 | else |
1570 | node = tree_search(tree, start); | 1570 | node = tree_search(tree, start); |
1571 | while (node && start <= end) { | 1571 | while (node && start <= end) { |
1572 | state = rb_entry(node, struct extent_state, rb_node); | 1572 | state = rb_entry(node, struct extent_state, rb_node); |
1573 | 1573 | ||
1574 | if (filled && state->start > start) { | 1574 | if (filled && state->start > start) { |
1575 | bitset = 0; | 1575 | bitset = 0; |
1576 | break; | 1576 | break; |
1577 | } | 1577 | } |
1578 | 1578 | ||
1579 | if (state->start > end) | 1579 | if (state->start > end) |
1580 | break; | 1580 | break; |
1581 | 1581 | ||
1582 | if (state->state & bits) { | 1582 | if (state->state & bits) { |
1583 | bitset = 1; | 1583 | bitset = 1; |
1584 | if (!filled) | 1584 | if (!filled) |
1585 | break; | 1585 | break; |
1586 | } else if (filled) { | 1586 | } else if (filled) { |
1587 | bitset = 0; | 1587 | bitset = 0; |
1588 | break; | 1588 | break; |
1589 | } | 1589 | } |
1590 | 1590 | ||
1591 | if (state->end == (u64)-1) | 1591 | if (state->end == (u64)-1) |
1592 | break; | 1592 | break; |
1593 | 1593 | ||
1594 | start = state->end + 1; | 1594 | start = state->end + 1; |
1595 | if (start > end) | 1595 | if (start > end) |
1596 | break; | 1596 | break; |
1597 | node = rb_next(node); | 1597 | node = rb_next(node); |
1598 | if (!node) { | 1598 | if (!node) { |
1599 | if (filled) | 1599 | if (filled) |
1600 | bitset = 0; | 1600 | bitset = 0; |
1601 | break; | 1601 | break; |
1602 | } | 1602 | } |
1603 | } | 1603 | } |
1604 | spin_unlock(&tree->lock); | 1604 | spin_unlock(&tree->lock); |
1605 | return bitset; | 1605 | return bitset; |
1606 | } | 1606 | } |
1607 | 1607 | ||
1608 | /* | 1608 | /* |
1609 | * helper function to set a given page up to date if all the | 1609 | * helper function to set a given page up to date if all the |
1610 | * extents in the tree for that page are up to date | 1610 | * extents in the tree for that page are up to date |
1611 | */ | 1611 | */ |
1612 | static int check_page_uptodate(struct extent_io_tree *tree, | 1612 | static int check_page_uptodate(struct extent_io_tree *tree, |
1613 | struct page *page) | 1613 | struct page *page) |
1614 | { | 1614 | { |
1615 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1615 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
1616 | u64 end = start + PAGE_CACHE_SIZE - 1; | 1616 | u64 end = start + PAGE_CACHE_SIZE - 1; |
1617 | if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) | 1617 | if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) |
1618 | SetPageUptodate(page); | 1618 | SetPageUptodate(page); |
1619 | return 0; | 1619 | return 0; |
1620 | } | 1620 | } |
1621 | 1621 | ||
1622 | /* | 1622 | /* |
1623 | * helper function to unlock a page if all the extents in the tree | 1623 | * helper function to unlock a page if all the extents in the tree |
1624 | * for that page are unlocked | 1624 | * for that page are unlocked |
1625 | */ | 1625 | */ |
1626 | static int check_page_locked(struct extent_io_tree *tree, | 1626 | static int check_page_locked(struct extent_io_tree *tree, |
1627 | struct page *page) | 1627 | struct page *page) |
1628 | { | 1628 | { |
1629 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1629 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
1630 | u64 end = start + PAGE_CACHE_SIZE - 1; | 1630 | u64 end = start + PAGE_CACHE_SIZE - 1; |
1631 | if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) | 1631 | if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) |
1632 | unlock_page(page); | 1632 | unlock_page(page); |
1633 | return 0; | 1633 | return 0; |
1634 | } | 1634 | } |
1635 | 1635 | ||
1636 | /* | 1636 | /* |
1637 | * helper function to end page writeback if all the extents | 1637 | * helper function to end page writeback if all the extents |
1638 | * in the tree for that page are done with writeback | 1638 | * in the tree for that page are done with writeback |
1639 | */ | 1639 | */ |
1640 | static int check_page_writeback(struct extent_io_tree *tree, | 1640 | static int check_page_writeback(struct extent_io_tree *tree, |
1641 | struct page *page) | 1641 | struct page *page) |
1642 | { | 1642 | { |
1643 | end_page_writeback(page); | 1643 | end_page_writeback(page); |
1644 | return 0; | 1644 | return 0; |
1645 | } | 1645 | } |
1646 | 1646 | ||
1647 | /* lots and lots of room for performance fixes in the end_bio funcs */ | 1647 | /* lots and lots of room for performance fixes in the end_bio funcs */ |
1648 | 1648 | ||
1649 | /* | 1649 | /* |
1650 | * after a writepage IO is done, we need to: | 1650 | * after a writepage IO is done, we need to: |
1651 | * clear the uptodate bits on error | 1651 | * clear the uptodate bits on error |
1652 | * clear the writeback bits in the extent tree for this IO | 1652 | * clear the writeback bits in the extent tree for this IO |
1653 | * end_page_writeback if the page has no more pending IO | 1653 | * end_page_writeback if the page has no more pending IO |
1654 | * | 1654 | * |
1655 | * Scheduling is not allowed, so the extent state tree is expected | 1655 | * Scheduling is not allowed, so the extent state tree is expected |
1656 | * to have one and only one object corresponding to this IO. | 1656 | * to have one and only one object corresponding to this IO. |
1657 | */ | 1657 | */ |
1658 | static void end_bio_extent_writepage(struct bio *bio, int err) | 1658 | static void end_bio_extent_writepage(struct bio *bio, int err) |
1659 | { | 1659 | { |
1660 | int uptodate = err == 0; | 1660 | int uptodate = err == 0; |
1661 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | 1661 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; |
1662 | struct extent_io_tree *tree; | 1662 | struct extent_io_tree *tree; |
1663 | u64 start; | 1663 | u64 start; |
1664 | u64 end; | 1664 | u64 end; |
1665 | int whole_page; | 1665 | int whole_page; |
1666 | int ret; | 1666 | int ret; |
1667 | 1667 | ||
1668 | do { | 1668 | do { |
1669 | struct page *page = bvec->bv_page; | 1669 | struct page *page = bvec->bv_page; |
1670 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 1670 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
1671 | 1671 | ||
1672 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 1672 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + |
1673 | bvec->bv_offset; | 1673 | bvec->bv_offset; |
1674 | end = start + bvec->bv_len - 1; | 1674 | end = start + bvec->bv_len - 1; |
1675 | 1675 | ||
1676 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | 1676 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
1677 | whole_page = 1; | 1677 | whole_page = 1; |
1678 | else | 1678 | else |
1679 | whole_page = 0; | 1679 | whole_page = 0; |
1680 | 1680 | ||
1681 | if (--bvec >= bio->bi_io_vec) | 1681 | if (--bvec >= bio->bi_io_vec) |
1682 | prefetchw(&bvec->bv_page->flags); | 1682 | prefetchw(&bvec->bv_page->flags); |
1683 | if (tree->ops && tree->ops->writepage_end_io_hook) { | 1683 | if (tree->ops && tree->ops->writepage_end_io_hook) { |
1684 | ret = tree->ops->writepage_end_io_hook(page, start, | 1684 | ret = tree->ops->writepage_end_io_hook(page, start, |
1685 | end, NULL, uptodate); | 1685 | end, NULL, uptodate); |
1686 | if (ret) | 1686 | if (ret) |
1687 | uptodate = 0; | 1687 | uptodate = 0; |
1688 | } | 1688 | } |
1689 | 1689 | ||
1690 | if (!uptodate && tree->ops && | 1690 | if (!uptodate && tree->ops && |
1691 | tree->ops->writepage_io_failed_hook) { | 1691 | tree->ops->writepage_io_failed_hook) { |
1692 | ret = tree->ops->writepage_io_failed_hook(bio, page, | 1692 | ret = tree->ops->writepage_io_failed_hook(bio, page, |
1693 | start, end, NULL); | 1693 | start, end, NULL); |
1694 | if (ret == 0) { | 1694 | if (ret == 0) { |
1695 | uptodate = (err == 0); | 1695 | uptodate = (err == 0); |
1696 | continue; | 1696 | continue; |
1697 | } | 1697 | } |
1698 | } | 1698 | } |
1699 | 1699 | ||
1700 | if (!uptodate) { | 1700 | if (!uptodate) { |
1701 | clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); | 1701 | clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); |
1702 | ClearPageUptodate(page); | 1702 | ClearPageUptodate(page); |
1703 | SetPageError(page); | 1703 | SetPageError(page); |
1704 | } | 1704 | } |
1705 | 1705 | ||
1706 | if (whole_page) | 1706 | if (whole_page) |
1707 | end_page_writeback(page); | 1707 | end_page_writeback(page); |
1708 | else | 1708 | else |
1709 | check_page_writeback(tree, page); | 1709 | check_page_writeback(tree, page); |
1710 | } while (bvec >= bio->bi_io_vec); | 1710 | } while (bvec >= bio->bi_io_vec); |
1711 | 1711 | ||
1712 | bio_put(bio); | 1712 | bio_put(bio); |
1713 | } | 1713 | } |
1714 | 1714 | ||
1715 | /* | 1715 | /* |
1716 | * after a readpage IO is done, we need to: | 1716 | * after a readpage IO is done, we need to: |
1717 | * clear the uptodate bits on error | 1717 | * clear the uptodate bits on error |
1718 | * set the uptodate bits if things worked | 1718 | * set the uptodate bits if things worked |
1719 | * set the page up to date if all extents in the tree are uptodate | 1719 | * set the page up to date if all extents in the tree are uptodate |
1720 | * clear the lock bit in the extent tree | 1720 | * clear the lock bit in the extent tree |
1721 | * unlock the page if there are no other extents locked for it | 1721 | * unlock the page if there are no other extents locked for it |
1722 | * | 1722 | * |
1723 | * Scheduling is not allowed, so the extent state tree is expected | 1723 | * Scheduling is not allowed, so the extent state tree is expected |
1724 | * to have one and only one object corresponding to this IO. | 1724 | * to have one and only one object corresponding to this IO. |
1725 | */ | 1725 | */ |
1726 | static void end_bio_extent_readpage(struct bio *bio, int err) | 1726 | static void end_bio_extent_readpage(struct bio *bio, int err) |
1727 | { | 1727 | { |
1728 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 1728 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
1729 | struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; | 1729 | struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; |
1730 | struct bio_vec *bvec = bio->bi_io_vec; | 1730 | struct bio_vec *bvec = bio->bi_io_vec; |
1731 | struct extent_io_tree *tree; | 1731 | struct extent_io_tree *tree; |
1732 | u64 start; | 1732 | u64 start; |
1733 | u64 end; | 1733 | u64 end; |
1734 | int whole_page; | 1734 | int whole_page; |
1735 | int ret; | 1735 | int ret; |
1736 | 1736 | ||
1737 | if (err) | 1737 | if (err) |
1738 | uptodate = 0; | 1738 | uptodate = 0; |
1739 | 1739 | ||
1740 | do { | 1740 | do { |
1741 | struct page *page = bvec->bv_page; | 1741 | struct page *page = bvec->bv_page; |
1742 | struct extent_state *cached = NULL; | 1742 | struct extent_state *cached = NULL; |
1743 | struct extent_state *state; | 1743 | struct extent_state *state; |
1744 | 1744 | ||
1745 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 1745 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
1746 | 1746 | ||
1747 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 1747 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + |
1748 | bvec->bv_offset; | 1748 | bvec->bv_offset; |
1749 | end = start + bvec->bv_len - 1; | 1749 | end = start + bvec->bv_len - 1; |
1750 | 1750 | ||
1751 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | 1751 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
1752 | whole_page = 1; | 1752 | whole_page = 1; |
1753 | else | 1753 | else |
1754 | whole_page = 0; | 1754 | whole_page = 0; |
1755 | 1755 | ||
1756 | if (++bvec <= bvec_end) | 1756 | if (++bvec <= bvec_end) |
1757 | prefetchw(&bvec->bv_page->flags); | 1757 | prefetchw(&bvec->bv_page->flags); |
1758 | 1758 | ||
1759 | spin_lock(&tree->lock); | 1759 | spin_lock(&tree->lock); |
1760 | state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); | 1760 | state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); |
1761 | if (state && state->start == start) { | 1761 | if (state && state->start == start) { |
1762 | /* | 1762 | /* |
1763 | * take a reference on the state, unlock will drop | 1763 | * take a reference on the state, unlock will drop |
1764 | * the ref | 1764 | * the ref |
1765 | */ | 1765 | */ |
1766 | cache_state(state, &cached); | 1766 | cache_state(state, &cached); |
1767 | } | 1767 | } |
1768 | spin_unlock(&tree->lock); | 1768 | spin_unlock(&tree->lock); |
1769 | 1769 | ||
1770 | if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { | 1770 | if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { |
1771 | ret = tree->ops->readpage_end_io_hook(page, start, end, | 1771 | ret = tree->ops->readpage_end_io_hook(page, start, end, |
1772 | state); | 1772 | state); |
1773 | if (ret) | 1773 | if (ret) |
1774 | uptodate = 0; | 1774 | uptodate = 0; |
1775 | } | 1775 | } |
1776 | if (!uptodate && tree->ops && | 1776 | if (!uptodate && tree->ops && |
1777 | tree->ops->readpage_io_failed_hook) { | 1777 | tree->ops->readpage_io_failed_hook) { |
1778 | ret = tree->ops->readpage_io_failed_hook(bio, page, | 1778 | ret = tree->ops->readpage_io_failed_hook(bio, page, |
1779 | start, end, NULL); | 1779 | start, end, NULL); |
1780 | if (ret == 0) { | 1780 | if (ret == 0) { |
1781 | uptodate = | 1781 | uptodate = |
1782 | test_bit(BIO_UPTODATE, &bio->bi_flags); | 1782 | test_bit(BIO_UPTODATE, &bio->bi_flags); |
1783 | if (err) | 1783 | if (err) |
1784 | uptodate = 0; | 1784 | uptodate = 0; |
1785 | uncache_state(&cached); | 1785 | uncache_state(&cached); |
1786 | continue; | 1786 | continue; |
1787 | } | 1787 | } |
1788 | } | 1788 | } |
1789 | 1789 | ||
1790 | if (uptodate) { | 1790 | if (uptodate) { |
1791 | set_extent_uptodate(tree, start, end, &cached, | 1791 | set_extent_uptodate(tree, start, end, &cached, |
1792 | GFP_ATOMIC); | 1792 | GFP_ATOMIC); |
1793 | } | 1793 | } |
1794 | unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); | 1794 | unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); |
1795 | 1795 | ||
1796 | if (whole_page) { | 1796 | if (whole_page) { |
1797 | if (uptodate) { | 1797 | if (uptodate) { |
1798 | SetPageUptodate(page); | 1798 | SetPageUptodate(page); |
1799 | } else { | 1799 | } else { |
1800 | ClearPageUptodate(page); | 1800 | ClearPageUptodate(page); |
1801 | SetPageError(page); | 1801 | SetPageError(page); |
1802 | } | 1802 | } |
1803 | unlock_page(page); | 1803 | unlock_page(page); |
1804 | } else { | 1804 | } else { |
1805 | if (uptodate) { | 1805 | if (uptodate) { |
1806 | check_page_uptodate(tree, page); | 1806 | check_page_uptodate(tree, page); |
1807 | } else { | 1807 | } else { |
1808 | ClearPageUptodate(page); | 1808 | ClearPageUptodate(page); |
1809 | SetPageError(page); | 1809 | SetPageError(page); |
1810 | } | 1810 | } |
1811 | check_page_locked(tree, page); | 1811 | check_page_locked(tree, page); |
1812 | } | 1812 | } |
1813 | } while (bvec <= bvec_end); | 1813 | } while (bvec <= bvec_end); |
1814 | 1814 | ||
1815 | bio_put(bio); | 1815 | bio_put(bio); |
1816 | } | 1816 | } |
1817 | 1817 | ||
1818 | struct bio * | 1818 | struct bio * |
1819 | btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, | 1819 | btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, |
1820 | gfp_t gfp_flags) | 1820 | gfp_t gfp_flags) |
1821 | { | 1821 | { |
1822 | struct bio *bio; | 1822 | struct bio *bio; |
1823 | 1823 | ||
1824 | bio = bio_alloc(gfp_flags, nr_vecs); | 1824 | bio = bio_alloc(gfp_flags, nr_vecs); |
1825 | 1825 | ||
1826 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { | 1826 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { |
1827 | while (!bio && (nr_vecs /= 2)) | 1827 | while (!bio && (nr_vecs /= 2)) |
1828 | bio = bio_alloc(gfp_flags, nr_vecs); | 1828 | bio = bio_alloc(gfp_flags, nr_vecs); |
1829 | } | 1829 | } |
1830 | 1830 | ||
1831 | if (bio) { | 1831 | if (bio) { |
1832 | bio->bi_size = 0; | 1832 | bio->bi_size = 0; |
1833 | bio->bi_bdev = bdev; | 1833 | bio->bi_bdev = bdev; |
1834 | bio->bi_sector = first_sector; | 1834 | bio->bi_sector = first_sector; |
1835 | } | 1835 | } |
1836 | return bio; | 1836 | return bio; |
1837 | } | 1837 | } |
1838 | 1838 | ||
1839 | static int submit_one_bio(int rw, struct bio *bio, int mirror_num, | 1839 | static int submit_one_bio(int rw, struct bio *bio, int mirror_num, |
1840 | unsigned long bio_flags) | 1840 | unsigned long bio_flags) |
1841 | { | 1841 | { |
1842 | int ret = 0; | 1842 | int ret = 0; |
1843 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | 1843 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; |
1844 | struct page *page = bvec->bv_page; | 1844 | struct page *page = bvec->bv_page; |
1845 | struct extent_io_tree *tree = bio->bi_private; | 1845 | struct extent_io_tree *tree = bio->bi_private; |
1846 | u64 start; | 1846 | u64 start; |
1847 | 1847 | ||
1848 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; | 1848 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; |
1849 | 1849 | ||
1850 | bio->bi_private = NULL; | 1850 | bio->bi_private = NULL; |
1851 | 1851 | ||
1852 | bio_get(bio); | 1852 | bio_get(bio); |
1853 | 1853 | ||
1854 | if (tree->ops && tree->ops->submit_bio_hook) | 1854 | if (tree->ops && tree->ops->submit_bio_hook) |
1855 | ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, | 1855 | ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, |
1856 | mirror_num, bio_flags, start); | 1856 | mirror_num, bio_flags, start); |
1857 | else | 1857 | else |
1858 | submit_bio(rw, bio); | 1858 | submit_bio(rw, bio); |
1859 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 1859 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
1860 | ret = -EOPNOTSUPP; | 1860 | ret = -EOPNOTSUPP; |
1861 | bio_put(bio); | 1861 | bio_put(bio); |
1862 | return ret; | 1862 | return ret; |
1863 | } | 1863 | } |
1864 | 1864 | ||
1865 | static int submit_extent_page(int rw, struct extent_io_tree *tree, | 1865 | static int submit_extent_page(int rw, struct extent_io_tree *tree, |
1866 | struct page *page, sector_t sector, | 1866 | struct page *page, sector_t sector, |
1867 | size_t size, unsigned long offset, | 1867 | size_t size, unsigned long offset, |
1868 | struct block_device *bdev, | 1868 | struct block_device *bdev, |
1869 | struct bio **bio_ret, | 1869 | struct bio **bio_ret, |
1870 | unsigned long max_pages, | 1870 | unsigned long max_pages, |
1871 | bio_end_io_t end_io_func, | 1871 | bio_end_io_t end_io_func, |
1872 | int mirror_num, | 1872 | int mirror_num, |
1873 | unsigned long prev_bio_flags, | 1873 | unsigned long prev_bio_flags, |
1874 | unsigned long bio_flags) | 1874 | unsigned long bio_flags) |
1875 | { | 1875 | { |
1876 | int ret = 0; | 1876 | int ret = 0; |
1877 | struct bio *bio; | 1877 | struct bio *bio; |
1878 | int nr; | 1878 | int nr; |
1879 | int contig = 0; | 1879 | int contig = 0; |
1880 | int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; | 1880 | int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; |
1881 | int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; | 1881 | int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; |
1882 | size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); | 1882 | size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); |
1883 | 1883 | ||
1884 | if (bio_ret && *bio_ret) { | 1884 | if (bio_ret && *bio_ret) { |
1885 | bio = *bio_ret; | 1885 | bio = *bio_ret; |
1886 | if (old_compressed) | 1886 | if (old_compressed) |
1887 | contig = bio->bi_sector == sector; | 1887 | contig = bio->bi_sector == sector; |
1888 | else | 1888 | else |
1889 | contig = bio->bi_sector + (bio->bi_size >> 9) == | 1889 | contig = bio->bi_sector + (bio->bi_size >> 9) == |
1890 | sector; | 1890 | sector; |
1891 | 1891 | ||
1892 | if (prev_bio_flags != bio_flags || !contig || | 1892 | if (prev_bio_flags != bio_flags || !contig || |
1893 | (tree->ops && tree->ops->merge_bio_hook && | 1893 | (tree->ops && tree->ops->merge_bio_hook && |
1894 | tree->ops->merge_bio_hook(page, offset, page_size, bio, | 1894 | tree->ops->merge_bio_hook(page, offset, page_size, bio, |
1895 | bio_flags)) || | 1895 | bio_flags)) || |
1896 | bio_add_page(bio, page, page_size, offset) < page_size) { | 1896 | bio_add_page(bio, page, page_size, offset) < page_size) { |
1897 | ret = submit_one_bio(rw, bio, mirror_num, | 1897 | ret = submit_one_bio(rw, bio, mirror_num, |
1898 | prev_bio_flags); | 1898 | prev_bio_flags); |
1899 | bio = NULL; | 1899 | bio = NULL; |
1900 | } else { | 1900 | } else { |
1901 | return 0; | 1901 | return 0; |
1902 | } | 1902 | } |
1903 | } | 1903 | } |
1904 | if (this_compressed) | 1904 | if (this_compressed) |
1905 | nr = BIO_MAX_PAGES; | 1905 | nr = BIO_MAX_PAGES; |
1906 | else | 1906 | else |
1907 | nr = bio_get_nr_vecs(bdev); | 1907 | nr = bio_get_nr_vecs(bdev); |
1908 | 1908 | ||
1909 | bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); | 1909 | bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); |
1910 | if (!bio) | 1910 | if (!bio) |
1911 | return -ENOMEM; | 1911 | return -ENOMEM; |
1912 | 1912 | ||
1913 | bio_add_page(bio, page, page_size, offset); | 1913 | bio_add_page(bio, page, page_size, offset); |
1914 | bio->bi_end_io = end_io_func; | 1914 | bio->bi_end_io = end_io_func; |
1915 | bio->bi_private = tree; | 1915 | bio->bi_private = tree; |
1916 | 1916 | ||
1917 | if (bio_ret) | 1917 | if (bio_ret) |
1918 | *bio_ret = bio; | 1918 | *bio_ret = bio; |
1919 | else | 1919 | else |
1920 | ret = submit_one_bio(rw, bio, mirror_num, bio_flags); | 1920 | ret = submit_one_bio(rw, bio, mirror_num, bio_flags); |
1921 | 1921 | ||
1922 | return ret; | 1922 | return ret; |
1923 | } | 1923 | } |
1924 | 1924 | ||
1925 | void set_page_extent_mapped(struct page *page) | 1925 | void set_page_extent_mapped(struct page *page) |
1926 | { | 1926 | { |
1927 | if (!PagePrivate(page)) { | 1927 | if (!PagePrivate(page)) { |
1928 | SetPagePrivate(page); | 1928 | SetPagePrivate(page); |
1929 | page_cache_get(page); | 1929 | page_cache_get(page); |
1930 | set_page_private(page, EXTENT_PAGE_PRIVATE); | 1930 | set_page_private(page, EXTENT_PAGE_PRIVATE); |
1931 | } | 1931 | } |
1932 | } | 1932 | } |
1933 | 1933 | ||
1934 | static void set_page_extent_head(struct page *page, unsigned long len) | 1934 | static void set_page_extent_head(struct page *page, unsigned long len) |
1935 | { | 1935 | { |
1936 | WARN_ON(!PagePrivate(page)); | 1936 | WARN_ON(!PagePrivate(page)); |
1937 | set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); | 1937 | set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); |
1938 | } | 1938 | } |
1939 | 1939 | ||
1940 | /* | 1940 | /* |
1941 | * basic readpage implementation. Locked extent state structs are inserted | 1941 | * basic readpage implementation. Locked extent state structs are inserted |
1942 | * into the tree that are removed when the IO is done (by the end_io | 1942 | * into the tree that are removed when the IO is done (by the end_io |
1943 | * handlers) | 1943 | * handlers) |
1944 | */ | 1944 | */ |
1945 | static int __extent_read_full_page(struct extent_io_tree *tree, | 1945 | static int __extent_read_full_page(struct extent_io_tree *tree, |
1946 | struct page *page, | 1946 | struct page *page, |
1947 | get_extent_t *get_extent, | 1947 | get_extent_t *get_extent, |
1948 | struct bio **bio, int mirror_num, | 1948 | struct bio **bio, int mirror_num, |
1949 | unsigned long *bio_flags) | 1949 | unsigned long *bio_flags) |
1950 | { | 1950 | { |
1951 | struct inode *inode = page->mapping->host; | 1951 | struct inode *inode = page->mapping->host; |
1952 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1952 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
1953 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | 1953 | u64 page_end = start + PAGE_CACHE_SIZE - 1; |
1954 | u64 end; | 1954 | u64 end; |
1955 | u64 cur = start; | 1955 | u64 cur = start; |
1956 | u64 extent_offset; | 1956 | u64 extent_offset; |
1957 | u64 last_byte = i_size_read(inode); | 1957 | u64 last_byte = i_size_read(inode); |
1958 | u64 block_start; | 1958 | u64 block_start; |
1959 | u64 cur_end; | 1959 | u64 cur_end; |
1960 | sector_t sector; | 1960 | sector_t sector; |
1961 | struct extent_map *em; | 1961 | struct extent_map *em; |
1962 | struct block_device *bdev; | 1962 | struct block_device *bdev; |
1963 | struct btrfs_ordered_extent *ordered; | 1963 | struct btrfs_ordered_extent *ordered; |
1964 | int ret; | 1964 | int ret; |
1965 | int nr = 0; | 1965 | int nr = 0; |
1966 | size_t pg_offset = 0; | 1966 | size_t pg_offset = 0; |
1967 | size_t iosize; | 1967 | size_t iosize; |
1968 | size_t disk_io_size; | 1968 | size_t disk_io_size; |
1969 | size_t blocksize = inode->i_sb->s_blocksize; | 1969 | size_t blocksize = inode->i_sb->s_blocksize; |
1970 | unsigned long this_bio_flag = 0; | 1970 | unsigned long this_bio_flag = 0; |
1971 | 1971 | ||
1972 | set_page_extent_mapped(page); | 1972 | set_page_extent_mapped(page); |
1973 | 1973 | ||
1974 | if (!PageUptodate(page)) { | 1974 | if (!PageUptodate(page)) { |
1975 | if (cleancache_get_page(page) == 0) { | 1975 | if (cleancache_get_page(page) == 0) { |
1976 | BUG_ON(blocksize != PAGE_SIZE); | 1976 | BUG_ON(blocksize != PAGE_SIZE); |
1977 | goto out; | 1977 | goto out; |
1978 | } | 1978 | } |
1979 | } | 1979 | } |
1980 | 1980 | ||
1981 | end = page_end; | 1981 | end = page_end; |
1982 | while (1) { | 1982 | while (1) { |
1983 | lock_extent(tree, start, end, GFP_NOFS); | 1983 | lock_extent(tree, start, end, GFP_NOFS); |
1984 | ordered = btrfs_lookup_ordered_extent(inode, start); | 1984 | ordered = btrfs_lookup_ordered_extent(inode, start); |
1985 | if (!ordered) | 1985 | if (!ordered) |
1986 | break; | 1986 | break; |
1987 | unlock_extent(tree, start, end, GFP_NOFS); | 1987 | unlock_extent(tree, start, end, GFP_NOFS); |
1988 | btrfs_start_ordered_extent(inode, ordered, 1); | 1988 | btrfs_start_ordered_extent(inode, ordered, 1); |
1989 | btrfs_put_ordered_extent(ordered); | 1989 | btrfs_put_ordered_extent(ordered); |
1990 | } | 1990 | } |
1991 | 1991 | ||
1992 | if (page->index == last_byte >> PAGE_CACHE_SHIFT) { | 1992 | if (page->index == last_byte >> PAGE_CACHE_SHIFT) { |
1993 | char *userpage; | 1993 | char *userpage; |
1994 | size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); | 1994 | size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); |
1995 | 1995 | ||
1996 | if (zero_offset) { | 1996 | if (zero_offset) { |
1997 | iosize = PAGE_CACHE_SIZE - zero_offset; | 1997 | iosize = PAGE_CACHE_SIZE - zero_offset; |
1998 | userpage = kmap_atomic(page, KM_USER0); | 1998 | userpage = kmap_atomic(page, KM_USER0); |
1999 | memset(userpage + zero_offset, 0, iosize); | 1999 | memset(userpage + zero_offset, 0, iosize); |
2000 | flush_dcache_page(page); | 2000 | flush_dcache_page(page); |
2001 | kunmap_atomic(userpage, KM_USER0); | 2001 | kunmap_atomic(userpage, KM_USER0); |
2002 | } | 2002 | } |
2003 | } | 2003 | } |
2004 | while (cur <= end) { | 2004 | while (cur <= end) { |
2005 | if (cur >= last_byte) { | 2005 | if (cur >= last_byte) { |
2006 | char *userpage; | 2006 | char *userpage; |
2007 | struct extent_state *cached = NULL; | 2007 | struct extent_state *cached = NULL; |
2008 | 2008 | ||
2009 | iosize = PAGE_CACHE_SIZE - pg_offset; | 2009 | iosize = PAGE_CACHE_SIZE - pg_offset; |
2010 | userpage = kmap_atomic(page, KM_USER0); | 2010 | userpage = kmap_atomic(page, KM_USER0); |
2011 | memset(userpage + pg_offset, 0, iosize); | 2011 | memset(userpage + pg_offset, 0, iosize); |
2012 | flush_dcache_page(page); | 2012 | flush_dcache_page(page); |
2013 | kunmap_atomic(userpage, KM_USER0); | 2013 | kunmap_atomic(userpage, KM_USER0); |
2014 | set_extent_uptodate(tree, cur, cur + iosize - 1, | 2014 | set_extent_uptodate(tree, cur, cur + iosize - 1, |
2015 | &cached, GFP_NOFS); | 2015 | &cached, GFP_NOFS); |
2016 | unlock_extent_cached(tree, cur, cur + iosize - 1, | 2016 | unlock_extent_cached(tree, cur, cur + iosize - 1, |
2017 | &cached, GFP_NOFS); | 2017 | &cached, GFP_NOFS); |
2018 | break; | 2018 | break; |
2019 | } | 2019 | } |
2020 | em = get_extent(inode, page, pg_offset, cur, | 2020 | em = get_extent(inode, page, pg_offset, cur, |
2021 | end - cur + 1, 0); | 2021 | end - cur + 1, 0); |
2022 | if (IS_ERR_OR_NULL(em)) { | 2022 | if (IS_ERR_OR_NULL(em)) { |
2023 | SetPageError(page); | 2023 | SetPageError(page); |
2024 | unlock_extent(tree, cur, end, GFP_NOFS); | 2024 | unlock_extent(tree, cur, end, GFP_NOFS); |
2025 | break; | 2025 | break; |
2026 | } | 2026 | } |
2027 | extent_offset = cur - em->start; | 2027 | extent_offset = cur - em->start; |
2028 | BUG_ON(extent_map_end(em) <= cur); | 2028 | BUG_ON(extent_map_end(em) <= cur); |
2029 | BUG_ON(end < cur); | 2029 | BUG_ON(end < cur); |
2030 | 2030 | ||
2031 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | 2031 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { |
2032 | this_bio_flag = EXTENT_BIO_COMPRESSED; | 2032 | this_bio_flag = EXTENT_BIO_COMPRESSED; |
2033 | extent_set_compress_type(&this_bio_flag, | 2033 | extent_set_compress_type(&this_bio_flag, |
2034 | em->compress_type); | 2034 | em->compress_type); |
2035 | } | 2035 | } |
2036 | 2036 | ||
2037 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | 2037 | iosize = min(extent_map_end(em) - cur, end - cur + 1); |
2038 | cur_end = min(extent_map_end(em) - 1, end); | 2038 | cur_end = min(extent_map_end(em) - 1, end); |
2039 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | 2039 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); |
2040 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { | 2040 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { |
2041 | disk_io_size = em->block_len; | 2041 | disk_io_size = em->block_len; |
2042 | sector = em->block_start >> 9; | 2042 | sector = em->block_start >> 9; |
2043 | } else { | 2043 | } else { |
2044 | sector = (em->block_start + extent_offset) >> 9; | 2044 | sector = (em->block_start + extent_offset) >> 9; |
2045 | disk_io_size = iosize; | 2045 | disk_io_size = iosize; |
2046 | } | 2046 | } |
2047 | bdev = em->bdev; | 2047 | bdev = em->bdev; |
2048 | block_start = em->block_start; | 2048 | block_start = em->block_start; |
2049 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) | 2049 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) |
2050 | block_start = EXTENT_MAP_HOLE; | 2050 | block_start = EXTENT_MAP_HOLE; |
2051 | free_extent_map(em); | 2051 | free_extent_map(em); |
2052 | em = NULL; | 2052 | em = NULL; |
2053 | 2053 | ||
2054 | /* we've found a hole, just zero and go on */ | 2054 | /* we've found a hole, just zero and go on */ |
2055 | if (block_start == EXTENT_MAP_HOLE) { | 2055 | if (block_start == EXTENT_MAP_HOLE) { |
2056 | char *userpage; | 2056 | char *userpage; |
2057 | struct extent_state *cached = NULL; | 2057 | struct extent_state *cached = NULL; |
2058 | 2058 | ||
2059 | userpage = kmap_atomic(page, KM_USER0); | 2059 | userpage = kmap_atomic(page, KM_USER0); |
2060 | memset(userpage + pg_offset, 0, iosize); | 2060 | memset(userpage + pg_offset, 0, iosize); |
2061 | flush_dcache_page(page); | 2061 | flush_dcache_page(page); |
2062 | kunmap_atomic(userpage, KM_USER0); | 2062 | kunmap_atomic(userpage, KM_USER0); |
2063 | 2063 | ||
2064 | set_extent_uptodate(tree, cur, cur + iosize - 1, | 2064 | set_extent_uptodate(tree, cur, cur + iosize - 1, |
2065 | &cached, GFP_NOFS); | 2065 | &cached, GFP_NOFS); |
2066 | unlock_extent_cached(tree, cur, cur + iosize - 1, | 2066 | unlock_extent_cached(tree, cur, cur + iosize - 1, |
2067 | &cached, GFP_NOFS); | 2067 | &cached, GFP_NOFS); |
2068 | cur = cur + iosize; | 2068 | cur = cur + iosize; |
2069 | pg_offset += iosize; | 2069 | pg_offset += iosize; |
2070 | continue; | 2070 | continue; |
2071 | } | 2071 | } |
2072 | /* the get_extent function already copied into the page */ | 2072 | /* the get_extent function already copied into the page */ |
2073 | if (test_range_bit(tree, cur, cur_end, | 2073 | if (test_range_bit(tree, cur, cur_end, |
2074 | EXTENT_UPTODATE, 1, NULL)) { | 2074 | EXTENT_UPTODATE, 1, NULL)) { |
2075 | check_page_uptodate(tree, page); | 2075 | check_page_uptodate(tree, page); |
2076 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); | 2076 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); |
2077 | cur = cur + iosize; | 2077 | cur = cur + iosize; |
2078 | pg_offset += iosize; | 2078 | pg_offset += iosize; |
2079 | continue; | 2079 | continue; |
2080 | } | 2080 | } |
2081 | /* we have an inline extent but it didn't get marked up | 2081 | /* we have an inline extent but it didn't get marked up |
2082 | * to date. Error out | 2082 | * to date. Error out |
2083 | */ | 2083 | */ |
2084 | if (block_start == EXTENT_MAP_INLINE) { | 2084 | if (block_start == EXTENT_MAP_INLINE) { |
2085 | SetPageError(page); | 2085 | SetPageError(page); |
2086 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); | 2086 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); |
2087 | cur = cur + iosize; | 2087 | cur = cur + iosize; |
2088 | pg_offset += iosize; | 2088 | pg_offset += iosize; |
2089 | continue; | 2089 | continue; |
2090 | } | 2090 | } |
2091 | 2091 | ||
2092 | ret = 0; | 2092 | ret = 0; |
2093 | if (tree->ops && tree->ops->readpage_io_hook) { | 2093 | if (tree->ops && tree->ops->readpage_io_hook) { |
2094 | ret = tree->ops->readpage_io_hook(page, cur, | 2094 | ret = tree->ops->readpage_io_hook(page, cur, |
2095 | cur + iosize - 1); | 2095 | cur + iosize - 1); |
2096 | } | 2096 | } |
2097 | if (!ret) { | 2097 | if (!ret) { |
2098 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; | 2098 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; |
2099 | pnr -= page->index; | 2099 | pnr -= page->index; |
2100 | ret = submit_extent_page(READ, tree, page, | 2100 | ret = submit_extent_page(READ, tree, page, |
2101 | sector, disk_io_size, pg_offset, | 2101 | sector, disk_io_size, pg_offset, |
2102 | bdev, bio, pnr, | 2102 | bdev, bio, pnr, |
2103 | end_bio_extent_readpage, mirror_num, | 2103 | end_bio_extent_readpage, mirror_num, |
2104 | *bio_flags, | 2104 | *bio_flags, |
2105 | this_bio_flag); | 2105 | this_bio_flag); |
2106 | nr++; | 2106 | nr++; |
2107 | *bio_flags = this_bio_flag; | 2107 | *bio_flags = this_bio_flag; |
2108 | } | 2108 | } |
2109 | if (ret) | 2109 | if (ret) |
2110 | SetPageError(page); | 2110 | SetPageError(page); |
2111 | cur = cur + iosize; | 2111 | cur = cur + iosize; |
2112 | pg_offset += iosize; | 2112 | pg_offset += iosize; |
2113 | } | 2113 | } |
2114 | out: | 2114 | out: |
2115 | if (!nr) { | 2115 | if (!nr) { |
2116 | if (!PageError(page)) | 2116 | if (!PageError(page)) |
2117 | SetPageUptodate(page); | 2117 | SetPageUptodate(page); |
2118 | unlock_page(page); | 2118 | unlock_page(page); |
2119 | } | 2119 | } |
2120 | return 0; | 2120 | return 0; |
2121 | } | 2121 | } |
2122 | 2122 | ||
2123 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | 2123 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, |
2124 | get_extent_t *get_extent) | 2124 | get_extent_t *get_extent) |
2125 | { | 2125 | { |
2126 | struct bio *bio = NULL; | 2126 | struct bio *bio = NULL; |
2127 | unsigned long bio_flags = 0; | 2127 | unsigned long bio_flags = 0; |
2128 | int ret; | 2128 | int ret; |
2129 | 2129 | ||
2130 | ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, | 2130 | ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, |
2131 | &bio_flags); | 2131 | &bio_flags); |
2132 | if (bio) | 2132 | if (bio) |
2133 | ret = submit_one_bio(READ, bio, 0, bio_flags); | 2133 | ret = submit_one_bio(READ, bio, 0, bio_flags); |
2134 | return ret; | 2134 | return ret; |
2135 | } | 2135 | } |
2136 | 2136 | ||
2137 | static noinline void update_nr_written(struct page *page, | 2137 | static noinline void update_nr_written(struct page *page, |
2138 | struct writeback_control *wbc, | 2138 | struct writeback_control *wbc, |
2139 | unsigned long nr_written) | 2139 | unsigned long nr_written) |
2140 | { | 2140 | { |
2141 | wbc->nr_to_write -= nr_written; | 2141 | wbc->nr_to_write -= nr_written; |
2142 | if (wbc->range_cyclic || (wbc->nr_to_write > 0 && | 2142 | if (wbc->range_cyclic || (wbc->nr_to_write > 0 && |
2143 | wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) | 2143 | wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) |
2144 | page->mapping->writeback_index = page->index + nr_written; | 2144 | page->mapping->writeback_index = page->index + nr_written; |
2145 | } | 2145 | } |
2146 | 2146 | ||
2147 | /* | 2147 | /* |
2148 | * the writepage semantics are similar to regular writepage. extent | 2148 | * the writepage semantics are similar to regular writepage. extent |
2149 | * records are inserted to lock ranges in the tree, and as dirty areas | 2149 | * records are inserted to lock ranges in the tree, and as dirty areas |
2150 | * are found, they are marked writeback. Then the lock bits are removed | 2150 | * are found, they are marked writeback. Then the lock bits are removed |
2151 | * and the end_io handler clears the writeback ranges | 2151 | * and the end_io handler clears the writeback ranges |
2152 | */ | 2152 | */ |
2153 | static int __extent_writepage(struct page *page, struct writeback_control *wbc, | 2153 | static int __extent_writepage(struct page *page, struct writeback_control *wbc, |
2154 | void *data) | 2154 | void *data) |
2155 | { | 2155 | { |
2156 | struct inode *inode = page->mapping->host; | 2156 | struct inode *inode = page->mapping->host; |
2157 | struct extent_page_data *epd = data; | 2157 | struct extent_page_data *epd = data; |
2158 | struct extent_io_tree *tree = epd->tree; | 2158 | struct extent_io_tree *tree = epd->tree; |
2159 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 2159 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
2160 | u64 delalloc_start; | 2160 | u64 delalloc_start; |
2161 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | 2161 | u64 page_end = start + PAGE_CACHE_SIZE - 1; |
2162 | u64 end; | 2162 | u64 end; |
2163 | u64 cur = start; | 2163 | u64 cur = start; |
2164 | u64 extent_offset; | 2164 | u64 extent_offset; |
2165 | u64 last_byte = i_size_read(inode); | 2165 | u64 last_byte = i_size_read(inode); |
2166 | u64 block_start; | 2166 | u64 block_start; |
2167 | u64 iosize; | 2167 | u64 iosize; |
2168 | sector_t sector; | 2168 | sector_t sector; |
2169 | struct extent_state *cached_state = NULL; | 2169 | struct extent_state *cached_state = NULL; |
2170 | struct extent_map *em; | 2170 | struct extent_map *em; |
2171 | struct block_device *bdev; | 2171 | struct block_device *bdev; |
2172 | int ret; | 2172 | int ret; |
2173 | int nr = 0; | 2173 | int nr = 0; |
2174 | size_t pg_offset = 0; | 2174 | size_t pg_offset = 0; |
2175 | size_t blocksize; | 2175 | size_t blocksize; |
2176 | loff_t i_size = i_size_read(inode); | 2176 | loff_t i_size = i_size_read(inode); |
2177 | unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; | 2177 | unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; |
2178 | u64 nr_delalloc; | 2178 | u64 nr_delalloc; |
2179 | u64 delalloc_end; | 2179 | u64 delalloc_end; |
2180 | int page_started; | 2180 | int page_started; |
2181 | int compressed; | 2181 | int compressed; |
2182 | int write_flags; | 2182 | int write_flags; |
2183 | unsigned long nr_written = 0; | 2183 | unsigned long nr_written = 0; |
2184 | 2184 | ||
2185 | if (wbc->sync_mode == WB_SYNC_ALL) | 2185 | if (wbc->sync_mode == WB_SYNC_ALL) |
2186 | write_flags = WRITE_SYNC; | 2186 | write_flags = WRITE_SYNC; |
2187 | else | 2187 | else |
2188 | write_flags = WRITE; | 2188 | write_flags = WRITE; |
2189 | 2189 | ||
2190 | trace___extent_writepage(page, inode, wbc); | 2190 | trace___extent_writepage(page, inode, wbc); |
2191 | 2191 | ||
2192 | WARN_ON(!PageLocked(page)); | 2192 | WARN_ON(!PageLocked(page)); |
2193 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); | 2193 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); |
2194 | if (page->index > end_index || | 2194 | if (page->index > end_index || |
2195 | (page->index == end_index && !pg_offset)) { | 2195 | (page->index == end_index && !pg_offset)) { |
2196 | page->mapping->a_ops->invalidatepage(page, 0); | 2196 | page->mapping->a_ops->invalidatepage(page, 0); |
2197 | unlock_page(page); | 2197 | unlock_page(page); |
2198 | return 0; | 2198 | return 0; |
2199 | } | 2199 | } |
2200 | 2200 | ||
2201 | if (page->index == end_index) { | 2201 | if (page->index == end_index) { |
2202 | char *userpage; | 2202 | char *userpage; |
2203 | 2203 | ||
2204 | userpage = kmap_atomic(page, KM_USER0); | 2204 | userpage = kmap_atomic(page, KM_USER0); |
2205 | memset(userpage + pg_offset, 0, | 2205 | memset(userpage + pg_offset, 0, |
2206 | PAGE_CACHE_SIZE - pg_offset); | 2206 | PAGE_CACHE_SIZE - pg_offset); |
2207 | kunmap_atomic(userpage, KM_USER0); | 2207 | kunmap_atomic(userpage, KM_USER0); |
2208 | flush_dcache_page(page); | 2208 | flush_dcache_page(page); |
2209 | } | 2209 | } |
2210 | pg_offset = 0; | 2210 | pg_offset = 0; |
2211 | 2211 | ||
2212 | set_page_extent_mapped(page); | 2212 | set_page_extent_mapped(page); |
2213 | 2213 | ||
2214 | delalloc_start = start; | 2214 | delalloc_start = start; |
2215 | delalloc_end = 0; | 2215 | delalloc_end = 0; |
2216 | page_started = 0; | 2216 | page_started = 0; |
2217 | if (!epd->extent_locked) { | 2217 | if (!epd->extent_locked) { |
2218 | u64 delalloc_to_write = 0; | 2218 | u64 delalloc_to_write = 0; |
2219 | /* | 2219 | /* |
2220 | * make sure the wbc mapping index is at least updated | 2220 | * make sure the wbc mapping index is at least updated |
2221 | * to this page. | 2221 | * to this page. |
2222 | */ | 2222 | */ |
2223 | update_nr_written(page, wbc, 0); | 2223 | update_nr_written(page, wbc, 0); |
2224 | 2224 | ||
2225 | while (delalloc_end < page_end) { | 2225 | while (delalloc_end < page_end) { |
2226 | nr_delalloc = find_lock_delalloc_range(inode, tree, | 2226 | nr_delalloc = find_lock_delalloc_range(inode, tree, |
2227 | page, | 2227 | page, |
2228 | &delalloc_start, | 2228 | &delalloc_start, |
2229 | &delalloc_end, | 2229 | &delalloc_end, |
2230 | 128 * 1024 * 1024); | 2230 | 128 * 1024 * 1024); |
2231 | if (nr_delalloc == 0) { | 2231 | if (nr_delalloc == 0) { |
2232 | delalloc_start = delalloc_end + 1; | 2232 | delalloc_start = delalloc_end + 1; |
2233 | continue; | 2233 | continue; |
2234 | } | 2234 | } |
2235 | tree->ops->fill_delalloc(inode, page, delalloc_start, | 2235 | tree->ops->fill_delalloc(inode, page, delalloc_start, |
2236 | delalloc_end, &page_started, | 2236 | delalloc_end, &page_started, |
2237 | &nr_written); | 2237 | &nr_written); |
2238 | /* | 2238 | /* |
2239 | * delalloc_end is already one less than the total | 2239 | * delalloc_end is already one less than the total |
2240 | * length, so we don't subtract one from | 2240 | * length, so we don't subtract one from |
2241 | * PAGE_CACHE_SIZE | 2241 | * PAGE_CACHE_SIZE |
2242 | */ | 2242 | */ |
2243 | delalloc_to_write += (delalloc_end - delalloc_start + | 2243 | delalloc_to_write += (delalloc_end - delalloc_start + |
2244 | PAGE_CACHE_SIZE) >> | 2244 | PAGE_CACHE_SIZE) >> |
2245 | PAGE_CACHE_SHIFT; | 2245 | PAGE_CACHE_SHIFT; |
2246 | delalloc_start = delalloc_end + 1; | 2246 | delalloc_start = delalloc_end + 1; |
2247 | } | 2247 | } |
2248 | if (wbc->nr_to_write < delalloc_to_write) { | 2248 | if (wbc->nr_to_write < delalloc_to_write) { |
2249 | int thresh = 8192; | 2249 | int thresh = 8192; |
2250 | 2250 | ||
2251 | if (delalloc_to_write < thresh * 2) | 2251 | if (delalloc_to_write < thresh * 2) |
2252 | thresh = delalloc_to_write; | 2252 | thresh = delalloc_to_write; |
2253 | wbc->nr_to_write = min_t(u64, delalloc_to_write, | 2253 | wbc->nr_to_write = min_t(u64, delalloc_to_write, |
2254 | thresh); | 2254 | thresh); |
2255 | } | 2255 | } |
2256 | 2256 | ||
2257 | /* did the fill delalloc function already unlock and start | 2257 | /* did the fill delalloc function already unlock and start |
2258 | * the IO? | 2258 | * the IO? |
2259 | */ | 2259 | */ |
2260 | if (page_started) { | 2260 | if (page_started) { |
2261 | ret = 0; | 2261 | ret = 0; |
2262 | /* | 2262 | /* |
2263 | * we've unlocked the page, so we can't update | 2263 | * we've unlocked the page, so we can't update |
2264 | * the mapping's writeback index, just update | 2264 | * the mapping's writeback index, just update |
2265 | * nr_to_write. | 2265 | * nr_to_write. |
2266 | */ | 2266 | */ |
2267 | wbc->nr_to_write -= nr_written; | 2267 | wbc->nr_to_write -= nr_written; |
2268 | goto done_unlocked; | 2268 | goto done_unlocked; |
2269 | } | 2269 | } |
2270 | } | 2270 | } |
2271 | if (tree->ops && tree->ops->writepage_start_hook) { | 2271 | if (tree->ops && tree->ops->writepage_start_hook) { |
2272 | ret = tree->ops->writepage_start_hook(page, start, | 2272 | ret = tree->ops->writepage_start_hook(page, start, |
2273 | page_end); | 2273 | page_end); |
2274 | if (ret == -EAGAIN) { | 2274 | if (ret == -EAGAIN) { |
2275 | redirty_page_for_writepage(wbc, page); | 2275 | redirty_page_for_writepage(wbc, page); |
2276 | update_nr_written(page, wbc, nr_written); | 2276 | update_nr_written(page, wbc, nr_written); |
2277 | unlock_page(page); | 2277 | unlock_page(page); |
2278 | ret = 0; | 2278 | ret = 0; |
2279 | goto done_unlocked; | 2279 | goto done_unlocked; |
2280 | } | 2280 | } |
2281 | } | 2281 | } |
2282 | 2282 | ||
2283 | /* | 2283 | /* |
2284 | * we don't want to touch the inode after unlocking the page, | 2284 | * we don't want to touch the inode after unlocking the page, |
2285 | * so we update the mapping writeback index now | 2285 | * so we update the mapping writeback index now |
2286 | */ | 2286 | */ |
2287 | update_nr_written(page, wbc, nr_written + 1); | 2287 | update_nr_written(page, wbc, nr_written + 1); |
2288 | 2288 | ||
2289 | end = page_end; | 2289 | end = page_end; |
2290 | if (last_byte <= start) { | 2290 | if (last_byte <= start) { |
2291 | if (tree->ops && tree->ops->writepage_end_io_hook) | 2291 | if (tree->ops && tree->ops->writepage_end_io_hook) |
2292 | tree->ops->writepage_end_io_hook(page, start, | 2292 | tree->ops->writepage_end_io_hook(page, start, |
2293 | page_end, NULL, 1); | 2293 | page_end, NULL, 1); |
2294 | goto done; | 2294 | goto done; |
2295 | } | 2295 | } |
2296 | 2296 | ||
2297 | blocksize = inode->i_sb->s_blocksize; | 2297 | blocksize = inode->i_sb->s_blocksize; |
2298 | 2298 | ||
2299 | while (cur <= end) { | 2299 | while (cur <= end) { |
2300 | if (cur >= last_byte) { | 2300 | if (cur >= last_byte) { |
2301 | if (tree->ops && tree->ops->writepage_end_io_hook) | 2301 | if (tree->ops && tree->ops->writepage_end_io_hook) |
2302 | tree->ops->writepage_end_io_hook(page, cur, | 2302 | tree->ops->writepage_end_io_hook(page, cur, |
2303 | page_end, NULL, 1); | 2303 | page_end, NULL, 1); |
2304 | break; | 2304 | break; |
2305 | } | 2305 | } |
2306 | em = epd->get_extent(inode, page, pg_offset, cur, | 2306 | em = epd->get_extent(inode, page, pg_offset, cur, |
2307 | end - cur + 1, 1); | 2307 | end - cur + 1, 1); |
2308 | if (IS_ERR_OR_NULL(em)) { | 2308 | if (IS_ERR_OR_NULL(em)) { |
2309 | SetPageError(page); | 2309 | SetPageError(page); |
2310 | break; | 2310 | break; |
2311 | } | 2311 | } |
2312 | 2312 | ||
2313 | extent_offset = cur - em->start; | 2313 | extent_offset = cur - em->start; |
2314 | BUG_ON(extent_map_end(em) <= cur); | 2314 | BUG_ON(extent_map_end(em) <= cur); |
2315 | BUG_ON(end < cur); | 2315 | BUG_ON(end < cur); |
2316 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | 2316 | iosize = min(extent_map_end(em) - cur, end - cur + 1); |
2317 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | 2317 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); |
2318 | sector = (em->block_start + extent_offset) >> 9; | 2318 | sector = (em->block_start + extent_offset) >> 9; |
2319 | bdev = em->bdev; | 2319 | bdev = em->bdev; |
2320 | block_start = em->block_start; | 2320 | block_start = em->block_start; |
2321 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | 2321 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); |
2322 | free_extent_map(em); | 2322 | free_extent_map(em); |
2323 | em = NULL; | 2323 | em = NULL; |
2324 | 2324 | ||
2325 | /* | 2325 | /* |
2326 | * compressed and inline extents are written through other | 2326 | * compressed and inline extents are written through other |
2327 | * paths in the FS | 2327 | * paths in the FS |
2328 | */ | 2328 | */ |
2329 | if (compressed || block_start == EXTENT_MAP_HOLE || | 2329 | if (compressed || block_start == EXTENT_MAP_HOLE || |
2330 | block_start == EXTENT_MAP_INLINE) { | 2330 | block_start == EXTENT_MAP_INLINE) { |
2331 | /* | 2331 | /* |
2332 | * end_io notification does not happen here for | 2332 | * end_io notification does not happen here for |
2333 | * compressed extents | 2333 | * compressed extents |
2334 | */ | 2334 | */ |
2335 | if (!compressed && tree->ops && | 2335 | if (!compressed && tree->ops && |
2336 | tree->ops->writepage_end_io_hook) | 2336 | tree->ops->writepage_end_io_hook) |
2337 | tree->ops->writepage_end_io_hook(page, cur, | 2337 | tree->ops->writepage_end_io_hook(page, cur, |
2338 | cur + iosize - 1, | 2338 | cur + iosize - 1, |
2339 | NULL, 1); | 2339 | NULL, 1); |
2340 | else if (compressed) { | 2340 | else if (compressed) { |
2341 | /* we don't want to end_page_writeback on | 2341 | /* we don't want to end_page_writeback on |
2342 | * a compressed extent. this happens | 2342 | * a compressed extent. this happens |
2343 | * elsewhere | 2343 | * elsewhere |
2344 | */ | 2344 | */ |
2345 | nr++; | 2345 | nr++; |
2346 | } | 2346 | } |
2347 | 2347 | ||
2348 | cur += iosize; | 2348 | cur += iosize; |
2349 | pg_offset += iosize; | 2349 | pg_offset += iosize; |
2350 | continue; | 2350 | continue; |
2351 | } | 2351 | } |
2352 | /* leave this out until we have a page_mkwrite call */ | 2352 | /* leave this out until we have a page_mkwrite call */ |
2353 | if (0 && !test_range_bit(tree, cur, cur + iosize - 1, | 2353 | if (0 && !test_range_bit(tree, cur, cur + iosize - 1, |
2354 | EXTENT_DIRTY, 0, NULL)) { | 2354 | EXTENT_DIRTY, 0, NULL)) { |
2355 | cur = cur + iosize; | 2355 | cur = cur + iosize; |
2356 | pg_offset += iosize; | 2356 | pg_offset += iosize; |
2357 | continue; | 2357 | continue; |
2358 | } | 2358 | } |
2359 | 2359 | ||
2360 | if (tree->ops && tree->ops->writepage_io_hook) { | 2360 | if (tree->ops && tree->ops->writepage_io_hook) { |
2361 | ret = tree->ops->writepage_io_hook(page, cur, | 2361 | ret = tree->ops->writepage_io_hook(page, cur, |
2362 | cur + iosize - 1); | 2362 | cur + iosize - 1); |
2363 | } else { | 2363 | } else { |
2364 | ret = 0; | 2364 | ret = 0; |
2365 | } | 2365 | } |
2366 | if (ret) { | 2366 | if (ret) { |
2367 | SetPageError(page); | 2367 | SetPageError(page); |
2368 | } else { | 2368 | } else { |
2369 | unsigned long max_nr = end_index + 1; | 2369 | unsigned long max_nr = end_index + 1; |
2370 | 2370 | ||
2371 | set_range_writeback(tree, cur, cur + iosize - 1); | 2371 | set_range_writeback(tree, cur, cur + iosize - 1); |
2372 | if (!PageWriteback(page)) { | 2372 | if (!PageWriteback(page)) { |
2373 | printk(KERN_ERR "btrfs warning page %lu not " | 2373 | printk(KERN_ERR "btrfs warning page %lu not " |
2374 | "writeback, cur %llu end %llu\n", | 2374 | "writeback, cur %llu end %llu\n", |
2375 | page->index, (unsigned long long)cur, | 2375 | page->index, (unsigned long long)cur, |
2376 | (unsigned long long)end); | 2376 | (unsigned long long)end); |
2377 | } | 2377 | } |
2378 | 2378 | ||
2379 | ret = submit_extent_page(write_flags, tree, page, | 2379 | ret = submit_extent_page(write_flags, tree, page, |
2380 | sector, iosize, pg_offset, | 2380 | sector, iosize, pg_offset, |
2381 | bdev, &epd->bio, max_nr, | 2381 | bdev, &epd->bio, max_nr, |
2382 | end_bio_extent_writepage, | 2382 | end_bio_extent_writepage, |
2383 | 0, 0, 0); | 2383 | 0, 0, 0); |
2384 | if (ret) | 2384 | if (ret) |
2385 | SetPageError(page); | 2385 | SetPageError(page); |
2386 | } | 2386 | } |
2387 | cur = cur + iosize; | 2387 | cur = cur + iosize; |
2388 | pg_offset += iosize; | 2388 | pg_offset += iosize; |
2389 | nr++; | 2389 | nr++; |
2390 | } | 2390 | } |
2391 | done: | 2391 | done: |
2392 | if (nr == 0) { | 2392 | if (nr == 0) { |
2393 | /* make sure the mapping tag for page dirty gets cleared */ | 2393 | /* make sure the mapping tag for page dirty gets cleared */ |
2394 | set_page_writeback(page); | 2394 | set_page_writeback(page); |
2395 | end_page_writeback(page); | 2395 | end_page_writeback(page); |
2396 | } | 2396 | } |
2397 | unlock_page(page); | 2397 | unlock_page(page); |
2398 | 2398 | ||
2399 | done_unlocked: | 2399 | done_unlocked: |
2400 | 2400 | ||
2401 | /* drop our reference on any cached states */ | 2401 | /* drop our reference on any cached states */ |
2402 | free_extent_state(cached_state); | 2402 | free_extent_state(cached_state); |
2403 | return 0; | 2403 | return 0; |
2404 | } | 2404 | } |
2405 | 2405 | ||
2406 | /** | 2406 | /** |
2407 | * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. | 2407 | * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. |
2408 | * @mapping: address space structure to write | 2408 | * @mapping: address space structure to write |
2409 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | 2409 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
2410 | * @writepage: function called for each page | 2410 | * @writepage: function called for each page |
2411 | * @data: data passed to writepage function | 2411 | * @data: data passed to writepage function |
2412 | * | 2412 | * |
2413 | * If a page is already under I/O, write_cache_pages() skips it, even | 2413 | * If a page is already under I/O, write_cache_pages() skips it, even |
2414 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, | 2414 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, |
2415 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() | 2415 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() |
2416 | * and msync() need to guarantee that all the data which was dirty at the time | 2416 | * and msync() need to guarantee that all the data which was dirty at the time |
2417 | * the call was made get new I/O started against them. If wbc->sync_mode is | 2417 | * the call was made get new I/O started against them. If wbc->sync_mode is |
2418 | * WB_SYNC_ALL then we were called for data integrity and we must wait for | 2418 | * WB_SYNC_ALL then we were called for data integrity and we must wait for |
2419 | * existing IO to complete. | 2419 | * existing IO to complete. |
2420 | */ | 2420 | */ |
2421 | static int extent_write_cache_pages(struct extent_io_tree *tree, | 2421 | static int extent_write_cache_pages(struct extent_io_tree *tree, |
2422 | struct address_space *mapping, | 2422 | struct address_space *mapping, |
2423 | struct writeback_control *wbc, | 2423 | struct writeback_control *wbc, |
2424 | writepage_t writepage, void *data, | 2424 | writepage_t writepage, void *data, |
2425 | void (*flush_fn)(void *)) | 2425 | void (*flush_fn)(void *)) |
2426 | { | 2426 | { |
2427 | int ret = 0; | 2427 | int ret = 0; |
2428 | int done = 0; | 2428 | int done = 0; |
2429 | int nr_to_write_done = 0; | 2429 | int nr_to_write_done = 0; |
2430 | struct pagevec pvec; | 2430 | struct pagevec pvec; |
2431 | int nr_pages; | 2431 | int nr_pages; |
2432 | pgoff_t index; | 2432 | pgoff_t index; |
2433 | pgoff_t end; /* Inclusive */ | 2433 | pgoff_t end; /* Inclusive */ |
2434 | int scanned = 0; | 2434 | int scanned = 0; |
2435 | 2435 | ||
2436 | pagevec_init(&pvec, 0); | 2436 | pagevec_init(&pvec, 0); |
2437 | if (wbc->range_cyclic) { | 2437 | if (wbc->range_cyclic) { |
2438 | index = mapping->writeback_index; /* Start from prev offset */ | 2438 | index = mapping->writeback_index; /* Start from prev offset */ |
2439 | end = -1; | 2439 | end = -1; |
2440 | } else { | 2440 | } else { |
2441 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2441 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2442 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 2442 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
2443 | scanned = 1; | 2443 | scanned = 1; |
2444 | } | 2444 | } |
2445 | retry: | 2445 | retry: |
2446 | while (!done && !nr_to_write_done && (index <= end) && | 2446 | while (!done && !nr_to_write_done && (index <= end) && |
2447 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 2447 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
2448 | PAGECACHE_TAG_DIRTY, min(end - index, | 2448 | PAGECACHE_TAG_DIRTY, min(end - index, |
2449 | (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | 2449 | (pgoff_t)PAGEVEC_SIZE-1) + 1))) { |
2450 | unsigned i; | 2450 | unsigned i; |
2451 | 2451 | ||
2452 | scanned = 1; | 2452 | scanned = 1; |
2453 | for (i = 0; i < nr_pages; i++) { | 2453 | for (i = 0; i < nr_pages; i++) { |
2454 | struct page *page = pvec.pages[i]; | 2454 | struct page *page = pvec.pages[i]; |
2455 | 2455 | ||
2456 | /* | 2456 | /* |
2457 | * At this point we hold neither mapping->tree_lock nor | 2457 | * At this point we hold neither mapping->tree_lock nor |
2458 | * lock on the page itself: the page may be truncated or | 2458 | * lock on the page itself: the page may be truncated or |
2459 | * invalidated (changing page->mapping to NULL), or even | 2459 | * invalidated (changing page->mapping to NULL), or even |
2460 | * swizzled back from swapper_space to tmpfs file | 2460 | * swizzled back from swapper_space to tmpfs file |
2461 | * mapping | 2461 | * mapping |
2462 | */ | 2462 | */ |
2463 | if (tree->ops && tree->ops->write_cache_pages_lock_hook) | 2463 | if (tree->ops && tree->ops->write_cache_pages_lock_hook) |
2464 | tree->ops->write_cache_pages_lock_hook(page); | 2464 | tree->ops->write_cache_pages_lock_hook(page); |
2465 | else | 2465 | else |
2466 | lock_page(page); | 2466 | lock_page(page); |
2467 | 2467 | ||
2468 | if (unlikely(page->mapping != mapping)) { | 2468 | if (unlikely(page->mapping != mapping)) { |
2469 | unlock_page(page); | 2469 | unlock_page(page); |
2470 | continue; | 2470 | continue; |
2471 | } | 2471 | } |
2472 | 2472 | ||
2473 | if (!wbc->range_cyclic && page->index > end) { | 2473 | if (!wbc->range_cyclic && page->index > end) { |
2474 | done = 1; | 2474 | done = 1; |
2475 | unlock_page(page); | 2475 | unlock_page(page); |
2476 | continue; | 2476 | continue; |
2477 | } | 2477 | } |
2478 | 2478 | ||
2479 | if (wbc->sync_mode != WB_SYNC_NONE) { | 2479 | if (wbc->sync_mode != WB_SYNC_NONE) { |
2480 | if (PageWriteback(page)) | 2480 | if (PageWriteback(page)) |
2481 | flush_fn(data); | 2481 | flush_fn(data); |
2482 | wait_on_page_writeback(page); | 2482 | wait_on_page_writeback(page); |
2483 | } | 2483 | } |
2484 | 2484 | ||
2485 | if (PageWriteback(page) || | 2485 | if (PageWriteback(page) || |
2486 | !clear_page_dirty_for_io(page)) { | 2486 | !clear_page_dirty_for_io(page)) { |
2487 | unlock_page(page); | 2487 | unlock_page(page); |
2488 | continue; | 2488 | continue; |
2489 | } | 2489 | } |
2490 | 2490 | ||
2491 | ret = (*writepage)(page, wbc, data); | 2491 | ret = (*writepage)(page, wbc, data); |
2492 | 2492 | ||
2493 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { | 2493 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { |
2494 | unlock_page(page); | 2494 | unlock_page(page); |
2495 | ret = 0; | 2495 | ret = 0; |
2496 | } | 2496 | } |
2497 | if (ret) | 2497 | if (ret) |
2498 | done = 1; | 2498 | done = 1; |
2499 | 2499 | ||
2500 | /* | 2500 | /* |
2501 | * the filesystem may choose to bump up nr_to_write. | 2501 | * the filesystem may choose to bump up nr_to_write. |
2502 | * We have to make sure to honor the new nr_to_write | 2502 | * We have to make sure to honor the new nr_to_write |
2503 | * at any time | 2503 | * at any time |
2504 | */ | 2504 | */ |
2505 | nr_to_write_done = wbc->nr_to_write <= 0; | 2505 | nr_to_write_done = wbc->nr_to_write <= 0; |
2506 | } | 2506 | } |
2507 | pagevec_release(&pvec); | 2507 | pagevec_release(&pvec); |
2508 | cond_resched(); | 2508 | cond_resched(); |
2509 | } | 2509 | } |
2510 | if (!scanned && !done) { | 2510 | if (!scanned && !done) { |
2511 | /* | 2511 | /* |
2512 | * We hit the last page and there is more work to be done: wrap | 2512 | * We hit the last page and there is more work to be done: wrap |
2513 | * back to the start of the file | 2513 | * back to the start of the file |
2514 | */ | 2514 | */ |
2515 | scanned = 1; | 2515 | scanned = 1; |
2516 | index = 0; | 2516 | index = 0; |
2517 | goto retry; | 2517 | goto retry; |
2518 | } | 2518 | } |
2519 | return ret; | 2519 | return ret; |
2520 | } | 2520 | } |
2521 | 2521 | ||
2522 | static void flush_epd_write_bio(struct extent_page_data *epd) | 2522 | static void flush_epd_write_bio(struct extent_page_data *epd) |
2523 | { | 2523 | { |
2524 | if (epd->bio) { | 2524 | if (epd->bio) { |
2525 | if (epd->sync_io) | 2525 | if (epd->sync_io) |
2526 | submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); | 2526 | submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); |
2527 | else | 2527 | else |
2528 | submit_one_bio(WRITE, epd->bio, 0, 0); | 2528 | submit_one_bio(WRITE, epd->bio, 0, 0); |
2529 | epd->bio = NULL; | 2529 | epd->bio = NULL; |
2530 | } | 2530 | } |
2531 | } | 2531 | } |
2532 | 2532 | ||
2533 | static noinline void flush_write_bio(void *data) | 2533 | static noinline void flush_write_bio(void *data) |
2534 | { | 2534 | { |
2535 | struct extent_page_data *epd = data; | 2535 | struct extent_page_data *epd = data; |
2536 | flush_epd_write_bio(epd); | 2536 | flush_epd_write_bio(epd); |
2537 | } | 2537 | } |
2538 | 2538 | ||
2539 | int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | 2539 | int extent_write_full_page(struct extent_io_tree *tree, struct page *page, |
2540 | get_extent_t *get_extent, | 2540 | get_extent_t *get_extent, |
2541 | struct writeback_control *wbc) | 2541 | struct writeback_control *wbc) |
2542 | { | 2542 | { |
2543 | int ret; | 2543 | int ret; |
2544 | struct address_space *mapping = page->mapping; | 2544 | struct address_space *mapping = page->mapping; |
2545 | struct extent_page_data epd = { | 2545 | struct extent_page_data epd = { |
2546 | .bio = NULL, | 2546 | .bio = NULL, |
2547 | .tree = tree, | 2547 | .tree = tree, |
2548 | .get_extent = get_extent, | 2548 | .get_extent = get_extent, |
2549 | .extent_locked = 0, | 2549 | .extent_locked = 0, |
2550 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, | 2550 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, |
2551 | }; | 2551 | }; |
2552 | struct writeback_control wbc_writepages = { | 2552 | struct writeback_control wbc_writepages = { |
2553 | .sync_mode = wbc->sync_mode, | 2553 | .sync_mode = wbc->sync_mode, |
2554 | .older_than_this = NULL, | ||
2555 | .nr_to_write = 64, | 2554 | .nr_to_write = 64, |
2556 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, | 2555 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, |
2557 | .range_end = (loff_t)-1, | 2556 | .range_end = (loff_t)-1, |
2558 | }; | 2557 | }; |
2559 | 2558 | ||
2560 | ret = __extent_writepage(page, wbc, &epd); | 2559 | ret = __extent_writepage(page, wbc, &epd); |
2561 | 2560 | ||
2562 | extent_write_cache_pages(tree, mapping, &wbc_writepages, | 2561 | extent_write_cache_pages(tree, mapping, &wbc_writepages, |
2563 | __extent_writepage, &epd, flush_write_bio); | 2562 | __extent_writepage, &epd, flush_write_bio); |
2564 | flush_epd_write_bio(&epd); | 2563 | flush_epd_write_bio(&epd); |
2565 | return ret; | 2564 | return ret; |
2566 | } | 2565 | } |
2567 | 2566 | ||
2568 | int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, | 2567 | int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, |
2569 | u64 start, u64 end, get_extent_t *get_extent, | 2568 | u64 start, u64 end, get_extent_t *get_extent, |
2570 | int mode) | 2569 | int mode) |
2571 | { | 2570 | { |
2572 | int ret = 0; | 2571 | int ret = 0; |
2573 | struct address_space *mapping = inode->i_mapping; | 2572 | struct address_space *mapping = inode->i_mapping; |
2574 | struct page *page; | 2573 | struct page *page; |
2575 | unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> | 2574 | unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> |
2576 | PAGE_CACHE_SHIFT; | 2575 | PAGE_CACHE_SHIFT; |
2577 | 2576 | ||
2578 | struct extent_page_data epd = { | 2577 | struct extent_page_data epd = { |
2579 | .bio = NULL, | 2578 | .bio = NULL, |
2580 | .tree = tree, | 2579 | .tree = tree, |
2581 | .get_extent = get_extent, | 2580 | .get_extent = get_extent, |
2582 | .extent_locked = 1, | 2581 | .extent_locked = 1, |
2583 | .sync_io = mode == WB_SYNC_ALL, | 2582 | .sync_io = mode == WB_SYNC_ALL, |
2584 | }; | 2583 | }; |
2585 | struct writeback_control wbc_writepages = { | 2584 | struct writeback_control wbc_writepages = { |
2586 | .sync_mode = mode, | 2585 | .sync_mode = mode, |
2587 | .older_than_this = NULL, | ||
2588 | .nr_to_write = nr_pages * 2, | 2586 | .nr_to_write = nr_pages * 2, |
2589 | .range_start = start, | 2587 | .range_start = start, |
2590 | .range_end = end + 1, | 2588 | .range_end = end + 1, |
2591 | }; | 2589 | }; |
2592 | 2590 | ||
2593 | while (start <= end) { | 2591 | while (start <= end) { |
2594 | page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); | 2592 | page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); |
2595 | if (clear_page_dirty_for_io(page)) | 2593 | if (clear_page_dirty_for_io(page)) |
2596 | ret = __extent_writepage(page, &wbc_writepages, &epd); | 2594 | ret = __extent_writepage(page, &wbc_writepages, &epd); |
2597 | else { | 2595 | else { |
2598 | if (tree->ops && tree->ops->writepage_end_io_hook) | 2596 | if (tree->ops && tree->ops->writepage_end_io_hook) |
2599 | tree->ops->writepage_end_io_hook(page, start, | 2597 | tree->ops->writepage_end_io_hook(page, start, |
2600 | start + PAGE_CACHE_SIZE - 1, | 2598 | start + PAGE_CACHE_SIZE - 1, |
2601 | NULL, 1); | 2599 | NULL, 1); |
2602 | unlock_page(page); | 2600 | unlock_page(page); |
2603 | } | 2601 | } |
2604 | page_cache_release(page); | 2602 | page_cache_release(page); |
2605 | start += PAGE_CACHE_SIZE; | 2603 | start += PAGE_CACHE_SIZE; |
2606 | } | 2604 | } |
2607 | 2605 | ||
2608 | flush_epd_write_bio(&epd); | 2606 | flush_epd_write_bio(&epd); |
2609 | return ret; | 2607 | return ret; |
2610 | } | 2608 | } |
2611 | 2609 | ||
2612 | int extent_writepages(struct extent_io_tree *tree, | 2610 | int extent_writepages(struct extent_io_tree *tree, |
2613 | struct address_space *mapping, | 2611 | struct address_space *mapping, |
2614 | get_extent_t *get_extent, | 2612 | get_extent_t *get_extent, |
2615 | struct writeback_control *wbc) | 2613 | struct writeback_control *wbc) |
2616 | { | 2614 | { |
2617 | int ret = 0; | 2615 | int ret = 0; |
2618 | struct extent_page_data epd = { | 2616 | struct extent_page_data epd = { |
2619 | .bio = NULL, | 2617 | .bio = NULL, |
2620 | .tree = tree, | 2618 | .tree = tree, |
2621 | .get_extent = get_extent, | 2619 | .get_extent = get_extent, |
2622 | .extent_locked = 0, | 2620 | .extent_locked = 0, |
2623 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, | 2621 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, |
2624 | }; | 2622 | }; |
2625 | 2623 | ||
2626 | ret = extent_write_cache_pages(tree, mapping, wbc, | 2624 | ret = extent_write_cache_pages(tree, mapping, wbc, |
2627 | __extent_writepage, &epd, | 2625 | __extent_writepage, &epd, |
2628 | flush_write_bio); | 2626 | flush_write_bio); |
2629 | flush_epd_write_bio(&epd); | 2627 | flush_epd_write_bio(&epd); |
2630 | return ret; | 2628 | return ret; |
2631 | } | 2629 | } |
2632 | 2630 | ||
2633 | int extent_readpages(struct extent_io_tree *tree, | 2631 | int extent_readpages(struct extent_io_tree *tree, |
2634 | struct address_space *mapping, | 2632 | struct address_space *mapping, |
2635 | struct list_head *pages, unsigned nr_pages, | 2633 | struct list_head *pages, unsigned nr_pages, |
2636 | get_extent_t get_extent) | 2634 | get_extent_t get_extent) |
2637 | { | 2635 | { |
2638 | struct bio *bio = NULL; | 2636 | struct bio *bio = NULL; |
2639 | unsigned page_idx; | 2637 | unsigned page_idx; |
2640 | unsigned long bio_flags = 0; | 2638 | unsigned long bio_flags = 0; |
2641 | 2639 | ||
2642 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | 2640 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { |
2643 | struct page *page = list_entry(pages->prev, struct page, lru); | 2641 | struct page *page = list_entry(pages->prev, struct page, lru); |
2644 | 2642 | ||
2645 | prefetchw(&page->flags); | 2643 | prefetchw(&page->flags); |
2646 | list_del(&page->lru); | 2644 | list_del(&page->lru); |
2647 | if (!add_to_page_cache_lru(page, mapping, | 2645 | if (!add_to_page_cache_lru(page, mapping, |
2648 | page->index, GFP_NOFS)) { | 2646 | page->index, GFP_NOFS)) { |
2649 | __extent_read_full_page(tree, page, get_extent, | 2647 | __extent_read_full_page(tree, page, get_extent, |
2650 | &bio, 0, &bio_flags); | 2648 | &bio, 0, &bio_flags); |
2651 | } | 2649 | } |
2652 | page_cache_release(page); | 2650 | page_cache_release(page); |
2653 | } | 2651 | } |
2654 | BUG_ON(!list_empty(pages)); | 2652 | BUG_ON(!list_empty(pages)); |
2655 | if (bio) | 2653 | if (bio) |
2656 | submit_one_bio(READ, bio, 0, bio_flags); | 2654 | submit_one_bio(READ, bio, 0, bio_flags); |
2657 | return 0; | 2655 | return 0; |
2658 | } | 2656 | } |
2659 | 2657 | ||
2660 | /* | 2658 | /* |
2661 | * basic invalidatepage code, this waits on any locked or writeback | 2659 | * basic invalidatepage code, this waits on any locked or writeback |
2662 | * ranges corresponding to the page, and then deletes any extent state | 2660 | * ranges corresponding to the page, and then deletes any extent state |
2663 | * records from the tree | 2661 | * records from the tree |
2664 | */ | 2662 | */ |
2665 | int extent_invalidatepage(struct extent_io_tree *tree, | 2663 | int extent_invalidatepage(struct extent_io_tree *tree, |
2666 | struct page *page, unsigned long offset) | 2664 | struct page *page, unsigned long offset) |
2667 | { | 2665 | { |
2668 | struct extent_state *cached_state = NULL; | 2666 | struct extent_state *cached_state = NULL; |
2669 | u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); | 2667 | u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); |
2670 | u64 end = start + PAGE_CACHE_SIZE - 1; | 2668 | u64 end = start + PAGE_CACHE_SIZE - 1; |
2671 | size_t blocksize = page->mapping->host->i_sb->s_blocksize; | 2669 | size_t blocksize = page->mapping->host->i_sb->s_blocksize; |
2672 | 2670 | ||
2673 | start += (offset + blocksize - 1) & ~(blocksize - 1); | 2671 | start += (offset + blocksize - 1) & ~(blocksize - 1); |
2674 | if (start > end) | 2672 | if (start > end) |
2675 | return 0; | 2673 | return 0; |
2676 | 2674 | ||
2677 | lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); | 2675 | lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); |
2678 | wait_on_page_writeback(page); | 2676 | wait_on_page_writeback(page); |
2679 | clear_extent_bit(tree, start, end, | 2677 | clear_extent_bit(tree, start, end, |
2680 | EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | | 2678 | EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | |
2681 | EXTENT_DO_ACCOUNTING, | 2679 | EXTENT_DO_ACCOUNTING, |
2682 | 1, 1, &cached_state, GFP_NOFS); | 2680 | 1, 1, &cached_state, GFP_NOFS); |
2683 | return 0; | 2681 | return 0; |
2684 | } | 2682 | } |
2685 | 2683 | ||
2686 | /* | 2684 | /* |
2687 | * a helper for releasepage, this tests for areas of the page that | 2685 | * a helper for releasepage, this tests for areas of the page that |
2688 | * are locked or under IO and drops the related state bits if it is safe | 2686 | * are locked or under IO and drops the related state bits if it is safe |
2689 | * to drop the page. | 2687 | * to drop the page. |
2690 | */ | 2688 | */ |
2691 | int try_release_extent_state(struct extent_map_tree *map, | 2689 | int try_release_extent_state(struct extent_map_tree *map, |
2692 | struct extent_io_tree *tree, struct page *page, | 2690 | struct extent_io_tree *tree, struct page *page, |
2693 | gfp_t mask) | 2691 | gfp_t mask) |
2694 | { | 2692 | { |
2695 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 2693 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
2696 | u64 end = start + PAGE_CACHE_SIZE - 1; | 2694 | u64 end = start + PAGE_CACHE_SIZE - 1; |
2697 | int ret = 1; | 2695 | int ret = 1; |
2698 | 2696 | ||
2699 | if (test_range_bit(tree, start, end, | 2697 | if (test_range_bit(tree, start, end, |
2700 | EXTENT_IOBITS, 0, NULL)) | 2698 | EXTENT_IOBITS, 0, NULL)) |
2701 | ret = 0; | 2699 | ret = 0; |
2702 | else { | 2700 | else { |
2703 | if ((mask & GFP_NOFS) == GFP_NOFS) | 2701 | if ((mask & GFP_NOFS) == GFP_NOFS) |
2704 | mask = GFP_NOFS; | 2702 | mask = GFP_NOFS; |
2705 | /* | 2703 | /* |
2706 | * at this point we can safely clear everything except the | 2704 | * at this point we can safely clear everything except the |
2707 | * locked bit and the nodatasum bit | 2705 | * locked bit and the nodatasum bit |
2708 | */ | 2706 | */ |
2709 | ret = clear_extent_bit(tree, start, end, | 2707 | ret = clear_extent_bit(tree, start, end, |
2710 | ~(EXTENT_LOCKED | EXTENT_NODATASUM), | 2708 | ~(EXTENT_LOCKED | EXTENT_NODATASUM), |
2711 | 0, 0, NULL, mask); | 2709 | 0, 0, NULL, mask); |
2712 | 2710 | ||
2713 | /* if clear_extent_bit failed for enomem reasons, | 2711 | /* if clear_extent_bit failed for enomem reasons, |
2714 | * we can't allow the release to continue. | 2712 | * we can't allow the release to continue. |
2715 | */ | 2713 | */ |
2716 | if (ret < 0) | 2714 | if (ret < 0) |
2717 | ret = 0; | 2715 | ret = 0; |
2718 | else | 2716 | else |
2719 | ret = 1; | 2717 | ret = 1; |
2720 | } | 2718 | } |
2721 | return ret; | 2719 | return ret; |
2722 | } | 2720 | } |
2723 | 2721 | ||
2724 | /* | 2722 | /* |
2725 | * a helper for releasepage. As long as there are no locked extents | 2723 | * a helper for releasepage. As long as there are no locked extents |
2726 | * in the range corresponding to the page, both state records and extent | 2724 | * in the range corresponding to the page, both state records and extent |
2727 | * map records are removed | 2725 | * map records are removed |
2728 | */ | 2726 | */ |
2729 | int try_release_extent_mapping(struct extent_map_tree *map, | 2727 | int try_release_extent_mapping(struct extent_map_tree *map, |
2730 | struct extent_io_tree *tree, struct page *page, | 2728 | struct extent_io_tree *tree, struct page *page, |
2731 | gfp_t mask) | 2729 | gfp_t mask) |
2732 | { | 2730 | { |
2733 | struct extent_map *em; | 2731 | struct extent_map *em; |
2734 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 2732 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
2735 | u64 end = start + PAGE_CACHE_SIZE - 1; | 2733 | u64 end = start + PAGE_CACHE_SIZE - 1; |
2736 | 2734 | ||
2737 | if ((mask & __GFP_WAIT) && | 2735 | if ((mask & __GFP_WAIT) && |
2738 | page->mapping->host->i_size > 16 * 1024 * 1024) { | 2736 | page->mapping->host->i_size > 16 * 1024 * 1024) { |
2739 | u64 len; | 2737 | u64 len; |
2740 | while (start <= end) { | 2738 | while (start <= end) { |
2741 | len = end - start + 1; | 2739 | len = end - start + 1; |
2742 | write_lock(&map->lock); | 2740 | write_lock(&map->lock); |
2743 | em = lookup_extent_mapping(map, start, len); | 2741 | em = lookup_extent_mapping(map, start, len); |
2744 | if (IS_ERR_OR_NULL(em)) { | 2742 | if (IS_ERR_OR_NULL(em)) { |
2745 | write_unlock(&map->lock); | 2743 | write_unlock(&map->lock); |
2746 | break; | 2744 | break; |
2747 | } | 2745 | } |
2748 | if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || | 2746 | if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || |
2749 | em->start != start) { | 2747 | em->start != start) { |
2750 | write_unlock(&map->lock); | 2748 | write_unlock(&map->lock); |
2751 | free_extent_map(em); | 2749 | free_extent_map(em); |
2752 | break; | 2750 | break; |
2753 | } | 2751 | } |
2754 | if (!test_range_bit(tree, em->start, | 2752 | if (!test_range_bit(tree, em->start, |
2755 | extent_map_end(em) - 1, | 2753 | extent_map_end(em) - 1, |
2756 | EXTENT_LOCKED | EXTENT_WRITEBACK, | 2754 | EXTENT_LOCKED | EXTENT_WRITEBACK, |
2757 | 0, NULL)) { | 2755 | 0, NULL)) { |
2758 | remove_extent_mapping(map, em); | 2756 | remove_extent_mapping(map, em); |
2759 | /* once for the rb tree */ | 2757 | /* once for the rb tree */ |
2760 | free_extent_map(em); | 2758 | free_extent_map(em); |
2761 | } | 2759 | } |
2762 | start = extent_map_end(em); | 2760 | start = extent_map_end(em); |
2763 | write_unlock(&map->lock); | 2761 | write_unlock(&map->lock); |
2764 | 2762 | ||
2765 | /* once for us */ | 2763 | /* once for us */ |
2766 | free_extent_map(em); | 2764 | free_extent_map(em); |
2767 | } | 2765 | } |
2768 | } | 2766 | } |
2769 | return try_release_extent_state(map, tree, page, mask); | 2767 | return try_release_extent_state(map, tree, page, mask); |
2770 | } | 2768 | } |
2771 | 2769 | ||
2772 | /* | 2770 | /* |
2773 | * helper function for fiemap, which doesn't want to see any holes. | 2771 | * helper function for fiemap, which doesn't want to see any holes. |
2774 | * This maps until we find something past 'last' | 2772 | * This maps until we find something past 'last' |
2775 | */ | 2773 | */ |
2776 | static struct extent_map *get_extent_skip_holes(struct inode *inode, | 2774 | static struct extent_map *get_extent_skip_holes(struct inode *inode, |
2777 | u64 offset, | 2775 | u64 offset, |
2778 | u64 last, | 2776 | u64 last, |
2779 | get_extent_t *get_extent) | 2777 | get_extent_t *get_extent) |
2780 | { | 2778 | { |
2781 | u64 sectorsize = BTRFS_I(inode)->root->sectorsize; | 2779 | u64 sectorsize = BTRFS_I(inode)->root->sectorsize; |
2782 | struct extent_map *em; | 2780 | struct extent_map *em; |
2783 | u64 len; | 2781 | u64 len; |
2784 | 2782 | ||
2785 | if (offset >= last) | 2783 | if (offset >= last) |
2786 | return NULL; | 2784 | return NULL; |
2787 | 2785 | ||
2788 | while(1) { | 2786 | while(1) { |
2789 | len = last - offset; | 2787 | len = last - offset; |
2790 | if (len == 0) | 2788 | if (len == 0) |
2791 | break; | 2789 | break; |
2792 | len = (len + sectorsize - 1) & ~(sectorsize - 1); | 2790 | len = (len + sectorsize - 1) & ~(sectorsize - 1); |
2793 | em = get_extent(inode, NULL, 0, offset, len, 0); | 2791 | em = get_extent(inode, NULL, 0, offset, len, 0); |
2794 | if (IS_ERR_OR_NULL(em)) | 2792 | if (IS_ERR_OR_NULL(em)) |
2795 | return em; | 2793 | return em; |
2796 | 2794 | ||
2797 | /* if this isn't a hole return it */ | 2795 | /* if this isn't a hole return it */ |
2798 | if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && | 2796 | if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && |
2799 | em->block_start != EXTENT_MAP_HOLE) { | 2797 | em->block_start != EXTENT_MAP_HOLE) { |
2800 | return em; | 2798 | return em; |
2801 | } | 2799 | } |
2802 | 2800 | ||
2803 | /* this is a hole, advance to the next extent */ | 2801 | /* this is a hole, advance to the next extent */ |
2804 | offset = extent_map_end(em); | 2802 | offset = extent_map_end(em); |
2805 | free_extent_map(em); | 2803 | free_extent_map(em); |
2806 | if (offset >= last) | 2804 | if (offset >= last) |
2807 | break; | 2805 | break; |
2808 | } | 2806 | } |
2809 | return NULL; | 2807 | return NULL; |
2810 | } | 2808 | } |
2811 | 2809 | ||
2812 | int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 2810 | int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
2813 | __u64 start, __u64 len, get_extent_t *get_extent) | 2811 | __u64 start, __u64 len, get_extent_t *get_extent) |
2814 | { | 2812 | { |
2815 | int ret = 0; | 2813 | int ret = 0; |
2816 | u64 off = start; | 2814 | u64 off = start; |
2817 | u64 max = start + len; | 2815 | u64 max = start + len; |
2818 | u32 flags = 0; | 2816 | u32 flags = 0; |
2819 | u32 found_type; | 2817 | u32 found_type; |
2820 | u64 last; | 2818 | u64 last; |
2821 | u64 last_for_get_extent = 0; | 2819 | u64 last_for_get_extent = 0; |
2822 | u64 disko = 0; | 2820 | u64 disko = 0; |
2823 | u64 isize = i_size_read(inode); | 2821 | u64 isize = i_size_read(inode); |
2824 | struct btrfs_key found_key; | 2822 | struct btrfs_key found_key; |
2825 | struct extent_map *em = NULL; | 2823 | struct extent_map *em = NULL; |
2826 | struct extent_state *cached_state = NULL; | 2824 | struct extent_state *cached_state = NULL; |
2827 | struct btrfs_path *path; | 2825 | struct btrfs_path *path; |
2828 | struct btrfs_file_extent_item *item; | 2826 | struct btrfs_file_extent_item *item; |
2829 | int end = 0; | 2827 | int end = 0; |
2830 | u64 em_start = 0; | 2828 | u64 em_start = 0; |
2831 | u64 em_len = 0; | 2829 | u64 em_len = 0; |
2832 | u64 em_end = 0; | 2830 | u64 em_end = 0; |
2833 | unsigned long emflags; | 2831 | unsigned long emflags; |
2834 | 2832 | ||
2835 | if (len == 0) | 2833 | if (len == 0) |
2836 | return -EINVAL; | 2834 | return -EINVAL; |
2837 | 2835 | ||
2838 | path = btrfs_alloc_path(); | 2836 | path = btrfs_alloc_path(); |
2839 | if (!path) | 2837 | if (!path) |
2840 | return -ENOMEM; | 2838 | return -ENOMEM; |
2841 | path->leave_spinning = 1; | 2839 | path->leave_spinning = 1; |
2842 | 2840 | ||
2843 | /* | 2841 | /* |
2844 | * lookup the last file extent. We're not using i_size here | 2842 | * lookup the last file extent. We're not using i_size here |
2845 | * because there might be preallocation past i_size | 2843 | * because there might be preallocation past i_size |
2846 | */ | 2844 | */ |
2847 | ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, | 2845 | ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, |
2848 | path, btrfs_ino(inode), -1, 0); | 2846 | path, btrfs_ino(inode), -1, 0); |
2849 | if (ret < 0) { | 2847 | if (ret < 0) { |
2850 | btrfs_free_path(path); | 2848 | btrfs_free_path(path); |
2851 | return ret; | 2849 | return ret; |
2852 | } | 2850 | } |
2853 | WARN_ON(!ret); | 2851 | WARN_ON(!ret); |
2854 | path->slots[0]--; | 2852 | path->slots[0]--; |
2855 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | 2853 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], |
2856 | struct btrfs_file_extent_item); | 2854 | struct btrfs_file_extent_item); |
2857 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); | 2855 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); |
2858 | found_type = btrfs_key_type(&found_key); | 2856 | found_type = btrfs_key_type(&found_key); |
2859 | 2857 | ||
2860 | /* No extents, but there might be delalloc bits */ | 2858 | /* No extents, but there might be delalloc bits */ |
2861 | if (found_key.objectid != btrfs_ino(inode) || | 2859 | if (found_key.objectid != btrfs_ino(inode) || |
2862 | found_type != BTRFS_EXTENT_DATA_KEY) { | 2860 | found_type != BTRFS_EXTENT_DATA_KEY) { |
2863 | /* have to trust i_size as the end */ | 2861 | /* have to trust i_size as the end */ |
2864 | last = (u64)-1; | 2862 | last = (u64)-1; |
2865 | last_for_get_extent = isize; | 2863 | last_for_get_extent = isize; |
2866 | } else { | 2864 | } else { |
2867 | /* | 2865 | /* |
2868 | * remember the start of the last extent. There are a | 2866 | * remember the start of the last extent. There are a |
2869 | * bunch of different factors that go into the length of the | 2867 | * bunch of different factors that go into the length of the |
2870 | * extent, so its much less complex to remember where it started | 2868 | * extent, so its much less complex to remember where it started |
2871 | */ | 2869 | */ |
2872 | last = found_key.offset; | 2870 | last = found_key.offset; |
2873 | last_for_get_extent = last + 1; | 2871 | last_for_get_extent = last + 1; |
2874 | } | 2872 | } |
2875 | btrfs_free_path(path); | 2873 | btrfs_free_path(path); |
2876 | 2874 | ||
2877 | /* | 2875 | /* |
2878 | * we might have some extents allocated but more delalloc past those | 2876 | * we might have some extents allocated but more delalloc past those |
2879 | * extents. so, we trust isize unless the start of the last extent is | 2877 | * extents. so, we trust isize unless the start of the last extent is |
2880 | * beyond isize | 2878 | * beyond isize |
2881 | */ | 2879 | */ |
2882 | if (last < isize) { | 2880 | if (last < isize) { |
2883 | last = (u64)-1; | 2881 | last = (u64)-1; |
2884 | last_for_get_extent = isize; | 2882 | last_for_get_extent = isize; |
2885 | } | 2883 | } |
2886 | 2884 | ||
2887 | lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, | 2885 | lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, |
2888 | &cached_state, GFP_NOFS); | 2886 | &cached_state, GFP_NOFS); |
2889 | 2887 | ||
2890 | em = get_extent_skip_holes(inode, off, last_for_get_extent, | 2888 | em = get_extent_skip_holes(inode, off, last_for_get_extent, |
2891 | get_extent); | 2889 | get_extent); |
2892 | if (!em) | 2890 | if (!em) |
2893 | goto out; | 2891 | goto out; |
2894 | if (IS_ERR(em)) { | 2892 | if (IS_ERR(em)) { |
2895 | ret = PTR_ERR(em); | 2893 | ret = PTR_ERR(em); |
2896 | goto out; | 2894 | goto out; |
2897 | } | 2895 | } |
2898 | 2896 | ||
2899 | while (!end) { | 2897 | while (!end) { |
2900 | u64 offset_in_extent; | 2898 | u64 offset_in_extent; |
2901 | 2899 | ||
2902 | /* break if the extent we found is outside the range */ | 2900 | /* break if the extent we found is outside the range */ |
2903 | if (em->start >= max || extent_map_end(em) < off) | 2901 | if (em->start >= max || extent_map_end(em) < off) |
2904 | break; | 2902 | break; |
2905 | 2903 | ||
2906 | /* | 2904 | /* |
2907 | * get_extent may return an extent that starts before our | 2905 | * get_extent may return an extent that starts before our |
2908 | * requested range. We have to make sure the ranges | 2906 | * requested range. We have to make sure the ranges |
2909 | * we return to fiemap always move forward and don't | 2907 | * we return to fiemap always move forward and don't |
2910 | * overlap, so adjust the offsets here | 2908 | * overlap, so adjust the offsets here |
2911 | */ | 2909 | */ |
2912 | em_start = max(em->start, off); | 2910 | em_start = max(em->start, off); |
2913 | 2911 | ||
2914 | /* | 2912 | /* |
2915 | * record the offset from the start of the extent | 2913 | * record the offset from the start of the extent |
2916 | * for adjusting the disk offset below | 2914 | * for adjusting the disk offset below |
2917 | */ | 2915 | */ |
2918 | offset_in_extent = em_start - em->start; | 2916 | offset_in_extent = em_start - em->start; |
2919 | em_end = extent_map_end(em); | 2917 | em_end = extent_map_end(em); |
2920 | em_len = em_end - em_start; | 2918 | em_len = em_end - em_start; |
2921 | emflags = em->flags; | 2919 | emflags = em->flags; |
2922 | disko = 0; | 2920 | disko = 0; |
2923 | flags = 0; | 2921 | flags = 0; |
2924 | 2922 | ||
2925 | /* | 2923 | /* |
2926 | * bump off for our next call to get_extent | 2924 | * bump off for our next call to get_extent |
2927 | */ | 2925 | */ |
2928 | off = extent_map_end(em); | 2926 | off = extent_map_end(em); |
2929 | if (off >= max) | 2927 | if (off >= max) |
2930 | end = 1; | 2928 | end = 1; |
2931 | 2929 | ||
2932 | if (em->block_start == EXTENT_MAP_LAST_BYTE) { | 2930 | if (em->block_start == EXTENT_MAP_LAST_BYTE) { |
2933 | end = 1; | 2931 | end = 1; |
2934 | flags |= FIEMAP_EXTENT_LAST; | 2932 | flags |= FIEMAP_EXTENT_LAST; |
2935 | } else if (em->block_start == EXTENT_MAP_INLINE) { | 2933 | } else if (em->block_start == EXTENT_MAP_INLINE) { |
2936 | flags |= (FIEMAP_EXTENT_DATA_INLINE | | 2934 | flags |= (FIEMAP_EXTENT_DATA_INLINE | |
2937 | FIEMAP_EXTENT_NOT_ALIGNED); | 2935 | FIEMAP_EXTENT_NOT_ALIGNED); |
2938 | } else if (em->block_start == EXTENT_MAP_DELALLOC) { | 2936 | } else if (em->block_start == EXTENT_MAP_DELALLOC) { |
2939 | flags |= (FIEMAP_EXTENT_DELALLOC | | 2937 | flags |= (FIEMAP_EXTENT_DELALLOC | |
2940 | FIEMAP_EXTENT_UNKNOWN); | 2938 | FIEMAP_EXTENT_UNKNOWN); |
2941 | } else { | 2939 | } else { |
2942 | disko = em->block_start + offset_in_extent; | 2940 | disko = em->block_start + offset_in_extent; |
2943 | } | 2941 | } |
2944 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) | 2942 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) |
2945 | flags |= FIEMAP_EXTENT_ENCODED; | 2943 | flags |= FIEMAP_EXTENT_ENCODED; |
2946 | 2944 | ||
2947 | free_extent_map(em); | 2945 | free_extent_map(em); |
2948 | em = NULL; | 2946 | em = NULL; |
2949 | if ((em_start >= last) || em_len == (u64)-1 || | 2947 | if ((em_start >= last) || em_len == (u64)-1 || |
2950 | (last == (u64)-1 && isize <= em_end)) { | 2948 | (last == (u64)-1 && isize <= em_end)) { |
2951 | flags |= FIEMAP_EXTENT_LAST; | 2949 | flags |= FIEMAP_EXTENT_LAST; |
2952 | end = 1; | 2950 | end = 1; |
2953 | } | 2951 | } |
2954 | 2952 | ||
2955 | /* now scan forward to see if this is really the last extent. */ | 2953 | /* now scan forward to see if this is really the last extent. */ |
2956 | em = get_extent_skip_holes(inode, off, last_for_get_extent, | 2954 | em = get_extent_skip_holes(inode, off, last_for_get_extent, |
2957 | get_extent); | 2955 | get_extent); |
2958 | if (IS_ERR(em)) { | 2956 | if (IS_ERR(em)) { |
2959 | ret = PTR_ERR(em); | 2957 | ret = PTR_ERR(em); |
2960 | goto out; | 2958 | goto out; |
2961 | } | 2959 | } |
2962 | if (!em) { | 2960 | if (!em) { |
2963 | flags |= FIEMAP_EXTENT_LAST; | 2961 | flags |= FIEMAP_EXTENT_LAST; |
2964 | end = 1; | 2962 | end = 1; |
2965 | } | 2963 | } |
2966 | ret = fiemap_fill_next_extent(fieinfo, em_start, disko, | 2964 | ret = fiemap_fill_next_extent(fieinfo, em_start, disko, |
2967 | em_len, flags); | 2965 | em_len, flags); |
2968 | if (ret) | 2966 | if (ret) |
2969 | goto out_free; | 2967 | goto out_free; |
2970 | } | 2968 | } |
2971 | out_free: | 2969 | out_free: |
2972 | free_extent_map(em); | 2970 | free_extent_map(em); |
2973 | out: | 2971 | out: |
2974 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, | 2972 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, |
2975 | &cached_state, GFP_NOFS); | 2973 | &cached_state, GFP_NOFS); |
2976 | return ret; | 2974 | return ret; |
2977 | } | 2975 | } |
2978 | 2976 | ||
2979 | static inline struct page *extent_buffer_page(struct extent_buffer *eb, | 2977 | static inline struct page *extent_buffer_page(struct extent_buffer *eb, |
2980 | unsigned long i) | 2978 | unsigned long i) |
2981 | { | 2979 | { |
2982 | struct page *p; | 2980 | struct page *p; |
2983 | struct address_space *mapping; | 2981 | struct address_space *mapping; |
2984 | 2982 | ||
2985 | if (i == 0) | 2983 | if (i == 0) |
2986 | return eb->first_page; | 2984 | return eb->first_page; |
2987 | i += eb->start >> PAGE_CACHE_SHIFT; | 2985 | i += eb->start >> PAGE_CACHE_SHIFT; |
2988 | mapping = eb->first_page->mapping; | 2986 | mapping = eb->first_page->mapping; |
2989 | if (!mapping) | 2987 | if (!mapping) |
2990 | return NULL; | 2988 | return NULL; |
2991 | 2989 | ||
2992 | /* | 2990 | /* |
2993 | * extent_buffer_page is only called after pinning the page | 2991 | * extent_buffer_page is only called after pinning the page |
2994 | * by increasing the reference count. So we know the page must | 2992 | * by increasing the reference count. So we know the page must |
2995 | * be in the radix tree. | 2993 | * be in the radix tree. |
2996 | */ | 2994 | */ |
2997 | rcu_read_lock(); | 2995 | rcu_read_lock(); |
2998 | p = radix_tree_lookup(&mapping->page_tree, i); | 2996 | p = radix_tree_lookup(&mapping->page_tree, i); |
2999 | rcu_read_unlock(); | 2997 | rcu_read_unlock(); |
3000 | 2998 | ||
3001 | return p; | 2999 | return p; |
3002 | } | 3000 | } |
3003 | 3001 | ||
3004 | static inline unsigned long num_extent_pages(u64 start, u64 len) | 3002 | static inline unsigned long num_extent_pages(u64 start, u64 len) |
3005 | { | 3003 | { |
3006 | return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - | 3004 | return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - |
3007 | (start >> PAGE_CACHE_SHIFT); | 3005 | (start >> PAGE_CACHE_SHIFT); |
3008 | } | 3006 | } |
3009 | 3007 | ||
3010 | static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, | 3008 | static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, |
3011 | u64 start, | 3009 | u64 start, |
3012 | unsigned long len, | 3010 | unsigned long len, |
3013 | gfp_t mask) | 3011 | gfp_t mask) |
3014 | { | 3012 | { |
3015 | struct extent_buffer *eb = NULL; | 3013 | struct extent_buffer *eb = NULL; |
3016 | #if LEAK_DEBUG | 3014 | #if LEAK_DEBUG |
3017 | unsigned long flags; | 3015 | unsigned long flags; |
3018 | #endif | 3016 | #endif |
3019 | 3017 | ||
3020 | eb = kmem_cache_zalloc(extent_buffer_cache, mask); | 3018 | eb = kmem_cache_zalloc(extent_buffer_cache, mask); |
3021 | if (eb == NULL) | 3019 | if (eb == NULL) |
3022 | return NULL; | 3020 | return NULL; |
3023 | eb->start = start; | 3021 | eb->start = start; |
3024 | eb->len = len; | 3022 | eb->len = len; |
3025 | spin_lock_init(&eb->lock); | 3023 | spin_lock_init(&eb->lock); |
3026 | init_waitqueue_head(&eb->lock_wq); | 3024 | init_waitqueue_head(&eb->lock_wq); |
3027 | 3025 | ||
3028 | #if LEAK_DEBUG | 3026 | #if LEAK_DEBUG |
3029 | spin_lock_irqsave(&leak_lock, flags); | 3027 | spin_lock_irqsave(&leak_lock, flags); |
3030 | list_add(&eb->leak_list, &buffers); | 3028 | list_add(&eb->leak_list, &buffers); |
3031 | spin_unlock_irqrestore(&leak_lock, flags); | 3029 | spin_unlock_irqrestore(&leak_lock, flags); |
3032 | #endif | 3030 | #endif |
3033 | atomic_set(&eb->refs, 1); | 3031 | atomic_set(&eb->refs, 1); |
3034 | 3032 | ||
3035 | return eb; | 3033 | return eb; |
3036 | } | 3034 | } |
3037 | 3035 | ||
3038 | static void __free_extent_buffer(struct extent_buffer *eb) | 3036 | static void __free_extent_buffer(struct extent_buffer *eb) |
3039 | { | 3037 | { |
3040 | #if LEAK_DEBUG | 3038 | #if LEAK_DEBUG |
3041 | unsigned long flags; | 3039 | unsigned long flags; |
3042 | spin_lock_irqsave(&leak_lock, flags); | 3040 | spin_lock_irqsave(&leak_lock, flags); |
3043 | list_del(&eb->leak_list); | 3041 | list_del(&eb->leak_list); |
3044 | spin_unlock_irqrestore(&leak_lock, flags); | 3042 | spin_unlock_irqrestore(&leak_lock, flags); |
3045 | #endif | 3043 | #endif |
3046 | kmem_cache_free(extent_buffer_cache, eb); | 3044 | kmem_cache_free(extent_buffer_cache, eb); |
3047 | } | 3045 | } |
3048 | 3046 | ||
3049 | /* | 3047 | /* |
3050 | * Helper for releasing extent buffer page. | 3048 | * Helper for releasing extent buffer page. |
3051 | */ | 3049 | */ |
3052 | static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, | 3050 | static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, |
3053 | unsigned long start_idx) | 3051 | unsigned long start_idx) |
3054 | { | 3052 | { |
3055 | unsigned long index; | 3053 | unsigned long index; |
3056 | struct page *page; | 3054 | struct page *page; |
3057 | 3055 | ||
3058 | if (!eb->first_page) | 3056 | if (!eb->first_page) |
3059 | return; | 3057 | return; |
3060 | 3058 | ||
3061 | index = num_extent_pages(eb->start, eb->len); | 3059 | index = num_extent_pages(eb->start, eb->len); |
3062 | if (start_idx >= index) | 3060 | if (start_idx >= index) |
3063 | return; | 3061 | return; |
3064 | 3062 | ||
3065 | do { | 3063 | do { |
3066 | index--; | 3064 | index--; |
3067 | page = extent_buffer_page(eb, index); | 3065 | page = extent_buffer_page(eb, index); |
3068 | if (page) | 3066 | if (page) |
3069 | page_cache_release(page); | 3067 | page_cache_release(page); |
3070 | } while (index != start_idx); | 3068 | } while (index != start_idx); |
3071 | } | 3069 | } |
3072 | 3070 | ||
3073 | /* | 3071 | /* |
3074 | * Helper for releasing the extent buffer. | 3072 | * Helper for releasing the extent buffer. |
3075 | */ | 3073 | */ |
3076 | static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) | 3074 | static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) |
3077 | { | 3075 | { |
3078 | btrfs_release_extent_buffer_page(eb, 0); | 3076 | btrfs_release_extent_buffer_page(eb, 0); |
3079 | __free_extent_buffer(eb); | 3077 | __free_extent_buffer(eb); |
3080 | } | 3078 | } |
3081 | 3079 | ||
3082 | struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, | 3080 | struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, |
3083 | u64 start, unsigned long len, | 3081 | u64 start, unsigned long len, |
3084 | struct page *page0) | 3082 | struct page *page0) |
3085 | { | 3083 | { |
3086 | unsigned long num_pages = num_extent_pages(start, len); | 3084 | unsigned long num_pages = num_extent_pages(start, len); |
3087 | unsigned long i; | 3085 | unsigned long i; |
3088 | unsigned long index = start >> PAGE_CACHE_SHIFT; | 3086 | unsigned long index = start >> PAGE_CACHE_SHIFT; |
3089 | struct extent_buffer *eb; | 3087 | struct extent_buffer *eb; |
3090 | struct extent_buffer *exists = NULL; | 3088 | struct extent_buffer *exists = NULL; |
3091 | struct page *p; | 3089 | struct page *p; |
3092 | struct address_space *mapping = tree->mapping; | 3090 | struct address_space *mapping = tree->mapping; |
3093 | int uptodate = 1; | 3091 | int uptodate = 1; |
3094 | int ret; | 3092 | int ret; |
3095 | 3093 | ||
3096 | rcu_read_lock(); | 3094 | rcu_read_lock(); |
3097 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); | 3095 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); |
3098 | if (eb && atomic_inc_not_zero(&eb->refs)) { | 3096 | if (eb && atomic_inc_not_zero(&eb->refs)) { |
3099 | rcu_read_unlock(); | 3097 | rcu_read_unlock(); |
3100 | mark_page_accessed(eb->first_page); | 3098 | mark_page_accessed(eb->first_page); |
3101 | return eb; | 3099 | return eb; |
3102 | } | 3100 | } |
3103 | rcu_read_unlock(); | 3101 | rcu_read_unlock(); |
3104 | 3102 | ||
3105 | eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); | 3103 | eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); |
3106 | if (!eb) | 3104 | if (!eb) |
3107 | return NULL; | 3105 | return NULL; |
3108 | 3106 | ||
3109 | if (page0) { | 3107 | if (page0) { |
3110 | eb->first_page = page0; | 3108 | eb->first_page = page0; |
3111 | i = 1; | 3109 | i = 1; |
3112 | index++; | 3110 | index++; |
3113 | page_cache_get(page0); | 3111 | page_cache_get(page0); |
3114 | mark_page_accessed(page0); | 3112 | mark_page_accessed(page0); |
3115 | set_page_extent_mapped(page0); | 3113 | set_page_extent_mapped(page0); |
3116 | set_page_extent_head(page0, len); | 3114 | set_page_extent_head(page0, len); |
3117 | uptodate = PageUptodate(page0); | 3115 | uptodate = PageUptodate(page0); |
3118 | } else { | 3116 | } else { |
3119 | i = 0; | 3117 | i = 0; |
3120 | } | 3118 | } |
3121 | for (; i < num_pages; i++, index++) { | 3119 | for (; i < num_pages; i++, index++) { |
3122 | p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); | 3120 | p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); |
3123 | if (!p) { | 3121 | if (!p) { |
3124 | WARN_ON(1); | 3122 | WARN_ON(1); |
3125 | goto free_eb; | 3123 | goto free_eb; |
3126 | } | 3124 | } |
3127 | set_page_extent_mapped(p); | 3125 | set_page_extent_mapped(p); |
3128 | mark_page_accessed(p); | 3126 | mark_page_accessed(p); |
3129 | if (i == 0) { | 3127 | if (i == 0) { |
3130 | eb->first_page = p; | 3128 | eb->first_page = p; |
3131 | set_page_extent_head(p, len); | 3129 | set_page_extent_head(p, len); |
3132 | } else { | 3130 | } else { |
3133 | set_page_private(p, EXTENT_PAGE_PRIVATE); | 3131 | set_page_private(p, EXTENT_PAGE_PRIVATE); |
3134 | } | 3132 | } |
3135 | if (!PageUptodate(p)) | 3133 | if (!PageUptodate(p)) |
3136 | uptodate = 0; | 3134 | uptodate = 0; |
3137 | 3135 | ||
3138 | /* | 3136 | /* |
3139 | * see below about how we avoid a nasty race with release page | 3137 | * see below about how we avoid a nasty race with release page |
3140 | * and why we unlock later | 3138 | * and why we unlock later |
3141 | */ | 3139 | */ |
3142 | if (i != 0) | 3140 | if (i != 0) |
3143 | unlock_page(p); | 3141 | unlock_page(p); |
3144 | } | 3142 | } |
3145 | if (uptodate) | 3143 | if (uptodate) |
3146 | set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); | 3144 | set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); |
3147 | 3145 | ||
3148 | ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); | 3146 | ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); |
3149 | if (ret) | 3147 | if (ret) |
3150 | goto free_eb; | 3148 | goto free_eb; |
3151 | 3149 | ||
3152 | spin_lock(&tree->buffer_lock); | 3150 | spin_lock(&tree->buffer_lock); |
3153 | ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); | 3151 | ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); |
3154 | if (ret == -EEXIST) { | 3152 | if (ret == -EEXIST) { |
3155 | exists = radix_tree_lookup(&tree->buffer, | 3153 | exists = radix_tree_lookup(&tree->buffer, |
3156 | start >> PAGE_CACHE_SHIFT); | 3154 | start >> PAGE_CACHE_SHIFT); |
3157 | /* add one reference for the caller */ | 3155 | /* add one reference for the caller */ |
3158 | atomic_inc(&exists->refs); | 3156 | atomic_inc(&exists->refs); |
3159 | spin_unlock(&tree->buffer_lock); | 3157 | spin_unlock(&tree->buffer_lock); |
3160 | radix_tree_preload_end(); | 3158 | radix_tree_preload_end(); |
3161 | goto free_eb; | 3159 | goto free_eb; |
3162 | } | 3160 | } |
3163 | /* add one reference for the tree */ | 3161 | /* add one reference for the tree */ |
3164 | atomic_inc(&eb->refs); | 3162 | atomic_inc(&eb->refs); |
3165 | spin_unlock(&tree->buffer_lock); | 3163 | spin_unlock(&tree->buffer_lock); |
3166 | radix_tree_preload_end(); | 3164 | radix_tree_preload_end(); |
3167 | 3165 | ||
3168 | /* | 3166 | /* |
3169 | * there is a race where release page may have | 3167 | * there is a race where release page may have |
3170 | * tried to find this extent buffer in the radix | 3168 | * tried to find this extent buffer in the radix |
3171 | * but failed. It will tell the VM it is safe to | 3169 | * but failed. It will tell the VM it is safe to |
3172 | * reclaim the, and it will clear the page private bit. | 3170 | * reclaim the, and it will clear the page private bit. |
3173 | * We must make sure to set the page private bit properly | 3171 | * We must make sure to set the page private bit properly |
3174 | * after the extent buffer is in the radix tree so | 3172 | * after the extent buffer is in the radix tree so |
3175 | * it doesn't get lost | 3173 | * it doesn't get lost |
3176 | */ | 3174 | */ |
3177 | set_page_extent_mapped(eb->first_page); | 3175 | set_page_extent_mapped(eb->first_page); |
3178 | set_page_extent_head(eb->first_page, eb->len); | 3176 | set_page_extent_head(eb->first_page, eb->len); |
3179 | if (!page0) | 3177 | if (!page0) |
3180 | unlock_page(eb->first_page); | 3178 | unlock_page(eb->first_page); |
3181 | return eb; | 3179 | return eb; |
3182 | 3180 | ||
3183 | free_eb: | 3181 | free_eb: |
3184 | if (eb->first_page && !page0) | 3182 | if (eb->first_page && !page0) |
3185 | unlock_page(eb->first_page); | 3183 | unlock_page(eb->first_page); |
3186 | 3184 | ||
3187 | if (!atomic_dec_and_test(&eb->refs)) | 3185 | if (!atomic_dec_and_test(&eb->refs)) |
3188 | return exists; | 3186 | return exists; |
3189 | btrfs_release_extent_buffer(eb); | 3187 | btrfs_release_extent_buffer(eb); |
3190 | return exists; | 3188 | return exists; |
3191 | } | 3189 | } |
3192 | 3190 | ||
3193 | struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, | 3191 | struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, |
3194 | u64 start, unsigned long len) | 3192 | u64 start, unsigned long len) |
3195 | { | 3193 | { |
3196 | struct extent_buffer *eb; | 3194 | struct extent_buffer *eb; |
3197 | 3195 | ||
3198 | rcu_read_lock(); | 3196 | rcu_read_lock(); |
3199 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); | 3197 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); |
3200 | if (eb && atomic_inc_not_zero(&eb->refs)) { | 3198 | if (eb && atomic_inc_not_zero(&eb->refs)) { |
3201 | rcu_read_unlock(); | 3199 | rcu_read_unlock(); |
3202 | mark_page_accessed(eb->first_page); | 3200 | mark_page_accessed(eb->first_page); |
3203 | return eb; | 3201 | return eb; |
3204 | } | 3202 | } |
3205 | rcu_read_unlock(); | 3203 | rcu_read_unlock(); |
3206 | 3204 | ||
3207 | return NULL; | 3205 | return NULL; |
3208 | } | 3206 | } |
3209 | 3207 | ||
3210 | void free_extent_buffer(struct extent_buffer *eb) | 3208 | void free_extent_buffer(struct extent_buffer *eb) |
3211 | { | 3209 | { |
3212 | if (!eb) | 3210 | if (!eb) |
3213 | return; | 3211 | return; |
3214 | 3212 | ||
3215 | if (!atomic_dec_and_test(&eb->refs)) | 3213 | if (!atomic_dec_and_test(&eb->refs)) |
3216 | return; | 3214 | return; |
3217 | 3215 | ||
3218 | WARN_ON(1); | 3216 | WARN_ON(1); |
3219 | } | 3217 | } |
3220 | 3218 | ||
3221 | int clear_extent_buffer_dirty(struct extent_io_tree *tree, | 3219 | int clear_extent_buffer_dirty(struct extent_io_tree *tree, |
3222 | struct extent_buffer *eb) | 3220 | struct extent_buffer *eb) |
3223 | { | 3221 | { |
3224 | unsigned long i; | 3222 | unsigned long i; |
3225 | unsigned long num_pages; | 3223 | unsigned long num_pages; |
3226 | struct page *page; | 3224 | struct page *page; |
3227 | 3225 | ||
3228 | num_pages = num_extent_pages(eb->start, eb->len); | 3226 | num_pages = num_extent_pages(eb->start, eb->len); |
3229 | 3227 | ||
3230 | for (i = 0; i < num_pages; i++) { | 3228 | for (i = 0; i < num_pages; i++) { |
3231 | page = extent_buffer_page(eb, i); | 3229 | page = extent_buffer_page(eb, i); |
3232 | if (!PageDirty(page)) | 3230 | if (!PageDirty(page)) |
3233 | continue; | 3231 | continue; |
3234 | 3232 | ||
3235 | lock_page(page); | 3233 | lock_page(page); |
3236 | WARN_ON(!PagePrivate(page)); | 3234 | WARN_ON(!PagePrivate(page)); |
3237 | 3235 | ||
3238 | set_page_extent_mapped(page); | 3236 | set_page_extent_mapped(page); |
3239 | if (i == 0) | 3237 | if (i == 0) |
3240 | set_page_extent_head(page, eb->len); | 3238 | set_page_extent_head(page, eb->len); |
3241 | 3239 | ||
3242 | clear_page_dirty_for_io(page); | 3240 | clear_page_dirty_for_io(page); |
3243 | spin_lock_irq(&page->mapping->tree_lock); | 3241 | spin_lock_irq(&page->mapping->tree_lock); |
3244 | if (!PageDirty(page)) { | 3242 | if (!PageDirty(page)) { |
3245 | radix_tree_tag_clear(&page->mapping->page_tree, | 3243 | radix_tree_tag_clear(&page->mapping->page_tree, |
3246 | page_index(page), | 3244 | page_index(page), |
3247 | PAGECACHE_TAG_DIRTY); | 3245 | PAGECACHE_TAG_DIRTY); |
3248 | } | 3246 | } |
3249 | spin_unlock_irq(&page->mapping->tree_lock); | 3247 | spin_unlock_irq(&page->mapping->tree_lock); |
3250 | unlock_page(page); | 3248 | unlock_page(page); |
3251 | } | 3249 | } |
3252 | return 0; | 3250 | return 0; |
3253 | } | 3251 | } |
3254 | 3252 | ||
3255 | int set_extent_buffer_dirty(struct extent_io_tree *tree, | 3253 | int set_extent_buffer_dirty(struct extent_io_tree *tree, |
3256 | struct extent_buffer *eb) | 3254 | struct extent_buffer *eb) |
3257 | { | 3255 | { |
3258 | unsigned long i; | 3256 | unsigned long i; |
3259 | unsigned long num_pages; | 3257 | unsigned long num_pages; |
3260 | int was_dirty = 0; | 3258 | int was_dirty = 0; |
3261 | 3259 | ||
3262 | was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); | 3260 | was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); |
3263 | num_pages = num_extent_pages(eb->start, eb->len); | 3261 | num_pages = num_extent_pages(eb->start, eb->len); |
3264 | for (i = 0; i < num_pages; i++) | 3262 | for (i = 0; i < num_pages; i++) |
3265 | __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); | 3263 | __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); |
3266 | return was_dirty; | 3264 | return was_dirty; |
3267 | } | 3265 | } |
3268 | 3266 | ||
3269 | int clear_extent_buffer_uptodate(struct extent_io_tree *tree, | 3267 | int clear_extent_buffer_uptodate(struct extent_io_tree *tree, |
3270 | struct extent_buffer *eb, | 3268 | struct extent_buffer *eb, |
3271 | struct extent_state **cached_state) | 3269 | struct extent_state **cached_state) |
3272 | { | 3270 | { |
3273 | unsigned long i; | 3271 | unsigned long i; |
3274 | struct page *page; | 3272 | struct page *page; |
3275 | unsigned long num_pages; | 3273 | unsigned long num_pages; |
3276 | 3274 | ||
3277 | num_pages = num_extent_pages(eb->start, eb->len); | 3275 | num_pages = num_extent_pages(eb->start, eb->len); |
3278 | clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); | 3276 | clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); |
3279 | 3277 | ||
3280 | clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, | 3278 | clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, |
3281 | cached_state, GFP_NOFS); | 3279 | cached_state, GFP_NOFS); |
3282 | for (i = 0; i < num_pages; i++) { | 3280 | for (i = 0; i < num_pages; i++) { |
3283 | page = extent_buffer_page(eb, i); | 3281 | page = extent_buffer_page(eb, i); |
3284 | if (page) | 3282 | if (page) |
3285 | ClearPageUptodate(page); | 3283 | ClearPageUptodate(page); |
3286 | } | 3284 | } |
3287 | return 0; | 3285 | return 0; |
3288 | } | 3286 | } |
3289 | 3287 | ||
3290 | int set_extent_buffer_uptodate(struct extent_io_tree *tree, | 3288 | int set_extent_buffer_uptodate(struct extent_io_tree *tree, |
3291 | struct extent_buffer *eb) | 3289 | struct extent_buffer *eb) |
3292 | { | 3290 | { |
3293 | unsigned long i; | 3291 | unsigned long i; |
3294 | struct page *page; | 3292 | struct page *page; |
3295 | unsigned long num_pages; | 3293 | unsigned long num_pages; |
3296 | 3294 | ||
3297 | num_pages = num_extent_pages(eb->start, eb->len); | 3295 | num_pages = num_extent_pages(eb->start, eb->len); |
3298 | 3296 | ||
3299 | set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, | 3297 | set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, |
3300 | NULL, GFP_NOFS); | 3298 | NULL, GFP_NOFS); |
3301 | for (i = 0; i < num_pages; i++) { | 3299 | for (i = 0; i < num_pages; i++) { |
3302 | page = extent_buffer_page(eb, i); | 3300 | page = extent_buffer_page(eb, i); |
3303 | if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || | 3301 | if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || |
3304 | ((i == num_pages - 1) && | 3302 | ((i == num_pages - 1) && |
3305 | ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { | 3303 | ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { |
3306 | check_page_uptodate(tree, page); | 3304 | check_page_uptodate(tree, page); |
3307 | continue; | 3305 | continue; |
3308 | } | 3306 | } |
3309 | SetPageUptodate(page); | 3307 | SetPageUptodate(page); |
3310 | } | 3308 | } |
3311 | return 0; | 3309 | return 0; |
3312 | } | 3310 | } |
3313 | 3311 | ||
3314 | int extent_range_uptodate(struct extent_io_tree *tree, | 3312 | int extent_range_uptodate(struct extent_io_tree *tree, |
3315 | u64 start, u64 end) | 3313 | u64 start, u64 end) |
3316 | { | 3314 | { |
3317 | struct page *page; | 3315 | struct page *page; |
3318 | int ret; | 3316 | int ret; |
3319 | int pg_uptodate = 1; | 3317 | int pg_uptodate = 1; |
3320 | int uptodate; | 3318 | int uptodate; |
3321 | unsigned long index; | 3319 | unsigned long index; |
3322 | 3320 | ||
3323 | ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); | 3321 | ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); |
3324 | if (ret) | 3322 | if (ret) |
3325 | return 1; | 3323 | return 1; |
3326 | while (start <= end) { | 3324 | while (start <= end) { |
3327 | index = start >> PAGE_CACHE_SHIFT; | 3325 | index = start >> PAGE_CACHE_SHIFT; |
3328 | page = find_get_page(tree->mapping, index); | 3326 | page = find_get_page(tree->mapping, index); |
3329 | uptodate = PageUptodate(page); | 3327 | uptodate = PageUptodate(page); |
3330 | page_cache_release(page); | 3328 | page_cache_release(page); |
3331 | if (!uptodate) { | 3329 | if (!uptodate) { |
3332 | pg_uptodate = 0; | 3330 | pg_uptodate = 0; |
3333 | break; | 3331 | break; |
3334 | } | 3332 | } |
3335 | start += PAGE_CACHE_SIZE; | 3333 | start += PAGE_CACHE_SIZE; |
3336 | } | 3334 | } |
3337 | return pg_uptodate; | 3335 | return pg_uptodate; |
3338 | } | 3336 | } |
3339 | 3337 | ||
3340 | int extent_buffer_uptodate(struct extent_io_tree *tree, | 3338 | int extent_buffer_uptodate(struct extent_io_tree *tree, |
3341 | struct extent_buffer *eb, | 3339 | struct extent_buffer *eb, |
3342 | struct extent_state *cached_state) | 3340 | struct extent_state *cached_state) |
3343 | { | 3341 | { |
3344 | int ret = 0; | 3342 | int ret = 0; |
3345 | unsigned long num_pages; | 3343 | unsigned long num_pages; |
3346 | unsigned long i; | 3344 | unsigned long i; |
3347 | struct page *page; | 3345 | struct page *page; |
3348 | int pg_uptodate = 1; | 3346 | int pg_uptodate = 1; |
3349 | 3347 | ||
3350 | if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) | 3348 | if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) |
3351 | return 1; | 3349 | return 1; |
3352 | 3350 | ||
3353 | ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, | 3351 | ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, |
3354 | EXTENT_UPTODATE, 1, cached_state); | 3352 | EXTENT_UPTODATE, 1, cached_state); |
3355 | if (ret) | 3353 | if (ret) |
3356 | return ret; | 3354 | return ret; |
3357 | 3355 | ||
3358 | num_pages = num_extent_pages(eb->start, eb->len); | 3356 | num_pages = num_extent_pages(eb->start, eb->len); |
3359 | for (i = 0; i < num_pages; i++) { | 3357 | for (i = 0; i < num_pages; i++) { |
3360 | page = extent_buffer_page(eb, i); | 3358 | page = extent_buffer_page(eb, i); |
3361 | if (!PageUptodate(page)) { | 3359 | if (!PageUptodate(page)) { |
3362 | pg_uptodate = 0; | 3360 | pg_uptodate = 0; |
3363 | break; | 3361 | break; |
3364 | } | 3362 | } |
3365 | } | 3363 | } |
3366 | return pg_uptodate; | 3364 | return pg_uptodate; |
3367 | } | 3365 | } |
3368 | 3366 | ||
3369 | int read_extent_buffer_pages(struct extent_io_tree *tree, | 3367 | int read_extent_buffer_pages(struct extent_io_tree *tree, |
3370 | struct extent_buffer *eb, | 3368 | struct extent_buffer *eb, |
3371 | u64 start, int wait, | 3369 | u64 start, int wait, |
3372 | get_extent_t *get_extent, int mirror_num) | 3370 | get_extent_t *get_extent, int mirror_num) |
3373 | { | 3371 | { |
3374 | unsigned long i; | 3372 | unsigned long i; |
3375 | unsigned long start_i; | 3373 | unsigned long start_i; |
3376 | struct page *page; | 3374 | struct page *page; |
3377 | int err; | 3375 | int err; |
3378 | int ret = 0; | 3376 | int ret = 0; |
3379 | int locked_pages = 0; | 3377 | int locked_pages = 0; |
3380 | int all_uptodate = 1; | 3378 | int all_uptodate = 1; |
3381 | int inc_all_pages = 0; | 3379 | int inc_all_pages = 0; |
3382 | unsigned long num_pages; | 3380 | unsigned long num_pages; |
3383 | struct bio *bio = NULL; | 3381 | struct bio *bio = NULL; |
3384 | unsigned long bio_flags = 0; | 3382 | unsigned long bio_flags = 0; |
3385 | 3383 | ||
3386 | if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) | 3384 | if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) |
3387 | return 0; | 3385 | return 0; |
3388 | 3386 | ||
3389 | if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, | 3387 | if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, |
3390 | EXTENT_UPTODATE, 1, NULL)) { | 3388 | EXTENT_UPTODATE, 1, NULL)) { |
3391 | return 0; | 3389 | return 0; |
3392 | } | 3390 | } |
3393 | 3391 | ||
3394 | if (start) { | 3392 | if (start) { |
3395 | WARN_ON(start < eb->start); | 3393 | WARN_ON(start < eb->start); |
3396 | start_i = (start >> PAGE_CACHE_SHIFT) - | 3394 | start_i = (start >> PAGE_CACHE_SHIFT) - |
3397 | (eb->start >> PAGE_CACHE_SHIFT); | 3395 | (eb->start >> PAGE_CACHE_SHIFT); |
3398 | } else { | 3396 | } else { |
3399 | start_i = 0; | 3397 | start_i = 0; |
3400 | } | 3398 | } |
3401 | 3399 | ||
3402 | num_pages = num_extent_pages(eb->start, eb->len); | 3400 | num_pages = num_extent_pages(eb->start, eb->len); |
3403 | for (i = start_i; i < num_pages; i++) { | 3401 | for (i = start_i; i < num_pages; i++) { |
3404 | page = extent_buffer_page(eb, i); | 3402 | page = extent_buffer_page(eb, i); |
3405 | if (!wait) { | 3403 | if (!wait) { |
3406 | if (!trylock_page(page)) | 3404 | if (!trylock_page(page)) |
3407 | goto unlock_exit; | 3405 | goto unlock_exit; |
3408 | } else { | 3406 | } else { |
3409 | lock_page(page); | 3407 | lock_page(page); |
3410 | } | 3408 | } |
3411 | locked_pages++; | 3409 | locked_pages++; |
3412 | if (!PageUptodate(page)) | 3410 | if (!PageUptodate(page)) |
3413 | all_uptodate = 0; | 3411 | all_uptodate = 0; |
3414 | } | 3412 | } |
3415 | if (all_uptodate) { | 3413 | if (all_uptodate) { |
3416 | if (start_i == 0) | 3414 | if (start_i == 0) |
3417 | set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); | 3415 | set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); |
3418 | goto unlock_exit; | 3416 | goto unlock_exit; |
3419 | } | 3417 | } |
3420 | 3418 | ||
3421 | for (i = start_i; i < num_pages; i++) { | 3419 | for (i = start_i; i < num_pages; i++) { |
3422 | page = extent_buffer_page(eb, i); | 3420 | page = extent_buffer_page(eb, i); |
3423 | 3421 | ||
3424 | WARN_ON(!PagePrivate(page)); | 3422 | WARN_ON(!PagePrivate(page)); |
3425 | 3423 | ||
3426 | set_page_extent_mapped(page); | 3424 | set_page_extent_mapped(page); |
3427 | if (i == 0) | 3425 | if (i == 0) |
3428 | set_page_extent_head(page, eb->len); | 3426 | set_page_extent_head(page, eb->len); |
3429 | 3427 | ||
3430 | if (inc_all_pages) | 3428 | if (inc_all_pages) |
3431 | page_cache_get(page); | 3429 | page_cache_get(page); |
3432 | if (!PageUptodate(page)) { | 3430 | if (!PageUptodate(page)) { |
3433 | if (start_i == 0) | 3431 | if (start_i == 0) |
3434 | inc_all_pages = 1; | 3432 | inc_all_pages = 1; |
3435 | ClearPageError(page); | 3433 | ClearPageError(page); |
3436 | err = __extent_read_full_page(tree, page, | 3434 | err = __extent_read_full_page(tree, page, |
3437 | get_extent, &bio, | 3435 | get_extent, &bio, |
3438 | mirror_num, &bio_flags); | 3436 | mirror_num, &bio_flags); |
3439 | if (err) | 3437 | if (err) |
3440 | ret = err; | 3438 | ret = err; |
3441 | } else { | 3439 | } else { |
3442 | unlock_page(page); | 3440 | unlock_page(page); |
3443 | } | 3441 | } |
3444 | } | 3442 | } |
3445 | 3443 | ||
3446 | if (bio) | 3444 | if (bio) |
3447 | submit_one_bio(READ, bio, mirror_num, bio_flags); | 3445 | submit_one_bio(READ, bio, mirror_num, bio_flags); |
3448 | 3446 | ||
3449 | if (ret || !wait) | 3447 | if (ret || !wait) |
3450 | return ret; | 3448 | return ret; |
3451 | 3449 | ||
3452 | for (i = start_i; i < num_pages; i++) { | 3450 | for (i = start_i; i < num_pages; i++) { |
3453 | page = extent_buffer_page(eb, i); | 3451 | page = extent_buffer_page(eb, i); |
3454 | wait_on_page_locked(page); | 3452 | wait_on_page_locked(page); |
3455 | if (!PageUptodate(page)) | 3453 | if (!PageUptodate(page)) |
3456 | ret = -EIO; | 3454 | ret = -EIO; |
3457 | } | 3455 | } |
3458 | 3456 | ||
3459 | if (!ret) | 3457 | if (!ret) |
3460 | set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); | 3458 | set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); |
3461 | return ret; | 3459 | return ret; |
3462 | 3460 | ||
3463 | unlock_exit: | 3461 | unlock_exit: |
3464 | i = start_i; | 3462 | i = start_i; |
3465 | while (locked_pages > 0) { | 3463 | while (locked_pages > 0) { |
3466 | page = extent_buffer_page(eb, i); | 3464 | page = extent_buffer_page(eb, i); |
3467 | i++; | 3465 | i++; |
3468 | unlock_page(page); | 3466 | unlock_page(page); |
3469 | locked_pages--; | 3467 | locked_pages--; |
3470 | } | 3468 | } |
3471 | return ret; | 3469 | return ret; |
3472 | } | 3470 | } |
3473 | 3471 | ||
3474 | void read_extent_buffer(struct extent_buffer *eb, void *dstv, | 3472 | void read_extent_buffer(struct extent_buffer *eb, void *dstv, |
3475 | unsigned long start, | 3473 | unsigned long start, |
3476 | unsigned long len) | 3474 | unsigned long len) |
3477 | { | 3475 | { |
3478 | size_t cur; | 3476 | size_t cur; |
3479 | size_t offset; | 3477 | size_t offset; |
3480 | struct page *page; | 3478 | struct page *page; |
3481 | char *kaddr; | 3479 | char *kaddr; |
3482 | char *dst = (char *)dstv; | 3480 | char *dst = (char *)dstv; |
3483 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | 3481 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); |
3484 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | 3482 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; |
3485 | 3483 | ||
3486 | WARN_ON(start > eb->len); | 3484 | WARN_ON(start > eb->len); |
3487 | WARN_ON(start + len > eb->start + eb->len); | 3485 | WARN_ON(start + len > eb->start + eb->len); |
3488 | 3486 | ||
3489 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | 3487 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); |
3490 | 3488 | ||
3491 | while (len > 0) { | 3489 | while (len > 0) { |
3492 | page = extent_buffer_page(eb, i); | 3490 | page = extent_buffer_page(eb, i); |
3493 | 3491 | ||
3494 | cur = min(len, (PAGE_CACHE_SIZE - offset)); | 3492 | cur = min(len, (PAGE_CACHE_SIZE - offset)); |
3495 | kaddr = kmap_atomic(page, KM_USER1); | 3493 | kaddr = kmap_atomic(page, KM_USER1); |
3496 | memcpy(dst, kaddr + offset, cur); | 3494 | memcpy(dst, kaddr + offset, cur); |
3497 | kunmap_atomic(kaddr, KM_USER1); | 3495 | kunmap_atomic(kaddr, KM_USER1); |
3498 | 3496 | ||
3499 | dst += cur; | 3497 | dst += cur; |
3500 | len -= cur; | 3498 | len -= cur; |
3501 | offset = 0; | 3499 | offset = 0; |
3502 | i++; | 3500 | i++; |
3503 | } | 3501 | } |
3504 | } | 3502 | } |
3505 | 3503 | ||
3506 | int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, | 3504 | int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, |
3507 | unsigned long min_len, char **token, char **map, | 3505 | unsigned long min_len, char **token, char **map, |
3508 | unsigned long *map_start, | 3506 | unsigned long *map_start, |
3509 | unsigned long *map_len, int km) | 3507 | unsigned long *map_len, int km) |
3510 | { | 3508 | { |
3511 | size_t offset = start & (PAGE_CACHE_SIZE - 1); | 3509 | size_t offset = start & (PAGE_CACHE_SIZE - 1); |
3512 | char *kaddr; | 3510 | char *kaddr; |
3513 | struct page *p; | 3511 | struct page *p; |
3514 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | 3512 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); |
3515 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | 3513 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; |
3516 | unsigned long end_i = (start_offset + start + min_len - 1) >> | 3514 | unsigned long end_i = (start_offset + start + min_len - 1) >> |
3517 | PAGE_CACHE_SHIFT; | 3515 | PAGE_CACHE_SHIFT; |
3518 | 3516 | ||
3519 | if (i != end_i) | 3517 | if (i != end_i) |
3520 | return -EINVAL; | 3518 | return -EINVAL; |
3521 | 3519 | ||
3522 | if (i == 0) { | 3520 | if (i == 0) { |
3523 | offset = start_offset; | 3521 | offset = start_offset; |
3524 | *map_start = 0; | 3522 | *map_start = 0; |
3525 | } else { | 3523 | } else { |
3526 | offset = 0; | 3524 | offset = 0; |
3527 | *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; | 3525 | *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; |
3528 | } | 3526 | } |
3529 | 3527 | ||
3530 | if (start + min_len > eb->len) { | 3528 | if (start + min_len > eb->len) { |
3531 | printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " | 3529 | printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " |
3532 | "wanted %lu %lu\n", (unsigned long long)eb->start, | 3530 | "wanted %lu %lu\n", (unsigned long long)eb->start, |
3533 | eb->len, start, min_len); | 3531 | eb->len, start, min_len); |
3534 | WARN_ON(1); | 3532 | WARN_ON(1); |
3535 | return -EINVAL; | 3533 | return -EINVAL; |
3536 | } | 3534 | } |
3537 | 3535 | ||
3538 | p = extent_buffer_page(eb, i); | 3536 | p = extent_buffer_page(eb, i); |
3539 | kaddr = kmap_atomic(p, km); | 3537 | kaddr = kmap_atomic(p, km); |
3540 | *token = kaddr; | 3538 | *token = kaddr; |
3541 | *map = kaddr + offset; | 3539 | *map = kaddr + offset; |
3542 | *map_len = PAGE_CACHE_SIZE - offset; | 3540 | *map_len = PAGE_CACHE_SIZE - offset; |
3543 | return 0; | 3541 | return 0; |
3544 | } | 3542 | } |
3545 | 3543 | ||
3546 | int map_extent_buffer(struct extent_buffer *eb, unsigned long start, | 3544 | int map_extent_buffer(struct extent_buffer *eb, unsigned long start, |
3547 | unsigned long min_len, | 3545 | unsigned long min_len, |
3548 | char **token, char **map, | 3546 | char **token, char **map, |
3549 | unsigned long *map_start, | 3547 | unsigned long *map_start, |
3550 | unsigned long *map_len, int km) | 3548 | unsigned long *map_len, int km) |
3551 | { | 3549 | { |
3552 | int err; | 3550 | int err; |
3553 | int save = 0; | 3551 | int save = 0; |
3554 | if (eb->map_token) { | 3552 | if (eb->map_token) { |
3555 | unmap_extent_buffer(eb, eb->map_token, km); | 3553 | unmap_extent_buffer(eb, eb->map_token, km); |
3556 | eb->map_token = NULL; | 3554 | eb->map_token = NULL; |
3557 | save = 1; | 3555 | save = 1; |
3558 | } | 3556 | } |
3559 | err = map_private_extent_buffer(eb, start, min_len, token, map, | 3557 | err = map_private_extent_buffer(eb, start, min_len, token, map, |
3560 | map_start, map_len, km); | 3558 | map_start, map_len, km); |
3561 | if (!err && save) { | 3559 | if (!err && save) { |
3562 | eb->map_token = *token; | 3560 | eb->map_token = *token; |
3563 | eb->kaddr = *map; | 3561 | eb->kaddr = *map; |
3564 | eb->map_start = *map_start; | 3562 | eb->map_start = *map_start; |
3565 | eb->map_len = *map_len; | 3563 | eb->map_len = *map_len; |
3566 | } | 3564 | } |
3567 | return err; | 3565 | return err; |
3568 | } | 3566 | } |
3569 | 3567 | ||
3570 | void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) | 3568 | void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) |
3571 | { | 3569 | { |
3572 | kunmap_atomic(token, km); | 3570 | kunmap_atomic(token, km); |
3573 | } | 3571 | } |
3574 | 3572 | ||
3575 | int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, | 3573 | int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, |
3576 | unsigned long start, | 3574 | unsigned long start, |
3577 | unsigned long len) | 3575 | unsigned long len) |
3578 | { | 3576 | { |
3579 | size_t cur; | 3577 | size_t cur; |
3580 | size_t offset; | 3578 | size_t offset; |
3581 | struct page *page; | 3579 | struct page *page; |
3582 | char *kaddr; | 3580 | char *kaddr; |
3583 | char *ptr = (char *)ptrv; | 3581 | char *ptr = (char *)ptrv; |
3584 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | 3582 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); |
3585 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | 3583 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; |
3586 | int ret = 0; | 3584 | int ret = 0; |
3587 | 3585 | ||
3588 | WARN_ON(start > eb->len); | 3586 | WARN_ON(start > eb->len); |
3589 | WARN_ON(start + len > eb->start + eb->len); | 3587 | WARN_ON(start + len > eb->start + eb->len); |
3590 | 3588 | ||
3591 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | 3589 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); |
3592 | 3590 | ||
3593 | while (len > 0) { | 3591 | while (len > 0) { |
3594 | page = extent_buffer_page(eb, i); | 3592 | page = extent_buffer_page(eb, i); |
3595 | 3593 | ||
3596 | cur = min(len, (PAGE_CACHE_SIZE - offset)); | 3594 | cur = min(len, (PAGE_CACHE_SIZE - offset)); |
3597 | 3595 | ||
3598 | kaddr = kmap_atomic(page, KM_USER0); | 3596 | kaddr = kmap_atomic(page, KM_USER0); |
3599 | ret = memcmp(ptr, kaddr + offset, cur); | 3597 | ret = memcmp(ptr, kaddr + offset, cur); |
3600 | kunmap_atomic(kaddr, KM_USER0); | 3598 | kunmap_atomic(kaddr, KM_USER0); |
3601 | if (ret) | 3599 | if (ret) |
3602 | break; | 3600 | break; |
3603 | 3601 | ||
3604 | ptr += cur; | 3602 | ptr += cur; |
3605 | len -= cur; | 3603 | len -= cur; |
3606 | offset = 0; | 3604 | offset = 0; |
3607 | i++; | 3605 | i++; |
3608 | } | 3606 | } |
3609 | return ret; | 3607 | return ret; |
3610 | } | 3608 | } |
3611 | 3609 | ||
3612 | void write_extent_buffer(struct extent_buffer *eb, const void *srcv, | 3610 | void write_extent_buffer(struct extent_buffer *eb, const void *srcv, |
3613 | unsigned long start, unsigned long len) | 3611 | unsigned long start, unsigned long len) |
3614 | { | 3612 | { |
3615 | size_t cur; | 3613 | size_t cur; |
3616 | size_t offset; | 3614 | size_t offset; |
3617 | struct page *page; | 3615 | struct page *page; |
3618 | char *kaddr; | 3616 | char *kaddr; |
3619 | char *src = (char *)srcv; | 3617 | char *src = (char *)srcv; |
3620 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | 3618 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); |
3621 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | 3619 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; |
3622 | 3620 | ||
3623 | WARN_ON(start > eb->len); | 3621 | WARN_ON(start > eb->len); |
3624 | WARN_ON(start + len > eb->start + eb->len); | 3622 | WARN_ON(start + len > eb->start + eb->len); |
3625 | 3623 | ||
3626 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | 3624 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); |
3627 | 3625 | ||
3628 | while (len > 0) { | 3626 | while (len > 0) { |
3629 | page = extent_buffer_page(eb, i); | 3627 | page = extent_buffer_page(eb, i); |
3630 | WARN_ON(!PageUptodate(page)); | 3628 | WARN_ON(!PageUptodate(page)); |
3631 | 3629 | ||
3632 | cur = min(len, PAGE_CACHE_SIZE - offset); | 3630 | cur = min(len, PAGE_CACHE_SIZE - offset); |
3633 | kaddr = kmap_atomic(page, KM_USER1); | 3631 | kaddr = kmap_atomic(page, KM_USER1); |
3634 | memcpy(kaddr + offset, src, cur); | 3632 | memcpy(kaddr + offset, src, cur); |
3635 | kunmap_atomic(kaddr, KM_USER1); | 3633 | kunmap_atomic(kaddr, KM_USER1); |
3636 | 3634 | ||
3637 | src += cur; | 3635 | src += cur; |
3638 | len -= cur; | 3636 | len -= cur; |
3639 | offset = 0; | 3637 | offset = 0; |
3640 | i++; | 3638 | i++; |
3641 | } | 3639 | } |
3642 | } | 3640 | } |
3643 | 3641 | ||
3644 | void memset_extent_buffer(struct extent_buffer *eb, char c, | 3642 | void memset_extent_buffer(struct extent_buffer *eb, char c, |
3645 | unsigned long start, unsigned long len) | 3643 | unsigned long start, unsigned long len) |
3646 | { | 3644 | { |
3647 | size_t cur; | 3645 | size_t cur; |
3648 | size_t offset; | 3646 | size_t offset; |
3649 | struct page *page; | 3647 | struct page *page; |
3650 | char *kaddr; | 3648 | char *kaddr; |
3651 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | 3649 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); |
3652 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | 3650 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; |
3653 | 3651 | ||
3654 | WARN_ON(start > eb->len); | 3652 | WARN_ON(start > eb->len); |
3655 | WARN_ON(start + len > eb->start + eb->len); | 3653 | WARN_ON(start + len > eb->start + eb->len); |
3656 | 3654 | ||
3657 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | 3655 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); |
3658 | 3656 | ||
3659 | while (len > 0) { | 3657 | while (len > 0) { |
3660 | page = extent_buffer_page(eb, i); | 3658 | page = extent_buffer_page(eb, i); |
3661 | WARN_ON(!PageUptodate(page)); | 3659 | WARN_ON(!PageUptodate(page)); |
3662 | 3660 | ||
3663 | cur = min(len, PAGE_CACHE_SIZE - offset); | 3661 | cur = min(len, PAGE_CACHE_SIZE - offset); |
3664 | kaddr = kmap_atomic(page, KM_USER0); | 3662 | kaddr = kmap_atomic(page, KM_USER0); |
3665 | memset(kaddr + offset, c, cur); | 3663 | memset(kaddr + offset, c, cur); |
3666 | kunmap_atomic(kaddr, KM_USER0); | 3664 | kunmap_atomic(kaddr, KM_USER0); |
3667 | 3665 | ||
3668 | len -= cur; | 3666 | len -= cur; |
3669 | offset = 0; | 3667 | offset = 0; |
3670 | i++; | 3668 | i++; |
3671 | } | 3669 | } |
3672 | } | 3670 | } |
3673 | 3671 | ||
3674 | void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, | 3672 | void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, |
3675 | unsigned long dst_offset, unsigned long src_offset, | 3673 | unsigned long dst_offset, unsigned long src_offset, |
3676 | unsigned long len) | 3674 | unsigned long len) |
3677 | { | 3675 | { |
3678 | u64 dst_len = dst->len; | 3676 | u64 dst_len = dst->len; |
3679 | size_t cur; | 3677 | size_t cur; |
3680 | size_t offset; | 3678 | size_t offset; |
3681 | struct page *page; | 3679 | struct page *page; |
3682 | char *kaddr; | 3680 | char *kaddr; |
3683 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); | 3681 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); |
3684 | unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; | 3682 | unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; |
3685 | 3683 | ||
3686 | WARN_ON(src->len != dst_len); | 3684 | WARN_ON(src->len != dst_len); |
3687 | 3685 | ||
3688 | offset = (start_offset + dst_offset) & | 3686 | offset = (start_offset + dst_offset) & |
3689 | ((unsigned long)PAGE_CACHE_SIZE - 1); | 3687 | ((unsigned long)PAGE_CACHE_SIZE - 1); |
3690 | 3688 | ||
3691 | while (len > 0) { | 3689 | while (len > 0) { |
3692 | page = extent_buffer_page(dst, i); | 3690 | page = extent_buffer_page(dst, i); |
3693 | WARN_ON(!PageUptodate(page)); | 3691 | WARN_ON(!PageUptodate(page)); |
3694 | 3692 | ||
3695 | cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); | 3693 | cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); |
3696 | 3694 | ||
3697 | kaddr = kmap_atomic(page, KM_USER0); | 3695 | kaddr = kmap_atomic(page, KM_USER0); |
3698 | read_extent_buffer(src, kaddr + offset, src_offset, cur); | 3696 | read_extent_buffer(src, kaddr + offset, src_offset, cur); |
3699 | kunmap_atomic(kaddr, KM_USER0); | 3697 | kunmap_atomic(kaddr, KM_USER0); |
3700 | 3698 | ||
3701 | src_offset += cur; | 3699 | src_offset += cur; |
3702 | len -= cur; | 3700 | len -= cur; |
3703 | offset = 0; | 3701 | offset = 0; |
3704 | i++; | 3702 | i++; |
3705 | } | 3703 | } |
3706 | } | 3704 | } |
3707 | 3705 | ||
3708 | static void move_pages(struct page *dst_page, struct page *src_page, | 3706 | static void move_pages(struct page *dst_page, struct page *src_page, |
3709 | unsigned long dst_off, unsigned long src_off, | 3707 | unsigned long dst_off, unsigned long src_off, |
3710 | unsigned long len) | 3708 | unsigned long len) |
3711 | { | 3709 | { |
3712 | char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); | 3710 | char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); |
3713 | if (dst_page == src_page) { | 3711 | if (dst_page == src_page) { |
3714 | memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); | 3712 | memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); |
3715 | } else { | 3713 | } else { |
3716 | char *src_kaddr = kmap_atomic(src_page, KM_USER1); | 3714 | char *src_kaddr = kmap_atomic(src_page, KM_USER1); |
3717 | char *p = dst_kaddr + dst_off + len; | 3715 | char *p = dst_kaddr + dst_off + len; |
3718 | char *s = src_kaddr + src_off + len; | 3716 | char *s = src_kaddr + src_off + len; |
3719 | 3717 | ||
3720 | while (len--) | 3718 | while (len--) |
3721 | *--p = *--s; | 3719 | *--p = *--s; |
3722 | 3720 | ||
3723 | kunmap_atomic(src_kaddr, KM_USER1); | 3721 | kunmap_atomic(src_kaddr, KM_USER1); |
3724 | } | 3722 | } |
3725 | kunmap_atomic(dst_kaddr, KM_USER0); | 3723 | kunmap_atomic(dst_kaddr, KM_USER0); |
3726 | } | 3724 | } |
3727 | 3725 | ||
3728 | static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) | 3726 | static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) |
3729 | { | 3727 | { |
3730 | unsigned long distance = (src > dst) ? src - dst : dst - src; | 3728 | unsigned long distance = (src > dst) ? src - dst : dst - src; |
3731 | return distance < len; | 3729 | return distance < len; |
3732 | } | 3730 | } |
3733 | 3731 | ||
3734 | static void copy_pages(struct page *dst_page, struct page *src_page, | 3732 | static void copy_pages(struct page *dst_page, struct page *src_page, |
3735 | unsigned long dst_off, unsigned long src_off, | 3733 | unsigned long dst_off, unsigned long src_off, |
3736 | unsigned long len) | 3734 | unsigned long len) |
3737 | { | 3735 | { |
3738 | char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); | 3736 | char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); |
3739 | char *src_kaddr; | 3737 | char *src_kaddr; |
3740 | 3738 | ||
3741 | if (dst_page != src_page) { | 3739 | if (dst_page != src_page) { |
3742 | src_kaddr = kmap_atomic(src_page, KM_USER1); | 3740 | src_kaddr = kmap_atomic(src_page, KM_USER1); |
3743 | } else { | 3741 | } else { |
3744 | src_kaddr = dst_kaddr; | 3742 | src_kaddr = dst_kaddr; |
3745 | BUG_ON(areas_overlap(src_off, dst_off, len)); | 3743 | BUG_ON(areas_overlap(src_off, dst_off, len)); |
3746 | } | 3744 | } |
3747 | 3745 | ||
3748 | memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); | 3746 | memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); |
3749 | kunmap_atomic(dst_kaddr, KM_USER0); | 3747 | kunmap_atomic(dst_kaddr, KM_USER0); |
3750 | if (dst_page != src_page) | 3748 | if (dst_page != src_page) |
3751 | kunmap_atomic(src_kaddr, KM_USER1); | 3749 | kunmap_atomic(src_kaddr, KM_USER1); |
3752 | } | 3750 | } |
3753 | 3751 | ||
3754 | void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, | 3752 | void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, |
3755 | unsigned long src_offset, unsigned long len) | 3753 | unsigned long src_offset, unsigned long len) |
3756 | { | 3754 | { |
3757 | size_t cur; | 3755 | size_t cur; |
3758 | size_t dst_off_in_page; | 3756 | size_t dst_off_in_page; |
3759 | size_t src_off_in_page; | 3757 | size_t src_off_in_page; |
3760 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); | 3758 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); |
3761 | unsigned long dst_i; | 3759 | unsigned long dst_i; |
3762 | unsigned long src_i; | 3760 | unsigned long src_i; |
3763 | 3761 | ||
3764 | if (src_offset + len > dst->len) { | 3762 | if (src_offset + len > dst->len) { |
3765 | printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " | 3763 | printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " |
3766 | "len %lu dst len %lu\n", src_offset, len, dst->len); | 3764 | "len %lu dst len %lu\n", src_offset, len, dst->len); |
3767 | BUG_ON(1); | 3765 | BUG_ON(1); |
3768 | } | 3766 | } |
3769 | if (dst_offset + len > dst->len) { | 3767 | if (dst_offset + len > dst->len) { |
3770 | printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " | 3768 | printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " |
3771 | "len %lu dst len %lu\n", dst_offset, len, dst->len); | 3769 | "len %lu dst len %lu\n", dst_offset, len, dst->len); |
3772 | BUG_ON(1); | 3770 | BUG_ON(1); |
3773 | } | 3771 | } |
3774 | 3772 | ||
3775 | while (len > 0) { | 3773 | while (len > 0) { |
3776 | dst_off_in_page = (start_offset + dst_offset) & | 3774 | dst_off_in_page = (start_offset + dst_offset) & |
3777 | ((unsigned long)PAGE_CACHE_SIZE - 1); | 3775 | ((unsigned long)PAGE_CACHE_SIZE - 1); |
3778 | src_off_in_page = (start_offset + src_offset) & | 3776 | src_off_in_page = (start_offset + src_offset) & |
3779 | ((unsigned long)PAGE_CACHE_SIZE - 1); | 3777 | ((unsigned long)PAGE_CACHE_SIZE - 1); |
3780 | 3778 | ||
3781 | dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; | 3779 | dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; |
3782 | src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; | 3780 | src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; |
3783 | 3781 | ||
3784 | cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - | 3782 | cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - |
3785 | src_off_in_page)); | 3783 | src_off_in_page)); |
3786 | cur = min_t(unsigned long, cur, | 3784 | cur = min_t(unsigned long, cur, |
3787 | (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); | 3785 | (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); |
3788 | 3786 | ||
3789 | copy_pages(extent_buffer_page(dst, dst_i), | 3787 | copy_pages(extent_buffer_page(dst, dst_i), |
3790 | extent_buffer_page(dst, src_i), | 3788 | extent_buffer_page(dst, src_i), |
3791 | dst_off_in_page, src_off_in_page, cur); | 3789 | dst_off_in_page, src_off_in_page, cur); |
3792 | 3790 | ||
3793 | src_offset += cur; | 3791 | src_offset += cur; |
3794 | dst_offset += cur; | 3792 | dst_offset += cur; |
3795 | len -= cur; | 3793 | len -= cur; |
3796 | } | 3794 | } |
3797 | } | 3795 | } |
3798 | 3796 | ||
3799 | void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, | 3797 | void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, |
3800 | unsigned long src_offset, unsigned long len) | 3798 | unsigned long src_offset, unsigned long len) |
3801 | { | 3799 | { |
3802 | size_t cur; | 3800 | size_t cur; |
3803 | size_t dst_off_in_page; | 3801 | size_t dst_off_in_page; |
3804 | size_t src_off_in_page; | 3802 | size_t src_off_in_page; |
3805 | unsigned long dst_end = dst_offset + len - 1; | 3803 | unsigned long dst_end = dst_offset + len - 1; |
3806 | unsigned long src_end = src_offset + len - 1; | 3804 | unsigned long src_end = src_offset + len - 1; |
3807 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); | 3805 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); |
3808 | unsigned long dst_i; | 3806 | unsigned long dst_i; |
3809 | unsigned long src_i; | 3807 | unsigned long src_i; |
3810 | 3808 | ||
3811 | if (src_offset + len > dst->len) { | 3809 | if (src_offset + len > dst->len) { |
3812 | printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " | 3810 | printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " |
3813 | "len %lu len %lu\n", src_offset, len, dst->len); | 3811 | "len %lu len %lu\n", src_offset, len, dst->len); |
3814 | BUG_ON(1); | 3812 | BUG_ON(1); |
3815 | } | 3813 | } |
3816 | if (dst_offset + len > dst->len) { | 3814 | if (dst_offset + len > dst->len) { |
3817 | printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " | 3815 | printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " |
3818 | "len %lu len %lu\n", dst_offset, len, dst->len); | 3816 | "len %lu len %lu\n", dst_offset, len, dst->len); |
3819 | BUG_ON(1); | 3817 | BUG_ON(1); |
3820 | } | 3818 | } |
3821 | if (!areas_overlap(src_offset, dst_offset, len)) { | 3819 | if (!areas_overlap(src_offset, dst_offset, len)) { |
3822 | memcpy_extent_buffer(dst, dst_offset, src_offset, len); | 3820 | memcpy_extent_buffer(dst, dst_offset, src_offset, len); |
3823 | return; | 3821 | return; |
3824 | } | 3822 | } |
3825 | while (len > 0) { | 3823 | while (len > 0) { |
3826 | dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; | 3824 | dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; |
3827 | src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; | 3825 | src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; |
3828 | 3826 | ||
3829 | dst_off_in_page = (start_offset + dst_end) & | 3827 | dst_off_in_page = (start_offset + dst_end) & |
3830 | ((unsigned long)PAGE_CACHE_SIZE - 1); | 3828 | ((unsigned long)PAGE_CACHE_SIZE - 1); |
3831 | src_off_in_page = (start_offset + src_end) & | 3829 | src_off_in_page = (start_offset + src_end) & |
3832 | ((unsigned long)PAGE_CACHE_SIZE - 1); | 3830 | ((unsigned long)PAGE_CACHE_SIZE - 1); |
3833 | 3831 | ||
3834 | cur = min_t(unsigned long, len, src_off_in_page + 1); | 3832 | cur = min_t(unsigned long, len, src_off_in_page + 1); |
3835 | cur = min(cur, dst_off_in_page + 1); | 3833 | cur = min(cur, dst_off_in_page + 1); |
3836 | move_pages(extent_buffer_page(dst, dst_i), | 3834 | move_pages(extent_buffer_page(dst, dst_i), |
3837 | extent_buffer_page(dst, src_i), | 3835 | extent_buffer_page(dst, src_i), |
3838 | dst_off_in_page - cur + 1, | 3836 | dst_off_in_page - cur + 1, |
3839 | src_off_in_page - cur + 1, cur); | 3837 | src_off_in_page - cur + 1, cur); |
3840 | 3838 | ||
3841 | dst_end -= cur; | 3839 | dst_end -= cur; |
3842 | src_end -= cur; | 3840 | src_end -= cur; |
3843 | len -= cur; | 3841 | len -= cur; |
3844 | } | 3842 | } |
3845 | } | 3843 | } |
3846 | 3844 | ||
3847 | static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) | 3845 | static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) |
3848 | { | 3846 | { |
3849 | struct extent_buffer *eb = | 3847 | struct extent_buffer *eb = |
3850 | container_of(head, struct extent_buffer, rcu_head); | 3848 | container_of(head, struct extent_buffer, rcu_head); |
3851 | 3849 | ||
3852 | btrfs_release_extent_buffer(eb); | 3850 | btrfs_release_extent_buffer(eb); |
3853 | } | 3851 | } |
3854 | 3852 | ||
3855 | int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) | 3853 | int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) |
3856 | { | 3854 | { |
3857 | u64 start = page_offset(page); | 3855 | u64 start = page_offset(page); |
3858 | struct extent_buffer *eb; | 3856 | struct extent_buffer *eb; |
3859 | int ret = 1; | 3857 | int ret = 1; |
3860 | 3858 | ||
3861 | spin_lock(&tree->buffer_lock); | 3859 | spin_lock(&tree->buffer_lock); |
3862 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); | 3860 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); |
3863 | if (!eb) { | 3861 | if (!eb) { |
3864 | spin_unlock(&tree->buffer_lock); | 3862 | spin_unlock(&tree->buffer_lock); |
3865 | return ret; | 3863 | return ret; |
3866 | } | 3864 | } |
3867 | 3865 | ||
3868 | if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { | 3866 | if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { |
3869 | ret = 0; | 3867 | ret = 0; |
3870 | goto out; | 3868 | goto out; |
3871 | } | 3869 | } |
3872 | 3870 | ||
3873 | /* | 3871 | /* |
3874 | * set @eb->refs to 0 if it is already 1, and then release the @eb. | 3872 | * set @eb->refs to 0 if it is already 1, and then release the @eb. |
3875 | * Or go back. | 3873 | * Or go back. |
3876 | */ | 3874 | */ |
3877 | if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { | 3875 | if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { |
3878 | ret = 0; | 3876 | ret = 0; |
3879 | goto out; | 3877 | goto out; |
3880 | } | 3878 | } |
3881 | 3879 | ||
3882 | radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); | 3880 | radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); |
3883 | out: | 3881 | out: |
3884 | spin_unlock(&tree->buffer_lock); | 3882 | spin_unlock(&tree->buffer_lock); |
3885 | 3883 | ||
3886 | /* at this point we can safely release the extent buffer */ | 3884 | /* at this point we can safely release the extent buffer */ |
3887 | if (atomic_read(&eb->refs) == 0) | 3885 | if (atomic_read(&eb->refs) == 0) |
3888 | call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); | 3886 | call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); |
3889 | return ret; | 3887 | return ret; |
3890 | } | 3888 | } |
3891 | 3889 |
fs/ext4/inode.c
1 | /* | 1 | /* |
2 | * linux/fs/ext4/inode.c | 2 | * linux/fs/ext4/inode.c |
3 | * | 3 | * |
4 | * Copyright (C) 1992, 1993, 1994, 1995 | 4 | * Copyright (C) 1992, 1993, 1994, 1995 |
5 | * Remy Card (card@masi.ibp.fr) | 5 | * Remy Card (card@masi.ibp.fr) |
6 | * Laboratoire MASI - Institut Blaise Pascal | 6 | * Laboratoire MASI - Institut Blaise Pascal |
7 | * Universite Pierre et Marie Curie (Paris VI) | 7 | * Universite Pierre et Marie Curie (Paris VI) |
8 | * | 8 | * |
9 | * from | 9 | * from |
10 | * | 10 | * |
11 | * linux/fs/minix/inode.c | 11 | * linux/fs/minix/inode.c |
12 | * | 12 | * |
13 | * Copyright (C) 1991, 1992 Linus Torvalds | 13 | * Copyright (C) 1991, 1992 Linus Torvalds |
14 | * | 14 | * |
15 | * Goal-directed block allocation by Stephen Tweedie | 15 | * Goal-directed block allocation by Stephen Tweedie |
16 | * (sct@redhat.com), 1993, 1998 | 16 | * (sct@redhat.com), 1993, 1998 |
17 | * Big-endian to little-endian byte-swapping/bitmaps by | 17 | * Big-endian to little-endian byte-swapping/bitmaps by |
18 | * David S. Miller (davem@caip.rutgers.edu), 1995 | 18 | * David S. Miller (davem@caip.rutgers.edu), 1995 |
19 | * 64-bit file support on 64-bit platforms by Jakub Jelinek | 19 | * 64-bit file support on 64-bit platforms by Jakub Jelinek |
20 | * (jj@sunsite.ms.mff.cuni.cz) | 20 | * (jj@sunsite.ms.mff.cuni.cz) |
21 | * | 21 | * |
22 | * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 | 22 | * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <linux/fs.h> | 26 | #include <linux/fs.h> |
27 | #include <linux/time.h> | 27 | #include <linux/time.h> |
28 | #include <linux/jbd2.h> | 28 | #include <linux/jbd2.h> |
29 | #include <linux/highuid.h> | 29 | #include <linux/highuid.h> |
30 | #include <linux/pagemap.h> | 30 | #include <linux/pagemap.h> |
31 | #include <linux/quotaops.h> | 31 | #include <linux/quotaops.h> |
32 | #include <linux/string.h> | 32 | #include <linux/string.h> |
33 | #include <linux/buffer_head.h> | 33 | #include <linux/buffer_head.h> |
34 | #include <linux/writeback.h> | 34 | #include <linux/writeback.h> |
35 | #include <linux/pagevec.h> | 35 | #include <linux/pagevec.h> |
36 | #include <linux/mpage.h> | 36 | #include <linux/mpage.h> |
37 | #include <linux/namei.h> | 37 | #include <linux/namei.h> |
38 | #include <linux/uio.h> | 38 | #include <linux/uio.h> |
39 | #include <linux/bio.h> | 39 | #include <linux/bio.h> |
40 | #include <linux/workqueue.h> | 40 | #include <linux/workqueue.h> |
41 | #include <linux/kernel.h> | 41 | #include <linux/kernel.h> |
42 | #include <linux/printk.h> | 42 | #include <linux/printk.h> |
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/ratelimit.h> | 44 | #include <linux/ratelimit.h> |
45 | 45 | ||
46 | #include "ext4_jbd2.h" | 46 | #include "ext4_jbd2.h" |
47 | #include "xattr.h" | 47 | #include "xattr.h" |
48 | #include "acl.h" | 48 | #include "acl.h" |
49 | #include "ext4_extents.h" | 49 | #include "ext4_extents.h" |
50 | 50 | ||
51 | #include <trace/events/ext4.h> | 51 | #include <trace/events/ext4.h> |
52 | 52 | ||
53 | #define MPAGE_DA_EXTENT_TAIL 0x01 | 53 | #define MPAGE_DA_EXTENT_TAIL 0x01 |
54 | 54 | ||
55 | static inline int ext4_begin_ordered_truncate(struct inode *inode, | 55 | static inline int ext4_begin_ordered_truncate(struct inode *inode, |
56 | loff_t new_size) | 56 | loff_t new_size) |
57 | { | 57 | { |
58 | trace_ext4_begin_ordered_truncate(inode, new_size); | 58 | trace_ext4_begin_ordered_truncate(inode, new_size); |
59 | /* | 59 | /* |
60 | * If jinode is zero, then we never opened the file for | 60 | * If jinode is zero, then we never opened the file for |
61 | * writing, so there's no need to call | 61 | * writing, so there's no need to call |
62 | * jbd2_journal_begin_ordered_truncate() since there's no | 62 | * jbd2_journal_begin_ordered_truncate() since there's no |
63 | * outstanding writes we need to flush. | 63 | * outstanding writes we need to flush. |
64 | */ | 64 | */ |
65 | if (!EXT4_I(inode)->jinode) | 65 | if (!EXT4_I(inode)->jinode) |
66 | return 0; | 66 | return 0; |
67 | return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), | 67 | return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), |
68 | EXT4_I(inode)->jinode, | 68 | EXT4_I(inode)->jinode, |
69 | new_size); | 69 | new_size); |
70 | } | 70 | } |
71 | 71 | ||
72 | static void ext4_invalidatepage(struct page *page, unsigned long offset); | 72 | static void ext4_invalidatepage(struct page *page, unsigned long offset); |
73 | static int noalloc_get_block_write(struct inode *inode, sector_t iblock, | 73 | static int noalloc_get_block_write(struct inode *inode, sector_t iblock, |
74 | struct buffer_head *bh_result, int create); | 74 | struct buffer_head *bh_result, int create); |
75 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); | 75 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); |
76 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); | 76 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); |
77 | static int __ext4_journalled_writepage(struct page *page, unsigned int len); | 77 | static int __ext4_journalled_writepage(struct page *page, unsigned int len); |
78 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); | 78 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * Test whether an inode is a fast symlink. | 81 | * Test whether an inode is a fast symlink. |
82 | */ | 82 | */ |
83 | static int ext4_inode_is_fast_symlink(struct inode *inode) | 83 | static int ext4_inode_is_fast_symlink(struct inode *inode) |
84 | { | 84 | { |
85 | int ea_blocks = EXT4_I(inode)->i_file_acl ? | 85 | int ea_blocks = EXT4_I(inode)->i_file_acl ? |
86 | (inode->i_sb->s_blocksize >> 9) : 0; | 86 | (inode->i_sb->s_blocksize >> 9) : 0; |
87 | 87 | ||
88 | return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); | 88 | return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); |
89 | } | 89 | } |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Work out how many blocks we need to proceed with the next chunk of a | 92 | * Work out how many blocks we need to proceed with the next chunk of a |
93 | * truncate transaction. | 93 | * truncate transaction. |
94 | */ | 94 | */ |
95 | static unsigned long blocks_for_truncate(struct inode *inode) | 95 | static unsigned long blocks_for_truncate(struct inode *inode) |
96 | { | 96 | { |
97 | ext4_lblk_t needed; | 97 | ext4_lblk_t needed; |
98 | 98 | ||
99 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | 99 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); |
100 | 100 | ||
101 | /* Give ourselves just enough room to cope with inodes in which | 101 | /* Give ourselves just enough room to cope with inodes in which |
102 | * i_blocks is corrupt: we've seen disk corruptions in the past | 102 | * i_blocks is corrupt: we've seen disk corruptions in the past |
103 | * which resulted in random data in an inode which looked enough | 103 | * which resulted in random data in an inode which looked enough |
104 | * like a regular file for ext4 to try to delete it. Things | 104 | * like a regular file for ext4 to try to delete it. Things |
105 | * will go a bit crazy if that happens, but at least we should | 105 | * will go a bit crazy if that happens, but at least we should |
106 | * try not to panic the whole kernel. */ | 106 | * try not to panic the whole kernel. */ |
107 | if (needed < 2) | 107 | if (needed < 2) |
108 | needed = 2; | 108 | needed = 2; |
109 | 109 | ||
110 | /* But we need to bound the transaction so we don't overflow the | 110 | /* But we need to bound the transaction so we don't overflow the |
111 | * journal. */ | 111 | * journal. */ |
112 | if (needed > EXT4_MAX_TRANS_DATA) | 112 | if (needed > EXT4_MAX_TRANS_DATA) |
113 | needed = EXT4_MAX_TRANS_DATA; | 113 | needed = EXT4_MAX_TRANS_DATA; |
114 | 114 | ||
115 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | 115 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; |
116 | } | 116 | } |
117 | 117 | ||
118 | /* | 118 | /* |
119 | * Truncate transactions can be complex and absolutely huge. So we need to | 119 | * Truncate transactions can be complex and absolutely huge. So we need to |
120 | * be able to restart the transaction at a conventient checkpoint to make | 120 | * be able to restart the transaction at a conventient checkpoint to make |
121 | * sure we don't overflow the journal. | 121 | * sure we don't overflow the journal. |
122 | * | 122 | * |
123 | * start_transaction gets us a new handle for a truncate transaction, | 123 | * start_transaction gets us a new handle for a truncate transaction, |
124 | * and extend_transaction tries to extend the existing one a bit. If | 124 | * and extend_transaction tries to extend the existing one a bit. If |
125 | * extend fails, we need to propagate the failure up and restart the | 125 | * extend fails, we need to propagate the failure up and restart the |
126 | * transaction in the top-level truncate loop. --sct | 126 | * transaction in the top-level truncate loop. --sct |
127 | */ | 127 | */ |
128 | static handle_t *start_transaction(struct inode *inode) | 128 | static handle_t *start_transaction(struct inode *inode) |
129 | { | 129 | { |
130 | handle_t *result; | 130 | handle_t *result; |
131 | 131 | ||
132 | result = ext4_journal_start(inode, blocks_for_truncate(inode)); | 132 | result = ext4_journal_start(inode, blocks_for_truncate(inode)); |
133 | if (!IS_ERR(result)) | 133 | if (!IS_ERR(result)) |
134 | return result; | 134 | return result; |
135 | 135 | ||
136 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | 136 | ext4_std_error(inode->i_sb, PTR_ERR(result)); |
137 | return result; | 137 | return result; |
138 | } | 138 | } |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * Try to extend this transaction for the purposes of truncation. | 141 | * Try to extend this transaction for the purposes of truncation. |
142 | * | 142 | * |
143 | * Returns 0 if we managed to create more room. If we can't create more | 143 | * Returns 0 if we managed to create more room. If we can't create more |
144 | * room, and the transaction must be restarted we return 1. | 144 | * room, and the transaction must be restarted we return 1. |
145 | */ | 145 | */ |
146 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | 146 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) |
147 | { | 147 | { |
148 | if (!ext4_handle_valid(handle)) | 148 | if (!ext4_handle_valid(handle)) |
149 | return 0; | 149 | return 0; |
150 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | 150 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) |
151 | return 0; | 151 | return 0; |
152 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) | 152 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) |
153 | return 0; | 153 | return 0; |
154 | return 1; | 154 | return 1; |
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * Restart the transaction associated with *handle. This does a commit, | 158 | * Restart the transaction associated with *handle. This does a commit, |
159 | * so before we call here everything must be consistently dirtied against | 159 | * so before we call here everything must be consistently dirtied against |
160 | * this transaction. | 160 | * this transaction. |
161 | */ | 161 | */ |
162 | int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, | 162 | int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, |
163 | int nblocks) | 163 | int nblocks) |
164 | { | 164 | { |
165 | int ret; | 165 | int ret; |
166 | 166 | ||
167 | /* | 167 | /* |
168 | * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this | 168 | * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this |
169 | * moment, get_block can be called only for blocks inside i_size since | 169 | * moment, get_block can be called only for blocks inside i_size since |
170 | * page cache has been already dropped and writes are blocked by | 170 | * page cache has been already dropped and writes are blocked by |
171 | * i_mutex. So we can safely drop the i_data_sem here. | 171 | * i_mutex. So we can safely drop the i_data_sem here. |
172 | */ | 172 | */ |
173 | BUG_ON(EXT4_JOURNAL(inode) == NULL); | 173 | BUG_ON(EXT4_JOURNAL(inode) == NULL); |
174 | jbd_debug(2, "restarting handle %p\n", handle); | 174 | jbd_debug(2, "restarting handle %p\n", handle); |
175 | up_write(&EXT4_I(inode)->i_data_sem); | 175 | up_write(&EXT4_I(inode)->i_data_sem); |
176 | ret = ext4_journal_restart(handle, nblocks); | 176 | ret = ext4_journal_restart(handle, nblocks); |
177 | down_write(&EXT4_I(inode)->i_data_sem); | 177 | down_write(&EXT4_I(inode)->i_data_sem); |
178 | ext4_discard_preallocations(inode); | 178 | ext4_discard_preallocations(inode); |
179 | 179 | ||
180 | return ret; | 180 | return ret; |
181 | } | 181 | } |
182 | 182 | ||
183 | /* | 183 | /* |
184 | * Called at the last iput() if i_nlink is zero. | 184 | * Called at the last iput() if i_nlink is zero. |
185 | */ | 185 | */ |
186 | void ext4_evict_inode(struct inode *inode) | 186 | void ext4_evict_inode(struct inode *inode) |
187 | { | 187 | { |
188 | handle_t *handle; | 188 | handle_t *handle; |
189 | int err; | 189 | int err; |
190 | 190 | ||
191 | trace_ext4_evict_inode(inode); | 191 | trace_ext4_evict_inode(inode); |
192 | if (inode->i_nlink) { | 192 | if (inode->i_nlink) { |
193 | truncate_inode_pages(&inode->i_data, 0); | 193 | truncate_inode_pages(&inode->i_data, 0); |
194 | goto no_delete; | 194 | goto no_delete; |
195 | } | 195 | } |
196 | 196 | ||
197 | if (!is_bad_inode(inode)) | 197 | if (!is_bad_inode(inode)) |
198 | dquot_initialize(inode); | 198 | dquot_initialize(inode); |
199 | 199 | ||
200 | if (ext4_should_order_data(inode)) | 200 | if (ext4_should_order_data(inode)) |
201 | ext4_begin_ordered_truncate(inode, 0); | 201 | ext4_begin_ordered_truncate(inode, 0); |
202 | truncate_inode_pages(&inode->i_data, 0); | 202 | truncate_inode_pages(&inode->i_data, 0); |
203 | 203 | ||
204 | if (is_bad_inode(inode)) | 204 | if (is_bad_inode(inode)) |
205 | goto no_delete; | 205 | goto no_delete; |
206 | 206 | ||
207 | handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); | 207 | handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); |
208 | if (IS_ERR(handle)) { | 208 | if (IS_ERR(handle)) { |
209 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); | 209 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); |
210 | /* | 210 | /* |
211 | * If we're going to skip the normal cleanup, we still need to | 211 | * If we're going to skip the normal cleanup, we still need to |
212 | * make sure that the in-core orphan linked list is properly | 212 | * make sure that the in-core orphan linked list is properly |
213 | * cleaned up. | 213 | * cleaned up. |
214 | */ | 214 | */ |
215 | ext4_orphan_del(NULL, inode); | 215 | ext4_orphan_del(NULL, inode); |
216 | goto no_delete; | 216 | goto no_delete; |
217 | } | 217 | } |
218 | 218 | ||
219 | if (IS_SYNC(inode)) | 219 | if (IS_SYNC(inode)) |
220 | ext4_handle_sync(handle); | 220 | ext4_handle_sync(handle); |
221 | inode->i_size = 0; | 221 | inode->i_size = 0; |
222 | err = ext4_mark_inode_dirty(handle, inode); | 222 | err = ext4_mark_inode_dirty(handle, inode); |
223 | if (err) { | 223 | if (err) { |
224 | ext4_warning(inode->i_sb, | 224 | ext4_warning(inode->i_sb, |
225 | "couldn't mark inode dirty (err %d)", err); | 225 | "couldn't mark inode dirty (err %d)", err); |
226 | goto stop_handle; | 226 | goto stop_handle; |
227 | } | 227 | } |
228 | if (inode->i_blocks) | 228 | if (inode->i_blocks) |
229 | ext4_truncate(inode); | 229 | ext4_truncate(inode); |
230 | 230 | ||
231 | /* | 231 | /* |
232 | * ext4_ext_truncate() doesn't reserve any slop when it | 232 | * ext4_ext_truncate() doesn't reserve any slop when it |
233 | * restarts journal transactions; therefore there may not be | 233 | * restarts journal transactions; therefore there may not be |
234 | * enough credits left in the handle to remove the inode from | 234 | * enough credits left in the handle to remove the inode from |
235 | * the orphan list and set the dtime field. | 235 | * the orphan list and set the dtime field. |
236 | */ | 236 | */ |
237 | if (!ext4_handle_has_enough_credits(handle, 3)) { | 237 | if (!ext4_handle_has_enough_credits(handle, 3)) { |
238 | err = ext4_journal_extend(handle, 3); | 238 | err = ext4_journal_extend(handle, 3); |
239 | if (err > 0) | 239 | if (err > 0) |
240 | err = ext4_journal_restart(handle, 3); | 240 | err = ext4_journal_restart(handle, 3); |
241 | if (err != 0) { | 241 | if (err != 0) { |
242 | ext4_warning(inode->i_sb, | 242 | ext4_warning(inode->i_sb, |
243 | "couldn't extend journal (err %d)", err); | 243 | "couldn't extend journal (err %d)", err); |
244 | stop_handle: | 244 | stop_handle: |
245 | ext4_journal_stop(handle); | 245 | ext4_journal_stop(handle); |
246 | ext4_orphan_del(NULL, inode); | 246 | ext4_orphan_del(NULL, inode); |
247 | goto no_delete; | 247 | goto no_delete; |
248 | } | 248 | } |
249 | } | 249 | } |
250 | 250 | ||
251 | /* | 251 | /* |
252 | * Kill off the orphan record which ext4_truncate created. | 252 | * Kill off the orphan record which ext4_truncate created. |
253 | * AKPM: I think this can be inside the above `if'. | 253 | * AKPM: I think this can be inside the above `if'. |
254 | * Note that ext4_orphan_del() has to be able to cope with the | 254 | * Note that ext4_orphan_del() has to be able to cope with the |
255 | * deletion of a non-existent orphan - this is because we don't | 255 | * deletion of a non-existent orphan - this is because we don't |
256 | * know if ext4_truncate() actually created an orphan record. | 256 | * know if ext4_truncate() actually created an orphan record. |
257 | * (Well, we could do this if we need to, but heck - it works) | 257 | * (Well, we could do this if we need to, but heck - it works) |
258 | */ | 258 | */ |
259 | ext4_orphan_del(handle, inode); | 259 | ext4_orphan_del(handle, inode); |
260 | EXT4_I(inode)->i_dtime = get_seconds(); | 260 | EXT4_I(inode)->i_dtime = get_seconds(); |
261 | 261 | ||
262 | /* | 262 | /* |
263 | * One subtle ordering requirement: if anything has gone wrong | 263 | * One subtle ordering requirement: if anything has gone wrong |
264 | * (transaction abort, IO errors, whatever), then we can still | 264 | * (transaction abort, IO errors, whatever), then we can still |
265 | * do these next steps (the fs will already have been marked as | 265 | * do these next steps (the fs will already have been marked as |
266 | * having errors), but we can't free the inode if the mark_dirty | 266 | * having errors), but we can't free the inode if the mark_dirty |
267 | * fails. | 267 | * fails. |
268 | */ | 268 | */ |
269 | if (ext4_mark_inode_dirty(handle, inode)) | 269 | if (ext4_mark_inode_dirty(handle, inode)) |
270 | /* If that failed, just do the required in-core inode clear. */ | 270 | /* If that failed, just do the required in-core inode clear. */ |
271 | ext4_clear_inode(inode); | 271 | ext4_clear_inode(inode); |
272 | else | 272 | else |
273 | ext4_free_inode(handle, inode); | 273 | ext4_free_inode(handle, inode); |
274 | ext4_journal_stop(handle); | 274 | ext4_journal_stop(handle); |
275 | return; | 275 | return; |
276 | no_delete: | 276 | no_delete: |
277 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ | 277 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ |
278 | } | 278 | } |
279 | 279 | ||
280 | typedef struct { | 280 | typedef struct { |
281 | __le32 *p; | 281 | __le32 *p; |
282 | __le32 key; | 282 | __le32 key; |
283 | struct buffer_head *bh; | 283 | struct buffer_head *bh; |
284 | } Indirect; | 284 | } Indirect; |
285 | 285 | ||
286 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | 286 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) |
287 | { | 287 | { |
288 | p->key = *(p->p = v); | 288 | p->key = *(p->p = v); |
289 | p->bh = bh; | 289 | p->bh = bh; |
290 | } | 290 | } |
291 | 291 | ||
292 | /** | 292 | /** |
293 | * ext4_block_to_path - parse the block number into array of offsets | 293 | * ext4_block_to_path - parse the block number into array of offsets |
294 | * @inode: inode in question (we are only interested in its superblock) | 294 | * @inode: inode in question (we are only interested in its superblock) |
295 | * @i_block: block number to be parsed | 295 | * @i_block: block number to be parsed |
296 | * @offsets: array to store the offsets in | 296 | * @offsets: array to store the offsets in |
297 | * @boundary: set this non-zero if the referred-to block is likely to be | 297 | * @boundary: set this non-zero if the referred-to block is likely to be |
298 | * followed (on disk) by an indirect block. | 298 | * followed (on disk) by an indirect block. |
299 | * | 299 | * |
300 | * To store the locations of file's data ext4 uses a data structure common | 300 | * To store the locations of file's data ext4 uses a data structure common |
301 | * for UNIX filesystems - tree of pointers anchored in the inode, with | 301 | * for UNIX filesystems - tree of pointers anchored in the inode, with |
302 | * data blocks at leaves and indirect blocks in intermediate nodes. | 302 | * data blocks at leaves and indirect blocks in intermediate nodes. |
303 | * This function translates the block number into path in that tree - | 303 | * This function translates the block number into path in that tree - |
304 | * return value is the path length and @offsets[n] is the offset of | 304 | * return value is the path length and @offsets[n] is the offset of |
305 | * pointer to (n+1)th node in the nth one. If @block is out of range | 305 | * pointer to (n+1)th node in the nth one. If @block is out of range |
306 | * (negative or too large) warning is printed and zero returned. | 306 | * (negative or too large) warning is printed and zero returned. |
307 | * | 307 | * |
308 | * Note: function doesn't find node addresses, so no IO is needed. All | 308 | * Note: function doesn't find node addresses, so no IO is needed. All |
309 | * we need to know is the capacity of indirect blocks (taken from the | 309 | * we need to know is the capacity of indirect blocks (taken from the |
310 | * inode->i_sb). | 310 | * inode->i_sb). |
311 | */ | 311 | */ |
312 | 312 | ||
313 | /* | 313 | /* |
314 | * Portability note: the last comparison (check that we fit into triple | 314 | * Portability note: the last comparison (check that we fit into triple |
315 | * indirect block) is spelled differently, because otherwise on an | 315 | * indirect block) is spelled differently, because otherwise on an |
316 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | 316 | * architecture with 32-bit longs and 8Kb pages we might get into trouble |
317 | * if our filesystem had 8Kb blocks. We might use long long, but that would | 317 | * if our filesystem had 8Kb blocks. We might use long long, but that would |
318 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | 318 | * kill us on x86. Oh, well, at least the sign propagation does not matter - |
319 | * i_block would have to be negative in the very beginning, so we would not | 319 | * i_block would have to be negative in the very beginning, so we would not |
320 | * get there at all. | 320 | * get there at all. |
321 | */ | 321 | */ |
322 | 322 | ||
323 | static int ext4_block_to_path(struct inode *inode, | 323 | static int ext4_block_to_path(struct inode *inode, |
324 | ext4_lblk_t i_block, | 324 | ext4_lblk_t i_block, |
325 | ext4_lblk_t offsets[4], int *boundary) | 325 | ext4_lblk_t offsets[4], int *boundary) |
326 | { | 326 | { |
327 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 327 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); |
328 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | 328 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); |
329 | const long direct_blocks = EXT4_NDIR_BLOCKS, | 329 | const long direct_blocks = EXT4_NDIR_BLOCKS, |
330 | indirect_blocks = ptrs, | 330 | indirect_blocks = ptrs, |
331 | double_blocks = (1 << (ptrs_bits * 2)); | 331 | double_blocks = (1 << (ptrs_bits * 2)); |
332 | int n = 0; | 332 | int n = 0; |
333 | int final = 0; | 333 | int final = 0; |
334 | 334 | ||
335 | if (i_block < direct_blocks) { | 335 | if (i_block < direct_blocks) { |
336 | offsets[n++] = i_block; | 336 | offsets[n++] = i_block; |
337 | final = direct_blocks; | 337 | final = direct_blocks; |
338 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | 338 | } else if ((i_block -= direct_blocks) < indirect_blocks) { |
339 | offsets[n++] = EXT4_IND_BLOCK; | 339 | offsets[n++] = EXT4_IND_BLOCK; |
340 | offsets[n++] = i_block; | 340 | offsets[n++] = i_block; |
341 | final = ptrs; | 341 | final = ptrs; |
342 | } else if ((i_block -= indirect_blocks) < double_blocks) { | 342 | } else if ((i_block -= indirect_blocks) < double_blocks) { |
343 | offsets[n++] = EXT4_DIND_BLOCK; | 343 | offsets[n++] = EXT4_DIND_BLOCK; |
344 | offsets[n++] = i_block >> ptrs_bits; | 344 | offsets[n++] = i_block >> ptrs_bits; |
345 | offsets[n++] = i_block & (ptrs - 1); | 345 | offsets[n++] = i_block & (ptrs - 1); |
346 | final = ptrs; | 346 | final = ptrs; |
347 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | 347 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { |
348 | offsets[n++] = EXT4_TIND_BLOCK; | 348 | offsets[n++] = EXT4_TIND_BLOCK; |
349 | offsets[n++] = i_block >> (ptrs_bits * 2); | 349 | offsets[n++] = i_block >> (ptrs_bits * 2); |
350 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | 350 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); |
351 | offsets[n++] = i_block & (ptrs - 1); | 351 | offsets[n++] = i_block & (ptrs - 1); |
352 | final = ptrs; | 352 | final = ptrs; |
353 | } else { | 353 | } else { |
354 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | 354 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", |
355 | i_block + direct_blocks + | 355 | i_block + direct_blocks + |
356 | indirect_blocks + double_blocks, inode->i_ino); | 356 | indirect_blocks + double_blocks, inode->i_ino); |
357 | } | 357 | } |
358 | if (boundary) | 358 | if (boundary) |
359 | *boundary = final - 1 - (i_block & (ptrs - 1)); | 359 | *boundary = final - 1 - (i_block & (ptrs - 1)); |
360 | return n; | 360 | return n; |
361 | } | 361 | } |
362 | 362 | ||
363 | static int __ext4_check_blockref(const char *function, unsigned int line, | 363 | static int __ext4_check_blockref(const char *function, unsigned int line, |
364 | struct inode *inode, | 364 | struct inode *inode, |
365 | __le32 *p, unsigned int max) | 365 | __le32 *p, unsigned int max) |
366 | { | 366 | { |
367 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | 367 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; |
368 | __le32 *bref = p; | 368 | __le32 *bref = p; |
369 | unsigned int blk; | 369 | unsigned int blk; |
370 | 370 | ||
371 | while (bref < p+max) { | 371 | while (bref < p+max) { |
372 | blk = le32_to_cpu(*bref++); | 372 | blk = le32_to_cpu(*bref++); |
373 | if (blk && | 373 | if (blk && |
374 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | 374 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), |
375 | blk, 1))) { | 375 | blk, 1))) { |
376 | es->s_last_error_block = cpu_to_le64(blk); | 376 | es->s_last_error_block = cpu_to_le64(blk); |
377 | ext4_error_inode(inode, function, line, blk, | 377 | ext4_error_inode(inode, function, line, blk, |
378 | "invalid block"); | 378 | "invalid block"); |
379 | return -EIO; | 379 | return -EIO; |
380 | } | 380 | } |
381 | } | 381 | } |
382 | return 0; | 382 | return 0; |
383 | } | 383 | } |
384 | 384 | ||
385 | 385 | ||
386 | #define ext4_check_indirect_blockref(inode, bh) \ | 386 | #define ext4_check_indirect_blockref(inode, bh) \ |
387 | __ext4_check_blockref(__func__, __LINE__, inode, \ | 387 | __ext4_check_blockref(__func__, __LINE__, inode, \ |
388 | (__le32 *)(bh)->b_data, \ | 388 | (__le32 *)(bh)->b_data, \ |
389 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | 389 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) |
390 | 390 | ||
391 | #define ext4_check_inode_blockref(inode) \ | 391 | #define ext4_check_inode_blockref(inode) \ |
392 | __ext4_check_blockref(__func__, __LINE__, inode, \ | 392 | __ext4_check_blockref(__func__, __LINE__, inode, \ |
393 | EXT4_I(inode)->i_data, \ | 393 | EXT4_I(inode)->i_data, \ |
394 | EXT4_NDIR_BLOCKS) | 394 | EXT4_NDIR_BLOCKS) |
395 | 395 | ||
396 | /** | 396 | /** |
397 | * ext4_get_branch - read the chain of indirect blocks leading to data | 397 | * ext4_get_branch - read the chain of indirect blocks leading to data |
398 | * @inode: inode in question | 398 | * @inode: inode in question |
399 | * @depth: depth of the chain (1 - direct pointer, etc.) | 399 | * @depth: depth of the chain (1 - direct pointer, etc.) |
400 | * @offsets: offsets of pointers in inode/indirect blocks | 400 | * @offsets: offsets of pointers in inode/indirect blocks |
401 | * @chain: place to store the result | 401 | * @chain: place to store the result |
402 | * @err: here we store the error value | 402 | * @err: here we store the error value |
403 | * | 403 | * |
404 | * Function fills the array of triples <key, p, bh> and returns %NULL | 404 | * Function fills the array of triples <key, p, bh> and returns %NULL |
405 | * if everything went OK or the pointer to the last filled triple | 405 | * if everything went OK or the pointer to the last filled triple |
406 | * (incomplete one) otherwise. Upon the return chain[i].key contains | 406 | * (incomplete one) otherwise. Upon the return chain[i].key contains |
407 | * the number of (i+1)-th block in the chain (as it is stored in memory, | 407 | * the number of (i+1)-th block in the chain (as it is stored in memory, |
408 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | 408 | * i.e. little-endian 32-bit), chain[i].p contains the address of that |
409 | * number (it points into struct inode for i==0 and into the bh->b_data | 409 | * number (it points into struct inode for i==0 and into the bh->b_data |
410 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | 410 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect |
411 | * block for i>0 and NULL for i==0. In other words, it holds the block | 411 | * block for i>0 and NULL for i==0. In other words, it holds the block |
412 | * numbers of the chain, addresses they were taken from (and where we can | 412 | * numbers of the chain, addresses they were taken from (and where we can |
413 | * verify that chain did not change) and buffer_heads hosting these | 413 | * verify that chain did not change) and buffer_heads hosting these |
414 | * numbers. | 414 | * numbers. |
415 | * | 415 | * |
416 | * Function stops when it stumbles upon zero pointer (absent block) | 416 | * Function stops when it stumbles upon zero pointer (absent block) |
417 | * (pointer to last triple returned, *@err == 0) | 417 | * (pointer to last triple returned, *@err == 0) |
418 | * or when it gets an IO error reading an indirect block | 418 | * or when it gets an IO error reading an indirect block |
419 | * (ditto, *@err == -EIO) | 419 | * (ditto, *@err == -EIO) |
420 | * or when it reads all @depth-1 indirect blocks successfully and finds | 420 | * or when it reads all @depth-1 indirect blocks successfully and finds |
421 | * the whole chain, all way to the data (returns %NULL, *err == 0). | 421 | * the whole chain, all way to the data (returns %NULL, *err == 0). |
422 | * | 422 | * |
423 | * Need to be called with | 423 | * Need to be called with |
424 | * down_read(&EXT4_I(inode)->i_data_sem) | 424 | * down_read(&EXT4_I(inode)->i_data_sem) |
425 | */ | 425 | */ |
426 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | 426 | static Indirect *ext4_get_branch(struct inode *inode, int depth, |
427 | ext4_lblk_t *offsets, | 427 | ext4_lblk_t *offsets, |
428 | Indirect chain[4], int *err) | 428 | Indirect chain[4], int *err) |
429 | { | 429 | { |
430 | struct super_block *sb = inode->i_sb; | 430 | struct super_block *sb = inode->i_sb; |
431 | Indirect *p = chain; | 431 | Indirect *p = chain; |
432 | struct buffer_head *bh; | 432 | struct buffer_head *bh; |
433 | 433 | ||
434 | *err = 0; | 434 | *err = 0; |
435 | /* i_data is not going away, no lock needed */ | 435 | /* i_data is not going away, no lock needed */ |
436 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | 436 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); |
437 | if (!p->key) | 437 | if (!p->key) |
438 | goto no_block; | 438 | goto no_block; |
439 | while (--depth) { | 439 | while (--depth) { |
440 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | 440 | bh = sb_getblk(sb, le32_to_cpu(p->key)); |
441 | if (unlikely(!bh)) | 441 | if (unlikely(!bh)) |
442 | goto failure; | 442 | goto failure; |
443 | 443 | ||
444 | if (!bh_uptodate_or_lock(bh)) { | 444 | if (!bh_uptodate_or_lock(bh)) { |
445 | if (bh_submit_read(bh) < 0) { | 445 | if (bh_submit_read(bh) < 0) { |
446 | put_bh(bh); | 446 | put_bh(bh); |
447 | goto failure; | 447 | goto failure; |
448 | } | 448 | } |
449 | /* validate block references */ | 449 | /* validate block references */ |
450 | if (ext4_check_indirect_blockref(inode, bh)) { | 450 | if (ext4_check_indirect_blockref(inode, bh)) { |
451 | put_bh(bh); | 451 | put_bh(bh); |
452 | goto failure; | 452 | goto failure; |
453 | } | 453 | } |
454 | } | 454 | } |
455 | 455 | ||
456 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | 456 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); |
457 | /* Reader: end */ | 457 | /* Reader: end */ |
458 | if (!p->key) | 458 | if (!p->key) |
459 | goto no_block; | 459 | goto no_block; |
460 | } | 460 | } |
461 | return NULL; | 461 | return NULL; |
462 | 462 | ||
463 | failure: | 463 | failure: |
464 | *err = -EIO; | 464 | *err = -EIO; |
465 | no_block: | 465 | no_block: |
466 | return p; | 466 | return p; |
467 | } | 467 | } |
468 | 468 | ||
469 | /** | 469 | /** |
470 | * ext4_find_near - find a place for allocation with sufficient locality | 470 | * ext4_find_near - find a place for allocation with sufficient locality |
471 | * @inode: owner | 471 | * @inode: owner |
472 | * @ind: descriptor of indirect block. | 472 | * @ind: descriptor of indirect block. |
473 | * | 473 | * |
474 | * This function returns the preferred place for block allocation. | 474 | * This function returns the preferred place for block allocation. |
475 | * It is used when heuristic for sequential allocation fails. | 475 | * It is used when heuristic for sequential allocation fails. |
476 | * Rules are: | 476 | * Rules are: |
477 | * + if there is a block to the left of our position - allocate near it. | 477 | * + if there is a block to the left of our position - allocate near it. |
478 | * + if pointer will live in indirect block - allocate near that block. | 478 | * + if pointer will live in indirect block - allocate near that block. |
479 | * + if pointer will live in inode - allocate in the same | 479 | * + if pointer will live in inode - allocate in the same |
480 | * cylinder group. | 480 | * cylinder group. |
481 | * | 481 | * |
482 | * In the latter case we colour the starting block by the callers PID to | 482 | * In the latter case we colour the starting block by the callers PID to |
483 | * prevent it from clashing with concurrent allocations for a different inode | 483 | * prevent it from clashing with concurrent allocations for a different inode |
484 | * in the same block group. The PID is used here so that functionally related | 484 | * in the same block group. The PID is used here so that functionally related |
485 | * files will be close-by on-disk. | 485 | * files will be close-by on-disk. |
486 | * | 486 | * |
487 | * Caller must make sure that @ind is valid and will stay that way. | 487 | * Caller must make sure that @ind is valid and will stay that way. |
488 | */ | 488 | */ |
489 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | 489 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) |
490 | { | 490 | { |
491 | struct ext4_inode_info *ei = EXT4_I(inode); | 491 | struct ext4_inode_info *ei = EXT4_I(inode); |
492 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | 492 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; |
493 | __le32 *p; | 493 | __le32 *p; |
494 | ext4_fsblk_t bg_start; | 494 | ext4_fsblk_t bg_start; |
495 | ext4_fsblk_t last_block; | 495 | ext4_fsblk_t last_block; |
496 | ext4_grpblk_t colour; | 496 | ext4_grpblk_t colour; |
497 | ext4_group_t block_group; | 497 | ext4_group_t block_group; |
498 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | 498 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); |
499 | 499 | ||
500 | /* Try to find previous block */ | 500 | /* Try to find previous block */ |
501 | for (p = ind->p - 1; p >= start; p--) { | 501 | for (p = ind->p - 1; p >= start; p--) { |
502 | if (*p) | 502 | if (*p) |
503 | return le32_to_cpu(*p); | 503 | return le32_to_cpu(*p); |
504 | } | 504 | } |
505 | 505 | ||
506 | /* No such thing, so let's try location of indirect block */ | 506 | /* No such thing, so let's try location of indirect block */ |
507 | if (ind->bh) | 507 | if (ind->bh) |
508 | return ind->bh->b_blocknr; | 508 | return ind->bh->b_blocknr; |
509 | 509 | ||
510 | /* | 510 | /* |
511 | * It is going to be referred to from the inode itself? OK, just put it | 511 | * It is going to be referred to from the inode itself? OK, just put it |
512 | * into the same cylinder group then. | 512 | * into the same cylinder group then. |
513 | */ | 513 | */ |
514 | block_group = ei->i_block_group; | 514 | block_group = ei->i_block_group; |
515 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | 515 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { |
516 | block_group &= ~(flex_size-1); | 516 | block_group &= ~(flex_size-1); |
517 | if (S_ISREG(inode->i_mode)) | 517 | if (S_ISREG(inode->i_mode)) |
518 | block_group++; | 518 | block_group++; |
519 | } | 519 | } |
520 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | 520 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); |
521 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | 521 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; |
522 | 522 | ||
523 | /* | 523 | /* |
524 | * If we are doing delayed allocation, we don't need take | 524 | * If we are doing delayed allocation, we don't need take |
525 | * colour into account. | 525 | * colour into account. |
526 | */ | 526 | */ |
527 | if (test_opt(inode->i_sb, DELALLOC)) | 527 | if (test_opt(inode->i_sb, DELALLOC)) |
528 | return bg_start; | 528 | return bg_start; |
529 | 529 | ||
530 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | 530 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) |
531 | colour = (current->pid % 16) * | 531 | colour = (current->pid % 16) * |
532 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | 532 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); |
533 | else | 533 | else |
534 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | 534 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); |
535 | return bg_start + colour; | 535 | return bg_start + colour; |
536 | } | 536 | } |
537 | 537 | ||
538 | /** | 538 | /** |
539 | * ext4_find_goal - find a preferred place for allocation. | 539 | * ext4_find_goal - find a preferred place for allocation. |
540 | * @inode: owner | 540 | * @inode: owner |
541 | * @block: block we want | 541 | * @block: block we want |
542 | * @partial: pointer to the last triple within a chain | 542 | * @partial: pointer to the last triple within a chain |
543 | * | 543 | * |
544 | * Normally this function find the preferred place for block allocation, | 544 | * Normally this function find the preferred place for block allocation, |
545 | * returns it. | 545 | * returns it. |
546 | * Because this is only used for non-extent files, we limit the block nr | 546 | * Because this is only used for non-extent files, we limit the block nr |
547 | * to 32 bits. | 547 | * to 32 bits. |
548 | */ | 548 | */ |
549 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | 549 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, |
550 | Indirect *partial) | 550 | Indirect *partial) |
551 | { | 551 | { |
552 | ext4_fsblk_t goal; | 552 | ext4_fsblk_t goal; |
553 | 553 | ||
554 | /* | 554 | /* |
555 | * XXX need to get goal block from mballoc's data structures | 555 | * XXX need to get goal block from mballoc's data structures |
556 | */ | 556 | */ |
557 | 557 | ||
558 | goal = ext4_find_near(inode, partial); | 558 | goal = ext4_find_near(inode, partial); |
559 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | 559 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; |
560 | return goal; | 560 | return goal; |
561 | } | 561 | } |
562 | 562 | ||
563 | /** | 563 | /** |
564 | * ext4_blks_to_allocate - Look up the block map and count the number | 564 | * ext4_blks_to_allocate - Look up the block map and count the number |
565 | * of direct blocks need to be allocated for the given branch. | 565 | * of direct blocks need to be allocated for the given branch. |
566 | * | 566 | * |
567 | * @branch: chain of indirect blocks | 567 | * @branch: chain of indirect blocks |
568 | * @k: number of blocks need for indirect blocks | 568 | * @k: number of blocks need for indirect blocks |
569 | * @blks: number of data blocks to be mapped. | 569 | * @blks: number of data blocks to be mapped. |
570 | * @blocks_to_boundary: the offset in the indirect block | 570 | * @blocks_to_boundary: the offset in the indirect block |
571 | * | 571 | * |
572 | * return the total number of blocks to be allocate, including the | 572 | * return the total number of blocks to be allocate, including the |
573 | * direct and indirect blocks. | 573 | * direct and indirect blocks. |
574 | */ | 574 | */ |
575 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | 575 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, |
576 | int blocks_to_boundary) | 576 | int blocks_to_boundary) |
577 | { | 577 | { |
578 | unsigned int count = 0; | 578 | unsigned int count = 0; |
579 | 579 | ||
580 | /* | 580 | /* |
581 | * Simple case, [t,d]Indirect block(s) has not allocated yet | 581 | * Simple case, [t,d]Indirect block(s) has not allocated yet |
582 | * then it's clear blocks on that path have not allocated | 582 | * then it's clear blocks on that path have not allocated |
583 | */ | 583 | */ |
584 | if (k > 0) { | 584 | if (k > 0) { |
585 | /* right now we don't handle cross boundary allocation */ | 585 | /* right now we don't handle cross boundary allocation */ |
586 | if (blks < blocks_to_boundary + 1) | 586 | if (blks < blocks_to_boundary + 1) |
587 | count += blks; | 587 | count += blks; |
588 | else | 588 | else |
589 | count += blocks_to_boundary + 1; | 589 | count += blocks_to_boundary + 1; |
590 | return count; | 590 | return count; |
591 | } | 591 | } |
592 | 592 | ||
593 | count++; | 593 | count++; |
594 | while (count < blks && count <= blocks_to_boundary && | 594 | while (count < blks && count <= blocks_to_boundary && |
595 | le32_to_cpu(*(branch[0].p + count)) == 0) { | 595 | le32_to_cpu(*(branch[0].p + count)) == 0) { |
596 | count++; | 596 | count++; |
597 | } | 597 | } |
598 | return count; | 598 | return count; |
599 | } | 599 | } |
600 | 600 | ||
601 | /** | 601 | /** |
602 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | 602 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch |
603 | * @handle: handle for this transaction | 603 | * @handle: handle for this transaction |
604 | * @inode: inode which needs allocated blocks | 604 | * @inode: inode which needs allocated blocks |
605 | * @iblock: the logical block to start allocated at | 605 | * @iblock: the logical block to start allocated at |
606 | * @goal: preferred physical block of allocation | 606 | * @goal: preferred physical block of allocation |
607 | * @indirect_blks: the number of blocks need to allocate for indirect | 607 | * @indirect_blks: the number of blocks need to allocate for indirect |
608 | * blocks | 608 | * blocks |
609 | * @blks: number of desired blocks | 609 | * @blks: number of desired blocks |
610 | * @new_blocks: on return it will store the new block numbers for | 610 | * @new_blocks: on return it will store the new block numbers for |
611 | * the indirect blocks(if needed) and the first direct block, | 611 | * the indirect blocks(if needed) and the first direct block, |
612 | * @err: on return it will store the error code | 612 | * @err: on return it will store the error code |
613 | * | 613 | * |
614 | * This function will return the number of blocks allocated as | 614 | * This function will return the number of blocks allocated as |
615 | * requested by the passed-in parameters. | 615 | * requested by the passed-in parameters. |
616 | */ | 616 | */ |
617 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | 617 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, |
618 | ext4_lblk_t iblock, ext4_fsblk_t goal, | 618 | ext4_lblk_t iblock, ext4_fsblk_t goal, |
619 | int indirect_blks, int blks, | 619 | int indirect_blks, int blks, |
620 | ext4_fsblk_t new_blocks[4], int *err) | 620 | ext4_fsblk_t new_blocks[4], int *err) |
621 | { | 621 | { |
622 | struct ext4_allocation_request ar; | 622 | struct ext4_allocation_request ar; |
623 | int target, i; | 623 | int target, i; |
624 | unsigned long count = 0, blk_allocated = 0; | 624 | unsigned long count = 0, blk_allocated = 0; |
625 | int index = 0; | 625 | int index = 0; |
626 | ext4_fsblk_t current_block = 0; | 626 | ext4_fsblk_t current_block = 0; |
627 | int ret = 0; | 627 | int ret = 0; |
628 | 628 | ||
629 | /* | 629 | /* |
630 | * Here we try to allocate the requested multiple blocks at once, | 630 | * Here we try to allocate the requested multiple blocks at once, |
631 | * on a best-effort basis. | 631 | * on a best-effort basis. |
632 | * To build a branch, we should allocate blocks for | 632 | * To build a branch, we should allocate blocks for |
633 | * the indirect blocks(if not allocated yet), and at least | 633 | * the indirect blocks(if not allocated yet), and at least |
634 | * the first direct block of this branch. That's the | 634 | * the first direct block of this branch. That's the |
635 | * minimum number of blocks need to allocate(required) | 635 | * minimum number of blocks need to allocate(required) |
636 | */ | 636 | */ |
637 | /* first we try to allocate the indirect blocks */ | 637 | /* first we try to allocate the indirect blocks */ |
638 | target = indirect_blks; | 638 | target = indirect_blks; |
639 | while (target > 0) { | 639 | while (target > 0) { |
640 | count = target; | 640 | count = target; |
641 | /* allocating blocks for indirect blocks and direct blocks */ | 641 | /* allocating blocks for indirect blocks and direct blocks */ |
642 | current_block = ext4_new_meta_blocks(handle, inode, goal, | 642 | current_block = ext4_new_meta_blocks(handle, inode, goal, |
643 | 0, &count, err); | 643 | 0, &count, err); |
644 | if (*err) | 644 | if (*err) |
645 | goto failed_out; | 645 | goto failed_out; |
646 | 646 | ||
647 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | 647 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { |
648 | EXT4_ERROR_INODE(inode, | 648 | EXT4_ERROR_INODE(inode, |
649 | "current_block %llu + count %lu > %d!", | 649 | "current_block %llu + count %lu > %d!", |
650 | current_block, count, | 650 | current_block, count, |
651 | EXT4_MAX_BLOCK_FILE_PHYS); | 651 | EXT4_MAX_BLOCK_FILE_PHYS); |
652 | *err = -EIO; | 652 | *err = -EIO; |
653 | goto failed_out; | 653 | goto failed_out; |
654 | } | 654 | } |
655 | 655 | ||
656 | target -= count; | 656 | target -= count; |
657 | /* allocate blocks for indirect blocks */ | 657 | /* allocate blocks for indirect blocks */ |
658 | while (index < indirect_blks && count) { | 658 | while (index < indirect_blks && count) { |
659 | new_blocks[index++] = current_block++; | 659 | new_blocks[index++] = current_block++; |
660 | count--; | 660 | count--; |
661 | } | 661 | } |
662 | if (count > 0) { | 662 | if (count > 0) { |
663 | /* | 663 | /* |
664 | * save the new block number | 664 | * save the new block number |
665 | * for the first direct block | 665 | * for the first direct block |
666 | */ | 666 | */ |
667 | new_blocks[index] = current_block; | 667 | new_blocks[index] = current_block; |
668 | printk(KERN_INFO "%s returned more blocks than " | 668 | printk(KERN_INFO "%s returned more blocks than " |
669 | "requested\n", __func__); | 669 | "requested\n", __func__); |
670 | WARN_ON(1); | 670 | WARN_ON(1); |
671 | break; | 671 | break; |
672 | } | 672 | } |
673 | } | 673 | } |
674 | 674 | ||
675 | target = blks - count ; | 675 | target = blks - count ; |
676 | blk_allocated = count; | 676 | blk_allocated = count; |
677 | if (!target) | 677 | if (!target) |
678 | goto allocated; | 678 | goto allocated; |
679 | /* Now allocate data blocks */ | 679 | /* Now allocate data blocks */ |
680 | memset(&ar, 0, sizeof(ar)); | 680 | memset(&ar, 0, sizeof(ar)); |
681 | ar.inode = inode; | 681 | ar.inode = inode; |
682 | ar.goal = goal; | 682 | ar.goal = goal; |
683 | ar.len = target; | 683 | ar.len = target; |
684 | ar.logical = iblock; | 684 | ar.logical = iblock; |
685 | if (S_ISREG(inode->i_mode)) | 685 | if (S_ISREG(inode->i_mode)) |
686 | /* enable in-core preallocation only for regular files */ | 686 | /* enable in-core preallocation only for regular files */ |
687 | ar.flags = EXT4_MB_HINT_DATA; | 687 | ar.flags = EXT4_MB_HINT_DATA; |
688 | 688 | ||
689 | current_block = ext4_mb_new_blocks(handle, &ar, err); | 689 | current_block = ext4_mb_new_blocks(handle, &ar, err); |
690 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | 690 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { |
691 | EXT4_ERROR_INODE(inode, | 691 | EXT4_ERROR_INODE(inode, |
692 | "current_block %llu + ar.len %d > %d!", | 692 | "current_block %llu + ar.len %d > %d!", |
693 | current_block, ar.len, | 693 | current_block, ar.len, |
694 | EXT4_MAX_BLOCK_FILE_PHYS); | 694 | EXT4_MAX_BLOCK_FILE_PHYS); |
695 | *err = -EIO; | 695 | *err = -EIO; |
696 | goto failed_out; | 696 | goto failed_out; |
697 | } | 697 | } |
698 | 698 | ||
699 | if (*err && (target == blks)) { | 699 | if (*err && (target == blks)) { |
700 | /* | 700 | /* |
701 | * if the allocation failed and we didn't allocate | 701 | * if the allocation failed and we didn't allocate |
702 | * any blocks before | 702 | * any blocks before |
703 | */ | 703 | */ |
704 | goto failed_out; | 704 | goto failed_out; |
705 | } | 705 | } |
706 | if (!*err) { | 706 | if (!*err) { |
707 | if (target == blks) { | 707 | if (target == blks) { |
708 | /* | 708 | /* |
709 | * save the new block number | 709 | * save the new block number |
710 | * for the first direct block | 710 | * for the first direct block |
711 | */ | 711 | */ |
712 | new_blocks[index] = current_block; | 712 | new_blocks[index] = current_block; |
713 | } | 713 | } |
714 | blk_allocated += ar.len; | 714 | blk_allocated += ar.len; |
715 | } | 715 | } |
716 | allocated: | 716 | allocated: |
717 | /* total number of blocks allocated for direct blocks */ | 717 | /* total number of blocks allocated for direct blocks */ |
718 | ret = blk_allocated; | 718 | ret = blk_allocated; |
719 | *err = 0; | 719 | *err = 0; |
720 | return ret; | 720 | return ret; |
721 | failed_out: | 721 | failed_out: |
722 | for (i = 0; i < index; i++) | 722 | for (i = 0; i < index; i++) |
723 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | 723 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); |
724 | return ret; | 724 | return ret; |
725 | } | 725 | } |
726 | 726 | ||
727 | /** | 727 | /** |
728 | * ext4_alloc_branch - allocate and set up a chain of blocks. | 728 | * ext4_alloc_branch - allocate and set up a chain of blocks. |
729 | * @handle: handle for this transaction | 729 | * @handle: handle for this transaction |
730 | * @inode: owner | 730 | * @inode: owner |
731 | * @indirect_blks: number of allocated indirect blocks | 731 | * @indirect_blks: number of allocated indirect blocks |
732 | * @blks: number of allocated direct blocks | 732 | * @blks: number of allocated direct blocks |
733 | * @goal: preferred place for allocation | 733 | * @goal: preferred place for allocation |
734 | * @offsets: offsets (in the blocks) to store the pointers to next. | 734 | * @offsets: offsets (in the blocks) to store the pointers to next. |
735 | * @branch: place to store the chain in. | 735 | * @branch: place to store the chain in. |
736 | * | 736 | * |
737 | * This function allocates blocks, zeroes out all but the last one, | 737 | * This function allocates blocks, zeroes out all but the last one, |
738 | * links them into chain and (if we are synchronous) writes them to disk. | 738 | * links them into chain and (if we are synchronous) writes them to disk. |
739 | * In other words, it prepares a branch that can be spliced onto the | 739 | * In other words, it prepares a branch that can be spliced onto the |
740 | * inode. It stores the information about that chain in the branch[], in | 740 | * inode. It stores the information about that chain in the branch[], in |
741 | * the same format as ext4_get_branch() would do. We are calling it after | 741 | * the same format as ext4_get_branch() would do. We are calling it after |
742 | * we had read the existing part of chain and partial points to the last | 742 | * we had read the existing part of chain and partial points to the last |
743 | * triple of that (one with zero ->key). Upon the exit we have the same | 743 | * triple of that (one with zero ->key). Upon the exit we have the same |
744 | * picture as after the successful ext4_get_block(), except that in one | 744 | * picture as after the successful ext4_get_block(), except that in one |
745 | * place chain is disconnected - *branch->p is still zero (we did not | 745 | * place chain is disconnected - *branch->p is still zero (we did not |
746 | * set the last link), but branch->key contains the number that should | 746 | * set the last link), but branch->key contains the number that should |
747 | * be placed into *branch->p to fill that gap. | 747 | * be placed into *branch->p to fill that gap. |
748 | * | 748 | * |
749 | * If allocation fails we free all blocks we've allocated (and forget | 749 | * If allocation fails we free all blocks we've allocated (and forget |
750 | * their buffer_heads) and return the error value the from failed | 750 | * their buffer_heads) and return the error value the from failed |
751 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | 751 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain |
752 | * as described above and return 0. | 752 | * as described above and return 0. |
753 | */ | 753 | */ |
754 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | 754 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, |
755 | ext4_lblk_t iblock, int indirect_blks, | 755 | ext4_lblk_t iblock, int indirect_blks, |
756 | int *blks, ext4_fsblk_t goal, | 756 | int *blks, ext4_fsblk_t goal, |
757 | ext4_lblk_t *offsets, Indirect *branch) | 757 | ext4_lblk_t *offsets, Indirect *branch) |
758 | { | 758 | { |
759 | int blocksize = inode->i_sb->s_blocksize; | 759 | int blocksize = inode->i_sb->s_blocksize; |
760 | int i, n = 0; | 760 | int i, n = 0; |
761 | int err = 0; | 761 | int err = 0; |
762 | struct buffer_head *bh; | 762 | struct buffer_head *bh; |
763 | int num; | 763 | int num; |
764 | ext4_fsblk_t new_blocks[4]; | 764 | ext4_fsblk_t new_blocks[4]; |
765 | ext4_fsblk_t current_block; | 765 | ext4_fsblk_t current_block; |
766 | 766 | ||
767 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | 767 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, |
768 | *blks, new_blocks, &err); | 768 | *blks, new_blocks, &err); |
769 | if (err) | 769 | if (err) |
770 | return err; | 770 | return err; |
771 | 771 | ||
772 | branch[0].key = cpu_to_le32(new_blocks[0]); | 772 | branch[0].key = cpu_to_le32(new_blocks[0]); |
773 | /* | 773 | /* |
774 | * metadata blocks and data blocks are allocated. | 774 | * metadata blocks and data blocks are allocated. |
775 | */ | 775 | */ |
776 | for (n = 1; n <= indirect_blks; n++) { | 776 | for (n = 1; n <= indirect_blks; n++) { |
777 | /* | 777 | /* |
778 | * Get buffer_head for parent block, zero it out | 778 | * Get buffer_head for parent block, zero it out |
779 | * and set the pointer to new one, then send | 779 | * and set the pointer to new one, then send |
780 | * parent to disk. | 780 | * parent to disk. |
781 | */ | 781 | */ |
782 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | 782 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); |
783 | if (unlikely(!bh)) { | 783 | if (unlikely(!bh)) { |
784 | err = -EIO; | 784 | err = -EIO; |
785 | goto failed; | 785 | goto failed; |
786 | } | 786 | } |
787 | 787 | ||
788 | branch[n].bh = bh; | 788 | branch[n].bh = bh; |
789 | lock_buffer(bh); | 789 | lock_buffer(bh); |
790 | BUFFER_TRACE(bh, "call get_create_access"); | 790 | BUFFER_TRACE(bh, "call get_create_access"); |
791 | err = ext4_journal_get_create_access(handle, bh); | 791 | err = ext4_journal_get_create_access(handle, bh); |
792 | if (err) { | 792 | if (err) { |
793 | /* Don't brelse(bh) here; it's done in | 793 | /* Don't brelse(bh) here; it's done in |
794 | * ext4_journal_forget() below */ | 794 | * ext4_journal_forget() below */ |
795 | unlock_buffer(bh); | 795 | unlock_buffer(bh); |
796 | goto failed; | 796 | goto failed; |
797 | } | 797 | } |
798 | 798 | ||
799 | memset(bh->b_data, 0, blocksize); | 799 | memset(bh->b_data, 0, blocksize); |
800 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | 800 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; |
801 | branch[n].key = cpu_to_le32(new_blocks[n]); | 801 | branch[n].key = cpu_to_le32(new_blocks[n]); |
802 | *branch[n].p = branch[n].key; | 802 | *branch[n].p = branch[n].key; |
803 | if (n == indirect_blks) { | 803 | if (n == indirect_blks) { |
804 | current_block = new_blocks[n]; | 804 | current_block = new_blocks[n]; |
805 | /* | 805 | /* |
806 | * End of chain, update the last new metablock of | 806 | * End of chain, update the last new metablock of |
807 | * the chain to point to the new allocated | 807 | * the chain to point to the new allocated |
808 | * data blocks numbers | 808 | * data blocks numbers |
809 | */ | 809 | */ |
810 | for (i = 1; i < num; i++) | 810 | for (i = 1; i < num; i++) |
811 | *(branch[n].p + i) = cpu_to_le32(++current_block); | 811 | *(branch[n].p + i) = cpu_to_le32(++current_block); |
812 | } | 812 | } |
813 | BUFFER_TRACE(bh, "marking uptodate"); | 813 | BUFFER_TRACE(bh, "marking uptodate"); |
814 | set_buffer_uptodate(bh); | 814 | set_buffer_uptodate(bh); |
815 | unlock_buffer(bh); | 815 | unlock_buffer(bh); |
816 | 816 | ||
817 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | 817 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
818 | err = ext4_handle_dirty_metadata(handle, inode, bh); | 818 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
819 | if (err) | 819 | if (err) |
820 | goto failed; | 820 | goto failed; |
821 | } | 821 | } |
822 | *blks = num; | 822 | *blks = num; |
823 | return err; | 823 | return err; |
824 | failed: | 824 | failed: |
825 | /* Allocation failed, free what we already allocated */ | 825 | /* Allocation failed, free what we already allocated */ |
826 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | 826 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); |
827 | for (i = 1; i <= n ; i++) { | 827 | for (i = 1; i <= n ; i++) { |
828 | /* | 828 | /* |
829 | * branch[i].bh is newly allocated, so there is no | 829 | * branch[i].bh is newly allocated, so there is no |
830 | * need to revoke the block, which is why we don't | 830 | * need to revoke the block, which is why we don't |
831 | * need to set EXT4_FREE_BLOCKS_METADATA. | 831 | * need to set EXT4_FREE_BLOCKS_METADATA. |
832 | */ | 832 | */ |
833 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | 833 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, |
834 | EXT4_FREE_BLOCKS_FORGET); | 834 | EXT4_FREE_BLOCKS_FORGET); |
835 | } | 835 | } |
836 | for (i = n+1; i < indirect_blks; i++) | 836 | for (i = n+1; i < indirect_blks; i++) |
837 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | 837 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); |
838 | 838 | ||
839 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | 839 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); |
840 | 840 | ||
841 | return err; | 841 | return err; |
842 | } | 842 | } |
843 | 843 | ||
844 | /** | 844 | /** |
845 | * ext4_splice_branch - splice the allocated branch onto inode. | 845 | * ext4_splice_branch - splice the allocated branch onto inode. |
846 | * @handle: handle for this transaction | 846 | * @handle: handle for this transaction |
847 | * @inode: owner | 847 | * @inode: owner |
848 | * @block: (logical) number of block we are adding | 848 | * @block: (logical) number of block we are adding |
849 | * @chain: chain of indirect blocks (with a missing link - see | 849 | * @chain: chain of indirect blocks (with a missing link - see |
850 | * ext4_alloc_branch) | 850 | * ext4_alloc_branch) |
851 | * @where: location of missing link | 851 | * @where: location of missing link |
852 | * @num: number of indirect blocks we are adding | 852 | * @num: number of indirect blocks we are adding |
853 | * @blks: number of direct blocks we are adding | 853 | * @blks: number of direct blocks we are adding |
854 | * | 854 | * |
855 | * This function fills the missing link and does all housekeeping needed in | 855 | * This function fills the missing link and does all housekeeping needed in |
856 | * inode (->i_blocks, etc.). In case of success we end up with the full | 856 | * inode (->i_blocks, etc.). In case of success we end up with the full |
857 | * chain to new block and return 0. | 857 | * chain to new block and return 0. |
858 | */ | 858 | */ |
859 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | 859 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, |
860 | ext4_lblk_t block, Indirect *where, int num, | 860 | ext4_lblk_t block, Indirect *where, int num, |
861 | int blks) | 861 | int blks) |
862 | { | 862 | { |
863 | int i; | 863 | int i; |
864 | int err = 0; | 864 | int err = 0; |
865 | ext4_fsblk_t current_block; | 865 | ext4_fsblk_t current_block; |
866 | 866 | ||
867 | /* | 867 | /* |
868 | * If we're splicing into a [td]indirect block (as opposed to the | 868 | * If we're splicing into a [td]indirect block (as opposed to the |
869 | * inode) then we need to get write access to the [td]indirect block | 869 | * inode) then we need to get write access to the [td]indirect block |
870 | * before the splice. | 870 | * before the splice. |
871 | */ | 871 | */ |
872 | if (where->bh) { | 872 | if (where->bh) { |
873 | BUFFER_TRACE(where->bh, "get_write_access"); | 873 | BUFFER_TRACE(where->bh, "get_write_access"); |
874 | err = ext4_journal_get_write_access(handle, where->bh); | 874 | err = ext4_journal_get_write_access(handle, where->bh); |
875 | if (err) | 875 | if (err) |
876 | goto err_out; | 876 | goto err_out; |
877 | } | 877 | } |
878 | /* That's it */ | 878 | /* That's it */ |
879 | 879 | ||
880 | *where->p = where->key; | 880 | *where->p = where->key; |
881 | 881 | ||
882 | /* | 882 | /* |
883 | * Update the host buffer_head or inode to point to more just allocated | 883 | * Update the host buffer_head or inode to point to more just allocated |
884 | * direct blocks blocks | 884 | * direct blocks blocks |
885 | */ | 885 | */ |
886 | if (num == 0 && blks > 1) { | 886 | if (num == 0 && blks > 1) { |
887 | current_block = le32_to_cpu(where->key) + 1; | 887 | current_block = le32_to_cpu(where->key) + 1; |
888 | for (i = 1; i < blks; i++) | 888 | for (i = 1; i < blks; i++) |
889 | *(where->p + i) = cpu_to_le32(current_block++); | 889 | *(where->p + i) = cpu_to_le32(current_block++); |
890 | } | 890 | } |
891 | 891 | ||
892 | /* We are done with atomic stuff, now do the rest of housekeeping */ | 892 | /* We are done with atomic stuff, now do the rest of housekeeping */ |
893 | /* had we spliced it onto indirect block? */ | 893 | /* had we spliced it onto indirect block? */ |
894 | if (where->bh) { | 894 | if (where->bh) { |
895 | /* | 895 | /* |
896 | * If we spliced it onto an indirect block, we haven't | 896 | * If we spliced it onto an indirect block, we haven't |
897 | * altered the inode. Note however that if it is being spliced | 897 | * altered the inode. Note however that if it is being spliced |
898 | * onto an indirect block at the very end of the file (the | 898 | * onto an indirect block at the very end of the file (the |
899 | * file is growing) then we *will* alter the inode to reflect | 899 | * file is growing) then we *will* alter the inode to reflect |
900 | * the new i_size. But that is not done here - it is done in | 900 | * the new i_size. But that is not done here - it is done in |
901 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | 901 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. |
902 | */ | 902 | */ |
903 | jbd_debug(5, "splicing indirect only\n"); | 903 | jbd_debug(5, "splicing indirect only\n"); |
904 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | 904 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); |
905 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | 905 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); |
906 | if (err) | 906 | if (err) |
907 | goto err_out; | 907 | goto err_out; |
908 | } else { | 908 | } else { |
909 | /* | 909 | /* |
910 | * OK, we spliced it into the inode itself on a direct block. | 910 | * OK, we spliced it into the inode itself on a direct block. |
911 | */ | 911 | */ |
912 | ext4_mark_inode_dirty(handle, inode); | 912 | ext4_mark_inode_dirty(handle, inode); |
913 | jbd_debug(5, "splicing direct\n"); | 913 | jbd_debug(5, "splicing direct\n"); |
914 | } | 914 | } |
915 | return err; | 915 | return err; |
916 | 916 | ||
917 | err_out: | 917 | err_out: |
918 | for (i = 1; i <= num; i++) { | 918 | for (i = 1; i <= num; i++) { |
919 | /* | 919 | /* |
920 | * branch[i].bh is newly allocated, so there is no | 920 | * branch[i].bh is newly allocated, so there is no |
921 | * need to revoke the block, which is why we don't | 921 | * need to revoke the block, which is why we don't |
922 | * need to set EXT4_FREE_BLOCKS_METADATA. | 922 | * need to set EXT4_FREE_BLOCKS_METADATA. |
923 | */ | 923 | */ |
924 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | 924 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, |
925 | EXT4_FREE_BLOCKS_FORGET); | 925 | EXT4_FREE_BLOCKS_FORGET); |
926 | } | 926 | } |
927 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | 927 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), |
928 | blks, 0); | 928 | blks, 0); |
929 | 929 | ||
930 | return err; | 930 | return err; |
931 | } | 931 | } |
932 | 932 | ||
933 | /* | 933 | /* |
934 | * The ext4_ind_map_blocks() function handles non-extents inodes | 934 | * The ext4_ind_map_blocks() function handles non-extents inodes |
935 | * (i.e., using the traditional indirect/double-indirect i_blocks | 935 | * (i.e., using the traditional indirect/double-indirect i_blocks |
936 | * scheme) for ext4_map_blocks(). | 936 | * scheme) for ext4_map_blocks(). |
937 | * | 937 | * |
938 | * Allocation strategy is simple: if we have to allocate something, we will | 938 | * Allocation strategy is simple: if we have to allocate something, we will |
939 | * have to go the whole way to leaf. So let's do it before attaching anything | 939 | * have to go the whole way to leaf. So let's do it before attaching anything |
940 | * to tree, set linkage between the newborn blocks, write them if sync is | 940 | * to tree, set linkage between the newborn blocks, write them if sync is |
941 | * required, recheck the path, free and repeat if check fails, otherwise | 941 | * required, recheck the path, free and repeat if check fails, otherwise |
942 | * set the last missing link (that will protect us from any truncate-generated | 942 | * set the last missing link (that will protect us from any truncate-generated |
943 | * removals - all blocks on the path are immune now) and possibly force the | 943 | * removals - all blocks on the path are immune now) and possibly force the |
944 | * write on the parent block. | 944 | * write on the parent block. |
945 | * That has a nice additional property: no special recovery from the failed | 945 | * That has a nice additional property: no special recovery from the failed |
946 | * allocations is needed - we simply release blocks and do not touch anything | 946 | * allocations is needed - we simply release blocks and do not touch anything |
947 | * reachable from inode. | 947 | * reachable from inode. |
948 | * | 948 | * |
949 | * `handle' can be NULL if create == 0. | 949 | * `handle' can be NULL if create == 0. |
950 | * | 950 | * |
951 | * return > 0, # of blocks mapped or allocated. | 951 | * return > 0, # of blocks mapped or allocated. |
952 | * return = 0, if plain lookup failed. | 952 | * return = 0, if plain lookup failed. |
953 | * return < 0, error case. | 953 | * return < 0, error case. |
954 | * | 954 | * |
955 | * The ext4_ind_get_blocks() function should be called with | 955 | * The ext4_ind_get_blocks() function should be called with |
956 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | 956 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem |
957 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | 957 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or |
958 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | 958 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system |
959 | * blocks. | 959 | * blocks. |
960 | */ | 960 | */ |
961 | static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | 961 | static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, |
962 | struct ext4_map_blocks *map, | 962 | struct ext4_map_blocks *map, |
963 | int flags) | 963 | int flags) |
964 | { | 964 | { |
965 | int err = -EIO; | 965 | int err = -EIO; |
966 | ext4_lblk_t offsets[4]; | 966 | ext4_lblk_t offsets[4]; |
967 | Indirect chain[4]; | 967 | Indirect chain[4]; |
968 | Indirect *partial; | 968 | Indirect *partial; |
969 | ext4_fsblk_t goal; | 969 | ext4_fsblk_t goal; |
970 | int indirect_blks; | 970 | int indirect_blks; |
971 | int blocks_to_boundary = 0; | 971 | int blocks_to_boundary = 0; |
972 | int depth; | 972 | int depth; |
973 | int count = 0; | 973 | int count = 0; |
974 | ext4_fsblk_t first_block = 0; | 974 | ext4_fsblk_t first_block = 0; |
975 | 975 | ||
976 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | 976 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); |
977 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | 977 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); |
978 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | 978 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); |
979 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | 979 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, |
980 | &blocks_to_boundary); | 980 | &blocks_to_boundary); |
981 | 981 | ||
982 | if (depth == 0) | 982 | if (depth == 0) |
983 | goto out; | 983 | goto out; |
984 | 984 | ||
985 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | 985 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); |
986 | 986 | ||
987 | /* Simplest case - block found, no allocation needed */ | 987 | /* Simplest case - block found, no allocation needed */ |
988 | if (!partial) { | 988 | if (!partial) { |
989 | first_block = le32_to_cpu(chain[depth - 1].key); | 989 | first_block = le32_to_cpu(chain[depth - 1].key); |
990 | count++; | 990 | count++; |
991 | /*map more blocks*/ | 991 | /*map more blocks*/ |
992 | while (count < map->m_len && count <= blocks_to_boundary) { | 992 | while (count < map->m_len && count <= blocks_to_boundary) { |
993 | ext4_fsblk_t blk; | 993 | ext4_fsblk_t blk; |
994 | 994 | ||
995 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | 995 | blk = le32_to_cpu(*(chain[depth-1].p + count)); |
996 | 996 | ||
997 | if (blk == first_block + count) | 997 | if (blk == first_block + count) |
998 | count++; | 998 | count++; |
999 | else | 999 | else |
1000 | break; | 1000 | break; |
1001 | } | 1001 | } |
1002 | goto got_it; | 1002 | goto got_it; |
1003 | } | 1003 | } |
1004 | 1004 | ||
1005 | /* Next simple case - plain lookup or failed read of indirect block */ | 1005 | /* Next simple case - plain lookup or failed read of indirect block */ |
1006 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | 1006 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) |
1007 | goto cleanup; | 1007 | goto cleanup; |
1008 | 1008 | ||
1009 | /* | 1009 | /* |
1010 | * Okay, we need to do block allocation. | 1010 | * Okay, we need to do block allocation. |
1011 | */ | 1011 | */ |
1012 | goal = ext4_find_goal(inode, map->m_lblk, partial); | 1012 | goal = ext4_find_goal(inode, map->m_lblk, partial); |
1013 | 1013 | ||
1014 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | 1014 | /* the number of blocks need to allocate for [d,t]indirect blocks */ |
1015 | indirect_blks = (chain + depth) - partial - 1; | 1015 | indirect_blks = (chain + depth) - partial - 1; |
1016 | 1016 | ||
1017 | /* | 1017 | /* |
1018 | * Next look up the indirect map to count the totoal number of | 1018 | * Next look up the indirect map to count the totoal number of |
1019 | * direct blocks to allocate for this branch. | 1019 | * direct blocks to allocate for this branch. |
1020 | */ | 1020 | */ |
1021 | count = ext4_blks_to_allocate(partial, indirect_blks, | 1021 | count = ext4_blks_to_allocate(partial, indirect_blks, |
1022 | map->m_len, blocks_to_boundary); | 1022 | map->m_len, blocks_to_boundary); |
1023 | /* | 1023 | /* |
1024 | * Block out ext4_truncate while we alter the tree | 1024 | * Block out ext4_truncate while we alter the tree |
1025 | */ | 1025 | */ |
1026 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | 1026 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, |
1027 | &count, goal, | 1027 | &count, goal, |
1028 | offsets + (partial - chain), partial); | 1028 | offsets + (partial - chain), partial); |
1029 | 1029 | ||
1030 | /* | 1030 | /* |
1031 | * The ext4_splice_branch call will free and forget any buffers | 1031 | * The ext4_splice_branch call will free and forget any buffers |
1032 | * on the new chain if there is a failure, but that risks using | 1032 | * on the new chain if there is a failure, but that risks using |
1033 | * up transaction credits, especially for bitmaps where the | 1033 | * up transaction credits, especially for bitmaps where the |
1034 | * credits cannot be returned. Can we handle this somehow? We | 1034 | * credits cannot be returned. Can we handle this somehow? We |
1035 | * may need to return -EAGAIN upwards in the worst case. --sct | 1035 | * may need to return -EAGAIN upwards in the worst case. --sct |
1036 | */ | 1036 | */ |
1037 | if (!err) | 1037 | if (!err) |
1038 | err = ext4_splice_branch(handle, inode, map->m_lblk, | 1038 | err = ext4_splice_branch(handle, inode, map->m_lblk, |
1039 | partial, indirect_blks, count); | 1039 | partial, indirect_blks, count); |
1040 | if (err) | 1040 | if (err) |
1041 | goto cleanup; | 1041 | goto cleanup; |
1042 | 1042 | ||
1043 | map->m_flags |= EXT4_MAP_NEW; | 1043 | map->m_flags |= EXT4_MAP_NEW; |
1044 | 1044 | ||
1045 | ext4_update_inode_fsync_trans(handle, inode, 1); | 1045 | ext4_update_inode_fsync_trans(handle, inode, 1); |
1046 | got_it: | 1046 | got_it: |
1047 | map->m_flags |= EXT4_MAP_MAPPED; | 1047 | map->m_flags |= EXT4_MAP_MAPPED; |
1048 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | 1048 | map->m_pblk = le32_to_cpu(chain[depth-1].key); |
1049 | map->m_len = count; | 1049 | map->m_len = count; |
1050 | if (count > blocks_to_boundary) | 1050 | if (count > blocks_to_boundary) |
1051 | map->m_flags |= EXT4_MAP_BOUNDARY; | 1051 | map->m_flags |= EXT4_MAP_BOUNDARY; |
1052 | err = count; | 1052 | err = count; |
1053 | /* Clean up and exit */ | 1053 | /* Clean up and exit */ |
1054 | partial = chain + depth - 1; /* the whole chain */ | 1054 | partial = chain + depth - 1; /* the whole chain */ |
1055 | cleanup: | 1055 | cleanup: |
1056 | while (partial > chain) { | 1056 | while (partial > chain) { |
1057 | BUFFER_TRACE(partial->bh, "call brelse"); | 1057 | BUFFER_TRACE(partial->bh, "call brelse"); |
1058 | brelse(partial->bh); | 1058 | brelse(partial->bh); |
1059 | partial--; | 1059 | partial--; |
1060 | } | 1060 | } |
1061 | out: | 1061 | out: |
1062 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | 1062 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, |
1063 | map->m_pblk, map->m_len, err); | 1063 | map->m_pblk, map->m_len, err); |
1064 | return err; | 1064 | return err; |
1065 | } | 1065 | } |
1066 | 1066 | ||
1067 | #ifdef CONFIG_QUOTA | 1067 | #ifdef CONFIG_QUOTA |
1068 | qsize_t *ext4_get_reserved_space(struct inode *inode) | 1068 | qsize_t *ext4_get_reserved_space(struct inode *inode) |
1069 | { | 1069 | { |
1070 | return &EXT4_I(inode)->i_reserved_quota; | 1070 | return &EXT4_I(inode)->i_reserved_quota; |
1071 | } | 1071 | } |
1072 | #endif | 1072 | #endif |
1073 | 1073 | ||
1074 | /* | 1074 | /* |
1075 | * Calculate the number of metadata blocks need to reserve | 1075 | * Calculate the number of metadata blocks need to reserve |
1076 | * to allocate a new block at @lblocks for non extent file based file | 1076 | * to allocate a new block at @lblocks for non extent file based file |
1077 | */ | 1077 | */ |
1078 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, | 1078 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, |
1079 | sector_t lblock) | 1079 | sector_t lblock) |
1080 | { | 1080 | { |
1081 | struct ext4_inode_info *ei = EXT4_I(inode); | 1081 | struct ext4_inode_info *ei = EXT4_I(inode); |
1082 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | 1082 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); |
1083 | int blk_bits; | 1083 | int blk_bits; |
1084 | 1084 | ||
1085 | if (lblock < EXT4_NDIR_BLOCKS) | 1085 | if (lblock < EXT4_NDIR_BLOCKS) |
1086 | return 0; | 1086 | return 0; |
1087 | 1087 | ||
1088 | lblock -= EXT4_NDIR_BLOCKS; | 1088 | lblock -= EXT4_NDIR_BLOCKS; |
1089 | 1089 | ||
1090 | if (ei->i_da_metadata_calc_len && | 1090 | if (ei->i_da_metadata_calc_len && |
1091 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | 1091 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { |
1092 | ei->i_da_metadata_calc_len++; | 1092 | ei->i_da_metadata_calc_len++; |
1093 | return 0; | 1093 | return 0; |
1094 | } | 1094 | } |
1095 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | 1095 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; |
1096 | ei->i_da_metadata_calc_len = 1; | 1096 | ei->i_da_metadata_calc_len = 1; |
1097 | blk_bits = order_base_2(lblock); | 1097 | blk_bits = order_base_2(lblock); |
1098 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | 1098 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | /* | 1101 | /* |
1102 | * Calculate the number of metadata blocks need to reserve | 1102 | * Calculate the number of metadata blocks need to reserve |
1103 | * to allocate a block located at @lblock | 1103 | * to allocate a block located at @lblock |
1104 | */ | 1104 | */ |
1105 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | 1105 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) |
1106 | { | 1106 | { |
1107 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 1107 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
1108 | return ext4_ext_calc_metadata_amount(inode, lblock); | 1108 | return ext4_ext_calc_metadata_amount(inode, lblock); |
1109 | 1109 | ||
1110 | return ext4_indirect_calc_metadata_amount(inode, lblock); | 1110 | return ext4_indirect_calc_metadata_amount(inode, lblock); |
1111 | } | 1111 | } |
1112 | 1112 | ||
1113 | /* | 1113 | /* |
1114 | * Called with i_data_sem down, which is important since we can call | 1114 | * Called with i_data_sem down, which is important since we can call |
1115 | * ext4_discard_preallocations() from here. | 1115 | * ext4_discard_preallocations() from here. |
1116 | */ | 1116 | */ |
1117 | void ext4_da_update_reserve_space(struct inode *inode, | 1117 | void ext4_da_update_reserve_space(struct inode *inode, |
1118 | int used, int quota_claim) | 1118 | int used, int quota_claim) |
1119 | { | 1119 | { |
1120 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1120 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1121 | struct ext4_inode_info *ei = EXT4_I(inode); | 1121 | struct ext4_inode_info *ei = EXT4_I(inode); |
1122 | 1122 | ||
1123 | spin_lock(&ei->i_block_reservation_lock); | 1123 | spin_lock(&ei->i_block_reservation_lock); |
1124 | trace_ext4_da_update_reserve_space(inode, used); | 1124 | trace_ext4_da_update_reserve_space(inode, used); |
1125 | if (unlikely(used > ei->i_reserved_data_blocks)) { | 1125 | if (unlikely(used > ei->i_reserved_data_blocks)) { |
1126 | ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " | 1126 | ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " |
1127 | "with only %d reserved data blocks\n", | 1127 | "with only %d reserved data blocks\n", |
1128 | __func__, inode->i_ino, used, | 1128 | __func__, inode->i_ino, used, |
1129 | ei->i_reserved_data_blocks); | 1129 | ei->i_reserved_data_blocks); |
1130 | WARN_ON(1); | 1130 | WARN_ON(1); |
1131 | used = ei->i_reserved_data_blocks; | 1131 | used = ei->i_reserved_data_blocks; |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | /* Update per-inode reservations */ | 1134 | /* Update per-inode reservations */ |
1135 | ei->i_reserved_data_blocks -= used; | 1135 | ei->i_reserved_data_blocks -= used; |
1136 | ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; | 1136 | ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; |
1137 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, | 1137 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, |
1138 | used + ei->i_allocated_meta_blocks); | 1138 | used + ei->i_allocated_meta_blocks); |
1139 | ei->i_allocated_meta_blocks = 0; | 1139 | ei->i_allocated_meta_blocks = 0; |
1140 | 1140 | ||
1141 | if (ei->i_reserved_data_blocks == 0) { | 1141 | if (ei->i_reserved_data_blocks == 0) { |
1142 | /* | 1142 | /* |
1143 | * We can release all of the reserved metadata blocks | 1143 | * We can release all of the reserved metadata blocks |
1144 | * only when we have written all of the delayed | 1144 | * only when we have written all of the delayed |
1145 | * allocation blocks. | 1145 | * allocation blocks. |
1146 | */ | 1146 | */ |
1147 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, | 1147 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, |
1148 | ei->i_reserved_meta_blocks); | 1148 | ei->i_reserved_meta_blocks); |
1149 | ei->i_reserved_meta_blocks = 0; | 1149 | ei->i_reserved_meta_blocks = 0; |
1150 | ei->i_da_metadata_calc_len = 0; | 1150 | ei->i_da_metadata_calc_len = 0; |
1151 | } | 1151 | } |
1152 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1152 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1153 | 1153 | ||
1154 | /* Update quota subsystem for data blocks */ | 1154 | /* Update quota subsystem for data blocks */ |
1155 | if (quota_claim) | 1155 | if (quota_claim) |
1156 | dquot_claim_block(inode, used); | 1156 | dquot_claim_block(inode, used); |
1157 | else { | 1157 | else { |
1158 | /* | 1158 | /* |
1159 | * We did fallocate with an offset that is already delayed | 1159 | * We did fallocate with an offset that is already delayed |
1160 | * allocated. So on delayed allocated writeback we should | 1160 | * allocated. So on delayed allocated writeback we should |
1161 | * not re-claim the quota for fallocated blocks. | 1161 | * not re-claim the quota for fallocated blocks. |
1162 | */ | 1162 | */ |
1163 | dquot_release_reservation_block(inode, used); | 1163 | dquot_release_reservation_block(inode, used); |
1164 | } | 1164 | } |
1165 | 1165 | ||
1166 | /* | 1166 | /* |
1167 | * If we have done all the pending block allocations and if | 1167 | * If we have done all the pending block allocations and if |
1168 | * there aren't any writers on the inode, we can discard the | 1168 | * there aren't any writers on the inode, we can discard the |
1169 | * inode's preallocations. | 1169 | * inode's preallocations. |
1170 | */ | 1170 | */ |
1171 | if ((ei->i_reserved_data_blocks == 0) && | 1171 | if ((ei->i_reserved_data_blocks == 0) && |
1172 | (atomic_read(&inode->i_writecount) == 0)) | 1172 | (atomic_read(&inode->i_writecount) == 0)) |
1173 | ext4_discard_preallocations(inode); | 1173 | ext4_discard_preallocations(inode); |
1174 | } | 1174 | } |
1175 | 1175 | ||
1176 | static int __check_block_validity(struct inode *inode, const char *func, | 1176 | static int __check_block_validity(struct inode *inode, const char *func, |
1177 | unsigned int line, | 1177 | unsigned int line, |
1178 | struct ext4_map_blocks *map) | 1178 | struct ext4_map_blocks *map) |
1179 | { | 1179 | { |
1180 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, | 1180 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, |
1181 | map->m_len)) { | 1181 | map->m_len)) { |
1182 | ext4_error_inode(inode, func, line, map->m_pblk, | 1182 | ext4_error_inode(inode, func, line, map->m_pblk, |
1183 | "lblock %lu mapped to illegal pblock " | 1183 | "lblock %lu mapped to illegal pblock " |
1184 | "(length %d)", (unsigned long) map->m_lblk, | 1184 | "(length %d)", (unsigned long) map->m_lblk, |
1185 | map->m_len); | 1185 | map->m_len); |
1186 | return -EIO; | 1186 | return -EIO; |
1187 | } | 1187 | } |
1188 | return 0; | 1188 | return 0; |
1189 | } | 1189 | } |
1190 | 1190 | ||
1191 | #define check_block_validity(inode, map) \ | 1191 | #define check_block_validity(inode, map) \ |
1192 | __check_block_validity((inode), __func__, __LINE__, (map)) | 1192 | __check_block_validity((inode), __func__, __LINE__, (map)) |
1193 | 1193 | ||
1194 | /* | 1194 | /* |
1195 | * Return the number of contiguous dirty pages in a given inode | 1195 | * Return the number of contiguous dirty pages in a given inode |
1196 | * starting at page frame idx. | 1196 | * starting at page frame idx. |
1197 | */ | 1197 | */ |
1198 | static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, | 1198 | static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, |
1199 | unsigned int max_pages) | 1199 | unsigned int max_pages) |
1200 | { | 1200 | { |
1201 | struct address_space *mapping = inode->i_mapping; | 1201 | struct address_space *mapping = inode->i_mapping; |
1202 | pgoff_t index; | 1202 | pgoff_t index; |
1203 | struct pagevec pvec; | 1203 | struct pagevec pvec; |
1204 | pgoff_t num = 0; | 1204 | pgoff_t num = 0; |
1205 | int i, nr_pages, done = 0; | 1205 | int i, nr_pages, done = 0; |
1206 | 1206 | ||
1207 | if (max_pages == 0) | 1207 | if (max_pages == 0) |
1208 | return 0; | 1208 | return 0; |
1209 | pagevec_init(&pvec, 0); | 1209 | pagevec_init(&pvec, 0); |
1210 | while (!done) { | 1210 | while (!done) { |
1211 | index = idx; | 1211 | index = idx; |
1212 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 1212 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
1213 | PAGECACHE_TAG_DIRTY, | 1213 | PAGECACHE_TAG_DIRTY, |
1214 | (pgoff_t)PAGEVEC_SIZE); | 1214 | (pgoff_t)PAGEVEC_SIZE); |
1215 | if (nr_pages == 0) | 1215 | if (nr_pages == 0) |
1216 | break; | 1216 | break; |
1217 | for (i = 0; i < nr_pages; i++) { | 1217 | for (i = 0; i < nr_pages; i++) { |
1218 | struct page *page = pvec.pages[i]; | 1218 | struct page *page = pvec.pages[i]; |
1219 | struct buffer_head *bh, *head; | 1219 | struct buffer_head *bh, *head; |
1220 | 1220 | ||
1221 | lock_page(page); | 1221 | lock_page(page); |
1222 | if (unlikely(page->mapping != mapping) || | 1222 | if (unlikely(page->mapping != mapping) || |
1223 | !PageDirty(page) || | 1223 | !PageDirty(page) || |
1224 | PageWriteback(page) || | 1224 | PageWriteback(page) || |
1225 | page->index != idx) { | 1225 | page->index != idx) { |
1226 | done = 1; | 1226 | done = 1; |
1227 | unlock_page(page); | 1227 | unlock_page(page); |
1228 | break; | 1228 | break; |
1229 | } | 1229 | } |
1230 | if (page_has_buffers(page)) { | 1230 | if (page_has_buffers(page)) { |
1231 | bh = head = page_buffers(page); | 1231 | bh = head = page_buffers(page); |
1232 | do { | 1232 | do { |
1233 | if (!buffer_delay(bh) && | 1233 | if (!buffer_delay(bh) && |
1234 | !buffer_unwritten(bh)) | 1234 | !buffer_unwritten(bh)) |
1235 | done = 1; | 1235 | done = 1; |
1236 | bh = bh->b_this_page; | 1236 | bh = bh->b_this_page; |
1237 | } while (!done && (bh != head)); | 1237 | } while (!done && (bh != head)); |
1238 | } | 1238 | } |
1239 | unlock_page(page); | 1239 | unlock_page(page); |
1240 | if (done) | 1240 | if (done) |
1241 | break; | 1241 | break; |
1242 | idx++; | 1242 | idx++; |
1243 | num++; | 1243 | num++; |
1244 | if (num >= max_pages) { | 1244 | if (num >= max_pages) { |
1245 | done = 1; | 1245 | done = 1; |
1246 | break; | 1246 | break; |
1247 | } | 1247 | } |
1248 | } | 1248 | } |
1249 | pagevec_release(&pvec); | 1249 | pagevec_release(&pvec); |
1250 | } | 1250 | } |
1251 | return num; | 1251 | return num; |
1252 | } | 1252 | } |
1253 | 1253 | ||
1254 | /* | 1254 | /* |
1255 | * The ext4_map_blocks() function tries to look up the requested blocks, | 1255 | * The ext4_map_blocks() function tries to look up the requested blocks, |
1256 | * and returns if the blocks are already mapped. | 1256 | * and returns if the blocks are already mapped. |
1257 | * | 1257 | * |
1258 | * Otherwise it takes the write lock of the i_data_sem and allocate blocks | 1258 | * Otherwise it takes the write lock of the i_data_sem and allocate blocks |
1259 | * and store the allocated blocks in the result buffer head and mark it | 1259 | * and store the allocated blocks in the result buffer head and mark it |
1260 | * mapped. | 1260 | * mapped. |
1261 | * | 1261 | * |
1262 | * If file type is extents based, it will call ext4_ext_map_blocks(), | 1262 | * If file type is extents based, it will call ext4_ext_map_blocks(), |
1263 | * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping | 1263 | * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping |
1264 | * based files | 1264 | * based files |
1265 | * | 1265 | * |
1266 | * On success, it returns the number of blocks being mapped or allocate. | 1266 | * On success, it returns the number of blocks being mapped or allocate. |
1267 | * if create==0 and the blocks are pre-allocated and uninitialized block, | 1267 | * if create==0 and the blocks are pre-allocated and uninitialized block, |
1268 | * the result buffer head is unmapped. If the create ==1, it will make sure | 1268 | * the result buffer head is unmapped. If the create ==1, it will make sure |
1269 | * the buffer head is mapped. | 1269 | * the buffer head is mapped. |
1270 | * | 1270 | * |
1271 | * It returns 0 if plain look up failed (blocks have not been allocated), in | 1271 | * It returns 0 if plain look up failed (blocks have not been allocated), in |
1272 | * that casem, buffer head is unmapped | 1272 | * that casem, buffer head is unmapped |
1273 | * | 1273 | * |
1274 | * It returns the error in case of allocation failure. | 1274 | * It returns the error in case of allocation failure. |
1275 | */ | 1275 | */ |
1276 | int ext4_map_blocks(handle_t *handle, struct inode *inode, | 1276 | int ext4_map_blocks(handle_t *handle, struct inode *inode, |
1277 | struct ext4_map_blocks *map, int flags) | 1277 | struct ext4_map_blocks *map, int flags) |
1278 | { | 1278 | { |
1279 | int retval; | 1279 | int retval; |
1280 | 1280 | ||
1281 | map->m_flags = 0; | 1281 | map->m_flags = 0; |
1282 | ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," | 1282 | ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," |
1283 | "logical block %lu\n", inode->i_ino, flags, map->m_len, | 1283 | "logical block %lu\n", inode->i_ino, flags, map->m_len, |
1284 | (unsigned long) map->m_lblk); | 1284 | (unsigned long) map->m_lblk); |
1285 | /* | 1285 | /* |
1286 | * Try to see if we can get the block without requesting a new | 1286 | * Try to see if we can get the block without requesting a new |
1287 | * file system block. | 1287 | * file system block. |
1288 | */ | 1288 | */ |
1289 | down_read((&EXT4_I(inode)->i_data_sem)); | 1289 | down_read((&EXT4_I(inode)->i_data_sem)); |
1290 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 1290 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
1291 | retval = ext4_ext_map_blocks(handle, inode, map, 0); | 1291 | retval = ext4_ext_map_blocks(handle, inode, map, 0); |
1292 | } else { | 1292 | } else { |
1293 | retval = ext4_ind_map_blocks(handle, inode, map, 0); | 1293 | retval = ext4_ind_map_blocks(handle, inode, map, 0); |
1294 | } | 1294 | } |
1295 | up_read((&EXT4_I(inode)->i_data_sem)); | 1295 | up_read((&EXT4_I(inode)->i_data_sem)); |
1296 | 1296 | ||
1297 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { | 1297 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { |
1298 | int ret = check_block_validity(inode, map); | 1298 | int ret = check_block_validity(inode, map); |
1299 | if (ret != 0) | 1299 | if (ret != 0) |
1300 | return ret; | 1300 | return ret; |
1301 | } | 1301 | } |
1302 | 1302 | ||
1303 | /* If it is only a block(s) look up */ | 1303 | /* If it is only a block(s) look up */ |
1304 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) | 1304 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) |
1305 | return retval; | 1305 | return retval; |
1306 | 1306 | ||
1307 | /* | 1307 | /* |
1308 | * Returns if the blocks have already allocated | 1308 | * Returns if the blocks have already allocated |
1309 | * | 1309 | * |
1310 | * Note that if blocks have been preallocated | 1310 | * Note that if blocks have been preallocated |
1311 | * ext4_ext_get_block() returns th create = 0 | 1311 | * ext4_ext_get_block() returns th create = 0 |
1312 | * with buffer head unmapped. | 1312 | * with buffer head unmapped. |
1313 | */ | 1313 | */ |
1314 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) | 1314 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) |
1315 | return retval; | 1315 | return retval; |
1316 | 1316 | ||
1317 | /* | 1317 | /* |
1318 | * When we call get_blocks without the create flag, the | 1318 | * When we call get_blocks without the create flag, the |
1319 | * BH_Unwritten flag could have gotten set if the blocks | 1319 | * BH_Unwritten flag could have gotten set if the blocks |
1320 | * requested were part of a uninitialized extent. We need to | 1320 | * requested were part of a uninitialized extent. We need to |
1321 | * clear this flag now that we are committed to convert all or | 1321 | * clear this flag now that we are committed to convert all or |
1322 | * part of the uninitialized extent to be an initialized | 1322 | * part of the uninitialized extent to be an initialized |
1323 | * extent. This is because we need to avoid the combination | 1323 | * extent. This is because we need to avoid the combination |
1324 | * of BH_Unwritten and BH_Mapped flags being simultaneously | 1324 | * of BH_Unwritten and BH_Mapped flags being simultaneously |
1325 | * set on the buffer_head. | 1325 | * set on the buffer_head. |
1326 | */ | 1326 | */ |
1327 | map->m_flags &= ~EXT4_MAP_UNWRITTEN; | 1327 | map->m_flags &= ~EXT4_MAP_UNWRITTEN; |
1328 | 1328 | ||
1329 | /* | 1329 | /* |
1330 | * New blocks allocate and/or writing to uninitialized extent | 1330 | * New blocks allocate and/or writing to uninitialized extent |
1331 | * will possibly result in updating i_data, so we take | 1331 | * will possibly result in updating i_data, so we take |
1332 | * the write lock of i_data_sem, and call get_blocks() | 1332 | * the write lock of i_data_sem, and call get_blocks() |
1333 | * with create == 1 flag. | 1333 | * with create == 1 flag. |
1334 | */ | 1334 | */ |
1335 | down_write((&EXT4_I(inode)->i_data_sem)); | 1335 | down_write((&EXT4_I(inode)->i_data_sem)); |
1336 | 1336 | ||
1337 | /* | 1337 | /* |
1338 | * if the caller is from delayed allocation writeout path | 1338 | * if the caller is from delayed allocation writeout path |
1339 | * we have already reserved fs blocks for allocation | 1339 | * we have already reserved fs blocks for allocation |
1340 | * let the underlying get_block() function know to | 1340 | * let the underlying get_block() function know to |
1341 | * avoid double accounting | 1341 | * avoid double accounting |
1342 | */ | 1342 | */ |
1343 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) | 1343 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) |
1344 | ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); | 1344 | ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); |
1345 | /* | 1345 | /* |
1346 | * We need to check for EXT4 here because migrate | 1346 | * We need to check for EXT4 here because migrate |
1347 | * could have changed the inode type in between | 1347 | * could have changed the inode type in between |
1348 | */ | 1348 | */ |
1349 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 1349 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
1350 | retval = ext4_ext_map_blocks(handle, inode, map, flags); | 1350 | retval = ext4_ext_map_blocks(handle, inode, map, flags); |
1351 | } else { | 1351 | } else { |
1352 | retval = ext4_ind_map_blocks(handle, inode, map, flags); | 1352 | retval = ext4_ind_map_blocks(handle, inode, map, flags); |
1353 | 1353 | ||
1354 | if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { | 1354 | if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { |
1355 | /* | 1355 | /* |
1356 | * We allocated new blocks which will result in | 1356 | * We allocated new blocks which will result in |
1357 | * i_data's format changing. Force the migrate | 1357 | * i_data's format changing. Force the migrate |
1358 | * to fail by clearing migrate flags | 1358 | * to fail by clearing migrate flags |
1359 | */ | 1359 | */ |
1360 | ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); | 1360 | ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); |
1361 | } | 1361 | } |
1362 | 1362 | ||
1363 | /* | 1363 | /* |
1364 | * Update reserved blocks/metadata blocks after successful | 1364 | * Update reserved blocks/metadata blocks after successful |
1365 | * block allocation which had been deferred till now. We don't | 1365 | * block allocation which had been deferred till now. We don't |
1366 | * support fallocate for non extent files. So we can update | 1366 | * support fallocate for non extent files. So we can update |
1367 | * reserve space here. | 1367 | * reserve space here. |
1368 | */ | 1368 | */ |
1369 | if ((retval > 0) && | 1369 | if ((retval > 0) && |
1370 | (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) | 1370 | (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) |
1371 | ext4_da_update_reserve_space(inode, retval, 1); | 1371 | ext4_da_update_reserve_space(inode, retval, 1); |
1372 | } | 1372 | } |
1373 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) | 1373 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) |
1374 | ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); | 1374 | ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); |
1375 | 1375 | ||
1376 | up_write((&EXT4_I(inode)->i_data_sem)); | 1376 | up_write((&EXT4_I(inode)->i_data_sem)); |
1377 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { | 1377 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { |
1378 | int ret = check_block_validity(inode, map); | 1378 | int ret = check_block_validity(inode, map); |
1379 | if (ret != 0) | 1379 | if (ret != 0) |
1380 | return ret; | 1380 | return ret; |
1381 | } | 1381 | } |
1382 | return retval; | 1382 | return retval; |
1383 | } | 1383 | } |
1384 | 1384 | ||
1385 | /* Maximum number of blocks we map for direct IO at once. */ | 1385 | /* Maximum number of blocks we map for direct IO at once. */ |
1386 | #define DIO_MAX_BLOCKS 4096 | 1386 | #define DIO_MAX_BLOCKS 4096 |
1387 | 1387 | ||
1388 | static int _ext4_get_block(struct inode *inode, sector_t iblock, | 1388 | static int _ext4_get_block(struct inode *inode, sector_t iblock, |
1389 | struct buffer_head *bh, int flags) | 1389 | struct buffer_head *bh, int flags) |
1390 | { | 1390 | { |
1391 | handle_t *handle = ext4_journal_current_handle(); | 1391 | handle_t *handle = ext4_journal_current_handle(); |
1392 | struct ext4_map_blocks map; | 1392 | struct ext4_map_blocks map; |
1393 | int ret = 0, started = 0; | 1393 | int ret = 0, started = 0; |
1394 | int dio_credits; | 1394 | int dio_credits; |
1395 | 1395 | ||
1396 | map.m_lblk = iblock; | 1396 | map.m_lblk = iblock; |
1397 | map.m_len = bh->b_size >> inode->i_blkbits; | 1397 | map.m_len = bh->b_size >> inode->i_blkbits; |
1398 | 1398 | ||
1399 | if (flags && !handle) { | 1399 | if (flags && !handle) { |
1400 | /* Direct IO write... */ | 1400 | /* Direct IO write... */ |
1401 | if (map.m_len > DIO_MAX_BLOCKS) | 1401 | if (map.m_len > DIO_MAX_BLOCKS) |
1402 | map.m_len = DIO_MAX_BLOCKS; | 1402 | map.m_len = DIO_MAX_BLOCKS; |
1403 | dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); | 1403 | dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); |
1404 | handle = ext4_journal_start(inode, dio_credits); | 1404 | handle = ext4_journal_start(inode, dio_credits); |
1405 | if (IS_ERR(handle)) { | 1405 | if (IS_ERR(handle)) { |
1406 | ret = PTR_ERR(handle); | 1406 | ret = PTR_ERR(handle); |
1407 | return ret; | 1407 | return ret; |
1408 | } | 1408 | } |
1409 | started = 1; | 1409 | started = 1; |
1410 | } | 1410 | } |
1411 | 1411 | ||
1412 | ret = ext4_map_blocks(handle, inode, &map, flags); | 1412 | ret = ext4_map_blocks(handle, inode, &map, flags); |
1413 | if (ret > 0) { | 1413 | if (ret > 0) { |
1414 | map_bh(bh, inode->i_sb, map.m_pblk); | 1414 | map_bh(bh, inode->i_sb, map.m_pblk); |
1415 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; | 1415 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; |
1416 | bh->b_size = inode->i_sb->s_blocksize * map.m_len; | 1416 | bh->b_size = inode->i_sb->s_blocksize * map.m_len; |
1417 | ret = 0; | 1417 | ret = 0; |
1418 | } | 1418 | } |
1419 | if (started) | 1419 | if (started) |
1420 | ext4_journal_stop(handle); | 1420 | ext4_journal_stop(handle); |
1421 | return ret; | 1421 | return ret; |
1422 | } | 1422 | } |
1423 | 1423 | ||
1424 | int ext4_get_block(struct inode *inode, sector_t iblock, | 1424 | int ext4_get_block(struct inode *inode, sector_t iblock, |
1425 | struct buffer_head *bh, int create) | 1425 | struct buffer_head *bh, int create) |
1426 | { | 1426 | { |
1427 | return _ext4_get_block(inode, iblock, bh, | 1427 | return _ext4_get_block(inode, iblock, bh, |
1428 | create ? EXT4_GET_BLOCKS_CREATE : 0); | 1428 | create ? EXT4_GET_BLOCKS_CREATE : 0); |
1429 | } | 1429 | } |
1430 | 1430 | ||
1431 | /* | 1431 | /* |
1432 | * `handle' can be NULL if create is zero | 1432 | * `handle' can be NULL if create is zero |
1433 | */ | 1433 | */ |
1434 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, | 1434 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, |
1435 | ext4_lblk_t block, int create, int *errp) | 1435 | ext4_lblk_t block, int create, int *errp) |
1436 | { | 1436 | { |
1437 | struct ext4_map_blocks map; | 1437 | struct ext4_map_blocks map; |
1438 | struct buffer_head *bh; | 1438 | struct buffer_head *bh; |
1439 | int fatal = 0, err; | 1439 | int fatal = 0, err; |
1440 | 1440 | ||
1441 | J_ASSERT(handle != NULL || create == 0); | 1441 | J_ASSERT(handle != NULL || create == 0); |
1442 | 1442 | ||
1443 | map.m_lblk = block; | 1443 | map.m_lblk = block; |
1444 | map.m_len = 1; | 1444 | map.m_len = 1; |
1445 | err = ext4_map_blocks(handle, inode, &map, | 1445 | err = ext4_map_blocks(handle, inode, &map, |
1446 | create ? EXT4_GET_BLOCKS_CREATE : 0); | 1446 | create ? EXT4_GET_BLOCKS_CREATE : 0); |
1447 | 1447 | ||
1448 | if (err < 0) | 1448 | if (err < 0) |
1449 | *errp = err; | 1449 | *errp = err; |
1450 | if (err <= 0) | 1450 | if (err <= 0) |
1451 | return NULL; | 1451 | return NULL; |
1452 | *errp = 0; | 1452 | *errp = 0; |
1453 | 1453 | ||
1454 | bh = sb_getblk(inode->i_sb, map.m_pblk); | 1454 | bh = sb_getblk(inode->i_sb, map.m_pblk); |
1455 | if (!bh) { | 1455 | if (!bh) { |
1456 | *errp = -EIO; | 1456 | *errp = -EIO; |
1457 | return NULL; | 1457 | return NULL; |
1458 | } | 1458 | } |
1459 | if (map.m_flags & EXT4_MAP_NEW) { | 1459 | if (map.m_flags & EXT4_MAP_NEW) { |
1460 | J_ASSERT(create != 0); | 1460 | J_ASSERT(create != 0); |
1461 | J_ASSERT(handle != NULL); | 1461 | J_ASSERT(handle != NULL); |
1462 | 1462 | ||
1463 | /* | 1463 | /* |
1464 | * Now that we do not always journal data, we should | 1464 | * Now that we do not always journal data, we should |
1465 | * keep in mind whether this should always journal the | 1465 | * keep in mind whether this should always journal the |
1466 | * new buffer as metadata. For now, regular file | 1466 | * new buffer as metadata. For now, regular file |
1467 | * writes use ext4_get_block instead, so it's not a | 1467 | * writes use ext4_get_block instead, so it's not a |
1468 | * problem. | 1468 | * problem. |
1469 | */ | 1469 | */ |
1470 | lock_buffer(bh); | 1470 | lock_buffer(bh); |
1471 | BUFFER_TRACE(bh, "call get_create_access"); | 1471 | BUFFER_TRACE(bh, "call get_create_access"); |
1472 | fatal = ext4_journal_get_create_access(handle, bh); | 1472 | fatal = ext4_journal_get_create_access(handle, bh); |
1473 | if (!fatal && !buffer_uptodate(bh)) { | 1473 | if (!fatal && !buffer_uptodate(bh)) { |
1474 | memset(bh->b_data, 0, inode->i_sb->s_blocksize); | 1474 | memset(bh->b_data, 0, inode->i_sb->s_blocksize); |
1475 | set_buffer_uptodate(bh); | 1475 | set_buffer_uptodate(bh); |
1476 | } | 1476 | } |
1477 | unlock_buffer(bh); | 1477 | unlock_buffer(bh); |
1478 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | 1478 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
1479 | err = ext4_handle_dirty_metadata(handle, inode, bh); | 1479 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
1480 | if (!fatal) | 1480 | if (!fatal) |
1481 | fatal = err; | 1481 | fatal = err; |
1482 | } else { | 1482 | } else { |
1483 | BUFFER_TRACE(bh, "not a new buffer"); | 1483 | BUFFER_TRACE(bh, "not a new buffer"); |
1484 | } | 1484 | } |
1485 | if (fatal) { | 1485 | if (fatal) { |
1486 | *errp = fatal; | 1486 | *errp = fatal; |
1487 | brelse(bh); | 1487 | brelse(bh); |
1488 | bh = NULL; | 1488 | bh = NULL; |
1489 | } | 1489 | } |
1490 | return bh; | 1490 | return bh; |
1491 | } | 1491 | } |
1492 | 1492 | ||
1493 | struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, | 1493 | struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, |
1494 | ext4_lblk_t block, int create, int *err) | 1494 | ext4_lblk_t block, int create, int *err) |
1495 | { | 1495 | { |
1496 | struct buffer_head *bh; | 1496 | struct buffer_head *bh; |
1497 | 1497 | ||
1498 | bh = ext4_getblk(handle, inode, block, create, err); | 1498 | bh = ext4_getblk(handle, inode, block, create, err); |
1499 | if (!bh) | 1499 | if (!bh) |
1500 | return bh; | 1500 | return bh; |
1501 | if (buffer_uptodate(bh)) | 1501 | if (buffer_uptodate(bh)) |
1502 | return bh; | 1502 | return bh; |
1503 | ll_rw_block(READ_META, 1, &bh); | 1503 | ll_rw_block(READ_META, 1, &bh); |
1504 | wait_on_buffer(bh); | 1504 | wait_on_buffer(bh); |
1505 | if (buffer_uptodate(bh)) | 1505 | if (buffer_uptodate(bh)) |
1506 | return bh; | 1506 | return bh; |
1507 | put_bh(bh); | 1507 | put_bh(bh); |
1508 | *err = -EIO; | 1508 | *err = -EIO; |
1509 | return NULL; | 1509 | return NULL; |
1510 | } | 1510 | } |
1511 | 1511 | ||
1512 | static int walk_page_buffers(handle_t *handle, | 1512 | static int walk_page_buffers(handle_t *handle, |
1513 | struct buffer_head *head, | 1513 | struct buffer_head *head, |
1514 | unsigned from, | 1514 | unsigned from, |
1515 | unsigned to, | 1515 | unsigned to, |
1516 | int *partial, | 1516 | int *partial, |
1517 | int (*fn)(handle_t *handle, | 1517 | int (*fn)(handle_t *handle, |
1518 | struct buffer_head *bh)) | 1518 | struct buffer_head *bh)) |
1519 | { | 1519 | { |
1520 | struct buffer_head *bh; | 1520 | struct buffer_head *bh; |
1521 | unsigned block_start, block_end; | 1521 | unsigned block_start, block_end; |
1522 | unsigned blocksize = head->b_size; | 1522 | unsigned blocksize = head->b_size; |
1523 | int err, ret = 0; | 1523 | int err, ret = 0; |
1524 | struct buffer_head *next; | 1524 | struct buffer_head *next; |
1525 | 1525 | ||
1526 | for (bh = head, block_start = 0; | 1526 | for (bh = head, block_start = 0; |
1527 | ret == 0 && (bh != head || !block_start); | 1527 | ret == 0 && (bh != head || !block_start); |
1528 | block_start = block_end, bh = next) { | 1528 | block_start = block_end, bh = next) { |
1529 | next = bh->b_this_page; | 1529 | next = bh->b_this_page; |
1530 | block_end = block_start + blocksize; | 1530 | block_end = block_start + blocksize; |
1531 | if (block_end <= from || block_start >= to) { | 1531 | if (block_end <= from || block_start >= to) { |
1532 | if (partial && !buffer_uptodate(bh)) | 1532 | if (partial && !buffer_uptodate(bh)) |
1533 | *partial = 1; | 1533 | *partial = 1; |
1534 | continue; | 1534 | continue; |
1535 | } | 1535 | } |
1536 | err = (*fn)(handle, bh); | 1536 | err = (*fn)(handle, bh); |
1537 | if (!ret) | 1537 | if (!ret) |
1538 | ret = err; | 1538 | ret = err; |
1539 | } | 1539 | } |
1540 | return ret; | 1540 | return ret; |
1541 | } | 1541 | } |
1542 | 1542 | ||
1543 | /* | 1543 | /* |
1544 | * To preserve ordering, it is essential that the hole instantiation and | 1544 | * To preserve ordering, it is essential that the hole instantiation and |
1545 | * the data write be encapsulated in a single transaction. We cannot | 1545 | * the data write be encapsulated in a single transaction. We cannot |
1546 | * close off a transaction and start a new one between the ext4_get_block() | 1546 | * close off a transaction and start a new one between the ext4_get_block() |
1547 | * and the commit_write(). So doing the jbd2_journal_start at the start of | 1547 | * and the commit_write(). So doing the jbd2_journal_start at the start of |
1548 | * prepare_write() is the right place. | 1548 | * prepare_write() is the right place. |
1549 | * | 1549 | * |
1550 | * Also, this function can nest inside ext4_writepage() -> | 1550 | * Also, this function can nest inside ext4_writepage() -> |
1551 | * block_write_full_page(). In that case, we *know* that ext4_writepage() | 1551 | * block_write_full_page(). In that case, we *know* that ext4_writepage() |
1552 | * has generated enough buffer credits to do the whole page. So we won't | 1552 | * has generated enough buffer credits to do the whole page. So we won't |
1553 | * block on the journal in that case, which is good, because the caller may | 1553 | * block on the journal in that case, which is good, because the caller may |
1554 | * be PF_MEMALLOC. | 1554 | * be PF_MEMALLOC. |
1555 | * | 1555 | * |
1556 | * By accident, ext4 can be reentered when a transaction is open via | 1556 | * By accident, ext4 can be reentered when a transaction is open via |
1557 | * quota file writes. If we were to commit the transaction while thus | 1557 | * quota file writes. If we were to commit the transaction while thus |
1558 | * reentered, there can be a deadlock - we would be holding a quota | 1558 | * reentered, there can be a deadlock - we would be holding a quota |
1559 | * lock, and the commit would never complete if another thread had a | 1559 | * lock, and the commit would never complete if another thread had a |
1560 | * transaction open and was blocking on the quota lock - a ranking | 1560 | * transaction open and was blocking on the quota lock - a ranking |
1561 | * violation. | 1561 | * violation. |
1562 | * | 1562 | * |
1563 | * So what we do is to rely on the fact that jbd2_journal_stop/journal_start | 1563 | * So what we do is to rely on the fact that jbd2_journal_stop/journal_start |
1564 | * will _not_ run commit under these circumstances because handle->h_ref | 1564 | * will _not_ run commit under these circumstances because handle->h_ref |
1565 | * is elevated. We'll still have enough credits for the tiny quotafile | 1565 | * is elevated. We'll still have enough credits for the tiny quotafile |
1566 | * write. | 1566 | * write. |
1567 | */ | 1567 | */ |
1568 | static int do_journal_get_write_access(handle_t *handle, | 1568 | static int do_journal_get_write_access(handle_t *handle, |
1569 | struct buffer_head *bh) | 1569 | struct buffer_head *bh) |
1570 | { | 1570 | { |
1571 | int dirty = buffer_dirty(bh); | 1571 | int dirty = buffer_dirty(bh); |
1572 | int ret; | 1572 | int ret; |
1573 | 1573 | ||
1574 | if (!buffer_mapped(bh) || buffer_freed(bh)) | 1574 | if (!buffer_mapped(bh) || buffer_freed(bh)) |
1575 | return 0; | 1575 | return 0; |
1576 | /* | 1576 | /* |
1577 | * __block_write_begin() could have dirtied some buffers. Clean | 1577 | * __block_write_begin() could have dirtied some buffers. Clean |
1578 | * the dirty bit as jbd2_journal_get_write_access() could complain | 1578 | * the dirty bit as jbd2_journal_get_write_access() could complain |
1579 | * otherwise about fs integrity issues. Setting of the dirty bit | 1579 | * otherwise about fs integrity issues. Setting of the dirty bit |
1580 | * by __block_write_begin() isn't a real problem here as we clear | 1580 | * by __block_write_begin() isn't a real problem here as we clear |
1581 | * the bit before releasing a page lock and thus writeback cannot | 1581 | * the bit before releasing a page lock and thus writeback cannot |
1582 | * ever write the buffer. | 1582 | * ever write the buffer. |
1583 | */ | 1583 | */ |
1584 | if (dirty) | 1584 | if (dirty) |
1585 | clear_buffer_dirty(bh); | 1585 | clear_buffer_dirty(bh); |
1586 | ret = ext4_journal_get_write_access(handle, bh); | 1586 | ret = ext4_journal_get_write_access(handle, bh); |
1587 | if (!ret && dirty) | 1587 | if (!ret && dirty) |
1588 | ret = ext4_handle_dirty_metadata(handle, NULL, bh); | 1588 | ret = ext4_handle_dirty_metadata(handle, NULL, bh); |
1589 | return ret; | 1589 | return ret; |
1590 | } | 1590 | } |
1591 | 1591 | ||
1592 | /* | 1592 | /* |
1593 | * Truncate blocks that were not used by write. We have to truncate the | 1593 | * Truncate blocks that were not used by write. We have to truncate the |
1594 | * pagecache as well so that corresponding buffers get properly unmapped. | 1594 | * pagecache as well so that corresponding buffers get properly unmapped. |
1595 | */ | 1595 | */ |
1596 | static void ext4_truncate_failed_write(struct inode *inode) | 1596 | static void ext4_truncate_failed_write(struct inode *inode) |
1597 | { | 1597 | { |
1598 | truncate_inode_pages(inode->i_mapping, inode->i_size); | 1598 | truncate_inode_pages(inode->i_mapping, inode->i_size); |
1599 | ext4_truncate(inode); | 1599 | ext4_truncate(inode); |
1600 | } | 1600 | } |
1601 | 1601 | ||
1602 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | 1602 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, |
1603 | struct buffer_head *bh_result, int create); | 1603 | struct buffer_head *bh_result, int create); |
1604 | static int ext4_write_begin(struct file *file, struct address_space *mapping, | 1604 | static int ext4_write_begin(struct file *file, struct address_space *mapping, |
1605 | loff_t pos, unsigned len, unsigned flags, | 1605 | loff_t pos, unsigned len, unsigned flags, |
1606 | struct page **pagep, void **fsdata) | 1606 | struct page **pagep, void **fsdata) |
1607 | { | 1607 | { |
1608 | struct inode *inode = mapping->host; | 1608 | struct inode *inode = mapping->host; |
1609 | int ret, needed_blocks; | 1609 | int ret, needed_blocks; |
1610 | handle_t *handle; | 1610 | handle_t *handle; |
1611 | int retries = 0; | 1611 | int retries = 0; |
1612 | struct page *page; | 1612 | struct page *page; |
1613 | pgoff_t index; | 1613 | pgoff_t index; |
1614 | unsigned from, to; | 1614 | unsigned from, to; |
1615 | 1615 | ||
1616 | trace_ext4_write_begin(inode, pos, len, flags); | 1616 | trace_ext4_write_begin(inode, pos, len, flags); |
1617 | /* | 1617 | /* |
1618 | * Reserve one block more for addition to orphan list in case | 1618 | * Reserve one block more for addition to orphan list in case |
1619 | * we allocate blocks but write fails for some reason | 1619 | * we allocate blocks but write fails for some reason |
1620 | */ | 1620 | */ |
1621 | needed_blocks = ext4_writepage_trans_blocks(inode) + 1; | 1621 | needed_blocks = ext4_writepage_trans_blocks(inode) + 1; |
1622 | index = pos >> PAGE_CACHE_SHIFT; | 1622 | index = pos >> PAGE_CACHE_SHIFT; |
1623 | from = pos & (PAGE_CACHE_SIZE - 1); | 1623 | from = pos & (PAGE_CACHE_SIZE - 1); |
1624 | to = from + len; | 1624 | to = from + len; |
1625 | 1625 | ||
1626 | retry: | 1626 | retry: |
1627 | handle = ext4_journal_start(inode, needed_blocks); | 1627 | handle = ext4_journal_start(inode, needed_blocks); |
1628 | if (IS_ERR(handle)) { | 1628 | if (IS_ERR(handle)) { |
1629 | ret = PTR_ERR(handle); | 1629 | ret = PTR_ERR(handle); |
1630 | goto out; | 1630 | goto out; |
1631 | } | 1631 | } |
1632 | 1632 | ||
1633 | /* We cannot recurse into the filesystem as the transaction is already | 1633 | /* We cannot recurse into the filesystem as the transaction is already |
1634 | * started */ | 1634 | * started */ |
1635 | flags |= AOP_FLAG_NOFS; | 1635 | flags |= AOP_FLAG_NOFS; |
1636 | 1636 | ||
1637 | page = grab_cache_page_write_begin(mapping, index, flags); | 1637 | page = grab_cache_page_write_begin(mapping, index, flags); |
1638 | if (!page) { | 1638 | if (!page) { |
1639 | ext4_journal_stop(handle); | 1639 | ext4_journal_stop(handle); |
1640 | ret = -ENOMEM; | 1640 | ret = -ENOMEM; |
1641 | goto out; | 1641 | goto out; |
1642 | } | 1642 | } |
1643 | *pagep = page; | 1643 | *pagep = page; |
1644 | 1644 | ||
1645 | if (ext4_should_dioread_nolock(inode)) | 1645 | if (ext4_should_dioread_nolock(inode)) |
1646 | ret = __block_write_begin(page, pos, len, ext4_get_block_write); | 1646 | ret = __block_write_begin(page, pos, len, ext4_get_block_write); |
1647 | else | 1647 | else |
1648 | ret = __block_write_begin(page, pos, len, ext4_get_block); | 1648 | ret = __block_write_begin(page, pos, len, ext4_get_block); |
1649 | 1649 | ||
1650 | if (!ret && ext4_should_journal_data(inode)) { | 1650 | if (!ret && ext4_should_journal_data(inode)) { |
1651 | ret = walk_page_buffers(handle, page_buffers(page), | 1651 | ret = walk_page_buffers(handle, page_buffers(page), |
1652 | from, to, NULL, do_journal_get_write_access); | 1652 | from, to, NULL, do_journal_get_write_access); |
1653 | } | 1653 | } |
1654 | 1654 | ||
1655 | if (ret) { | 1655 | if (ret) { |
1656 | unlock_page(page); | 1656 | unlock_page(page); |
1657 | page_cache_release(page); | 1657 | page_cache_release(page); |
1658 | /* | 1658 | /* |
1659 | * __block_write_begin may have instantiated a few blocks | 1659 | * __block_write_begin may have instantiated a few blocks |
1660 | * outside i_size. Trim these off again. Don't need | 1660 | * outside i_size. Trim these off again. Don't need |
1661 | * i_size_read because we hold i_mutex. | 1661 | * i_size_read because we hold i_mutex. |
1662 | * | 1662 | * |
1663 | * Add inode to orphan list in case we crash before | 1663 | * Add inode to orphan list in case we crash before |
1664 | * truncate finishes | 1664 | * truncate finishes |
1665 | */ | 1665 | */ |
1666 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | 1666 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1667 | ext4_orphan_add(handle, inode); | 1667 | ext4_orphan_add(handle, inode); |
1668 | 1668 | ||
1669 | ext4_journal_stop(handle); | 1669 | ext4_journal_stop(handle); |
1670 | if (pos + len > inode->i_size) { | 1670 | if (pos + len > inode->i_size) { |
1671 | ext4_truncate_failed_write(inode); | 1671 | ext4_truncate_failed_write(inode); |
1672 | /* | 1672 | /* |
1673 | * If truncate failed early the inode might | 1673 | * If truncate failed early the inode might |
1674 | * still be on the orphan list; we need to | 1674 | * still be on the orphan list; we need to |
1675 | * make sure the inode is removed from the | 1675 | * make sure the inode is removed from the |
1676 | * orphan list in that case. | 1676 | * orphan list in that case. |
1677 | */ | 1677 | */ |
1678 | if (inode->i_nlink) | 1678 | if (inode->i_nlink) |
1679 | ext4_orphan_del(NULL, inode); | 1679 | ext4_orphan_del(NULL, inode); |
1680 | } | 1680 | } |
1681 | } | 1681 | } |
1682 | 1682 | ||
1683 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 1683 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
1684 | goto retry; | 1684 | goto retry; |
1685 | out: | 1685 | out: |
1686 | return ret; | 1686 | return ret; |
1687 | } | 1687 | } |
1688 | 1688 | ||
1689 | /* For write_end() in data=journal mode */ | 1689 | /* For write_end() in data=journal mode */ |
1690 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) | 1690 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) |
1691 | { | 1691 | { |
1692 | if (!buffer_mapped(bh) || buffer_freed(bh)) | 1692 | if (!buffer_mapped(bh) || buffer_freed(bh)) |
1693 | return 0; | 1693 | return 0; |
1694 | set_buffer_uptodate(bh); | 1694 | set_buffer_uptodate(bh); |
1695 | return ext4_handle_dirty_metadata(handle, NULL, bh); | 1695 | return ext4_handle_dirty_metadata(handle, NULL, bh); |
1696 | } | 1696 | } |
1697 | 1697 | ||
1698 | static int ext4_generic_write_end(struct file *file, | 1698 | static int ext4_generic_write_end(struct file *file, |
1699 | struct address_space *mapping, | 1699 | struct address_space *mapping, |
1700 | loff_t pos, unsigned len, unsigned copied, | 1700 | loff_t pos, unsigned len, unsigned copied, |
1701 | struct page *page, void *fsdata) | 1701 | struct page *page, void *fsdata) |
1702 | { | 1702 | { |
1703 | int i_size_changed = 0; | 1703 | int i_size_changed = 0; |
1704 | struct inode *inode = mapping->host; | 1704 | struct inode *inode = mapping->host; |
1705 | handle_t *handle = ext4_journal_current_handle(); | 1705 | handle_t *handle = ext4_journal_current_handle(); |
1706 | 1706 | ||
1707 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | 1707 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); |
1708 | 1708 | ||
1709 | /* | 1709 | /* |
1710 | * No need to use i_size_read() here, the i_size | 1710 | * No need to use i_size_read() here, the i_size |
1711 | * cannot change under us because we hold i_mutex. | 1711 | * cannot change under us because we hold i_mutex. |
1712 | * | 1712 | * |
1713 | * But it's important to update i_size while still holding page lock: | 1713 | * But it's important to update i_size while still holding page lock: |
1714 | * page writeout could otherwise come in and zero beyond i_size. | 1714 | * page writeout could otherwise come in and zero beyond i_size. |
1715 | */ | 1715 | */ |
1716 | if (pos + copied > inode->i_size) { | 1716 | if (pos + copied > inode->i_size) { |
1717 | i_size_write(inode, pos + copied); | 1717 | i_size_write(inode, pos + copied); |
1718 | i_size_changed = 1; | 1718 | i_size_changed = 1; |
1719 | } | 1719 | } |
1720 | 1720 | ||
1721 | if (pos + copied > EXT4_I(inode)->i_disksize) { | 1721 | if (pos + copied > EXT4_I(inode)->i_disksize) { |
1722 | /* We need to mark inode dirty even if | 1722 | /* We need to mark inode dirty even if |
1723 | * new_i_size is less that inode->i_size | 1723 | * new_i_size is less that inode->i_size |
1724 | * bu greater than i_disksize.(hint delalloc) | 1724 | * bu greater than i_disksize.(hint delalloc) |
1725 | */ | 1725 | */ |
1726 | ext4_update_i_disksize(inode, (pos + copied)); | 1726 | ext4_update_i_disksize(inode, (pos + copied)); |
1727 | i_size_changed = 1; | 1727 | i_size_changed = 1; |
1728 | } | 1728 | } |
1729 | unlock_page(page); | 1729 | unlock_page(page); |
1730 | page_cache_release(page); | 1730 | page_cache_release(page); |
1731 | 1731 | ||
1732 | /* | 1732 | /* |
1733 | * Don't mark the inode dirty under page lock. First, it unnecessarily | 1733 | * Don't mark the inode dirty under page lock. First, it unnecessarily |
1734 | * makes the holding time of page lock longer. Second, it forces lock | 1734 | * makes the holding time of page lock longer. Second, it forces lock |
1735 | * ordering of page lock and transaction start for journaling | 1735 | * ordering of page lock and transaction start for journaling |
1736 | * filesystems. | 1736 | * filesystems. |
1737 | */ | 1737 | */ |
1738 | if (i_size_changed) | 1738 | if (i_size_changed) |
1739 | ext4_mark_inode_dirty(handle, inode); | 1739 | ext4_mark_inode_dirty(handle, inode); |
1740 | 1740 | ||
1741 | return copied; | 1741 | return copied; |
1742 | } | 1742 | } |
1743 | 1743 | ||
1744 | /* | 1744 | /* |
1745 | * We need to pick up the new inode size which generic_commit_write gave us | 1745 | * We need to pick up the new inode size which generic_commit_write gave us |
1746 | * `file' can be NULL - eg, when called from page_symlink(). | 1746 | * `file' can be NULL - eg, when called from page_symlink(). |
1747 | * | 1747 | * |
1748 | * ext4 never places buffers on inode->i_mapping->private_list. metadata | 1748 | * ext4 never places buffers on inode->i_mapping->private_list. metadata |
1749 | * buffers are managed internally. | 1749 | * buffers are managed internally. |
1750 | */ | 1750 | */ |
1751 | static int ext4_ordered_write_end(struct file *file, | 1751 | static int ext4_ordered_write_end(struct file *file, |
1752 | struct address_space *mapping, | 1752 | struct address_space *mapping, |
1753 | loff_t pos, unsigned len, unsigned copied, | 1753 | loff_t pos, unsigned len, unsigned copied, |
1754 | struct page *page, void *fsdata) | 1754 | struct page *page, void *fsdata) |
1755 | { | 1755 | { |
1756 | handle_t *handle = ext4_journal_current_handle(); | 1756 | handle_t *handle = ext4_journal_current_handle(); |
1757 | struct inode *inode = mapping->host; | 1757 | struct inode *inode = mapping->host; |
1758 | int ret = 0, ret2; | 1758 | int ret = 0, ret2; |
1759 | 1759 | ||
1760 | trace_ext4_ordered_write_end(inode, pos, len, copied); | 1760 | trace_ext4_ordered_write_end(inode, pos, len, copied); |
1761 | ret = ext4_jbd2_file_inode(handle, inode); | 1761 | ret = ext4_jbd2_file_inode(handle, inode); |
1762 | 1762 | ||
1763 | if (ret == 0) { | 1763 | if (ret == 0) { |
1764 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | 1764 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, |
1765 | page, fsdata); | 1765 | page, fsdata); |
1766 | copied = ret2; | 1766 | copied = ret2; |
1767 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | 1767 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1768 | /* if we have allocated more blocks and copied | 1768 | /* if we have allocated more blocks and copied |
1769 | * less. We will have blocks allocated outside | 1769 | * less. We will have blocks allocated outside |
1770 | * inode->i_size. So truncate them | 1770 | * inode->i_size. So truncate them |
1771 | */ | 1771 | */ |
1772 | ext4_orphan_add(handle, inode); | 1772 | ext4_orphan_add(handle, inode); |
1773 | if (ret2 < 0) | 1773 | if (ret2 < 0) |
1774 | ret = ret2; | 1774 | ret = ret2; |
1775 | } | 1775 | } |
1776 | ret2 = ext4_journal_stop(handle); | 1776 | ret2 = ext4_journal_stop(handle); |
1777 | if (!ret) | 1777 | if (!ret) |
1778 | ret = ret2; | 1778 | ret = ret2; |
1779 | 1779 | ||
1780 | if (pos + len > inode->i_size) { | 1780 | if (pos + len > inode->i_size) { |
1781 | ext4_truncate_failed_write(inode); | 1781 | ext4_truncate_failed_write(inode); |
1782 | /* | 1782 | /* |
1783 | * If truncate failed early the inode might still be | 1783 | * If truncate failed early the inode might still be |
1784 | * on the orphan list; we need to make sure the inode | 1784 | * on the orphan list; we need to make sure the inode |
1785 | * is removed from the orphan list in that case. | 1785 | * is removed from the orphan list in that case. |
1786 | */ | 1786 | */ |
1787 | if (inode->i_nlink) | 1787 | if (inode->i_nlink) |
1788 | ext4_orphan_del(NULL, inode); | 1788 | ext4_orphan_del(NULL, inode); |
1789 | } | 1789 | } |
1790 | 1790 | ||
1791 | 1791 | ||
1792 | return ret ? ret : copied; | 1792 | return ret ? ret : copied; |
1793 | } | 1793 | } |
1794 | 1794 | ||
1795 | static int ext4_writeback_write_end(struct file *file, | 1795 | static int ext4_writeback_write_end(struct file *file, |
1796 | struct address_space *mapping, | 1796 | struct address_space *mapping, |
1797 | loff_t pos, unsigned len, unsigned copied, | 1797 | loff_t pos, unsigned len, unsigned copied, |
1798 | struct page *page, void *fsdata) | 1798 | struct page *page, void *fsdata) |
1799 | { | 1799 | { |
1800 | handle_t *handle = ext4_journal_current_handle(); | 1800 | handle_t *handle = ext4_journal_current_handle(); |
1801 | struct inode *inode = mapping->host; | 1801 | struct inode *inode = mapping->host; |
1802 | int ret = 0, ret2; | 1802 | int ret = 0, ret2; |
1803 | 1803 | ||
1804 | trace_ext4_writeback_write_end(inode, pos, len, copied); | 1804 | trace_ext4_writeback_write_end(inode, pos, len, copied); |
1805 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | 1805 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, |
1806 | page, fsdata); | 1806 | page, fsdata); |
1807 | copied = ret2; | 1807 | copied = ret2; |
1808 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | 1808 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1809 | /* if we have allocated more blocks and copied | 1809 | /* if we have allocated more blocks and copied |
1810 | * less. We will have blocks allocated outside | 1810 | * less. We will have blocks allocated outside |
1811 | * inode->i_size. So truncate them | 1811 | * inode->i_size. So truncate them |
1812 | */ | 1812 | */ |
1813 | ext4_orphan_add(handle, inode); | 1813 | ext4_orphan_add(handle, inode); |
1814 | 1814 | ||
1815 | if (ret2 < 0) | 1815 | if (ret2 < 0) |
1816 | ret = ret2; | 1816 | ret = ret2; |
1817 | 1817 | ||
1818 | ret2 = ext4_journal_stop(handle); | 1818 | ret2 = ext4_journal_stop(handle); |
1819 | if (!ret) | 1819 | if (!ret) |
1820 | ret = ret2; | 1820 | ret = ret2; |
1821 | 1821 | ||
1822 | if (pos + len > inode->i_size) { | 1822 | if (pos + len > inode->i_size) { |
1823 | ext4_truncate_failed_write(inode); | 1823 | ext4_truncate_failed_write(inode); |
1824 | /* | 1824 | /* |
1825 | * If truncate failed early the inode might still be | 1825 | * If truncate failed early the inode might still be |
1826 | * on the orphan list; we need to make sure the inode | 1826 | * on the orphan list; we need to make sure the inode |
1827 | * is removed from the orphan list in that case. | 1827 | * is removed from the orphan list in that case. |
1828 | */ | 1828 | */ |
1829 | if (inode->i_nlink) | 1829 | if (inode->i_nlink) |
1830 | ext4_orphan_del(NULL, inode); | 1830 | ext4_orphan_del(NULL, inode); |
1831 | } | 1831 | } |
1832 | 1832 | ||
1833 | return ret ? ret : copied; | 1833 | return ret ? ret : copied; |
1834 | } | 1834 | } |
1835 | 1835 | ||
1836 | static int ext4_journalled_write_end(struct file *file, | 1836 | static int ext4_journalled_write_end(struct file *file, |
1837 | struct address_space *mapping, | 1837 | struct address_space *mapping, |
1838 | loff_t pos, unsigned len, unsigned copied, | 1838 | loff_t pos, unsigned len, unsigned copied, |
1839 | struct page *page, void *fsdata) | 1839 | struct page *page, void *fsdata) |
1840 | { | 1840 | { |
1841 | handle_t *handle = ext4_journal_current_handle(); | 1841 | handle_t *handle = ext4_journal_current_handle(); |
1842 | struct inode *inode = mapping->host; | 1842 | struct inode *inode = mapping->host; |
1843 | int ret = 0, ret2; | 1843 | int ret = 0, ret2; |
1844 | int partial = 0; | 1844 | int partial = 0; |
1845 | unsigned from, to; | 1845 | unsigned from, to; |
1846 | loff_t new_i_size; | 1846 | loff_t new_i_size; |
1847 | 1847 | ||
1848 | trace_ext4_journalled_write_end(inode, pos, len, copied); | 1848 | trace_ext4_journalled_write_end(inode, pos, len, copied); |
1849 | from = pos & (PAGE_CACHE_SIZE - 1); | 1849 | from = pos & (PAGE_CACHE_SIZE - 1); |
1850 | to = from + len; | 1850 | to = from + len; |
1851 | 1851 | ||
1852 | if (copied < len) { | 1852 | if (copied < len) { |
1853 | if (!PageUptodate(page)) | 1853 | if (!PageUptodate(page)) |
1854 | copied = 0; | 1854 | copied = 0; |
1855 | page_zero_new_buffers(page, from+copied, to); | 1855 | page_zero_new_buffers(page, from+copied, to); |
1856 | } | 1856 | } |
1857 | 1857 | ||
1858 | ret = walk_page_buffers(handle, page_buffers(page), from, | 1858 | ret = walk_page_buffers(handle, page_buffers(page), from, |
1859 | to, &partial, write_end_fn); | 1859 | to, &partial, write_end_fn); |
1860 | if (!partial) | 1860 | if (!partial) |
1861 | SetPageUptodate(page); | 1861 | SetPageUptodate(page); |
1862 | new_i_size = pos + copied; | 1862 | new_i_size = pos + copied; |
1863 | if (new_i_size > inode->i_size) | 1863 | if (new_i_size > inode->i_size) |
1864 | i_size_write(inode, pos+copied); | 1864 | i_size_write(inode, pos+copied); |
1865 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 1865 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
1866 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 1866 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
1867 | ext4_update_i_disksize(inode, new_i_size); | 1867 | ext4_update_i_disksize(inode, new_i_size); |
1868 | ret2 = ext4_mark_inode_dirty(handle, inode); | 1868 | ret2 = ext4_mark_inode_dirty(handle, inode); |
1869 | if (!ret) | 1869 | if (!ret) |
1870 | ret = ret2; | 1870 | ret = ret2; |
1871 | } | 1871 | } |
1872 | 1872 | ||
1873 | unlock_page(page); | 1873 | unlock_page(page); |
1874 | page_cache_release(page); | 1874 | page_cache_release(page); |
1875 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | 1875 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1876 | /* if we have allocated more blocks and copied | 1876 | /* if we have allocated more blocks and copied |
1877 | * less. We will have blocks allocated outside | 1877 | * less. We will have blocks allocated outside |
1878 | * inode->i_size. So truncate them | 1878 | * inode->i_size. So truncate them |
1879 | */ | 1879 | */ |
1880 | ext4_orphan_add(handle, inode); | 1880 | ext4_orphan_add(handle, inode); |
1881 | 1881 | ||
1882 | ret2 = ext4_journal_stop(handle); | 1882 | ret2 = ext4_journal_stop(handle); |
1883 | if (!ret) | 1883 | if (!ret) |
1884 | ret = ret2; | 1884 | ret = ret2; |
1885 | if (pos + len > inode->i_size) { | 1885 | if (pos + len > inode->i_size) { |
1886 | ext4_truncate_failed_write(inode); | 1886 | ext4_truncate_failed_write(inode); |
1887 | /* | 1887 | /* |
1888 | * If truncate failed early the inode might still be | 1888 | * If truncate failed early the inode might still be |
1889 | * on the orphan list; we need to make sure the inode | 1889 | * on the orphan list; we need to make sure the inode |
1890 | * is removed from the orphan list in that case. | 1890 | * is removed from the orphan list in that case. |
1891 | */ | 1891 | */ |
1892 | if (inode->i_nlink) | 1892 | if (inode->i_nlink) |
1893 | ext4_orphan_del(NULL, inode); | 1893 | ext4_orphan_del(NULL, inode); |
1894 | } | 1894 | } |
1895 | 1895 | ||
1896 | return ret ? ret : copied; | 1896 | return ret ? ret : copied; |
1897 | } | 1897 | } |
1898 | 1898 | ||
1899 | /* | 1899 | /* |
1900 | * Reserve a single block located at lblock | 1900 | * Reserve a single block located at lblock |
1901 | */ | 1901 | */ |
1902 | static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) | 1902 | static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) |
1903 | { | 1903 | { |
1904 | int retries = 0; | 1904 | int retries = 0; |
1905 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1905 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1906 | struct ext4_inode_info *ei = EXT4_I(inode); | 1906 | struct ext4_inode_info *ei = EXT4_I(inode); |
1907 | unsigned long md_needed; | 1907 | unsigned long md_needed; |
1908 | int ret; | 1908 | int ret; |
1909 | 1909 | ||
1910 | /* | 1910 | /* |
1911 | * recalculate the amount of metadata blocks to reserve | 1911 | * recalculate the amount of metadata blocks to reserve |
1912 | * in order to allocate nrblocks | 1912 | * in order to allocate nrblocks |
1913 | * worse case is one extent per block | 1913 | * worse case is one extent per block |
1914 | */ | 1914 | */ |
1915 | repeat: | 1915 | repeat: |
1916 | spin_lock(&ei->i_block_reservation_lock); | 1916 | spin_lock(&ei->i_block_reservation_lock); |
1917 | md_needed = ext4_calc_metadata_amount(inode, lblock); | 1917 | md_needed = ext4_calc_metadata_amount(inode, lblock); |
1918 | trace_ext4_da_reserve_space(inode, md_needed); | 1918 | trace_ext4_da_reserve_space(inode, md_needed); |
1919 | spin_unlock(&ei->i_block_reservation_lock); | 1919 | spin_unlock(&ei->i_block_reservation_lock); |
1920 | 1920 | ||
1921 | /* | 1921 | /* |
1922 | * We will charge metadata quota at writeout time; this saves | 1922 | * We will charge metadata quota at writeout time; this saves |
1923 | * us from metadata over-estimation, though we may go over by | 1923 | * us from metadata over-estimation, though we may go over by |
1924 | * a small amount in the end. Here we just reserve for data. | 1924 | * a small amount in the end. Here we just reserve for data. |
1925 | */ | 1925 | */ |
1926 | ret = dquot_reserve_block(inode, 1); | 1926 | ret = dquot_reserve_block(inode, 1); |
1927 | if (ret) | 1927 | if (ret) |
1928 | return ret; | 1928 | return ret; |
1929 | /* | 1929 | /* |
1930 | * We do still charge estimated metadata to the sb though; | 1930 | * We do still charge estimated metadata to the sb though; |
1931 | * we cannot afford to run out of free blocks. | 1931 | * we cannot afford to run out of free blocks. |
1932 | */ | 1932 | */ |
1933 | if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { | 1933 | if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { |
1934 | dquot_release_reservation_block(inode, 1); | 1934 | dquot_release_reservation_block(inode, 1); |
1935 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { | 1935 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { |
1936 | yield(); | 1936 | yield(); |
1937 | goto repeat; | 1937 | goto repeat; |
1938 | } | 1938 | } |
1939 | return -ENOSPC; | 1939 | return -ENOSPC; |
1940 | } | 1940 | } |
1941 | spin_lock(&ei->i_block_reservation_lock); | 1941 | spin_lock(&ei->i_block_reservation_lock); |
1942 | ei->i_reserved_data_blocks++; | 1942 | ei->i_reserved_data_blocks++; |
1943 | ei->i_reserved_meta_blocks += md_needed; | 1943 | ei->i_reserved_meta_blocks += md_needed; |
1944 | spin_unlock(&ei->i_block_reservation_lock); | 1944 | spin_unlock(&ei->i_block_reservation_lock); |
1945 | 1945 | ||
1946 | return 0; /* success */ | 1946 | return 0; /* success */ |
1947 | } | 1947 | } |
1948 | 1948 | ||
1949 | static void ext4_da_release_space(struct inode *inode, int to_free) | 1949 | static void ext4_da_release_space(struct inode *inode, int to_free) |
1950 | { | 1950 | { |
1951 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1951 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1952 | struct ext4_inode_info *ei = EXT4_I(inode); | 1952 | struct ext4_inode_info *ei = EXT4_I(inode); |
1953 | 1953 | ||
1954 | if (!to_free) | 1954 | if (!to_free) |
1955 | return; /* Nothing to release, exit */ | 1955 | return; /* Nothing to release, exit */ |
1956 | 1956 | ||
1957 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1957 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); |
1958 | 1958 | ||
1959 | trace_ext4_da_release_space(inode, to_free); | 1959 | trace_ext4_da_release_space(inode, to_free); |
1960 | if (unlikely(to_free > ei->i_reserved_data_blocks)) { | 1960 | if (unlikely(to_free > ei->i_reserved_data_blocks)) { |
1961 | /* | 1961 | /* |
1962 | * if there aren't enough reserved blocks, then the | 1962 | * if there aren't enough reserved blocks, then the |
1963 | * counter is messed up somewhere. Since this | 1963 | * counter is messed up somewhere. Since this |
1964 | * function is called from invalidate page, it's | 1964 | * function is called from invalidate page, it's |
1965 | * harmless to return without any action. | 1965 | * harmless to return without any action. |
1966 | */ | 1966 | */ |
1967 | ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " | 1967 | ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " |
1968 | "ino %lu, to_free %d with only %d reserved " | 1968 | "ino %lu, to_free %d with only %d reserved " |
1969 | "data blocks\n", inode->i_ino, to_free, | 1969 | "data blocks\n", inode->i_ino, to_free, |
1970 | ei->i_reserved_data_blocks); | 1970 | ei->i_reserved_data_blocks); |
1971 | WARN_ON(1); | 1971 | WARN_ON(1); |
1972 | to_free = ei->i_reserved_data_blocks; | 1972 | to_free = ei->i_reserved_data_blocks; |
1973 | } | 1973 | } |
1974 | ei->i_reserved_data_blocks -= to_free; | 1974 | ei->i_reserved_data_blocks -= to_free; |
1975 | 1975 | ||
1976 | if (ei->i_reserved_data_blocks == 0) { | 1976 | if (ei->i_reserved_data_blocks == 0) { |
1977 | /* | 1977 | /* |
1978 | * We can release all of the reserved metadata blocks | 1978 | * We can release all of the reserved metadata blocks |
1979 | * only when we have written all of the delayed | 1979 | * only when we have written all of the delayed |
1980 | * allocation blocks. | 1980 | * allocation blocks. |
1981 | */ | 1981 | */ |
1982 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, | 1982 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, |
1983 | ei->i_reserved_meta_blocks); | 1983 | ei->i_reserved_meta_blocks); |
1984 | ei->i_reserved_meta_blocks = 0; | 1984 | ei->i_reserved_meta_blocks = 0; |
1985 | ei->i_da_metadata_calc_len = 0; | 1985 | ei->i_da_metadata_calc_len = 0; |
1986 | } | 1986 | } |
1987 | 1987 | ||
1988 | /* update fs dirty data blocks counter */ | 1988 | /* update fs dirty data blocks counter */ |
1989 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); | 1989 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); |
1990 | 1990 | ||
1991 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1991 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1992 | 1992 | ||
1993 | dquot_release_reservation_block(inode, to_free); | 1993 | dquot_release_reservation_block(inode, to_free); |
1994 | } | 1994 | } |
1995 | 1995 | ||
1996 | static void ext4_da_page_release_reservation(struct page *page, | 1996 | static void ext4_da_page_release_reservation(struct page *page, |
1997 | unsigned long offset) | 1997 | unsigned long offset) |
1998 | { | 1998 | { |
1999 | int to_release = 0; | 1999 | int to_release = 0; |
2000 | struct buffer_head *head, *bh; | 2000 | struct buffer_head *head, *bh; |
2001 | unsigned int curr_off = 0; | 2001 | unsigned int curr_off = 0; |
2002 | 2002 | ||
2003 | head = page_buffers(page); | 2003 | head = page_buffers(page); |
2004 | bh = head; | 2004 | bh = head; |
2005 | do { | 2005 | do { |
2006 | unsigned int next_off = curr_off + bh->b_size; | 2006 | unsigned int next_off = curr_off + bh->b_size; |
2007 | 2007 | ||
2008 | if ((offset <= curr_off) && (buffer_delay(bh))) { | 2008 | if ((offset <= curr_off) && (buffer_delay(bh))) { |
2009 | to_release++; | 2009 | to_release++; |
2010 | clear_buffer_delay(bh); | 2010 | clear_buffer_delay(bh); |
2011 | } | 2011 | } |
2012 | curr_off = next_off; | 2012 | curr_off = next_off; |
2013 | } while ((bh = bh->b_this_page) != head); | 2013 | } while ((bh = bh->b_this_page) != head); |
2014 | ext4_da_release_space(page->mapping->host, to_release); | 2014 | ext4_da_release_space(page->mapping->host, to_release); |
2015 | } | 2015 | } |
2016 | 2016 | ||
2017 | /* | 2017 | /* |
2018 | * Delayed allocation stuff | 2018 | * Delayed allocation stuff |
2019 | */ | 2019 | */ |
2020 | 2020 | ||
2021 | /* | 2021 | /* |
2022 | * mpage_da_submit_io - walks through extent of pages and try to write | 2022 | * mpage_da_submit_io - walks through extent of pages and try to write |
2023 | * them with writepage() call back | 2023 | * them with writepage() call back |
2024 | * | 2024 | * |
2025 | * @mpd->inode: inode | 2025 | * @mpd->inode: inode |
2026 | * @mpd->first_page: first page of the extent | 2026 | * @mpd->first_page: first page of the extent |
2027 | * @mpd->next_page: page after the last page of the extent | 2027 | * @mpd->next_page: page after the last page of the extent |
2028 | * | 2028 | * |
2029 | * By the time mpage_da_submit_io() is called we expect all blocks | 2029 | * By the time mpage_da_submit_io() is called we expect all blocks |
2030 | * to be allocated. this may be wrong if allocation failed. | 2030 | * to be allocated. this may be wrong if allocation failed. |
2031 | * | 2031 | * |
2032 | * As pages are already locked by write_cache_pages(), we can't use it | 2032 | * As pages are already locked by write_cache_pages(), we can't use it |
2033 | */ | 2033 | */ |
2034 | static int mpage_da_submit_io(struct mpage_da_data *mpd, | 2034 | static int mpage_da_submit_io(struct mpage_da_data *mpd, |
2035 | struct ext4_map_blocks *map) | 2035 | struct ext4_map_blocks *map) |
2036 | { | 2036 | { |
2037 | struct pagevec pvec; | 2037 | struct pagevec pvec; |
2038 | unsigned long index, end; | 2038 | unsigned long index, end; |
2039 | int ret = 0, err, nr_pages, i; | 2039 | int ret = 0, err, nr_pages, i; |
2040 | struct inode *inode = mpd->inode; | 2040 | struct inode *inode = mpd->inode; |
2041 | struct address_space *mapping = inode->i_mapping; | 2041 | struct address_space *mapping = inode->i_mapping; |
2042 | loff_t size = i_size_read(inode); | 2042 | loff_t size = i_size_read(inode); |
2043 | unsigned int len, block_start; | 2043 | unsigned int len, block_start; |
2044 | struct buffer_head *bh, *page_bufs = NULL; | 2044 | struct buffer_head *bh, *page_bufs = NULL; |
2045 | int journal_data = ext4_should_journal_data(inode); | 2045 | int journal_data = ext4_should_journal_data(inode); |
2046 | sector_t pblock = 0, cur_logical = 0; | 2046 | sector_t pblock = 0, cur_logical = 0; |
2047 | struct ext4_io_submit io_submit; | 2047 | struct ext4_io_submit io_submit; |
2048 | 2048 | ||
2049 | BUG_ON(mpd->next_page <= mpd->first_page); | 2049 | BUG_ON(mpd->next_page <= mpd->first_page); |
2050 | memset(&io_submit, 0, sizeof(io_submit)); | 2050 | memset(&io_submit, 0, sizeof(io_submit)); |
2051 | /* | 2051 | /* |
2052 | * We need to start from the first_page to the next_page - 1 | 2052 | * We need to start from the first_page to the next_page - 1 |
2053 | * to make sure we also write the mapped dirty buffer_heads. | 2053 | * to make sure we also write the mapped dirty buffer_heads. |
2054 | * If we look at mpd->b_blocknr we would only be looking | 2054 | * If we look at mpd->b_blocknr we would only be looking |
2055 | * at the currently mapped buffer_heads. | 2055 | * at the currently mapped buffer_heads. |
2056 | */ | 2056 | */ |
2057 | index = mpd->first_page; | 2057 | index = mpd->first_page; |
2058 | end = mpd->next_page - 1; | 2058 | end = mpd->next_page - 1; |
2059 | 2059 | ||
2060 | pagevec_init(&pvec, 0); | 2060 | pagevec_init(&pvec, 0); |
2061 | while (index <= end) { | 2061 | while (index <= end) { |
2062 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | 2062 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); |
2063 | if (nr_pages == 0) | 2063 | if (nr_pages == 0) |
2064 | break; | 2064 | break; |
2065 | for (i = 0; i < nr_pages; i++) { | 2065 | for (i = 0; i < nr_pages; i++) { |
2066 | int commit_write = 0, skip_page = 0; | 2066 | int commit_write = 0, skip_page = 0; |
2067 | struct page *page = pvec.pages[i]; | 2067 | struct page *page = pvec.pages[i]; |
2068 | 2068 | ||
2069 | index = page->index; | 2069 | index = page->index; |
2070 | if (index > end) | 2070 | if (index > end) |
2071 | break; | 2071 | break; |
2072 | 2072 | ||
2073 | if (index == size >> PAGE_CACHE_SHIFT) | 2073 | if (index == size >> PAGE_CACHE_SHIFT) |
2074 | len = size & ~PAGE_CACHE_MASK; | 2074 | len = size & ~PAGE_CACHE_MASK; |
2075 | else | 2075 | else |
2076 | len = PAGE_CACHE_SIZE; | 2076 | len = PAGE_CACHE_SIZE; |
2077 | if (map) { | 2077 | if (map) { |
2078 | cur_logical = index << (PAGE_CACHE_SHIFT - | 2078 | cur_logical = index << (PAGE_CACHE_SHIFT - |
2079 | inode->i_blkbits); | 2079 | inode->i_blkbits); |
2080 | pblock = map->m_pblk + (cur_logical - | 2080 | pblock = map->m_pblk + (cur_logical - |
2081 | map->m_lblk); | 2081 | map->m_lblk); |
2082 | } | 2082 | } |
2083 | index++; | 2083 | index++; |
2084 | 2084 | ||
2085 | BUG_ON(!PageLocked(page)); | 2085 | BUG_ON(!PageLocked(page)); |
2086 | BUG_ON(PageWriteback(page)); | 2086 | BUG_ON(PageWriteback(page)); |
2087 | 2087 | ||
2088 | /* | 2088 | /* |
2089 | * If the page does not have buffers (for | 2089 | * If the page does not have buffers (for |
2090 | * whatever reason), try to create them using | 2090 | * whatever reason), try to create them using |
2091 | * __block_write_begin. If this fails, | 2091 | * __block_write_begin. If this fails, |
2092 | * skip the page and move on. | 2092 | * skip the page and move on. |
2093 | */ | 2093 | */ |
2094 | if (!page_has_buffers(page)) { | 2094 | if (!page_has_buffers(page)) { |
2095 | if (__block_write_begin(page, 0, len, | 2095 | if (__block_write_begin(page, 0, len, |
2096 | noalloc_get_block_write)) { | 2096 | noalloc_get_block_write)) { |
2097 | skip_page: | 2097 | skip_page: |
2098 | unlock_page(page); | 2098 | unlock_page(page); |
2099 | continue; | 2099 | continue; |
2100 | } | 2100 | } |
2101 | commit_write = 1; | 2101 | commit_write = 1; |
2102 | } | 2102 | } |
2103 | 2103 | ||
2104 | bh = page_bufs = page_buffers(page); | 2104 | bh = page_bufs = page_buffers(page); |
2105 | block_start = 0; | 2105 | block_start = 0; |
2106 | do { | 2106 | do { |
2107 | if (!bh) | 2107 | if (!bh) |
2108 | goto skip_page; | 2108 | goto skip_page; |
2109 | if (map && (cur_logical >= map->m_lblk) && | 2109 | if (map && (cur_logical >= map->m_lblk) && |
2110 | (cur_logical <= (map->m_lblk + | 2110 | (cur_logical <= (map->m_lblk + |
2111 | (map->m_len - 1)))) { | 2111 | (map->m_len - 1)))) { |
2112 | if (buffer_delay(bh)) { | 2112 | if (buffer_delay(bh)) { |
2113 | clear_buffer_delay(bh); | 2113 | clear_buffer_delay(bh); |
2114 | bh->b_blocknr = pblock; | 2114 | bh->b_blocknr = pblock; |
2115 | } | 2115 | } |
2116 | if (buffer_unwritten(bh) || | 2116 | if (buffer_unwritten(bh) || |
2117 | buffer_mapped(bh)) | 2117 | buffer_mapped(bh)) |
2118 | BUG_ON(bh->b_blocknr != pblock); | 2118 | BUG_ON(bh->b_blocknr != pblock); |
2119 | if (map->m_flags & EXT4_MAP_UNINIT) | 2119 | if (map->m_flags & EXT4_MAP_UNINIT) |
2120 | set_buffer_uninit(bh); | 2120 | set_buffer_uninit(bh); |
2121 | clear_buffer_unwritten(bh); | 2121 | clear_buffer_unwritten(bh); |
2122 | } | 2122 | } |
2123 | 2123 | ||
2124 | /* skip page if block allocation undone */ | 2124 | /* skip page if block allocation undone */ |
2125 | if (buffer_delay(bh) || buffer_unwritten(bh)) | 2125 | if (buffer_delay(bh) || buffer_unwritten(bh)) |
2126 | skip_page = 1; | 2126 | skip_page = 1; |
2127 | bh = bh->b_this_page; | 2127 | bh = bh->b_this_page; |
2128 | block_start += bh->b_size; | 2128 | block_start += bh->b_size; |
2129 | cur_logical++; | 2129 | cur_logical++; |
2130 | pblock++; | 2130 | pblock++; |
2131 | } while (bh != page_bufs); | 2131 | } while (bh != page_bufs); |
2132 | 2132 | ||
2133 | if (skip_page) | 2133 | if (skip_page) |
2134 | goto skip_page; | 2134 | goto skip_page; |
2135 | 2135 | ||
2136 | if (commit_write) | 2136 | if (commit_write) |
2137 | /* mark the buffer_heads as dirty & uptodate */ | 2137 | /* mark the buffer_heads as dirty & uptodate */ |
2138 | block_commit_write(page, 0, len); | 2138 | block_commit_write(page, 0, len); |
2139 | 2139 | ||
2140 | clear_page_dirty_for_io(page); | 2140 | clear_page_dirty_for_io(page); |
2141 | /* | 2141 | /* |
2142 | * Delalloc doesn't support data journalling, | 2142 | * Delalloc doesn't support data journalling, |
2143 | * but eventually maybe we'll lift this | 2143 | * but eventually maybe we'll lift this |
2144 | * restriction. | 2144 | * restriction. |
2145 | */ | 2145 | */ |
2146 | if (unlikely(journal_data && PageChecked(page))) | 2146 | if (unlikely(journal_data && PageChecked(page))) |
2147 | err = __ext4_journalled_writepage(page, len); | 2147 | err = __ext4_journalled_writepage(page, len); |
2148 | else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) | 2148 | else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) |
2149 | err = ext4_bio_write_page(&io_submit, page, | 2149 | err = ext4_bio_write_page(&io_submit, page, |
2150 | len, mpd->wbc); | 2150 | len, mpd->wbc); |
2151 | else | 2151 | else |
2152 | err = block_write_full_page(page, | 2152 | err = block_write_full_page(page, |
2153 | noalloc_get_block_write, mpd->wbc); | 2153 | noalloc_get_block_write, mpd->wbc); |
2154 | 2154 | ||
2155 | if (!err) | 2155 | if (!err) |
2156 | mpd->pages_written++; | 2156 | mpd->pages_written++; |
2157 | /* | 2157 | /* |
2158 | * In error case, we have to continue because | 2158 | * In error case, we have to continue because |
2159 | * remaining pages are still locked | 2159 | * remaining pages are still locked |
2160 | */ | 2160 | */ |
2161 | if (ret == 0) | 2161 | if (ret == 0) |
2162 | ret = err; | 2162 | ret = err; |
2163 | } | 2163 | } |
2164 | pagevec_release(&pvec); | 2164 | pagevec_release(&pvec); |
2165 | } | 2165 | } |
2166 | ext4_io_submit(&io_submit); | 2166 | ext4_io_submit(&io_submit); |
2167 | return ret; | 2167 | return ret; |
2168 | } | 2168 | } |
2169 | 2169 | ||
2170 | static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) | 2170 | static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) |
2171 | { | 2171 | { |
2172 | int nr_pages, i; | 2172 | int nr_pages, i; |
2173 | pgoff_t index, end; | 2173 | pgoff_t index, end; |
2174 | struct pagevec pvec; | 2174 | struct pagevec pvec; |
2175 | struct inode *inode = mpd->inode; | 2175 | struct inode *inode = mpd->inode; |
2176 | struct address_space *mapping = inode->i_mapping; | 2176 | struct address_space *mapping = inode->i_mapping; |
2177 | 2177 | ||
2178 | index = mpd->first_page; | 2178 | index = mpd->first_page; |
2179 | end = mpd->next_page - 1; | 2179 | end = mpd->next_page - 1; |
2180 | while (index <= end) { | 2180 | while (index <= end) { |
2181 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | 2181 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); |
2182 | if (nr_pages == 0) | 2182 | if (nr_pages == 0) |
2183 | break; | 2183 | break; |
2184 | for (i = 0; i < nr_pages; i++) { | 2184 | for (i = 0; i < nr_pages; i++) { |
2185 | struct page *page = pvec.pages[i]; | 2185 | struct page *page = pvec.pages[i]; |
2186 | if (page->index > end) | 2186 | if (page->index > end) |
2187 | break; | 2187 | break; |
2188 | BUG_ON(!PageLocked(page)); | 2188 | BUG_ON(!PageLocked(page)); |
2189 | BUG_ON(PageWriteback(page)); | 2189 | BUG_ON(PageWriteback(page)); |
2190 | block_invalidatepage(page, 0); | 2190 | block_invalidatepage(page, 0); |
2191 | ClearPageUptodate(page); | 2191 | ClearPageUptodate(page); |
2192 | unlock_page(page); | 2192 | unlock_page(page); |
2193 | } | 2193 | } |
2194 | index = pvec.pages[nr_pages - 1]->index + 1; | 2194 | index = pvec.pages[nr_pages - 1]->index + 1; |
2195 | pagevec_release(&pvec); | 2195 | pagevec_release(&pvec); |
2196 | } | 2196 | } |
2197 | return; | 2197 | return; |
2198 | } | 2198 | } |
2199 | 2199 | ||
2200 | static void ext4_print_free_blocks(struct inode *inode) | 2200 | static void ext4_print_free_blocks(struct inode *inode) |
2201 | { | 2201 | { |
2202 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2202 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
2203 | printk(KERN_CRIT "Total free blocks count %lld\n", | 2203 | printk(KERN_CRIT "Total free blocks count %lld\n", |
2204 | ext4_count_free_blocks(inode->i_sb)); | 2204 | ext4_count_free_blocks(inode->i_sb)); |
2205 | printk(KERN_CRIT "Free/Dirty block details\n"); | 2205 | printk(KERN_CRIT "Free/Dirty block details\n"); |
2206 | printk(KERN_CRIT "free_blocks=%lld\n", | 2206 | printk(KERN_CRIT "free_blocks=%lld\n", |
2207 | (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); | 2207 | (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); |
2208 | printk(KERN_CRIT "dirty_blocks=%lld\n", | 2208 | printk(KERN_CRIT "dirty_blocks=%lld\n", |
2209 | (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); | 2209 | (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); |
2210 | printk(KERN_CRIT "Block reservation details\n"); | 2210 | printk(KERN_CRIT "Block reservation details\n"); |
2211 | printk(KERN_CRIT "i_reserved_data_blocks=%u\n", | 2211 | printk(KERN_CRIT "i_reserved_data_blocks=%u\n", |
2212 | EXT4_I(inode)->i_reserved_data_blocks); | 2212 | EXT4_I(inode)->i_reserved_data_blocks); |
2213 | printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", | 2213 | printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", |
2214 | EXT4_I(inode)->i_reserved_meta_blocks); | 2214 | EXT4_I(inode)->i_reserved_meta_blocks); |
2215 | return; | 2215 | return; |
2216 | } | 2216 | } |
2217 | 2217 | ||
2218 | /* | 2218 | /* |
2219 | * mpage_da_map_and_submit - go through given space, map them | 2219 | * mpage_da_map_and_submit - go through given space, map them |
2220 | * if necessary, and then submit them for I/O | 2220 | * if necessary, and then submit them for I/O |
2221 | * | 2221 | * |
2222 | * @mpd - bh describing space | 2222 | * @mpd - bh describing space |
2223 | * | 2223 | * |
2224 | * The function skips space we know is already mapped to disk blocks. | 2224 | * The function skips space we know is already mapped to disk blocks. |
2225 | * | 2225 | * |
2226 | */ | 2226 | */ |
2227 | static void mpage_da_map_and_submit(struct mpage_da_data *mpd) | 2227 | static void mpage_da_map_and_submit(struct mpage_da_data *mpd) |
2228 | { | 2228 | { |
2229 | int err, blks, get_blocks_flags; | 2229 | int err, blks, get_blocks_flags; |
2230 | struct ext4_map_blocks map, *mapp = NULL; | 2230 | struct ext4_map_blocks map, *mapp = NULL; |
2231 | sector_t next = mpd->b_blocknr; | 2231 | sector_t next = mpd->b_blocknr; |
2232 | unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; | 2232 | unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; |
2233 | loff_t disksize = EXT4_I(mpd->inode)->i_disksize; | 2233 | loff_t disksize = EXT4_I(mpd->inode)->i_disksize; |
2234 | handle_t *handle = NULL; | 2234 | handle_t *handle = NULL; |
2235 | 2235 | ||
2236 | /* | 2236 | /* |
2237 | * If the blocks are mapped already, or we couldn't accumulate | 2237 | * If the blocks are mapped already, or we couldn't accumulate |
2238 | * any blocks, then proceed immediately to the submission stage. | 2238 | * any blocks, then proceed immediately to the submission stage. |
2239 | */ | 2239 | */ |
2240 | if ((mpd->b_size == 0) || | 2240 | if ((mpd->b_size == 0) || |
2241 | ((mpd->b_state & (1 << BH_Mapped)) && | 2241 | ((mpd->b_state & (1 << BH_Mapped)) && |
2242 | !(mpd->b_state & (1 << BH_Delay)) && | 2242 | !(mpd->b_state & (1 << BH_Delay)) && |
2243 | !(mpd->b_state & (1 << BH_Unwritten)))) | 2243 | !(mpd->b_state & (1 << BH_Unwritten)))) |
2244 | goto submit_io; | 2244 | goto submit_io; |
2245 | 2245 | ||
2246 | handle = ext4_journal_current_handle(); | 2246 | handle = ext4_journal_current_handle(); |
2247 | BUG_ON(!handle); | 2247 | BUG_ON(!handle); |
2248 | 2248 | ||
2249 | /* | 2249 | /* |
2250 | * Call ext4_map_blocks() to allocate any delayed allocation | 2250 | * Call ext4_map_blocks() to allocate any delayed allocation |
2251 | * blocks, or to convert an uninitialized extent to be | 2251 | * blocks, or to convert an uninitialized extent to be |
2252 | * initialized (in the case where we have written into | 2252 | * initialized (in the case where we have written into |
2253 | * one or more preallocated blocks). | 2253 | * one or more preallocated blocks). |
2254 | * | 2254 | * |
2255 | * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to | 2255 | * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to |
2256 | * indicate that we are on the delayed allocation path. This | 2256 | * indicate that we are on the delayed allocation path. This |
2257 | * affects functions in many different parts of the allocation | 2257 | * affects functions in many different parts of the allocation |
2258 | * call path. This flag exists primarily because we don't | 2258 | * call path. This flag exists primarily because we don't |
2259 | * want to change *many* call functions, so ext4_map_blocks() | 2259 | * want to change *many* call functions, so ext4_map_blocks() |
2260 | * will set the EXT4_STATE_DELALLOC_RESERVED flag once the | 2260 | * will set the EXT4_STATE_DELALLOC_RESERVED flag once the |
2261 | * inode's allocation semaphore is taken. | 2261 | * inode's allocation semaphore is taken. |
2262 | * | 2262 | * |
2263 | * If the blocks in questions were delalloc blocks, set | 2263 | * If the blocks in questions were delalloc blocks, set |
2264 | * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting | 2264 | * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting |
2265 | * variables are updated after the blocks have been allocated. | 2265 | * variables are updated after the blocks have been allocated. |
2266 | */ | 2266 | */ |
2267 | map.m_lblk = next; | 2267 | map.m_lblk = next; |
2268 | map.m_len = max_blocks; | 2268 | map.m_len = max_blocks; |
2269 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE; | 2269 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE; |
2270 | if (ext4_should_dioread_nolock(mpd->inode)) | 2270 | if (ext4_should_dioread_nolock(mpd->inode)) |
2271 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; | 2271 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; |
2272 | if (mpd->b_state & (1 << BH_Delay)) | 2272 | if (mpd->b_state & (1 << BH_Delay)) |
2273 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; | 2273 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; |
2274 | 2274 | ||
2275 | blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); | 2275 | blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); |
2276 | if (blks < 0) { | 2276 | if (blks < 0) { |
2277 | struct super_block *sb = mpd->inode->i_sb; | 2277 | struct super_block *sb = mpd->inode->i_sb; |
2278 | 2278 | ||
2279 | err = blks; | 2279 | err = blks; |
2280 | /* | 2280 | /* |
2281 | * If get block returns EAGAIN or ENOSPC and there | 2281 | * If get block returns EAGAIN or ENOSPC and there |
2282 | * appears to be free blocks we will just let | 2282 | * appears to be free blocks we will just let |
2283 | * mpage_da_submit_io() unlock all of the pages. | 2283 | * mpage_da_submit_io() unlock all of the pages. |
2284 | */ | 2284 | */ |
2285 | if (err == -EAGAIN) | 2285 | if (err == -EAGAIN) |
2286 | goto submit_io; | 2286 | goto submit_io; |
2287 | 2287 | ||
2288 | if (err == -ENOSPC && | 2288 | if (err == -ENOSPC && |
2289 | ext4_count_free_blocks(sb)) { | 2289 | ext4_count_free_blocks(sb)) { |
2290 | mpd->retval = err; | 2290 | mpd->retval = err; |
2291 | goto submit_io; | 2291 | goto submit_io; |
2292 | } | 2292 | } |
2293 | 2293 | ||
2294 | /* | 2294 | /* |
2295 | * get block failure will cause us to loop in | 2295 | * get block failure will cause us to loop in |
2296 | * writepages, because a_ops->writepage won't be able | 2296 | * writepages, because a_ops->writepage won't be able |
2297 | * to make progress. The page will be redirtied by | 2297 | * to make progress. The page will be redirtied by |
2298 | * writepage and writepages will again try to write | 2298 | * writepage and writepages will again try to write |
2299 | * the same. | 2299 | * the same. |
2300 | */ | 2300 | */ |
2301 | if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { | 2301 | if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { |
2302 | ext4_msg(sb, KERN_CRIT, | 2302 | ext4_msg(sb, KERN_CRIT, |
2303 | "delayed block allocation failed for inode %lu " | 2303 | "delayed block allocation failed for inode %lu " |
2304 | "at logical offset %llu with max blocks %zd " | 2304 | "at logical offset %llu with max blocks %zd " |
2305 | "with error %d", mpd->inode->i_ino, | 2305 | "with error %d", mpd->inode->i_ino, |
2306 | (unsigned long long) next, | 2306 | (unsigned long long) next, |
2307 | mpd->b_size >> mpd->inode->i_blkbits, err); | 2307 | mpd->b_size >> mpd->inode->i_blkbits, err); |
2308 | ext4_msg(sb, KERN_CRIT, | 2308 | ext4_msg(sb, KERN_CRIT, |
2309 | "This should not happen!! Data will be lost\n"); | 2309 | "This should not happen!! Data will be lost\n"); |
2310 | if (err == -ENOSPC) | 2310 | if (err == -ENOSPC) |
2311 | ext4_print_free_blocks(mpd->inode); | 2311 | ext4_print_free_blocks(mpd->inode); |
2312 | } | 2312 | } |
2313 | /* invalidate all the pages */ | 2313 | /* invalidate all the pages */ |
2314 | ext4_da_block_invalidatepages(mpd); | 2314 | ext4_da_block_invalidatepages(mpd); |
2315 | 2315 | ||
2316 | /* Mark this page range as having been completed */ | 2316 | /* Mark this page range as having been completed */ |
2317 | mpd->io_done = 1; | 2317 | mpd->io_done = 1; |
2318 | return; | 2318 | return; |
2319 | } | 2319 | } |
2320 | BUG_ON(blks == 0); | 2320 | BUG_ON(blks == 0); |
2321 | 2321 | ||
2322 | mapp = ↦ | 2322 | mapp = ↦ |
2323 | if (map.m_flags & EXT4_MAP_NEW) { | 2323 | if (map.m_flags & EXT4_MAP_NEW) { |
2324 | struct block_device *bdev = mpd->inode->i_sb->s_bdev; | 2324 | struct block_device *bdev = mpd->inode->i_sb->s_bdev; |
2325 | int i; | 2325 | int i; |
2326 | 2326 | ||
2327 | for (i = 0; i < map.m_len; i++) | 2327 | for (i = 0; i < map.m_len; i++) |
2328 | unmap_underlying_metadata(bdev, map.m_pblk + i); | 2328 | unmap_underlying_metadata(bdev, map.m_pblk + i); |
2329 | } | 2329 | } |
2330 | 2330 | ||
2331 | if (ext4_should_order_data(mpd->inode)) { | 2331 | if (ext4_should_order_data(mpd->inode)) { |
2332 | err = ext4_jbd2_file_inode(handle, mpd->inode); | 2332 | err = ext4_jbd2_file_inode(handle, mpd->inode); |
2333 | if (err) | 2333 | if (err) |
2334 | /* This only happens if the journal is aborted */ | 2334 | /* This only happens if the journal is aborted */ |
2335 | return; | 2335 | return; |
2336 | } | 2336 | } |
2337 | 2337 | ||
2338 | /* | 2338 | /* |
2339 | * Update on-disk size along with block allocation. | 2339 | * Update on-disk size along with block allocation. |
2340 | */ | 2340 | */ |
2341 | disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; | 2341 | disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; |
2342 | if (disksize > i_size_read(mpd->inode)) | 2342 | if (disksize > i_size_read(mpd->inode)) |
2343 | disksize = i_size_read(mpd->inode); | 2343 | disksize = i_size_read(mpd->inode); |
2344 | if (disksize > EXT4_I(mpd->inode)->i_disksize) { | 2344 | if (disksize > EXT4_I(mpd->inode)->i_disksize) { |
2345 | ext4_update_i_disksize(mpd->inode, disksize); | 2345 | ext4_update_i_disksize(mpd->inode, disksize); |
2346 | err = ext4_mark_inode_dirty(handle, mpd->inode); | 2346 | err = ext4_mark_inode_dirty(handle, mpd->inode); |
2347 | if (err) | 2347 | if (err) |
2348 | ext4_error(mpd->inode->i_sb, | 2348 | ext4_error(mpd->inode->i_sb, |
2349 | "Failed to mark inode %lu dirty", | 2349 | "Failed to mark inode %lu dirty", |
2350 | mpd->inode->i_ino); | 2350 | mpd->inode->i_ino); |
2351 | } | 2351 | } |
2352 | 2352 | ||
2353 | submit_io: | 2353 | submit_io: |
2354 | mpage_da_submit_io(mpd, mapp); | 2354 | mpage_da_submit_io(mpd, mapp); |
2355 | mpd->io_done = 1; | 2355 | mpd->io_done = 1; |
2356 | } | 2356 | } |
2357 | 2357 | ||
2358 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ | 2358 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ |
2359 | (1 << BH_Delay) | (1 << BH_Unwritten)) | 2359 | (1 << BH_Delay) | (1 << BH_Unwritten)) |
2360 | 2360 | ||
2361 | /* | 2361 | /* |
2362 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks | 2362 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks |
2363 | * | 2363 | * |
2364 | * @mpd->lbh - extent of blocks | 2364 | * @mpd->lbh - extent of blocks |
2365 | * @logical - logical number of the block in the file | 2365 | * @logical - logical number of the block in the file |
2366 | * @bh - bh of the block (used to access block's state) | 2366 | * @bh - bh of the block (used to access block's state) |
2367 | * | 2367 | * |
2368 | * the function is used to collect contig. blocks in same state | 2368 | * the function is used to collect contig. blocks in same state |
2369 | */ | 2369 | */ |
2370 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, | 2370 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, |
2371 | sector_t logical, size_t b_size, | 2371 | sector_t logical, size_t b_size, |
2372 | unsigned long b_state) | 2372 | unsigned long b_state) |
2373 | { | 2373 | { |
2374 | sector_t next; | 2374 | sector_t next; |
2375 | int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; | 2375 | int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; |
2376 | 2376 | ||
2377 | /* | 2377 | /* |
2378 | * XXX Don't go larger than mballoc is willing to allocate | 2378 | * XXX Don't go larger than mballoc is willing to allocate |
2379 | * This is a stopgap solution. We eventually need to fold | 2379 | * This is a stopgap solution. We eventually need to fold |
2380 | * mpage_da_submit_io() into this function and then call | 2380 | * mpage_da_submit_io() into this function and then call |
2381 | * ext4_map_blocks() multiple times in a loop | 2381 | * ext4_map_blocks() multiple times in a loop |
2382 | */ | 2382 | */ |
2383 | if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) | 2383 | if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) |
2384 | goto flush_it; | 2384 | goto flush_it; |
2385 | 2385 | ||
2386 | /* check if thereserved journal credits might overflow */ | 2386 | /* check if thereserved journal credits might overflow */ |
2387 | if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { | 2387 | if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { |
2388 | if (nrblocks >= EXT4_MAX_TRANS_DATA) { | 2388 | if (nrblocks >= EXT4_MAX_TRANS_DATA) { |
2389 | /* | 2389 | /* |
2390 | * With non-extent format we are limited by the journal | 2390 | * With non-extent format we are limited by the journal |
2391 | * credit available. Total credit needed to insert | 2391 | * credit available. Total credit needed to insert |
2392 | * nrblocks contiguous blocks is dependent on the | 2392 | * nrblocks contiguous blocks is dependent on the |
2393 | * nrblocks. So limit nrblocks. | 2393 | * nrblocks. So limit nrblocks. |
2394 | */ | 2394 | */ |
2395 | goto flush_it; | 2395 | goto flush_it; |
2396 | } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > | 2396 | } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > |
2397 | EXT4_MAX_TRANS_DATA) { | 2397 | EXT4_MAX_TRANS_DATA) { |
2398 | /* | 2398 | /* |
2399 | * Adding the new buffer_head would make it cross the | 2399 | * Adding the new buffer_head would make it cross the |
2400 | * allowed limit for which we have journal credit | 2400 | * allowed limit for which we have journal credit |
2401 | * reserved. So limit the new bh->b_size | 2401 | * reserved. So limit the new bh->b_size |
2402 | */ | 2402 | */ |
2403 | b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << | 2403 | b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << |
2404 | mpd->inode->i_blkbits; | 2404 | mpd->inode->i_blkbits; |
2405 | /* we will do mpage_da_submit_io in the next loop */ | 2405 | /* we will do mpage_da_submit_io in the next loop */ |
2406 | } | 2406 | } |
2407 | } | 2407 | } |
2408 | /* | 2408 | /* |
2409 | * First block in the extent | 2409 | * First block in the extent |
2410 | */ | 2410 | */ |
2411 | if (mpd->b_size == 0) { | 2411 | if (mpd->b_size == 0) { |
2412 | mpd->b_blocknr = logical; | 2412 | mpd->b_blocknr = logical; |
2413 | mpd->b_size = b_size; | 2413 | mpd->b_size = b_size; |
2414 | mpd->b_state = b_state & BH_FLAGS; | 2414 | mpd->b_state = b_state & BH_FLAGS; |
2415 | return; | 2415 | return; |
2416 | } | 2416 | } |
2417 | 2417 | ||
2418 | next = mpd->b_blocknr + nrblocks; | 2418 | next = mpd->b_blocknr + nrblocks; |
2419 | /* | 2419 | /* |
2420 | * Can we merge the block to our big extent? | 2420 | * Can we merge the block to our big extent? |
2421 | */ | 2421 | */ |
2422 | if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { | 2422 | if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { |
2423 | mpd->b_size += b_size; | 2423 | mpd->b_size += b_size; |
2424 | return; | 2424 | return; |
2425 | } | 2425 | } |
2426 | 2426 | ||
2427 | flush_it: | 2427 | flush_it: |
2428 | /* | 2428 | /* |
2429 | * We couldn't merge the block to our extent, so we | 2429 | * We couldn't merge the block to our extent, so we |
2430 | * need to flush current extent and start new one | 2430 | * need to flush current extent and start new one |
2431 | */ | 2431 | */ |
2432 | mpage_da_map_and_submit(mpd); | 2432 | mpage_da_map_and_submit(mpd); |
2433 | return; | 2433 | return; |
2434 | } | 2434 | } |
2435 | 2435 | ||
2436 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) | 2436 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) |
2437 | { | 2437 | { |
2438 | return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); | 2438 | return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); |
2439 | } | 2439 | } |
2440 | 2440 | ||
2441 | /* | 2441 | /* |
2442 | * This is a special get_blocks_t callback which is used by | 2442 | * This is a special get_blocks_t callback which is used by |
2443 | * ext4_da_write_begin(). It will either return mapped block or | 2443 | * ext4_da_write_begin(). It will either return mapped block or |
2444 | * reserve space for a single block. | 2444 | * reserve space for a single block. |
2445 | * | 2445 | * |
2446 | * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. | 2446 | * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. |
2447 | * We also have b_blocknr = -1 and b_bdev initialized properly | 2447 | * We also have b_blocknr = -1 and b_bdev initialized properly |
2448 | * | 2448 | * |
2449 | * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. | 2449 | * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. |
2450 | * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev | 2450 | * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev |
2451 | * initialized properly. | 2451 | * initialized properly. |
2452 | */ | 2452 | */ |
2453 | static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | 2453 | static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, |
2454 | struct buffer_head *bh, int create) | 2454 | struct buffer_head *bh, int create) |
2455 | { | 2455 | { |
2456 | struct ext4_map_blocks map; | 2456 | struct ext4_map_blocks map; |
2457 | int ret = 0; | 2457 | int ret = 0; |
2458 | sector_t invalid_block = ~((sector_t) 0xffff); | 2458 | sector_t invalid_block = ~((sector_t) 0xffff); |
2459 | 2459 | ||
2460 | if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) | 2460 | if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) |
2461 | invalid_block = ~0; | 2461 | invalid_block = ~0; |
2462 | 2462 | ||
2463 | BUG_ON(create == 0); | 2463 | BUG_ON(create == 0); |
2464 | BUG_ON(bh->b_size != inode->i_sb->s_blocksize); | 2464 | BUG_ON(bh->b_size != inode->i_sb->s_blocksize); |
2465 | 2465 | ||
2466 | map.m_lblk = iblock; | 2466 | map.m_lblk = iblock; |
2467 | map.m_len = 1; | 2467 | map.m_len = 1; |
2468 | 2468 | ||
2469 | /* | 2469 | /* |
2470 | * first, we need to know whether the block is allocated already | 2470 | * first, we need to know whether the block is allocated already |
2471 | * preallocated blocks are unmapped but should treated | 2471 | * preallocated blocks are unmapped but should treated |
2472 | * the same as allocated blocks. | 2472 | * the same as allocated blocks. |
2473 | */ | 2473 | */ |
2474 | ret = ext4_map_blocks(NULL, inode, &map, 0); | 2474 | ret = ext4_map_blocks(NULL, inode, &map, 0); |
2475 | if (ret < 0) | 2475 | if (ret < 0) |
2476 | return ret; | 2476 | return ret; |
2477 | if (ret == 0) { | 2477 | if (ret == 0) { |
2478 | if (buffer_delay(bh)) | 2478 | if (buffer_delay(bh)) |
2479 | return 0; /* Not sure this could or should happen */ | 2479 | return 0; /* Not sure this could or should happen */ |
2480 | /* | 2480 | /* |
2481 | * XXX: __block_write_begin() unmaps passed block, is it OK? | 2481 | * XXX: __block_write_begin() unmaps passed block, is it OK? |
2482 | */ | 2482 | */ |
2483 | ret = ext4_da_reserve_space(inode, iblock); | 2483 | ret = ext4_da_reserve_space(inode, iblock); |
2484 | if (ret) | 2484 | if (ret) |
2485 | /* not enough space to reserve */ | 2485 | /* not enough space to reserve */ |
2486 | return ret; | 2486 | return ret; |
2487 | 2487 | ||
2488 | map_bh(bh, inode->i_sb, invalid_block); | 2488 | map_bh(bh, inode->i_sb, invalid_block); |
2489 | set_buffer_new(bh); | 2489 | set_buffer_new(bh); |
2490 | set_buffer_delay(bh); | 2490 | set_buffer_delay(bh); |
2491 | return 0; | 2491 | return 0; |
2492 | } | 2492 | } |
2493 | 2493 | ||
2494 | map_bh(bh, inode->i_sb, map.m_pblk); | 2494 | map_bh(bh, inode->i_sb, map.m_pblk); |
2495 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; | 2495 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; |
2496 | 2496 | ||
2497 | if (buffer_unwritten(bh)) { | 2497 | if (buffer_unwritten(bh)) { |
2498 | /* A delayed write to unwritten bh should be marked | 2498 | /* A delayed write to unwritten bh should be marked |
2499 | * new and mapped. Mapped ensures that we don't do | 2499 | * new and mapped. Mapped ensures that we don't do |
2500 | * get_block multiple times when we write to the same | 2500 | * get_block multiple times when we write to the same |
2501 | * offset and new ensures that we do proper zero out | 2501 | * offset and new ensures that we do proper zero out |
2502 | * for partial write. | 2502 | * for partial write. |
2503 | */ | 2503 | */ |
2504 | set_buffer_new(bh); | 2504 | set_buffer_new(bh); |
2505 | set_buffer_mapped(bh); | 2505 | set_buffer_mapped(bh); |
2506 | } | 2506 | } |
2507 | return 0; | 2507 | return 0; |
2508 | } | 2508 | } |
2509 | 2509 | ||
2510 | /* | 2510 | /* |
2511 | * This function is used as a standard get_block_t calback function | 2511 | * This function is used as a standard get_block_t calback function |
2512 | * when there is no desire to allocate any blocks. It is used as a | 2512 | * when there is no desire to allocate any blocks. It is used as a |
2513 | * callback function for block_write_begin() and block_write_full_page(). | 2513 | * callback function for block_write_begin() and block_write_full_page(). |
2514 | * These functions should only try to map a single block at a time. | 2514 | * These functions should only try to map a single block at a time. |
2515 | * | 2515 | * |
2516 | * Since this function doesn't do block allocations even if the caller | 2516 | * Since this function doesn't do block allocations even if the caller |
2517 | * requests it by passing in create=1, it is critically important that | 2517 | * requests it by passing in create=1, it is critically important that |
2518 | * any caller checks to make sure that any buffer heads are returned | 2518 | * any caller checks to make sure that any buffer heads are returned |
2519 | * by this function are either all already mapped or marked for | 2519 | * by this function are either all already mapped or marked for |
2520 | * delayed allocation before calling block_write_full_page(). Otherwise, | 2520 | * delayed allocation before calling block_write_full_page(). Otherwise, |
2521 | * b_blocknr could be left unitialized, and the page write functions will | 2521 | * b_blocknr could be left unitialized, and the page write functions will |
2522 | * be taken by surprise. | 2522 | * be taken by surprise. |
2523 | */ | 2523 | */ |
2524 | static int noalloc_get_block_write(struct inode *inode, sector_t iblock, | 2524 | static int noalloc_get_block_write(struct inode *inode, sector_t iblock, |
2525 | struct buffer_head *bh_result, int create) | 2525 | struct buffer_head *bh_result, int create) |
2526 | { | 2526 | { |
2527 | BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); | 2527 | BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); |
2528 | return _ext4_get_block(inode, iblock, bh_result, 0); | 2528 | return _ext4_get_block(inode, iblock, bh_result, 0); |
2529 | } | 2529 | } |
2530 | 2530 | ||
2531 | static int bget_one(handle_t *handle, struct buffer_head *bh) | 2531 | static int bget_one(handle_t *handle, struct buffer_head *bh) |
2532 | { | 2532 | { |
2533 | get_bh(bh); | 2533 | get_bh(bh); |
2534 | return 0; | 2534 | return 0; |
2535 | } | 2535 | } |
2536 | 2536 | ||
2537 | static int bput_one(handle_t *handle, struct buffer_head *bh) | 2537 | static int bput_one(handle_t *handle, struct buffer_head *bh) |
2538 | { | 2538 | { |
2539 | put_bh(bh); | 2539 | put_bh(bh); |
2540 | return 0; | 2540 | return 0; |
2541 | } | 2541 | } |
2542 | 2542 | ||
2543 | static int __ext4_journalled_writepage(struct page *page, | 2543 | static int __ext4_journalled_writepage(struct page *page, |
2544 | unsigned int len) | 2544 | unsigned int len) |
2545 | { | 2545 | { |
2546 | struct address_space *mapping = page->mapping; | 2546 | struct address_space *mapping = page->mapping; |
2547 | struct inode *inode = mapping->host; | 2547 | struct inode *inode = mapping->host; |
2548 | struct buffer_head *page_bufs; | 2548 | struct buffer_head *page_bufs; |
2549 | handle_t *handle = NULL; | 2549 | handle_t *handle = NULL; |
2550 | int ret = 0; | 2550 | int ret = 0; |
2551 | int err; | 2551 | int err; |
2552 | 2552 | ||
2553 | ClearPageChecked(page); | 2553 | ClearPageChecked(page); |
2554 | page_bufs = page_buffers(page); | 2554 | page_bufs = page_buffers(page); |
2555 | BUG_ON(!page_bufs); | 2555 | BUG_ON(!page_bufs); |
2556 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); | 2556 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); |
2557 | /* As soon as we unlock the page, it can go away, but we have | 2557 | /* As soon as we unlock the page, it can go away, but we have |
2558 | * references to buffers so we are safe */ | 2558 | * references to buffers so we are safe */ |
2559 | unlock_page(page); | 2559 | unlock_page(page); |
2560 | 2560 | ||
2561 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | 2561 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); |
2562 | if (IS_ERR(handle)) { | 2562 | if (IS_ERR(handle)) { |
2563 | ret = PTR_ERR(handle); | 2563 | ret = PTR_ERR(handle); |
2564 | goto out; | 2564 | goto out; |
2565 | } | 2565 | } |
2566 | 2566 | ||
2567 | ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, | 2567 | ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, |
2568 | do_journal_get_write_access); | 2568 | do_journal_get_write_access); |
2569 | 2569 | ||
2570 | err = walk_page_buffers(handle, page_bufs, 0, len, NULL, | 2570 | err = walk_page_buffers(handle, page_bufs, 0, len, NULL, |
2571 | write_end_fn); | 2571 | write_end_fn); |
2572 | if (ret == 0) | 2572 | if (ret == 0) |
2573 | ret = err; | 2573 | ret = err; |
2574 | err = ext4_journal_stop(handle); | 2574 | err = ext4_journal_stop(handle); |
2575 | if (!ret) | 2575 | if (!ret) |
2576 | ret = err; | 2576 | ret = err; |
2577 | 2577 | ||
2578 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); | 2578 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); |
2579 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 2579 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
2580 | out: | 2580 | out: |
2581 | return ret; | 2581 | return ret; |
2582 | } | 2582 | } |
2583 | 2583 | ||
2584 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); | 2584 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); |
2585 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); | 2585 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); |
2586 | 2586 | ||
2587 | /* | 2587 | /* |
2588 | * Note that we don't need to start a transaction unless we're journaling data | 2588 | * Note that we don't need to start a transaction unless we're journaling data |
2589 | * because we should have holes filled from ext4_page_mkwrite(). We even don't | 2589 | * because we should have holes filled from ext4_page_mkwrite(). We even don't |
2590 | * need to file the inode to the transaction's list in ordered mode because if | 2590 | * need to file the inode to the transaction's list in ordered mode because if |
2591 | * we are writing back data added by write(), the inode is already there and if | 2591 | * we are writing back data added by write(), the inode is already there and if |
2592 | * we are writing back data modified via mmap(), no one guarantees in which | 2592 | * we are writing back data modified via mmap(), no one guarantees in which |
2593 | * transaction the data will hit the disk. In case we are journaling data, we | 2593 | * transaction the data will hit the disk. In case we are journaling data, we |
2594 | * cannot start transaction directly because transaction start ranks above page | 2594 | * cannot start transaction directly because transaction start ranks above page |
2595 | * lock so we have to do some magic. | 2595 | * lock so we have to do some magic. |
2596 | * | 2596 | * |
2597 | * This function can get called via... | 2597 | * This function can get called via... |
2598 | * - ext4_da_writepages after taking page lock (have journal handle) | 2598 | * - ext4_da_writepages after taking page lock (have journal handle) |
2599 | * - journal_submit_inode_data_buffers (no journal handle) | 2599 | * - journal_submit_inode_data_buffers (no journal handle) |
2600 | * - shrink_page_list via pdflush (no journal handle) | 2600 | * - shrink_page_list via pdflush (no journal handle) |
2601 | * - grab_page_cache when doing write_begin (have journal handle) | 2601 | * - grab_page_cache when doing write_begin (have journal handle) |
2602 | * | 2602 | * |
2603 | * We don't do any block allocation in this function. If we have page with | 2603 | * We don't do any block allocation in this function. If we have page with |
2604 | * multiple blocks we need to write those buffer_heads that are mapped. This | 2604 | * multiple blocks we need to write those buffer_heads that are mapped. This |
2605 | * is important for mmaped based write. So if we do with blocksize 1K | 2605 | * is important for mmaped based write. So if we do with blocksize 1K |
2606 | * truncate(f, 1024); | 2606 | * truncate(f, 1024); |
2607 | * a = mmap(f, 0, 4096); | 2607 | * a = mmap(f, 0, 4096); |
2608 | * a[0] = 'a'; | 2608 | * a[0] = 'a'; |
2609 | * truncate(f, 4096); | 2609 | * truncate(f, 4096); |
2610 | * we have in the page first buffer_head mapped via page_mkwrite call back | 2610 | * we have in the page first buffer_head mapped via page_mkwrite call back |
2611 | * but other bufer_heads would be unmapped but dirty(dirty done via the | 2611 | * but other bufer_heads would be unmapped but dirty(dirty done via the |
2612 | * do_wp_page). So writepage should write the first block. If we modify | 2612 | * do_wp_page). So writepage should write the first block. If we modify |
2613 | * the mmap area beyond 1024 we will again get a page_fault and the | 2613 | * the mmap area beyond 1024 we will again get a page_fault and the |
2614 | * page_mkwrite callback will do the block allocation and mark the | 2614 | * page_mkwrite callback will do the block allocation and mark the |
2615 | * buffer_heads mapped. | 2615 | * buffer_heads mapped. |
2616 | * | 2616 | * |
2617 | * We redirty the page if we have any buffer_heads that is either delay or | 2617 | * We redirty the page if we have any buffer_heads that is either delay or |
2618 | * unwritten in the page. | 2618 | * unwritten in the page. |
2619 | * | 2619 | * |
2620 | * We can get recursively called as show below. | 2620 | * We can get recursively called as show below. |
2621 | * | 2621 | * |
2622 | * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> | 2622 | * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> |
2623 | * ext4_writepage() | 2623 | * ext4_writepage() |
2624 | * | 2624 | * |
2625 | * But since we don't do any block allocation we should not deadlock. | 2625 | * But since we don't do any block allocation we should not deadlock. |
2626 | * Page also have the dirty flag cleared so we don't get recurive page_lock. | 2626 | * Page also have the dirty flag cleared so we don't get recurive page_lock. |
2627 | */ | 2627 | */ |
2628 | static int ext4_writepage(struct page *page, | 2628 | static int ext4_writepage(struct page *page, |
2629 | struct writeback_control *wbc) | 2629 | struct writeback_control *wbc) |
2630 | { | 2630 | { |
2631 | int ret = 0, commit_write = 0; | 2631 | int ret = 0, commit_write = 0; |
2632 | loff_t size; | 2632 | loff_t size; |
2633 | unsigned int len; | 2633 | unsigned int len; |
2634 | struct buffer_head *page_bufs = NULL; | 2634 | struct buffer_head *page_bufs = NULL; |
2635 | struct inode *inode = page->mapping->host; | 2635 | struct inode *inode = page->mapping->host; |
2636 | 2636 | ||
2637 | trace_ext4_writepage(page); | 2637 | trace_ext4_writepage(page); |
2638 | size = i_size_read(inode); | 2638 | size = i_size_read(inode); |
2639 | if (page->index == size >> PAGE_CACHE_SHIFT) | 2639 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2640 | len = size & ~PAGE_CACHE_MASK; | 2640 | len = size & ~PAGE_CACHE_MASK; |
2641 | else | 2641 | else |
2642 | len = PAGE_CACHE_SIZE; | 2642 | len = PAGE_CACHE_SIZE; |
2643 | 2643 | ||
2644 | /* | 2644 | /* |
2645 | * If the page does not have buffers (for whatever reason), | 2645 | * If the page does not have buffers (for whatever reason), |
2646 | * try to create them using __block_write_begin. If this | 2646 | * try to create them using __block_write_begin. If this |
2647 | * fails, redirty the page and move on. | 2647 | * fails, redirty the page and move on. |
2648 | */ | 2648 | */ |
2649 | if (!page_has_buffers(page)) { | 2649 | if (!page_has_buffers(page)) { |
2650 | if (__block_write_begin(page, 0, len, | 2650 | if (__block_write_begin(page, 0, len, |
2651 | noalloc_get_block_write)) { | 2651 | noalloc_get_block_write)) { |
2652 | redirty_page: | 2652 | redirty_page: |
2653 | redirty_page_for_writepage(wbc, page); | 2653 | redirty_page_for_writepage(wbc, page); |
2654 | unlock_page(page); | 2654 | unlock_page(page); |
2655 | return 0; | 2655 | return 0; |
2656 | } | 2656 | } |
2657 | commit_write = 1; | 2657 | commit_write = 1; |
2658 | } | 2658 | } |
2659 | page_bufs = page_buffers(page); | 2659 | page_bufs = page_buffers(page); |
2660 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | 2660 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, |
2661 | ext4_bh_delay_or_unwritten)) { | 2661 | ext4_bh_delay_or_unwritten)) { |
2662 | /* | 2662 | /* |
2663 | * We don't want to do block allocation, so redirty | 2663 | * We don't want to do block allocation, so redirty |
2664 | * the page and return. We may reach here when we do | 2664 | * the page and return. We may reach here when we do |
2665 | * a journal commit via journal_submit_inode_data_buffers. | 2665 | * a journal commit via journal_submit_inode_data_buffers. |
2666 | * We can also reach here via shrink_page_list | 2666 | * We can also reach here via shrink_page_list |
2667 | */ | 2667 | */ |
2668 | goto redirty_page; | 2668 | goto redirty_page; |
2669 | } | 2669 | } |
2670 | if (commit_write) | 2670 | if (commit_write) |
2671 | /* now mark the buffer_heads as dirty and uptodate */ | 2671 | /* now mark the buffer_heads as dirty and uptodate */ |
2672 | block_commit_write(page, 0, len); | 2672 | block_commit_write(page, 0, len); |
2673 | 2673 | ||
2674 | if (PageChecked(page) && ext4_should_journal_data(inode)) | 2674 | if (PageChecked(page) && ext4_should_journal_data(inode)) |
2675 | /* | 2675 | /* |
2676 | * It's mmapped pagecache. Add buffers and journal it. There | 2676 | * It's mmapped pagecache. Add buffers and journal it. There |
2677 | * doesn't seem much point in redirtying the page here. | 2677 | * doesn't seem much point in redirtying the page here. |
2678 | */ | 2678 | */ |
2679 | return __ext4_journalled_writepage(page, len); | 2679 | return __ext4_journalled_writepage(page, len); |
2680 | 2680 | ||
2681 | if (buffer_uninit(page_bufs)) { | 2681 | if (buffer_uninit(page_bufs)) { |
2682 | ext4_set_bh_endio(page_bufs, inode); | 2682 | ext4_set_bh_endio(page_bufs, inode); |
2683 | ret = block_write_full_page_endio(page, noalloc_get_block_write, | 2683 | ret = block_write_full_page_endio(page, noalloc_get_block_write, |
2684 | wbc, ext4_end_io_buffer_write); | 2684 | wbc, ext4_end_io_buffer_write); |
2685 | } else | 2685 | } else |
2686 | ret = block_write_full_page(page, noalloc_get_block_write, | 2686 | ret = block_write_full_page(page, noalloc_get_block_write, |
2687 | wbc); | 2687 | wbc); |
2688 | 2688 | ||
2689 | return ret; | 2689 | return ret; |
2690 | } | 2690 | } |
2691 | 2691 | ||
2692 | /* | 2692 | /* |
2693 | * This is called via ext4_da_writepages() to | 2693 | * This is called via ext4_da_writepages() to |
2694 | * calculate the total number of credits to reserve to fit | 2694 | * calculate the total number of credits to reserve to fit |
2695 | * a single extent allocation into a single transaction, | 2695 | * a single extent allocation into a single transaction, |
2696 | * ext4_da_writpeages() will loop calling this before | 2696 | * ext4_da_writpeages() will loop calling this before |
2697 | * the block allocation. | 2697 | * the block allocation. |
2698 | */ | 2698 | */ |
2699 | 2699 | ||
2700 | static int ext4_da_writepages_trans_blocks(struct inode *inode) | 2700 | static int ext4_da_writepages_trans_blocks(struct inode *inode) |
2701 | { | 2701 | { |
2702 | int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 2702 | int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; |
2703 | 2703 | ||
2704 | /* | 2704 | /* |
2705 | * With non-extent format the journal credit needed to | 2705 | * With non-extent format the journal credit needed to |
2706 | * insert nrblocks contiguous block is dependent on | 2706 | * insert nrblocks contiguous block is dependent on |
2707 | * number of contiguous block. So we will limit | 2707 | * number of contiguous block. So we will limit |
2708 | * number of contiguous block to a sane value | 2708 | * number of contiguous block to a sane value |
2709 | */ | 2709 | */ |
2710 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && | 2710 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && |
2711 | (max_blocks > EXT4_MAX_TRANS_DATA)) | 2711 | (max_blocks > EXT4_MAX_TRANS_DATA)) |
2712 | max_blocks = EXT4_MAX_TRANS_DATA; | 2712 | max_blocks = EXT4_MAX_TRANS_DATA; |
2713 | 2713 | ||
2714 | return ext4_chunk_trans_blocks(inode, max_blocks); | 2714 | return ext4_chunk_trans_blocks(inode, max_blocks); |
2715 | } | 2715 | } |
2716 | 2716 | ||
2717 | /* | 2717 | /* |
2718 | * write_cache_pages_da - walk the list of dirty pages of the given | 2718 | * write_cache_pages_da - walk the list of dirty pages of the given |
2719 | * address space and accumulate pages that need writing, and call | 2719 | * address space and accumulate pages that need writing, and call |
2720 | * mpage_da_map_and_submit to map a single contiguous memory region | 2720 | * mpage_da_map_and_submit to map a single contiguous memory region |
2721 | * and then write them. | 2721 | * and then write them. |
2722 | */ | 2722 | */ |
2723 | static int write_cache_pages_da(struct address_space *mapping, | 2723 | static int write_cache_pages_da(struct address_space *mapping, |
2724 | struct writeback_control *wbc, | 2724 | struct writeback_control *wbc, |
2725 | struct mpage_da_data *mpd, | 2725 | struct mpage_da_data *mpd, |
2726 | pgoff_t *done_index) | 2726 | pgoff_t *done_index) |
2727 | { | 2727 | { |
2728 | struct buffer_head *bh, *head; | 2728 | struct buffer_head *bh, *head; |
2729 | struct inode *inode = mapping->host; | 2729 | struct inode *inode = mapping->host; |
2730 | struct pagevec pvec; | 2730 | struct pagevec pvec; |
2731 | unsigned int nr_pages; | 2731 | unsigned int nr_pages; |
2732 | sector_t logical; | 2732 | sector_t logical; |
2733 | pgoff_t index, end; | 2733 | pgoff_t index, end; |
2734 | long nr_to_write = wbc->nr_to_write; | 2734 | long nr_to_write = wbc->nr_to_write; |
2735 | int i, tag, ret = 0; | 2735 | int i, tag, ret = 0; |
2736 | 2736 | ||
2737 | memset(mpd, 0, sizeof(struct mpage_da_data)); | 2737 | memset(mpd, 0, sizeof(struct mpage_da_data)); |
2738 | mpd->wbc = wbc; | 2738 | mpd->wbc = wbc; |
2739 | mpd->inode = inode; | 2739 | mpd->inode = inode; |
2740 | pagevec_init(&pvec, 0); | 2740 | pagevec_init(&pvec, 0); |
2741 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2741 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2742 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 2742 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
2743 | 2743 | ||
2744 | if (wbc->sync_mode == WB_SYNC_ALL) | 2744 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2745 | tag = PAGECACHE_TAG_TOWRITE; | 2745 | tag = PAGECACHE_TAG_TOWRITE; |
2746 | else | 2746 | else |
2747 | tag = PAGECACHE_TAG_DIRTY; | 2747 | tag = PAGECACHE_TAG_DIRTY; |
2748 | 2748 | ||
2749 | *done_index = index; | 2749 | *done_index = index; |
2750 | while (index <= end) { | 2750 | while (index <= end) { |
2751 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | 2751 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, |
2752 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | 2752 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); |
2753 | if (nr_pages == 0) | 2753 | if (nr_pages == 0) |
2754 | return 0; | 2754 | return 0; |
2755 | 2755 | ||
2756 | for (i = 0; i < nr_pages; i++) { | 2756 | for (i = 0; i < nr_pages; i++) { |
2757 | struct page *page = pvec.pages[i]; | 2757 | struct page *page = pvec.pages[i]; |
2758 | 2758 | ||
2759 | /* | 2759 | /* |
2760 | * At this point, the page may be truncated or | 2760 | * At this point, the page may be truncated or |
2761 | * invalidated (changing page->mapping to NULL), or | 2761 | * invalidated (changing page->mapping to NULL), or |
2762 | * even swizzled back from swapper_space to tmpfs file | 2762 | * even swizzled back from swapper_space to tmpfs file |
2763 | * mapping. However, page->index will not change | 2763 | * mapping. However, page->index will not change |
2764 | * because we have a reference on the page. | 2764 | * because we have a reference on the page. |
2765 | */ | 2765 | */ |
2766 | if (page->index > end) | 2766 | if (page->index > end) |
2767 | goto out; | 2767 | goto out; |
2768 | 2768 | ||
2769 | *done_index = page->index + 1; | 2769 | *done_index = page->index + 1; |
2770 | 2770 | ||
2771 | /* | 2771 | /* |
2772 | * If we can't merge this page, and we have | 2772 | * If we can't merge this page, and we have |
2773 | * accumulated an contiguous region, write it | 2773 | * accumulated an contiguous region, write it |
2774 | */ | 2774 | */ |
2775 | if ((mpd->next_page != page->index) && | 2775 | if ((mpd->next_page != page->index) && |
2776 | (mpd->next_page != mpd->first_page)) { | 2776 | (mpd->next_page != mpd->first_page)) { |
2777 | mpage_da_map_and_submit(mpd); | 2777 | mpage_da_map_and_submit(mpd); |
2778 | goto ret_extent_tail; | 2778 | goto ret_extent_tail; |
2779 | } | 2779 | } |
2780 | 2780 | ||
2781 | lock_page(page); | 2781 | lock_page(page); |
2782 | 2782 | ||
2783 | /* | 2783 | /* |
2784 | * If the page is no longer dirty, or its | 2784 | * If the page is no longer dirty, or its |
2785 | * mapping no longer corresponds to inode we | 2785 | * mapping no longer corresponds to inode we |
2786 | * are writing (which means it has been | 2786 | * are writing (which means it has been |
2787 | * truncated or invalidated), or the page is | 2787 | * truncated or invalidated), or the page is |
2788 | * already under writeback and we are not | 2788 | * already under writeback and we are not |
2789 | * doing a data integrity writeback, skip the page | 2789 | * doing a data integrity writeback, skip the page |
2790 | */ | 2790 | */ |
2791 | if (!PageDirty(page) || | 2791 | if (!PageDirty(page) || |
2792 | (PageWriteback(page) && | 2792 | (PageWriteback(page) && |
2793 | (wbc->sync_mode == WB_SYNC_NONE)) || | 2793 | (wbc->sync_mode == WB_SYNC_NONE)) || |
2794 | unlikely(page->mapping != mapping)) { | 2794 | unlikely(page->mapping != mapping)) { |
2795 | unlock_page(page); | 2795 | unlock_page(page); |
2796 | continue; | 2796 | continue; |
2797 | } | 2797 | } |
2798 | 2798 | ||
2799 | wait_on_page_writeback(page); | 2799 | wait_on_page_writeback(page); |
2800 | BUG_ON(PageWriteback(page)); | 2800 | BUG_ON(PageWriteback(page)); |
2801 | 2801 | ||
2802 | if (mpd->next_page != page->index) | 2802 | if (mpd->next_page != page->index) |
2803 | mpd->first_page = page->index; | 2803 | mpd->first_page = page->index; |
2804 | mpd->next_page = page->index + 1; | 2804 | mpd->next_page = page->index + 1; |
2805 | logical = (sector_t) page->index << | 2805 | logical = (sector_t) page->index << |
2806 | (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2806 | (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2807 | 2807 | ||
2808 | if (!page_has_buffers(page)) { | 2808 | if (!page_has_buffers(page)) { |
2809 | mpage_add_bh_to_extent(mpd, logical, | 2809 | mpage_add_bh_to_extent(mpd, logical, |
2810 | PAGE_CACHE_SIZE, | 2810 | PAGE_CACHE_SIZE, |
2811 | (1 << BH_Dirty) | (1 << BH_Uptodate)); | 2811 | (1 << BH_Dirty) | (1 << BH_Uptodate)); |
2812 | if (mpd->io_done) | 2812 | if (mpd->io_done) |
2813 | goto ret_extent_tail; | 2813 | goto ret_extent_tail; |
2814 | } else { | 2814 | } else { |
2815 | /* | 2815 | /* |
2816 | * Page with regular buffer heads, | 2816 | * Page with regular buffer heads, |
2817 | * just add all dirty ones | 2817 | * just add all dirty ones |
2818 | */ | 2818 | */ |
2819 | head = page_buffers(page); | 2819 | head = page_buffers(page); |
2820 | bh = head; | 2820 | bh = head; |
2821 | do { | 2821 | do { |
2822 | BUG_ON(buffer_locked(bh)); | 2822 | BUG_ON(buffer_locked(bh)); |
2823 | /* | 2823 | /* |
2824 | * We need to try to allocate | 2824 | * We need to try to allocate |
2825 | * unmapped blocks in the same page. | 2825 | * unmapped blocks in the same page. |
2826 | * Otherwise we won't make progress | 2826 | * Otherwise we won't make progress |
2827 | * with the page in ext4_writepage | 2827 | * with the page in ext4_writepage |
2828 | */ | 2828 | */ |
2829 | if (ext4_bh_delay_or_unwritten(NULL, bh)) { | 2829 | if (ext4_bh_delay_or_unwritten(NULL, bh)) { |
2830 | mpage_add_bh_to_extent(mpd, logical, | 2830 | mpage_add_bh_to_extent(mpd, logical, |
2831 | bh->b_size, | 2831 | bh->b_size, |
2832 | bh->b_state); | 2832 | bh->b_state); |
2833 | if (mpd->io_done) | 2833 | if (mpd->io_done) |
2834 | goto ret_extent_tail; | 2834 | goto ret_extent_tail; |
2835 | } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { | 2835 | } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { |
2836 | /* | 2836 | /* |
2837 | * mapped dirty buffer. We need | 2837 | * mapped dirty buffer. We need |
2838 | * to update the b_state | 2838 | * to update the b_state |
2839 | * because we look at b_state | 2839 | * because we look at b_state |
2840 | * in mpage_da_map_blocks. We | 2840 | * in mpage_da_map_blocks. We |
2841 | * don't update b_size because | 2841 | * don't update b_size because |
2842 | * if we find an unmapped | 2842 | * if we find an unmapped |
2843 | * buffer_head later we need to | 2843 | * buffer_head later we need to |
2844 | * use the b_state flag of that | 2844 | * use the b_state flag of that |
2845 | * buffer_head. | 2845 | * buffer_head. |
2846 | */ | 2846 | */ |
2847 | if (mpd->b_size == 0) | 2847 | if (mpd->b_size == 0) |
2848 | mpd->b_state = bh->b_state & BH_FLAGS; | 2848 | mpd->b_state = bh->b_state & BH_FLAGS; |
2849 | } | 2849 | } |
2850 | logical++; | 2850 | logical++; |
2851 | } while ((bh = bh->b_this_page) != head); | 2851 | } while ((bh = bh->b_this_page) != head); |
2852 | } | 2852 | } |
2853 | 2853 | ||
2854 | if (nr_to_write > 0) { | 2854 | if (nr_to_write > 0) { |
2855 | nr_to_write--; | 2855 | nr_to_write--; |
2856 | if (nr_to_write == 0 && | 2856 | if (nr_to_write == 0 && |
2857 | wbc->sync_mode == WB_SYNC_NONE) | 2857 | wbc->sync_mode == WB_SYNC_NONE) |
2858 | /* | 2858 | /* |
2859 | * We stop writing back only if we are | 2859 | * We stop writing back only if we are |
2860 | * not doing integrity sync. In case of | 2860 | * not doing integrity sync. In case of |
2861 | * integrity sync we have to keep going | 2861 | * integrity sync we have to keep going |
2862 | * because someone may be concurrently | 2862 | * because someone may be concurrently |
2863 | * dirtying pages, and we might have | 2863 | * dirtying pages, and we might have |
2864 | * synced a lot of newly appeared dirty | 2864 | * synced a lot of newly appeared dirty |
2865 | * pages, but have not synced all of the | 2865 | * pages, but have not synced all of the |
2866 | * old dirty pages. | 2866 | * old dirty pages. |
2867 | */ | 2867 | */ |
2868 | goto out; | 2868 | goto out; |
2869 | } | 2869 | } |
2870 | } | 2870 | } |
2871 | pagevec_release(&pvec); | 2871 | pagevec_release(&pvec); |
2872 | cond_resched(); | 2872 | cond_resched(); |
2873 | } | 2873 | } |
2874 | return 0; | 2874 | return 0; |
2875 | ret_extent_tail: | 2875 | ret_extent_tail: |
2876 | ret = MPAGE_DA_EXTENT_TAIL; | 2876 | ret = MPAGE_DA_EXTENT_TAIL; |
2877 | out: | 2877 | out: |
2878 | pagevec_release(&pvec); | 2878 | pagevec_release(&pvec); |
2879 | cond_resched(); | 2879 | cond_resched(); |
2880 | return ret; | 2880 | return ret; |
2881 | } | 2881 | } |
2882 | 2882 | ||
2883 | 2883 | ||
2884 | static int ext4_da_writepages(struct address_space *mapping, | 2884 | static int ext4_da_writepages(struct address_space *mapping, |
2885 | struct writeback_control *wbc) | 2885 | struct writeback_control *wbc) |
2886 | { | 2886 | { |
2887 | pgoff_t index; | 2887 | pgoff_t index; |
2888 | int range_whole = 0; | 2888 | int range_whole = 0; |
2889 | handle_t *handle = NULL; | 2889 | handle_t *handle = NULL; |
2890 | struct mpage_da_data mpd; | 2890 | struct mpage_da_data mpd; |
2891 | struct inode *inode = mapping->host; | 2891 | struct inode *inode = mapping->host; |
2892 | int pages_written = 0; | 2892 | int pages_written = 0; |
2893 | unsigned int max_pages; | 2893 | unsigned int max_pages; |
2894 | int range_cyclic, cycled = 1, io_done = 0; | 2894 | int range_cyclic, cycled = 1, io_done = 0; |
2895 | int needed_blocks, ret = 0; | 2895 | int needed_blocks, ret = 0; |
2896 | long desired_nr_to_write, nr_to_writebump = 0; | 2896 | long desired_nr_to_write, nr_to_writebump = 0; |
2897 | loff_t range_start = wbc->range_start; | 2897 | loff_t range_start = wbc->range_start; |
2898 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2898 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2899 | pgoff_t done_index = 0; | 2899 | pgoff_t done_index = 0; |
2900 | pgoff_t end; | 2900 | pgoff_t end; |
2901 | 2901 | ||
2902 | trace_ext4_da_writepages(inode, wbc); | 2902 | trace_ext4_da_writepages(inode, wbc); |
2903 | 2903 | ||
2904 | /* | 2904 | /* |
2905 | * No pages to write? This is mainly a kludge to avoid starting | 2905 | * No pages to write? This is mainly a kludge to avoid starting |
2906 | * a transaction for special inodes like journal inode on last iput() | 2906 | * a transaction for special inodes like journal inode on last iput() |
2907 | * because that could violate lock ordering on umount | 2907 | * because that could violate lock ordering on umount |
2908 | */ | 2908 | */ |
2909 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | 2909 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) |
2910 | return 0; | 2910 | return 0; |
2911 | 2911 | ||
2912 | /* | 2912 | /* |
2913 | * If the filesystem has aborted, it is read-only, so return | 2913 | * If the filesystem has aborted, it is read-only, so return |
2914 | * right away instead of dumping stack traces later on that | 2914 | * right away instead of dumping stack traces later on that |
2915 | * will obscure the real source of the problem. We test | 2915 | * will obscure the real source of the problem. We test |
2916 | * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because | 2916 | * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because |
2917 | * the latter could be true if the filesystem is mounted | 2917 | * the latter could be true if the filesystem is mounted |
2918 | * read-only, and in that case, ext4_da_writepages should | 2918 | * read-only, and in that case, ext4_da_writepages should |
2919 | * *never* be called, so if that ever happens, we would want | 2919 | * *never* be called, so if that ever happens, we would want |
2920 | * the stack trace. | 2920 | * the stack trace. |
2921 | */ | 2921 | */ |
2922 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) | 2922 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) |
2923 | return -EROFS; | 2923 | return -EROFS; |
2924 | 2924 | ||
2925 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 2925 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
2926 | range_whole = 1; | 2926 | range_whole = 1; |
2927 | 2927 | ||
2928 | range_cyclic = wbc->range_cyclic; | 2928 | range_cyclic = wbc->range_cyclic; |
2929 | if (wbc->range_cyclic) { | 2929 | if (wbc->range_cyclic) { |
2930 | index = mapping->writeback_index; | 2930 | index = mapping->writeback_index; |
2931 | if (index) | 2931 | if (index) |
2932 | cycled = 0; | 2932 | cycled = 0; |
2933 | wbc->range_start = index << PAGE_CACHE_SHIFT; | 2933 | wbc->range_start = index << PAGE_CACHE_SHIFT; |
2934 | wbc->range_end = LLONG_MAX; | 2934 | wbc->range_end = LLONG_MAX; |
2935 | wbc->range_cyclic = 0; | 2935 | wbc->range_cyclic = 0; |
2936 | end = -1; | 2936 | end = -1; |
2937 | } else { | 2937 | } else { |
2938 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2938 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2939 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 2939 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
2940 | } | 2940 | } |
2941 | 2941 | ||
2942 | /* | 2942 | /* |
2943 | * This works around two forms of stupidity. The first is in | 2943 | * This works around two forms of stupidity. The first is in |
2944 | * the writeback code, which caps the maximum number of pages | 2944 | * the writeback code, which caps the maximum number of pages |
2945 | * written to be 1024 pages. This is wrong on multiple | 2945 | * written to be 1024 pages. This is wrong on multiple |
2946 | * levels; different architectues have a different page size, | 2946 | * levels; different architectues have a different page size, |
2947 | * which changes the maximum amount of data which gets | 2947 | * which changes the maximum amount of data which gets |
2948 | * written. Secondly, 4 megabytes is way too small. XFS | 2948 | * written. Secondly, 4 megabytes is way too small. XFS |
2949 | * forces this value to be 16 megabytes by multiplying | 2949 | * forces this value to be 16 megabytes by multiplying |
2950 | * nr_to_write parameter by four, and then relies on its | 2950 | * nr_to_write parameter by four, and then relies on its |
2951 | * allocator to allocate larger extents to make them | 2951 | * allocator to allocate larger extents to make them |
2952 | * contiguous. Unfortunately this brings us to the second | 2952 | * contiguous. Unfortunately this brings us to the second |
2953 | * stupidity, which is that ext4's mballoc code only allocates | 2953 | * stupidity, which is that ext4's mballoc code only allocates |
2954 | * at most 2048 blocks. So we force contiguous writes up to | 2954 | * at most 2048 blocks. So we force contiguous writes up to |
2955 | * the number of dirty blocks in the inode, or | 2955 | * the number of dirty blocks in the inode, or |
2956 | * sbi->max_writeback_mb_bump whichever is smaller. | 2956 | * sbi->max_writeback_mb_bump whichever is smaller. |
2957 | */ | 2957 | */ |
2958 | max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); | 2958 | max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); |
2959 | if (!range_cyclic && range_whole) { | 2959 | if (!range_cyclic && range_whole) { |
2960 | if (wbc->nr_to_write == LONG_MAX) | 2960 | if (wbc->nr_to_write == LONG_MAX) |
2961 | desired_nr_to_write = wbc->nr_to_write; | 2961 | desired_nr_to_write = wbc->nr_to_write; |
2962 | else | 2962 | else |
2963 | desired_nr_to_write = wbc->nr_to_write * 8; | 2963 | desired_nr_to_write = wbc->nr_to_write * 8; |
2964 | } else | 2964 | } else |
2965 | desired_nr_to_write = ext4_num_dirty_pages(inode, index, | 2965 | desired_nr_to_write = ext4_num_dirty_pages(inode, index, |
2966 | max_pages); | 2966 | max_pages); |
2967 | if (desired_nr_to_write > max_pages) | 2967 | if (desired_nr_to_write > max_pages) |
2968 | desired_nr_to_write = max_pages; | 2968 | desired_nr_to_write = max_pages; |
2969 | 2969 | ||
2970 | if (wbc->nr_to_write < desired_nr_to_write) { | 2970 | if (wbc->nr_to_write < desired_nr_to_write) { |
2971 | nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; | 2971 | nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; |
2972 | wbc->nr_to_write = desired_nr_to_write; | 2972 | wbc->nr_to_write = desired_nr_to_write; |
2973 | } | 2973 | } |
2974 | 2974 | ||
2975 | retry: | 2975 | retry: |
2976 | if (wbc->sync_mode == WB_SYNC_ALL) | 2976 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2977 | tag_pages_for_writeback(mapping, index, end); | 2977 | tag_pages_for_writeback(mapping, index, end); |
2978 | 2978 | ||
2979 | while (!ret && wbc->nr_to_write > 0) { | 2979 | while (!ret && wbc->nr_to_write > 0) { |
2980 | 2980 | ||
2981 | /* | 2981 | /* |
2982 | * we insert one extent at a time. So we need | 2982 | * we insert one extent at a time. So we need |
2983 | * credit needed for single extent allocation. | 2983 | * credit needed for single extent allocation. |
2984 | * journalled mode is currently not supported | 2984 | * journalled mode is currently not supported |
2985 | * by delalloc | 2985 | * by delalloc |
2986 | */ | 2986 | */ |
2987 | BUG_ON(ext4_should_journal_data(inode)); | 2987 | BUG_ON(ext4_should_journal_data(inode)); |
2988 | needed_blocks = ext4_da_writepages_trans_blocks(inode); | 2988 | needed_blocks = ext4_da_writepages_trans_blocks(inode); |
2989 | 2989 | ||
2990 | /* start a new transaction*/ | 2990 | /* start a new transaction*/ |
2991 | handle = ext4_journal_start(inode, needed_blocks); | 2991 | handle = ext4_journal_start(inode, needed_blocks); |
2992 | if (IS_ERR(handle)) { | 2992 | if (IS_ERR(handle)) { |
2993 | ret = PTR_ERR(handle); | 2993 | ret = PTR_ERR(handle); |
2994 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " | 2994 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " |
2995 | "%ld pages, ino %lu; err %d", __func__, | 2995 | "%ld pages, ino %lu; err %d", __func__, |
2996 | wbc->nr_to_write, inode->i_ino, ret); | 2996 | wbc->nr_to_write, inode->i_ino, ret); |
2997 | goto out_writepages; | 2997 | goto out_writepages; |
2998 | } | 2998 | } |
2999 | 2999 | ||
3000 | /* | 3000 | /* |
3001 | * Now call write_cache_pages_da() to find the next | 3001 | * Now call write_cache_pages_da() to find the next |
3002 | * contiguous region of logical blocks that need | 3002 | * contiguous region of logical blocks that need |
3003 | * blocks to be allocated by ext4 and submit them. | 3003 | * blocks to be allocated by ext4 and submit them. |
3004 | */ | 3004 | */ |
3005 | ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); | 3005 | ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); |
3006 | /* | 3006 | /* |
3007 | * If we have a contiguous extent of pages and we | 3007 | * If we have a contiguous extent of pages and we |
3008 | * haven't done the I/O yet, map the blocks and submit | 3008 | * haven't done the I/O yet, map the blocks and submit |
3009 | * them for I/O. | 3009 | * them for I/O. |
3010 | */ | 3010 | */ |
3011 | if (!mpd.io_done && mpd.next_page != mpd.first_page) { | 3011 | if (!mpd.io_done && mpd.next_page != mpd.first_page) { |
3012 | mpage_da_map_and_submit(&mpd); | 3012 | mpage_da_map_and_submit(&mpd); |
3013 | ret = MPAGE_DA_EXTENT_TAIL; | 3013 | ret = MPAGE_DA_EXTENT_TAIL; |
3014 | } | 3014 | } |
3015 | trace_ext4_da_write_pages(inode, &mpd); | 3015 | trace_ext4_da_write_pages(inode, &mpd); |
3016 | wbc->nr_to_write -= mpd.pages_written; | 3016 | wbc->nr_to_write -= mpd.pages_written; |
3017 | 3017 | ||
3018 | ext4_journal_stop(handle); | 3018 | ext4_journal_stop(handle); |
3019 | 3019 | ||
3020 | if ((mpd.retval == -ENOSPC) && sbi->s_journal) { | 3020 | if ((mpd.retval == -ENOSPC) && sbi->s_journal) { |
3021 | /* commit the transaction which would | 3021 | /* commit the transaction which would |
3022 | * free blocks released in the transaction | 3022 | * free blocks released in the transaction |
3023 | * and try again | 3023 | * and try again |
3024 | */ | 3024 | */ |
3025 | jbd2_journal_force_commit_nested(sbi->s_journal); | 3025 | jbd2_journal_force_commit_nested(sbi->s_journal); |
3026 | ret = 0; | 3026 | ret = 0; |
3027 | } else if (ret == MPAGE_DA_EXTENT_TAIL) { | 3027 | } else if (ret == MPAGE_DA_EXTENT_TAIL) { |
3028 | /* | 3028 | /* |
3029 | * got one extent now try with | 3029 | * got one extent now try with |
3030 | * rest of the pages | 3030 | * rest of the pages |
3031 | */ | 3031 | */ |
3032 | pages_written += mpd.pages_written; | 3032 | pages_written += mpd.pages_written; |
3033 | ret = 0; | 3033 | ret = 0; |
3034 | io_done = 1; | 3034 | io_done = 1; |
3035 | } else if (wbc->nr_to_write) | 3035 | } else if (wbc->nr_to_write) |
3036 | /* | 3036 | /* |
3037 | * There is no more writeout needed | 3037 | * There is no more writeout needed |
3038 | * or we requested for a noblocking writeout | 3038 | * or we requested for a noblocking writeout |
3039 | * and we found the device congested | 3039 | * and we found the device congested |
3040 | */ | 3040 | */ |
3041 | break; | 3041 | break; |
3042 | } | 3042 | } |
3043 | if (!io_done && !cycled) { | 3043 | if (!io_done && !cycled) { |
3044 | cycled = 1; | 3044 | cycled = 1; |
3045 | index = 0; | 3045 | index = 0; |
3046 | wbc->range_start = index << PAGE_CACHE_SHIFT; | 3046 | wbc->range_start = index << PAGE_CACHE_SHIFT; |
3047 | wbc->range_end = mapping->writeback_index - 1; | 3047 | wbc->range_end = mapping->writeback_index - 1; |
3048 | goto retry; | 3048 | goto retry; |
3049 | } | 3049 | } |
3050 | 3050 | ||
3051 | /* Update index */ | 3051 | /* Update index */ |
3052 | wbc->range_cyclic = range_cyclic; | 3052 | wbc->range_cyclic = range_cyclic; |
3053 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 3053 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
3054 | /* | 3054 | /* |
3055 | * set the writeback_index so that range_cyclic | 3055 | * set the writeback_index so that range_cyclic |
3056 | * mode will write it back later | 3056 | * mode will write it back later |
3057 | */ | 3057 | */ |
3058 | mapping->writeback_index = done_index; | 3058 | mapping->writeback_index = done_index; |
3059 | 3059 | ||
3060 | out_writepages: | 3060 | out_writepages: |
3061 | wbc->nr_to_write -= nr_to_writebump; | 3061 | wbc->nr_to_write -= nr_to_writebump; |
3062 | wbc->range_start = range_start; | 3062 | wbc->range_start = range_start; |
3063 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); | 3063 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); |
3064 | return ret; | 3064 | return ret; |
3065 | } | 3065 | } |
3066 | 3066 | ||
3067 | #define FALL_BACK_TO_NONDELALLOC 1 | 3067 | #define FALL_BACK_TO_NONDELALLOC 1 |
3068 | static int ext4_nonda_switch(struct super_block *sb) | 3068 | static int ext4_nonda_switch(struct super_block *sb) |
3069 | { | 3069 | { |
3070 | s64 free_blocks, dirty_blocks; | 3070 | s64 free_blocks, dirty_blocks; |
3071 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 3071 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
3072 | 3072 | ||
3073 | /* | 3073 | /* |
3074 | * switch to non delalloc mode if we are running low | 3074 | * switch to non delalloc mode if we are running low |
3075 | * on free block. The free block accounting via percpu | 3075 | * on free block. The free block accounting via percpu |
3076 | * counters can get slightly wrong with percpu_counter_batch getting | 3076 | * counters can get slightly wrong with percpu_counter_batch getting |
3077 | * accumulated on each CPU without updating global counters | 3077 | * accumulated on each CPU without updating global counters |
3078 | * Delalloc need an accurate free block accounting. So switch | 3078 | * Delalloc need an accurate free block accounting. So switch |
3079 | * to non delalloc when we are near to error range. | 3079 | * to non delalloc when we are near to error range. |
3080 | */ | 3080 | */ |
3081 | free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); | 3081 | free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); |
3082 | dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); | 3082 | dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); |
3083 | if (2 * free_blocks < 3 * dirty_blocks || | 3083 | if (2 * free_blocks < 3 * dirty_blocks || |
3084 | free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { | 3084 | free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { |
3085 | /* | 3085 | /* |
3086 | * free block count is less than 150% of dirty blocks | 3086 | * free block count is less than 150% of dirty blocks |
3087 | * or free blocks is less than watermark | 3087 | * or free blocks is less than watermark |
3088 | */ | 3088 | */ |
3089 | return 1; | 3089 | return 1; |
3090 | } | 3090 | } |
3091 | /* | 3091 | /* |
3092 | * Even if we don't switch but are nearing capacity, | 3092 | * Even if we don't switch but are nearing capacity, |
3093 | * start pushing delalloc when 1/2 of free blocks are dirty. | 3093 | * start pushing delalloc when 1/2 of free blocks are dirty. |
3094 | */ | 3094 | */ |
3095 | if (free_blocks < 2 * dirty_blocks) | 3095 | if (free_blocks < 2 * dirty_blocks) |
3096 | writeback_inodes_sb_if_idle(sb); | 3096 | writeback_inodes_sb_if_idle(sb); |
3097 | 3097 | ||
3098 | return 0; | 3098 | return 0; |
3099 | } | 3099 | } |
3100 | 3100 | ||
3101 | static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | 3101 | static int ext4_da_write_begin(struct file *file, struct address_space *mapping, |
3102 | loff_t pos, unsigned len, unsigned flags, | 3102 | loff_t pos, unsigned len, unsigned flags, |
3103 | struct page **pagep, void **fsdata) | 3103 | struct page **pagep, void **fsdata) |
3104 | { | 3104 | { |
3105 | int ret, retries = 0; | 3105 | int ret, retries = 0; |
3106 | struct page *page; | 3106 | struct page *page; |
3107 | pgoff_t index; | 3107 | pgoff_t index; |
3108 | struct inode *inode = mapping->host; | 3108 | struct inode *inode = mapping->host; |
3109 | handle_t *handle; | 3109 | handle_t *handle; |
3110 | 3110 | ||
3111 | index = pos >> PAGE_CACHE_SHIFT; | 3111 | index = pos >> PAGE_CACHE_SHIFT; |
3112 | 3112 | ||
3113 | if (ext4_nonda_switch(inode->i_sb)) { | 3113 | if (ext4_nonda_switch(inode->i_sb)) { |
3114 | *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; | 3114 | *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; |
3115 | return ext4_write_begin(file, mapping, pos, | 3115 | return ext4_write_begin(file, mapping, pos, |
3116 | len, flags, pagep, fsdata); | 3116 | len, flags, pagep, fsdata); |
3117 | } | 3117 | } |
3118 | *fsdata = (void *)0; | 3118 | *fsdata = (void *)0; |
3119 | trace_ext4_da_write_begin(inode, pos, len, flags); | 3119 | trace_ext4_da_write_begin(inode, pos, len, flags); |
3120 | retry: | 3120 | retry: |
3121 | /* | 3121 | /* |
3122 | * With delayed allocation, we don't log the i_disksize update | 3122 | * With delayed allocation, we don't log the i_disksize update |
3123 | * if there is delayed block allocation. But we still need | 3123 | * if there is delayed block allocation. But we still need |
3124 | * to journalling the i_disksize update if writes to the end | 3124 | * to journalling the i_disksize update if writes to the end |
3125 | * of file which has an already mapped buffer. | 3125 | * of file which has an already mapped buffer. |
3126 | */ | 3126 | */ |
3127 | handle = ext4_journal_start(inode, 1); | 3127 | handle = ext4_journal_start(inode, 1); |
3128 | if (IS_ERR(handle)) { | 3128 | if (IS_ERR(handle)) { |
3129 | ret = PTR_ERR(handle); | 3129 | ret = PTR_ERR(handle); |
3130 | goto out; | 3130 | goto out; |
3131 | } | 3131 | } |
3132 | /* We cannot recurse into the filesystem as the transaction is already | 3132 | /* We cannot recurse into the filesystem as the transaction is already |
3133 | * started */ | 3133 | * started */ |
3134 | flags |= AOP_FLAG_NOFS; | 3134 | flags |= AOP_FLAG_NOFS; |
3135 | 3135 | ||
3136 | page = grab_cache_page_write_begin(mapping, index, flags); | 3136 | page = grab_cache_page_write_begin(mapping, index, flags); |
3137 | if (!page) { | 3137 | if (!page) { |
3138 | ext4_journal_stop(handle); | 3138 | ext4_journal_stop(handle); |
3139 | ret = -ENOMEM; | 3139 | ret = -ENOMEM; |
3140 | goto out; | 3140 | goto out; |
3141 | } | 3141 | } |
3142 | *pagep = page; | 3142 | *pagep = page; |
3143 | 3143 | ||
3144 | ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); | 3144 | ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); |
3145 | if (ret < 0) { | 3145 | if (ret < 0) { |
3146 | unlock_page(page); | 3146 | unlock_page(page); |
3147 | ext4_journal_stop(handle); | 3147 | ext4_journal_stop(handle); |
3148 | page_cache_release(page); | 3148 | page_cache_release(page); |
3149 | /* | 3149 | /* |
3150 | * block_write_begin may have instantiated a few blocks | 3150 | * block_write_begin may have instantiated a few blocks |
3151 | * outside i_size. Trim these off again. Don't need | 3151 | * outside i_size. Trim these off again. Don't need |
3152 | * i_size_read because we hold i_mutex. | 3152 | * i_size_read because we hold i_mutex. |
3153 | */ | 3153 | */ |
3154 | if (pos + len > inode->i_size) | 3154 | if (pos + len > inode->i_size) |
3155 | ext4_truncate_failed_write(inode); | 3155 | ext4_truncate_failed_write(inode); |
3156 | } | 3156 | } |
3157 | 3157 | ||
3158 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 3158 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
3159 | goto retry; | 3159 | goto retry; |
3160 | out: | 3160 | out: |
3161 | return ret; | 3161 | return ret; |
3162 | } | 3162 | } |
3163 | 3163 | ||
3164 | /* | 3164 | /* |
3165 | * Check if we should update i_disksize | 3165 | * Check if we should update i_disksize |
3166 | * when write to the end of file but not require block allocation | 3166 | * when write to the end of file but not require block allocation |
3167 | */ | 3167 | */ |
3168 | static int ext4_da_should_update_i_disksize(struct page *page, | 3168 | static int ext4_da_should_update_i_disksize(struct page *page, |
3169 | unsigned long offset) | 3169 | unsigned long offset) |
3170 | { | 3170 | { |
3171 | struct buffer_head *bh; | 3171 | struct buffer_head *bh; |
3172 | struct inode *inode = page->mapping->host; | 3172 | struct inode *inode = page->mapping->host; |
3173 | unsigned int idx; | 3173 | unsigned int idx; |
3174 | int i; | 3174 | int i; |
3175 | 3175 | ||
3176 | bh = page_buffers(page); | 3176 | bh = page_buffers(page); |
3177 | idx = offset >> inode->i_blkbits; | 3177 | idx = offset >> inode->i_blkbits; |
3178 | 3178 | ||
3179 | for (i = 0; i < idx; i++) | 3179 | for (i = 0; i < idx; i++) |
3180 | bh = bh->b_this_page; | 3180 | bh = bh->b_this_page; |
3181 | 3181 | ||
3182 | if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) | 3182 | if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) |
3183 | return 0; | 3183 | return 0; |
3184 | return 1; | 3184 | return 1; |
3185 | } | 3185 | } |
3186 | 3186 | ||
3187 | static int ext4_da_write_end(struct file *file, | 3187 | static int ext4_da_write_end(struct file *file, |
3188 | struct address_space *mapping, | 3188 | struct address_space *mapping, |
3189 | loff_t pos, unsigned len, unsigned copied, | 3189 | loff_t pos, unsigned len, unsigned copied, |
3190 | struct page *page, void *fsdata) | 3190 | struct page *page, void *fsdata) |
3191 | { | 3191 | { |
3192 | struct inode *inode = mapping->host; | 3192 | struct inode *inode = mapping->host; |
3193 | int ret = 0, ret2; | 3193 | int ret = 0, ret2; |
3194 | handle_t *handle = ext4_journal_current_handle(); | 3194 | handle_t *handle = ext4_journal_current_handle(); |
3195 | loff_t new_i_size; | 3195 | loff_t new_i_size; |
3196 | unsigned long start, end; | 3196 | unsigned long start, end; |
3197 | int write_mode = (int)(unsigned long)fsdata; | 3197 | int write_mode = (int)(unsigned long)fsdata; |
3198 | 3198 | ||
3199 | if (write_mode == FALL_BACK_TO_NONDELALLOC) { | 3199 | if (write_mode == FALL_BACK_TO_NONDELALLOC) { |
3200 | if (ext4_should_order_data(inode)) { | 3200 | if (ext4_should_order_data(inode)) { |
3201 | return ext4_ordered_write_end(file, mapping, pos, | 3201 | return ext4_ordered_write_end(file, mapping, pos, |
3202 | len, copied, page, fsdata); | 3202 | len, copied, page, fsdata); |
3203 | } else if (ext4_should_writeback_data(inode)) { | 3203 | } else if (ext4_should_writeback_data(inode)) { |
3204 | return ext4_writeback_write_end(file, mapping, pos, | 3204 | return ext4_writeback_write_end(file, mapping, pos, |
3205 | len, copied, page, fsdata); | 3205 | len, copied, page, fsdata); |
3206 | } else { | 3206 | } else { |
3207 | BUG(); | 3207 | BUG(); |
3208 | } | 3208 | } |
3209 | } | 3209 | } |
3210 | 3210 | ||
3211 | trace_ext4_da_write_end(inode, pos, len, copied); | 3211 | trace_ext4_da_write_end(inode, pos, len, copied); |
3212 | start = pos & (PAGE_CACHE_SIZE - 1); | 3212 | start = pos & (PAGE_CACHE_SIZE - 1); |
3213 | end = start + copied - 1; | 3213 | end = start + copied - 1; |
3214 | 3214 | ||
3215 | /* | 3215 | /* |
3216 | * generic_write_end() will run mark_inode_dirty() if i_size | 3216 | * generic_write_end() will run mark_inode_dirty() if i_size |
3217 | * changes. So let's piggyback the i_disksize mark_inode_dirty | 3217 | * changes. So let's piggyback the i_disksize mark_inode_dirty |
3218 | * into that. | 3218 | * into that. |
3219 | */ | 3219 | */ |
3220 | 3220 | ||
3221 | new_i_size = pos + copied; | 3221 | new_i_size = pos + copied; |
3222 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 3222 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
3223 | if (ext4_da_should_update_i_disksize(page, end)) { | 3223 | if (ext4_da_should_update_i_disksize(page, end)) { |
3224 | down_write(&EXT4_I(inode)->i_data_sem); | 3224 | down_write(&EXT4_I(inode)->i_data_sem); |
3225 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 3225 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
3226 | /* | 3226 | /* |
3227 | * Updating i_disksize when extending file | 3227 | * Updating i_disksize when extending file |
3228 | * without needing block allocation | 3228 | * without needing block allocation |
3229 | */ | 3229 | */ |
3230 | if (ext4_should_order_data(inode)) | 3230 | if (ext4_should_order_data(inode)) |
3231 | ret = ext4_jbd2_file_inode(handle, | 3231 | ret = ext4_jbd2_file_inode(handle, |
3232 | inode); | 3232 | inode); |
3233 | 3233 | ||
3234 | EXT4_I(inode)->i_disksize = new_i_size; | 3234 | EXT4_I(inode)->i_disksize = new_i_size; |
3235 | } | 3235 | } |
3236 | up_write(&EXT4_I(inode)->i_data_sem); | 3236 | up_write(&EXT4_I(inode)->i_data_sem); |
3237 | /* We need to mark inode dirty even if | 3237 | /* We need to mark inode dirty even if |
3238 | * new_i_size is less that inode->i_size | 3238 | * new_i_size is less that inode->i_size |
3239 | * bu greater than i_disksize.(hint delalloc) | 3239 | * bu greater than i_disksize.(hint delalloc) |
3240 | */ | 3240 | */ |
3241 | ext4_mark_inode_dirty(handle, inode); | 3241 | ext4_mark_inode_dirty(handle, inode); |
3242 | } | 3242 | } |
3243 | } | 3243 | } |
3244 | ret2 = generic_write_end(file, mapping, pos, len, copied, | 3244 | ret2 = generic_write_end(file, mapping, pos, len, copied, |
3245 | page, fsdata); | 3245 | page, fsdata); |
3246 | copied = ret2; | 3246 | copied = ret2; |
3247 | if (ret2 < 0) | 3247 | if (ret2 < 0) |
3248 | ret = ret2; | 3248 | ret = ret2; |
3249 | ret2 = ext4_journal_stop(handle); | 3249 | ret2 = ext4_journal_stop(handle); |
3250 | if (!ret) | 3250 | if (!ret) |
3251 | ret = ret2; | 3251 | ret = ret2; |
3252 | 3252 | ||
3253 | return ret ? ret : copied; | 3253 | return ret ? ret : copied; |
3254 | } | 3254 | } |
3255 | 3255 | ||
3256 | static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | 3256 | static void ext4_da_invalidatepage(struct page *page, unsigned long offset) |
3257 | { | 3257 | { |
3258 | /* | 3258 | /* |
3259 | * Drop reserved blocks | 3259 | * Drop reserved blocks |
3260 | */ | 3260 | */ |
3261 | BUG_ON(!PageLocked(page)); | 3261 | BUG_ON(!PageLocked(page)); |
3262 | if (!page_has_buffers(page)) | 3262 | if (!page_has_buffers(page)) |
3263 | goto out; | 3263 | goto out; |
3264 | 3264 | ||
3265 | ext4_da_page_release_reservation(page, offset); | 3265 | ext4_da_page_release_reservation(page, offset); |
3266 | 3266 | ||
3267 | out: | 3267 | out: |
3268 | ext4_invalidatepage(page, offset); | 3268 | ext4_invalidatepage(page, offset); |
3269 | 3269 | ||
3270 | return; | 3270 | return; |
3271 | } | 3271 | } |
3272 | 3272 | ||
3273 | /* | 3273 | /* |
3274 | * Force all delayed allocation blocks to be allocated for a given inode. | 3274 | * Force all delayed allocation blocks to be allocated for a given inode. |
3275 | */ | 3275 | */ |
3276 | int ext4_alloc_da_blocks(struct inode *inode) | 3276 | int ext4_alloc_da_blocks(struct inode *inode) |
3277 | { | 3277 | { |
3278 | trace_ext4_alloc_da_blocks(inode); | 3278 | trace_ext4_alloc_da_blocks(inode); |
3279 | 3279 | ||
3280 | if (!EXT4_I(inode)->i_reserved_data_blocks && | 3280 | if (!EXT4_I(inode)->i_reserved_data_blocks && |
3281 | !EXT4_I(inode)->i_reserved_meta_blocks) | 3281 | !EXT4_I(inode)->i_reserved_meta_blocks) |
3282 | return 0; | 3282 | return 0; |
3283 | 3283 | ||
3284 | /* | 3284 | /* |
3285 | * We do something simple for now. The filemap_flush() will | 3285 | * We do something simple for now. The filemap_flush() will |
3286 | * also start triggering a write of the data blocks, which is | 3286 | * also start triggering a write of the data blocks, which is |
3287 | * not strictly speaking necessary (and for users of | 3287 | * not strictly speaking necessary (and for users of |
3288 | * laptop_mode, not even desirable). However, to do otherwise | 3288 | * laptop_mode, not even desirable). However, to do otherwise |
3289 | * would require replicating code paths in: | 3289 | * would require replicating code paths in: |
3290 | * | 3290 | * |
3291 | * ext4_da_writepages() -> | 3291 | * ext4_da_writepages() -> |
3292 | * write_cache_pages() ---> (via passed in callback function) | 3292 | * write_cache_pages() ---> (via passed in callback function) |
3293 | * __mpage_da_writepage() --> | 3293 | * __mpage_da_writepage() --> |
3294 | * mpage_add_bh_to_extent() | 3294 | * mpage_add_bh_to_extent() |
3295 | * mpage_da_map_blocks() | 3295 | * mpage_da_map_blocks() |
3296 | * | 3296 | * |
3297 | * The problem is that write_cache_pages(), located in | 3297 | * The problem is that write_cache_pages(), located in |
3298 | * mm/page-writeback.c, marks pages clean in preparation for | 3298 | * mm/page-writeback.c, marks pages clean in preparation for |
3299 | * doing I/O, which is not desirable if we're not planning on | 3299 | * doing I/O, which is not desirable if we're not planning on |
3300 | * doing I/O at all. | 3300 | * doing I/O at all. |
3301 | * | 3301 | * |
3302 | * We could call write_cache_pages(), and then redirty all of | 3302 | * We could call write_cache_pages(), and then redirty all of |
3303 | * the pages by calling redirty_page_for_writepage() but that | 3303 | * the pages by calling redirty_page_for_writepage() but that |
3304 | * would be ugly in the extreme. So instead we would need to | 3304 | * would be ugly in the extreme. So instead we would need to |
3305 | * replicate parts of the code in the above functions, | 3305 | * replicate parts of the code in the above functions, |
3306 | * simplifying them because we wouldn't actually intend to | 3306 | * simplifying them because we wouldn't actually intend to |
3307 | * write out the pages, but rather only collect contiguous | 3307 | * write out the pages, but rather only collect contiguous |
3308 | * logical block extents, call the multi-block allocator, and | 3308 | * logical block extents, call the multi-block allocator, and |
3309 | * then update the buffer heads with the block allocations. | 3309 | * then update the buffer heads with the block allocations. |
3310 | * | 3310 | * |
3311 | * For now, though, we'll cheat by calling filemap_flush(), | 3311 | * For now, though, we'll cheat by calling filemap_flush(), |
3312 | * which will map the blocks, and start the I/O, but not | 3312 | * which will map the blocks, and start the I/O, but not |
3313 | * actually wait for the I/O to complete. | 3313 | * actually wait for the I/O to complete. |
3314 | */ | 3314 | */ |
3315 | return filemap_flush(inode->i_mapping); | 3315 | return filemap_flush(inode->i_mapping); |
3316 | } | 3316 | } |
3317 | 3317 | ||
3318 | /* | 3318 | /* |
3319 | * bmap() is special. It gets used by applications such as lilo and by | 3319 | * bmap() is special. It gets used by applications such as lilo and by |
3320 | * the swapper to find the on-disk block of a specific piece of data. | 3320 | * the swapper to find the on-disk block of a specific piece of data. |
3321 | * | 3321 | * |
3322 | * Naturally, this is dangerous if the block concerned is still in the | 3322 | * Naturally, this is dangerous if the block concerned is still in the |
3323 | * journal. If somebody makes a swapfile on an ext4 data-journaling | 3323 | * journal. If somebody makes a swapfile on an ext4 data-journaling |
3324 | * filesystem and enables swap, then they may get a nasty shock when the | 3324 | * filesystem and enables swap, then they may get a nasty shock when the |
3325 | * data getting swapped to that swapfile suddenly gets overwritten by | 3325 | * data getting swapped to that swapfile suddenly gets overwritten by |
3326 | * the original zero's written out previously to the journal and | 3326 | * the original zero's written out previously to the journal and |
3327 | * awaiting writeback in the kernel's buffer cache. | 3327 | * awaiting writeback in the kernel's buffer cache. |
3328 | * | 3328 | * |
3329 | * So, if we see any bmap calls here on a modified, data-journaled file, | 3329 | * So, if we see any bmap calls here on a modified, data-journaled file, |
3330 | * take extra steps to flush any blocks which might be in the cache. | 3330 | * take extra steps to flush any blocks which might be in the cache. |
3331 | */ | 3331 | */ |
3332 | static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | 3332 | static sector_t ext4_bmap(struct address_space *mapping, sector_t block) |
3333 | { | 3333 | { |
3334 | struct inode *inode = mapping->host; | 3334 | struct inode *inode = mapping->host; |
3335 | journal_t *journal; | 3335 | journal_t *journal; |
3336 | int err; | 3336 | int err; |
3337 | 3337 | ||
3338 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && | 3338 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && |
3339 | test_opt(inode->i_sb, DELALLOC)) { | 3339 | test_opt(inode->i_sb, DELALLOC)) { |
3340 | /* | 3340 | /* |
3341 | * With delalloc we want to sync the file | 3341 | * With delalloc we want to sync the file |
3342 | * so that we can make sure we allocate | 3342 | * so that we can make sure we allocate |
3343 | * blocks for file | 3343 | * blocks for file |
3344 | */ | 3344 | */ |
3345 | filemap_write_and_wait(mapping); | 3345 | filemap_write_and_wait(mapping); |
3346 | } | 3346 | } |
3347 | 3347 | ||
3348 | if (EXT4_JOURNAL(inode) && | 3348 | if (EXT4_JOURNAL(inode) && |
3349 | ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { | 3349 | ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { |
3350 | /* | 3350 | /* |
3351 | * This is a REALLY heavyweight approach, but the use of | 3351 | * This is a REALLY heavyweight approach, but the use of |
3352 | * bmap on dirty files is expected to be extremely rare: | 3352 | * bmap on dirty files is expected to be extremely rare: |
3353 | * only if we run lilo or swapon on a freshly made file | 3353 | * only if we run lilo or swapon on a freshly made file |
3354 | * do we expect this to happen. | 3354 | * do we expect this to happen. |
3355 | * | 3355 | * |
3356 | * (bmap requires CAP_SYS_RAWIO so this does not | 3356 | * (bmap requires CAP_SYS_RAWIO so this does not |
3357 | * represent an unprivileged user DOS attack --- we'd be | 3357 | * represent an unprivileged user DOS attack --- we'd be |
3358 | * in trouble if mortal users could trigger this path at | 3358 | * in trouble if mortal users could trigger this path at |
3359 | * will.) | 3359 | * will.) |
3360 | * | 3360 | * |
3361 | * NB. EXT4_STATE_JDATA is not set on files other than | 3361 | * NB. EXT4_STATE_JDATA is not set on files other than |
3362 | * regular files. If somebody wants to bmap a directory | 3362 | * regular files. If somebody wants to bmap a directory |
3363 | * or symlink and gets confused because the buffer | 3363 | * or symlink and gets confused because the buffer |
3364 | * hasn't yet been flushed to disk, they deserve | 3364 | * hasn't yet been flushed to disk, they deserve |
3365 | * everything they get. | 3365 | * everything they get. |
3366 | */ | 3366 | */ |
3367 | 3367 | ||
3368 | ext4_clear_inode_state(inode, EXT4_STATE_JDATA); | 3368 | ext4_clear_inode_state(inode, EXT4_STATE_JDATA); |
3369 | journal = EXT4_JOURNAL(inode); | 3369 | journal = EXT4_JOURNAL(inode); |
3370 | jbd2_journal_lock_updates(journal); | 3370 | jbd2_journal_lock_updates(journal); |
3371 | err = jbd2_journal_flush(journal); | 3371 | err = jbd2_journal_flush(journal); |
3372 | jbd2_journal_unlock_updates(journal); | 3372 | jbd2_journal_unlock_updates(journal); |
3373 | 3373 | ||
3374 | if (err) | 3374 | if (err) |
3375 | return 0; | 3375 | return 0; |
3376 | } | 3376 | } |
3377 | 3377 | ||
3378 | return generic_block_bmap(mapping, block, ext4_get_block); | 3378 | return generic_block_bmap(mapping, block, ext4_get_block); |
3379 | } | 3379 | } |
3380 | 3380 | ||
3381 | static int ext4_readpage(struct file *file, struct page *page) | 3381 | static int ext4_readpage(struct file *file, struct page *page) |
3382 | { | 3382 | { |
3383 | trace_ext4_readpage(page); | 3383 | trace_ext4_readpage(page); |
3384 | return mpage_readpage(page, ext4_get_block); | 3384 | return mpage_readpage(page, ext4_get_block); |
3385 | } | 3385 | } |
3386 | 3386 | ||
3387 | static int | 3387 | static int |
3388 | ext4_readpages(struct file *file, struct address_space *mapping, | 3388 | ext4_readpages(struct file *file, struct address_space *mapping, |
3389 | struct list_head *pages, unsigned nr_pages) | 3389 | struct list_head *pages, unsigned nr_pages) |
3390 | { | 3390 | { |
3391 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); | 3391 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); |
3392 | } | 3392 | } |
3393 | 3393 | ||
3394 | static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) | 3394 | static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) |
3395 | { | 3395 | { |
3396 | struct buffer_head *head, *bh; | 3396 | struct buffer_head *head, *bh; |
3397 | unsigned int curr_off = 0; | 3397 | unsigned int curr_off = 0; |
3398 | 3398 | ||
3399 | if (!page_has_buffers(page)) | 3399 | if (!page_has_buffers(page)) |
3400 | return; | 3400 | return; |
3401 | head = bh = page_buffers(page); | 3401 | head = bh = page_buffers(page); |
3402 | do { | 3402 | do { |
3403 | if (offset <= curr_off && test_clear_buffer_uninit(bh) | 3403 | if (offset <= curr_off && test_clear_buffer_uninit(bh) |
3404 | && bh->b_private) { | 3404 | && bh->b_private) { |
3405 | ext4_free_io_end(bh->b_private); | 3405 | ext4_free_io_end(bh->b_private); |
3406 | bh->b_private = NULL; | 3406 | bh->b_private = NULL; |
3407 | bh->b_end_io = NULL; | 3407 | bh->b_end_io = NULL; |
3408 | } | 3408 | } |
3409 | curr_off = curr_off + bh->b_size; | 3409 | curr_off = curr_off + bh->b_size; |
3410 | bh = bh->b_this_page; | 3410 | bh = bh->b_this_page; |
3411 | } while (bh != head); | 3411 | } while (bh != head); |
3412 | } | 3412 | } |
3413 | 3413 | ||
3414 | static void ext4_invalidatepage(struct page *page, unsigned long offset) | 3414 | static void ext4_invalidatepage(struct page *page, unsigned long offset) |
3415 | { | 3415 | { |
3416 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); | 3416 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); |
3417 | 3417 | ||
3418 | trace_ext4_invalidatepage(page, offset); | 3418 | trace_ext4_invalidatepage(page, offset); |
3419 | 3419 | ||
3420 | /* | 3420 | /* |
3421 | * free any io_end structure allocated for buffers to be discarded | 3421 | * free any io_end structure allocated for buffers to be discarded |
3422 | */ | 3422 | */ |
3423 | if (ext4_should_dioread_nolock(page->mapping->host)) | 3423 | if (ext4_should_dioread_nolock(page->mapping->host)) |
3424 | ext4_invalidatepage_free_endio(page, offset); | 3424 | ext4_invalidatepage_free_endio(page, offset); |
3425 | /* | 3425 | /* |
3426 | * If it's a full truncate we just forget about the pending dirtying | 3426 | * If it's a full truncate we just forget about the pending dirtying |
3427 | */ | 3427 | */ |
3428 | if (offset == 0) | 3428 | if (offset == 0) |
3429 | ClearPageChecked(page); | 3429 | ClearPageChecked(page); |
3430 | 3430 | ||
3431 | if (journal) | 3431 | if (journal) |
3432 | jbd2_journal_invalidatepage(journal, page, offset); | 3432 | jbd2_journal_invalidatepage(journal, page, offset); |
3433 | else | 3433 | else |
3434 | block_invalidatepage(page, offset); | 3434 | block_invalidatepage(page, offset); |
3435 | } | 3435 | } |
3436 | 3436 | ||
3437 | static int ext4_releasepage(struct page *page, gfp_t wait) | 3437 | static int ext4_releasepage(struct page *page, gfp_t wait) |
3438 | { | 3438 | { |
3439 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); | 3439 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); |
3440 | 3440 | ||
3441 | trace_ext4_releasepage(page); | 3441 | trace_ext4_releasepage(page); |
3442 | 3442 | ||
3443 | WARN_ON(PageChecked(page)); | 3443 | WARN_ON(PageChecked(page)); |
3444 | if (!page_has_buffers(page)) | 3444 | if (!page_has_buffers(page)) |
3445 | return 0; | 3445 | return 0; |
3446 | if (journal) | 3446 | if (journal) |
3447 | return jbd2_journal_try_to_free_buffers(journal, page, wait); | 3447 | return jbd2_journal_try_to_free_buffers(journal, page, wait); |
3448 | else | 3448 | else |
3449 | return try_to_free_buffers(page); | 3449 | return try_to_free_buffers(page); |
3450 | } | 3450 | } |
3451 | 3451 | ||
3452 | /* | 3452 | /* |
3453 | * O_DIRECT for ext3 (or indirect map) based files | 3453 | * O_DIRECT for ext3 (or indirect map) based files |
3454 | * | 3454 | * |
3455 | * If the O_DIRECT write will extend the file then add this inode to the | 3455 | * If the O_DIRECT write will extend the file then add this inode to the |
3456 | * orphan list. So recovery will truncate it back to the original size | 3456 | * orphan list. So recovery will truncate it back to the original size |
3457 | * if the machine crashes during the write. | 3457 | * if the machine crashes during the write. |
3458 | * | 3458 | * |
3459 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | 3459 | * If the O_DIRECT write is intantiating holes inside i_size and the machine |
3460 | * crashes then stale disk data _may_ be exposed inside the file. But current | 3460 | * crashes then stale disk data _may_ be exposed inside the file. But current |
3461 | * VFS code falls back into buffered path in that case so we are safe. | 3461 | * VFS code falls back into buffered path in that case so we are safe. |
3462 | */ | 3462 | */ |
3463 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | 3463 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, |
3464 | const struct iovec *iov, loff_t offset, | 3464 | const struct iovec *iov, loff_t offset, |
3465 | unsigned long nr_segs) | 3465 | unsigned long nr_segs) |
3466 | { | 3466 | { |
3467 | struct file *file = iocb->ki_filp; | 3467 | struct file *file = iocb->ki_filp; |
3468 | struct inode *inode = file->f_mapping->host; | 3468 | struct inode *inode = file->f_mapping->host; |
3469 | struct ext4_inode_info *ei = EXT4_I(inode); | 3469 | struct ext4_inode_info *ei = EXT4_I(inode); |
3470 | handle_t *handle; | 3470 | handle_t *handle; |
3471 | ssize_t ret; | 3471 | ssize_t ret; |
3472 | int orphan = 0; | 3472 | int orphan = 0; |
3473 | size_t count = iov_length(iov, nr_segs); | 3473 | size_t count = iov_length(iov, nr_segs); |
3474 | int retries = 0; | 3474 | int retries = 0; |
3475 | 3475 | ||
3476 | if (rw == WRITE) { | 3476 | if (rw == WRITE) { |
3477 | loff_t final_size = offset + count; | 3477 | loff_t final_size = offset + count; |
3478 | 3478 | ||
3479 | if (final_size > inode->i_size) { | 3479 | if (final_size > inode->i_size) { |
3480 | /* Credits for sb + inode write */ | 3480 | /* Credits for sb + inode write */ |
3481 | handle = ext4_journal_start(inode, 2); | 3481 | handle = ext4_journal_start(inode, 2); |
3482 | if (IS_ERR(handle)) { | 3482 | if (IS_ERR(handle)) { |
3483 | ret = PTR_ERR(handle); | 3483 | ret = PTR_ERR(handle); |
3484 | goto out; | 3484 | goto out; |
3485 | } | 3485 | } |
3486 | ret = ext4_orphan_add(handle, inode); | 3486 | ret = ext4_orphan_add(handle, inode); |
3487 | if (ret) { | 3487 | if (ret) { |
3488 | ext4_journal_stop(handle); | 3488 | ext4_journal_stop(handle); |
3489 | goto out; | 3489 | goto out; |
3490 | } | 3490 | } |
3491 | orphan = 1; | 3491 | orphan = 1; |
3492 | ei->i_disksize = inode->i_size; | 3492 | ei->i_disksize = inode->i_size; |
3493 | ext4_journal_stop(handle); | 3493 | ext4_journal_stop(handle); |
3494 | } | 3494 | } |
3495 | } | 3495 | } |
3496 | 3496 | ||
3497 | retry: | 3497 | retry: |
3498 | if (rw == READ && ext4_should_dioread_nolock(inode)) | 3498 | if (rw == READ && ext4_should_dioread_nolock(inode)) |
3499 | ret = __blockdev_direct_IO(rw, iocb, inode, | 3499 | ret = __blockdev_direct_IO(rw, iocb, inode, |
3500 | inode->i_sb->s_bdev, iov, | 3500 | inode->i_sb->s_bdev, iov, |
3501 | offset, nr_segs, | 3501 | offset, nr_segs, |
3502 | ext4_get_block, NULL, NULL, 0); | 3502 | ext4_get_block, NULL, NULL, 0); |
3503 | else { | 3503 | else { |
3504 | ret = blockdev_direct_IO(rw, iocb, inode, iov, | 3504 | ret = blockdev_direct_IO(rw, iocb, inode, iov, |
3505 | offset, nr_segs, ext4_get_block); | 3505 | offset, nr_segs, ext4_get_block); |
3506 | 3506 | ||
3507 | if (unlikely((rw & WRITE) && ret < 0)) { | 3507 | if (unlikely((rw & WRITE) && ret < 0)) { |
3508 | loff_t isize = i_size_read(inode); | 3508 | loff_t isize = i_size_read(inode); |
3509 | loff_t end = offset + iov_length(iov, nr_segs); | 3509 | loff_t end = offset + iov_length(iov, nr_segs); |
3510 | 3510 | ||
3511 | if (end > isize) | 3511 | if (end > isize) |
3512 | ext4_truncate_failed_write(inode); | 3512 | ext4_truncate_failed_write(inode); |
3513 | } | 3513 | } |
3514 | } | 3514 | } |
3515 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 3515 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
3516 | goto retry; | 3516 | goto retry; |
3517 | 3517 | ||
3518 | if (orphan) { | 3518 | if (orphan) { |
3519 | int err; | 3519 | int err; |
3520 | 3520 | ||
3521 | /* Credits for sb + inode write */ | 3521 | /* Credits for sb + inode write */ |
3522 | handle = ext4_journal_start(inode, 2); | 3522 | handle = ext4_journal_start(inode, 2); |
3523 | if (IS_ERR(handle)) { | 3523 | if (IS_ERR(handle)) { |
3524 | /* This is really bad luck. We've written the data | 3524 | /* This is really bad luck. We've written the data |
3525 | * but cannot extend i_size. Bail out and pretend | 3525 | * but cannot extend i_size. Bail out and pretend |
3526 | * the write failed... */ | 3526 | * the write failed... */ |
3527 | ret = PTR_ERR(handle); | 3527 | ret = PTR_ERR(handle); |
3528 | if (inode->i_nlink) | 3528 | if (inode->i_nlink) |
3529 | ext4_orphan_del(NULL, inode); | 3529 | ext4_orphan_del(NULL, inode); |
3530 | 3530 | ||
3531 | goto out; | 3531 | goto out; |
3532 | } | 3532 | } |
3533 | if (inode->i_nlink) | 3533 | if (inode->i_nlink) |
3534 | ext4_orphan_del(handle, inode); | 3534 | ext4_orphan_del(handle, inode); |
3535 | if (ret > 0) { | 3535 | if (ret > 0) { |
3536 | loff_t end = offset + ret; | 3536 | loff_t end = offset + ret; |
3537 | if (end > inode->i_size) { | 3537 | if (end > inode->i_size) { |
3538 | ei->i_disksize = end; | 3538 | ei->i_disksize = end; |
3539 | i_size_write(inode, end); | 3539 | i_size_write(inode, end); |
3540 | /* | 3540 | /* |
3541 | * We're going to return a positive `ret' | 3541 | * We're going to return a positive `ret' |
3542 | * here due to non-zero-length I/O, so there's | 3542 | * here due to non-zero-length I/O, so there's |
3543 | * no way of reporting error returns from | 3543 | * no way of reporting error returns from |
3544 | * ext4_mark_inode_dirty() to userspace. So | 3544 | * ext4_mark_inode_dirty() to userspace. So |
3545 | * ignore it. | 3545 | * ignore it. |
3546 | */ | 3546 | */ |
3547 | ext4_mark_inode_dirty(handle, inode); | 3547 | ext4_mark_inode_dirty(handle, inode); |
3548 | } | 3548 | } |
3549 | } | 3549 | } |
3550 | err = ext4_journal_stop(handle); | 3550 | err = ext4_journal_stop(handle); |
3551 | if (ret == 0) | 3551 | if (ret == 0) |
3552 | ret = err; | 3552 | ret = err; |
3553 | } | 3553 | } |
3554 | out: | 3554 | out: |
3555 | return ret; | 3555 | return ret; |
3556 | } | 3556 | } |
3557 | 3557 | ||
3558 | /* | 3558 | /* |
3559 | * ext4_get_block used when preparing for a DIO write or buffer write. | 3559 | * ext4_get_block used when preparing for a DIO write or buffer write. |
3560 | * We allocate an uinitialized extent if blocks haven't been allocated. | 3560 | * We allocate an uinitialized extent if blocks haven't been allocated. |
3561 | * The extent will be converted to initialized after the IO is complete. | 3561 | * The extent will be converted to initialized after the IO is complete. |
3562 | */ | 3562 | */ |
3563 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | 3563 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, |
3564 | struct buffer_head *bh_result, int create) | 3564 | struct buffer_head *bh_result, int create) |
3565 | { | 3565 | { |
3566 | ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", | 3566 | ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", |
3567 | inode->i_ino, create); | 3567 | inode->i_ino, create); |
3568 | return _ext4_get_block(inode, iblock, bh_result, | 3568 | return _ext4_get_block(inode, iblock, bh_result, |
3569 | EXT4_GET_BLOCKS_IO_CREATE_EXT); | 3569 | EXT4_GET_BLOCKS_IO_CREATE_EXT); |
3570 | } | 3570 | } |
3571 | 3571 | ||
3572 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | 3572 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, |
3573 | ssize_t size, void *private, int ret, | 3573 | ssize_t size, void *private, int ret, |
3574 | bool is_async) | 3574 | bool is_async) |
3575 | { | 3575 | { |
3576 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 3576 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
3577 | ext4_io_end_t *io_end = iocb->private; | 3577 | ext4_io_end_t *io_end = iocb->private; |
3578 | struct workqueue_struct *wq; | 3578 | struct workqueue_struct *wq; |
3579 | unsigned long flags; | 3579 | unsigned long flags; |
3580 | struct ext4_inode_info *ei; | 3580 | struct ext4_inode_info *ei; |
3581 | 3581 | ||
3582 | /* if not async direct IO or dio with 0 bytes write, just return */ | 3582 | /* if not async direct IO or dio with 0 bytes write, just return */ |
3583 | if (!io_end || !size) | 3583 | if (!io_end || !size) |
3584 | goto out; | 3584 | goto out; |
3585 | 3585 | ||
3586 | ext_debug("ext4_end_io_dio(): io_end 0x%p" | 3586 | ext_debug("ext4_end_io_dio(): io_end 0x%p" |
3587 | "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", | 3587 | "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", |
3588 | iocb->private, io_end->inode->i_ino, iocb, offset, | 3588 | iocb->private, io_end->inode->i_ino, iocb, offset, |
3589 | size); | 3589 | size); |
3590 | 3590 | ||
3591 | /* if not aio dio with unwritten extents, just free io and return */ | 3591 | /* if not aio dio with unwritten extents, just free io and return */ |
3592 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | 3592 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { |
3593 | ext4_free_io_end(io_end); | 3593 | ext4_free_io_end(io_end); |
3594 | iocb->private = NULL; | 3594 | iocb->private = NULL; |
3595 | out: | 3595 | out: |
3596 | if (is_async) | 3596 | if (is_async) |
3597 | aio_complete(iocb, ret, 0); | 3597 | aio_complete(iocb, ret, 0); |
3598 | inode_dio_done(inode); | 3598 | inode_dio_done(inode); |
3599 | return; | 3599 | return; |
3600 | } | 3600 | } |
3601 | 3601 | ||
3602 | io_end->offset = offset; | 3602 | io_end->offset = offset; |
3603 | io_end->size = size; | 3603 | io_end->size = size; |
3604 | if (is_async) { | 3604 | if (is_async) { |
3605 | io_end->iocb = iocb; | 3605 | io_end->iocb = iocb; |
3606 | io_end->result = ret; | 3606 | io_end->result = ret; |
3607 | } | 3607 | } |
3608 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | 3608 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; |
3609 | 3609 | ||
3610 | /* Add the io_end to per-inode completed aio dio list*/ | 3610 | /* Add the io_end to per-inode completed aio dio list*/ |
3611 | ei = EXT4_I(io_end->inode); | 3611 | ei = EXT4_I(io_end->inode); |
3612 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | 3612 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
3613 | list_add_tail(&io_end->list, &ei->i_completed_io_list); | 3613 | list_add_tail(&io_end->list, &ei->i_completed_io_list); |
3614 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 3614 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
3615 | 3615 | ||
3616 | /* queue the work to convert unwritten extents to written */ | 3616 | /* queue the work to convert unwritten extents to written */ |
3617 | queue_work(wq, &io_end->work); | 3617 | queue_work(wq, &io_end->work); |
3618 | iocb->private = NULL; | 3618 | iocb->private = NULL; |
3619 | 3619 | ||
3620 | /* XXX: probably should move into the real I/O completion handler */ | 3620 | /* XXX: probably should move into the real I/O completion handler */ |
3621 | inode_dio_done(inode); | 3621 | inode_dio_done(inode); |
3622 | } | 3622 | } |
3623 | 3623 | ||
3624 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | 3624 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) |
3625 | { | 3625 | { |
3626 | ext4_io_end_t *io_end = bh->b_private; | 3626 | ext4_io_end_t *io_end = bh->b_private; |
3627 | struct workqueue_struct *wq; | 3627 | struct workqueue_struct *wq; |
3628 | struct inode *inode; | 3628 | struct inode *inode; |
3629 | unsigned long flags; | 3629 | unsigned long flags; |
3630 | 3630 | ||
3631 | if (!test_clear_buffer_uninit(bh) || !io_end) | 3631 | if (!test_clear_buffer_uninit(bh) || !io_end) |
3632 | goto out; | 3632 | goto out; |
3633 | 3633 | ||
3634 | if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { | 3634 | if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { |
3635 | printk("sb umounted, discard end_io request for inode %lu\n", | 3635 | printk("sb umounted, discard end_io request for inode %lu\n", |
3636 | io_end->inode->i_ino); | 3636 | io_end->inode->i_ino); |
3637 | ext4_free_io_end(io_end); | 3637 | ext4_free_io_end(io_end); |
3638 | goto out; | 3638 | goto out; |
3639 | } | 3639 | } |
3640 | 3640 | ||
3641 | io_end->flag = EXT4_IO_END_UNWRITTEN; | 3641 | io_end->flag = EXT4_IO_END_UNWRITTEN; |
3642 | inode = io_end->inode; | 3642 | inode = io_end->inode; |
3643 | 3643 | ||
3644 | /* Add the io_end to per-inode completed io list*/ | 3644 | /* Add the io_end to per-inode completed io list*/ |
3645 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | 3645 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); |
3646 | list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); | 3646 | list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); |
3647 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); | 3647 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); |
3648 | 3648 | ||
3649 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; | 3649 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; |
3650 | /* queue the work to convert unwritten extents to written */ | 3650 | /* queue the work to convert unwritten extents to written */ |
3651 | queue_work(wq, &io_end->work); | 3651 | queue_work(wq, &io_end->work); |
3652 | out: | 3652 | out: |
3653 | bh->b_private = NULL; | 3653 | bh->b_private = NULL; |
3654 | bh->b_end_io = NULL; | 3654 | bh->b_end_io = NULL; |
3655 | clear_buffer_uninit(bh); | 3655 | clear_buffer_uninit(bh); |
3656 | end_buffer_async_write(bh, uptodate); | 3656 | end_buffer_async_write(bh, uptodate); |
3657 | } | 3657 | } |
3658 | 3658 | ||
3659 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) | 3659 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) |
3660 | { | 3660 | { |
3661 | ext4_io_end_t *io_end; | 3661 | ext4_io_end_t *io_end; |
3662 | struct page *page = bh->b_page; | 3662 | struct page *page = bh->b_page; |
3663 | loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; | 3663 | loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; |
3664 | size_t size = bh->b_size; | 3664 | size_t size = bh->b_size; |
3665 | 3665 | ||
3666 | retry: | 3666 | retry: |
3667 | io_end = ext4_init_io_end(inode, GFP_ATOMIC); | 3667 | io_end = ext4_init_io_end(inode, GFP_ATOMIC); |
3668 | if (!io_end) { | 3668 | if (!io_end) { |
3669 | pr_warn_ratelimited("%s: allocation fail\n", __func__); | 3669 | pr_warn_ratelimited("%s: allocation fail\n", __func__); |
3670 | schedule(); | 3670 | schedule(); |
3671 | goto retry; | 3671 | goto retry; |
3672 | } | 3672 | } |
3673 | io_end->offset = offset; | 3673 | io_end->offset = offset; |
3674 | io_end->size = size; | 3674 | io_end->size = size; |
3675 | /* | 3675 | /* |
3676 | * We need to hold a reference to the page to make sure it | 3676 | * We need to hold a reference to the page to make sure it |
3677 | * doesn't get evicted before ext4_end_io_work() has a chance | 3677 | * doesn't get evicted before ext4_end_io_work() has a chance |
3678 | * to convert the extent from written to unwritten. | 3678 | * to convert the extent from written to unwritten. |
3679 | */ | 3679 | */ |
3680 | io_end->page = page; | 3680 | io_end->page = page; |
3681 | get_page(io_end->page); | 3681 | get_page(io_end->page); |
3682 | 3682 | ||
3683 | bh->b_private = io_end; | 3683 | bh->b_private = io_end; |
3684 | bh->b_end_io = ext4_end_io_buffer_write; | 3684 | bh->b_end_io = ext4_end_io_buffer_write; |
3685 | return 0; | 3685 | return 0; |
3686 | } | 3686 | } |
3687 | 3687 | ||
3688 | /* | 3688 | /* |
3689 | * For ext4 extent files, ext4 will do direct-io write to holes, | 3689 | * For ext4 extent files, ext4 will do direct-io write to holes, |
3690 | * preallocated extents, and those write extend the file, no need to | 3690 | * preallocated extents, and those write extend the file, no need to |
3691 | * fall back to buffered IO. | 3691 | * fall back to buffered IO. |
3692 | * | 3692 | * |
3693 | * For holes, we fallocate those blocks, mark them as uninitialized | 3693 | * For holes, we fallocate those blocks, mark them as uninitialized |
3694 | * If those blocks were preallocated, we mark sure they are splited, but | 3694 | * If those blocks were preallocated, we mark sure they are splited, but |
3695 | * still keep the range to write as uninitialized. | 3695 | * still keep the range to write as uninitialized. |
3696 | * | 3696 | * |
3697 | * The unwrritten extents will be converted to written when DIO is completed. | 3697 | * The unwrritten extents will be converted to written when DIO is completed. |
3698 | * For async direct IO, since the IO may still pending when return, we | 3698 | * For async direct IO, since the IO may still pending when return, we |
3699 | * set up an end_io call back function, which will do the conversion | 3699 | * set up an end_io call back function, which will do the conversion |
3700 | * when async direct IO completed. | 3700 | * when async direct IO completed. |
3701 | * | 3701 | * |
3702 | * If the O_DIRECT write will extend the file then add this inode to the | 3702 | * If the O_DIRECT write will extend the file then add this inode to the |
3703 | * orphan list. So recovery will truncate it back to the original size | 3703 | * orphan list. So recovery will truncate it back to the original size |
3704 | * if the machine crashes during the write. | 3704 | * if the machine crashes during the write. |
3705 | * | 3705 | * |
3706 | */ | 3706 | */ |
3707 | static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | 3707 | static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, |
3708 | const struct iovec *iov, loff_t offset, | 3708 | const struct iovec *iov, loff_t offset, |
3709 | unsigned long nr_segs) | 3709 | unsigned long nr_segs) |
3710 | { | 3710 | { |
3711 | struct file *file = iocb->ki_filp; | 3711 | struct file *file = iocb->ki_filp; |
3712 | struct inode *inode = file->f_mapping->host; | 3712 | struct inode *inode = file->f_mapping->host; |
3713 | ssize_t ret; | 3713 | ssize_t ret; |
3714 | size_t count = iov_length(iov, nr_segs); | 3714 | size_t count = iov_length(iov, nr_segs); |
3715 | 3715 | ||
3716 | loff_t final_size = offset + count; | 3716 | loff_t final_size = offset + count; |
3717 | if (rw == WRITE && final_size <= inode->i_size) { | 3717 | if (rw == WRITE && final_size <= inode->i_size) { |
3718 | /* | 3718 | /* |
3719 | * We could direct write to holes and fallocate. | 3719 | * We could direct write to holes and fallocate. |
3720 | * | 3720 | * |
3721 | * Allocated blocks to fill the hole are marked as uninitialized | 3721 | * Allocated blocks to fill the hole are marked as uninitialized |
3722 | * to prevent parallel buffered read to expose the stale data | 3722 | * to prevent parallel buffered read to expose the stale data |
3723 | * before DIO complete the data IO. | 3723 | * before DIO complete the data IO. |
3724 | * | 3724 | * |
3725 | * As to previously fallocated extents, ext4 get_block | 3725 | * As to previously fallocated extents, ext4 get_block |
3726 | * will just simply mark the buffer mapped but still | 3726 | * will just simply mark the buffer mapped but still |
3727 | * keep the extents uninitialized. | 3727 | * keep the extents uninitialized. |
3728 | * | 3728 | * |
3729 | * for non AIO case, we will convert those unwritten extents | 3729 | * for non AIO case, we will convert those unwritten extents |
3730 | * to written after return back from blockdev_direct_IO. | 3730 | * to written after return back from blockdev_direct_IO. |
3731 | * | 3731 | * |
3732 | * for async DIO, the conversion needs to be defered when | 3732 | * for async DIO, the conversion needs to be defered when |
3733 | * the IO is completed. The ext4 end_io callback function | 3733 | * the IO is completed. The ext4 end_io callback function |
3734 | * will be called to take care of the conversion work. | 3734 | * will be called to take care of the conversion work. |
3735 | * Here for async case, we allocate an io_end structure to | 3735 | * Here for async case, we allocate an io_end structure to |
3736 | * hook to the iocb. | 3736 | * hook to the iocb. |
3737 | */ | 3737 | */ |
3738 | iocb->private = NULL; | 3738 | iocb->private = NULL; |
3739 | EXT4_I(inode)->cur_aio_dio = NULL; | 3739 | EXT4_I(inode)->cur_aio_dio = NULL; |
3740 | if (!is_sync_kiocb(iocb)) { | 3740 | if (!is_sync_kiocb(iocb)) { |
3741 | iocb->private = ext4_init_io_end(inode, GFP_NOFS); | 3741 | iocb->private = ext4_init_io_end(inode, GFP_NOFS); |
3742 | if (!iocb->private) | 3742 | if (!iocb->private) |
3743 | return -ENOMEM; | 3743 | return -ENOMEM; |
3744 | /* | 3744 | /* |
3745 | * we save the io structure for current async | 3745 | * we save the io structure for current async |
3746 | * direct IO, so that later ext4_map_blocks() | 3746 | * direct IO, so that later ext4_map_blocks() |
3747 | * could flag the io structure whether there | 3747 | * could flag the io structure whether there |
3748 | * is a unwritten extents needs to be converted | 3748 | * is a unwritten extents needs to be converted |
3749 | * when IO is completed. | 3749 | * when IO is completed. |
3750 | */ | 3750 | */ |
3751 | EXT4_I(inode)->cur_aio_dio = iocb->private; | 3751 | EXT4_I(inode)->cur_aio_dio = iocb->private; |
3752 | } | 3752 | } |
3753 | 3753 | ||
3754 | ret = __blockdev_direct_IO(rw, iocb, inode, | 3754 | ret = __blockdev_direct_IO(rw, iocb, inode, |
3755 | inode->i_sb->s_bdev, iov, | 3755 | inode->i_sb->s_bdev, iov, |
3756 | offset, nr_segs, | 3756 | offset, nr_segs, |
3757 | ext4_get_block_write, | 3757 | ext4_get_block_write, |
3758 | ext4_end_io_dio, | 3758 | ext4_end_io_dio, |
3759 | NULL, | 3759 | NULL, |
3760 | DIO_LOCKING | DIO_SKIP_HOLES); | 3760 | DIO_LOCKING | DIO_SKIP_HOLES); |
3761 | if (iocb->private) | 3761 | if (iocb->private) |
3762 | EXT4_I(inode)->cur_aio_dio = NULL; | 3762 | EXT4_I(inode)->cur_aio_dio = NULL; |
3763 | /* | 3763 | /* |
3764 | * The io_end structure takes a reference to the inode, | 3764 | * The io_end structure takes a reference to the inode, |
3765 | * that structure needs to be destroyed and the | 3765 | * that structure needs to be destroyed and the |
3766 | * reference to the inode need to be dropped, when IO is | 3766 | * reference to the inode need to be dropped, when IO is |
3767 | * complete, even with 0 byte write, or failed. | 3767 | * complete, even with 0 byte write, or failed. |
3768 | * | 3768 | * |
3769 | * In the successful AIO DIO case, the io_end structure will be | 3769 | * In the successful AIO DIO case, the io_end structure will be |
3770 | * desctroyed and the reference to the inode will be dropped | 3770 | * desctroyed and the reference to the inode will be dropped |
3771 | * after the end_io call back function is called. | 3771 | * after the end_io call back function is called. |
3772 | * | 3772 | * |
3773 | * In the case there is 0 byte write, or error case, since | 3773 | * In the case there is 0 byte write, or error case, since |
3774 | * VFS direct IO won't invoke the end_io call back function, | 3774 | * VFS direct IO won't invoke the end_io call back function, |
3775 | * we need to free the end_io structure here. | 3775 | * we need to free the end_io structure here. |
3776 | */ | 3776 | */ |
3777 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | 3777 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { |
3778 | ext4_free_io_end(iocb->private); | 3778 | ext4_free_io_end(iocb->private); |
3779 | iocb->private = NULL; | 3779 | iocb->private = NULL; |
3780 | } else if (ret > 0 && ext4_test_inode_state(inode, | 3780 | } else if (ret > 0 && ext4_test_inode_state(inode, |
3781 | EXT4_STATE_DIO_UNWRITTEN)) { | 3781 | EXT4_STATE_DIO_UNWRITTEN)) { |
3782 | int err; | 3782 | int err; |
3783 | /* | 3783 | /* |
3784 | * for non AIO case, since the IO is already | 3784 | * for non AIO case, since the IO is already |
3785 | * completed, we could do the conversion right here | 3785 | * completed, we could do the conversion right here |
3786 | */ | 3786 | */ |
3787 | err = ext4_convert_unwritten_extents(inode, | 3787 | err = ext4_convert_unwritten_extents(inode, |
3788 | offset, ret); | 3788 | offset, ret); |
3789 | if (err < 0) | 3789 | if (err < 0) |
3790 | ret = err; | 3790 | ret = err; |
3791 | ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | 3791 | ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); |
3792 | } | 3792 | } |
3793 | return ret; | 3793 | return ret; |
3794 | } | 3794 | } |
3795 | 3795 | ||
3796 | /* for write the the end of file case, we fall back to old way */ | 3796 | /* for write the the end of file case, we fall back to old way */ |
3797 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | 3797 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); |
3798 | } | 3798 | } |
3799 | 3799 | ||
3800 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | 3800 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, |
3801 | const struct iovec *iov, loff_t offset, | 3801 | const struct iovec *iov, loff_t offset, |
3802 | unsigned long nr_segs) | 3802 | unsigned long nr_segs) |
3803 | { | 3803 | { |
3804 | struct file *file = iocb->ki_filp; | 3804 | struct file *file = iocb->ki_filp; |
3805 | struct inode *inode = file->f_mapping->host; | 3805 | struct inode *inode = file->f_mapping->host; |
3806 | ssize_t ret; | 3806 | ssize_t ret; |
3807 | 3807 | ||
3808 | trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); | 3808 | trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); |
3809 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3809 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
3810 | ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); | 3810 | ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); |
3811 | else | 3811 | else |
3812 | ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | 3812 | ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); |
3813 | trace_ext4_direct_IO_exit(inode, offset, | 3813 | trace_ext4_direct_IO_exit(inode, offset, |
3814 | iov_length(iov, nr_segs), rw, ret); | 3814 | iov_length(iov, nr_segs), rw, ret); |
3815 | return ret; | 3815 | return ret; |
3816 | } | 3816 | } |
3817 | 3817 | ||
3818 | /* | 3818 | /* |
3819 | * Pages can be marked dirty completely asynchronously from ext4's journalling | 3819 | * Pages can be marked dirty completely asynchronously from ext4's journalling |
3820 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do | 3820 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do |
3821 | * much here because ->set_page_dirty is called under VFS locks. The page is | 3821 | * much here because ->set_page_dirty is called under VFS locks. The page is |
3822 | * not necessarily locked. | 3822 | * not necessarily locked. |
3823 | * | 3823 | * |
3824 | * We cannot just dirty the page and leave attached buffers clean, because the | 3824 | * We cannot just dirty the page and leave attached buffers clean, because the |
3825 | * buffers' dirty state is "definitive". We cannot just set the buffers dirty | 3825 | * buffers' dirty state is "definitive". We cannot just set the buffers dirty |
3826 | * or jbddirty because all the journalling code will explode. | 3826 | * or jbddirty because all the journalling code will explode. |
3827 | * | 3827 | * |
3828 | * So what we do is to mark the page "pending dirty" and next time writepage | 3828 | * So what we do is to mark the page "pending dirty" and next time writepage |
3829 | * is called, propagate that into the buffers appropriately. | 3829 | * is called, propagate that into the buffers appropriately. |
3830 | */ | 3830 | */ |
3831 | static int ext4_journalled_set_page_dirty(struct page *page) | 3831 | static int ext4_journalled_set_page_dirty(struct page *page) |
3832 | { | 3832 | { |
3833 | SetPageChecked(page); | 3833 | SetPageChecked(page); |
3834 | return __set_page_dirty_nobuffers(page); | 3834 | return __set_page_dirty_nobuffers(page); |
3835 | } | 3835 | } |
3836 | 3836 | ||
3837 | static const struct address_space_operations ext4_ordered_aops = { | 3837 | static const struct address_space_operations ext4_ordered_aops = { |
3838 | .readpage = ext4_readpage, | 3838 | .readpage = ext4_readpage, |
3839 | .readpages = ext4_readpages, | 3839 | .readpages = ext4_readpages, |
3840 | .writepage = ext4_writepage, | 3840 | .writepage = ext4_writepage, |
3841 | .write_begin = ext4_write_begin, | 3841 | .write_begin = ext4_write_begin, |
3842 | .write_end = ext4_ordered_write_end, | 3842 | .write_end = ext4_ordered_write_end, |
3843 | .bmap = ext4_bmap, | 3843 | .bmap = ext4_bmap, |
3844 | .invalidatepage = ext4_invalidatepage, | 3844 | .invalidatepage = ext4_invalidatepage, |
3845 | .releasepage = ext4_releasepage, | 3845 | .releasepage = ext4_releasepage, |
3846 | .direct_IO = ext4_direct_IO, | 3846 | .direct_IO = ext4_direct_IO, |
3847 | .migratepage = buffer_migrate_page, | 3847 | .migratepage = buffer_migrate_page, |
3848 | .is_partially_uptodate = block_is_partially_uptodate, | 3848 | .is_partially_uptodate = block_is_partially_uptodate, |
3849 | .error_remove_page = generic_error_remove_page, | 3849 | .error_remove_page = generic_error_remove_page, |
3850 | }; | 3850 | }; |
3851 | 3851 | ||
3852 | static const struct address_space_operations ext4_writeback_aops = { | 3852 | static const struct address_space_operations ext4_writeback_aops = { |
3853 | .readpage = ext4_readpage, | 3853 | .readpage = ext4_readpage, |
3854 | .readpages = ext4_readpages, | 3854 | .readpages = ext4_readpages, |
3855 | .writepage = ext4_writepage, | 3855 | .writepage = ext4_writepage, |
3856 | .write_begin = ext4_write_begin, | 3856 | .write_begin = ext4_write_begin, |
3857 | .write_end = ext4_writeback_write_end, | 3857 | .write_end = ext4_writeback_write_end, |
3858 | .bmap = ext4_bmap, | 3858 | .bmap = ext4_bmap, |
3859 | .invalidatepage = ext4_invalidatepage, | 3859 | .invalidatepage = ext4_invalidatepage, |
3860 | .releasepage = ext4_releasepage, | 3860 | .releasepage = ext4_releasepage, |
3861 | .direct_IO = ext4_direct_IO, | 3861 | .direct_IO = ext4_direct_IO, |
3862 | .migratepage = buffer_migrate_page, | 3862 | .migratepage = buffer_migrate_page, |
3863 | .is_partially_uptodate = block_is_partially_uptodate, | 3863 | .is_partially_uptodate = block_is_partially_uptodate, |
3864 | .error_remove_page = generic_error_remove_page, | 3864 | .error_remove_page = generic_error_remove_page, |
3865 | }; | 3865 | }; |
3866 | 3866 | ||
3867 | static const struct address_space_operations ext4_journalled_aops = { | 3867 | static const struct address_space_operations ext4_journalled_aops = { |
3868 | .readpage = ext4_readpage, | 3868 | .readpage = ext4_readpage, |
3869 | .readpages = ext4_readpages, | 3869 | .readpages = ext4_readpages, |
3870 | .writepage = ext4_writepage, | 3870 | .writepage = ext4_writepage, |
3871 | .write_begin = ext4_write_begin, | 3871 | .write_begin = ext4_write_begin, |
3872 | .write_end = ext4_journalled_write_end, | 3872 | .write_end = ext4_journalled_write_end, |
3873 | .set_page_dirty = ext4_journalled_set_page_dirty, | 3873 | .set_page_dirty = ext4_journalled_set_page_dirty, |
3874 | .bmap = ext4_bmap, | 3874 | .bmap = ext4_bmap, |
3875 | .invalidatepage = ext4_invalidatepage, | 3875 | .invalidatepage = ext4_invalidatepage, |
3876 | .releasepage = ext4_releasepage, | 3876 | .releasepage = ext4_releasepage, |
3877 | .is_partially_uptodate = block_is_partially_uptodate, | 3877 | .is_partially_uptodate = block_is_partially_uptodate, |
3878 | .error_remove_page = generic_error_remove_page, | 3878 | .error_remove_page = generic_error_remove_page, |
3879 | }; | 3879 | }; |
3880 | 3880 | ||
3881 | static const struct address_space_operations ext4_da_aops = { | 3881 | static const struct address_space_operations ext4_da_aops = { |
3882 | .readpage = ext4_readpage, | 3882 | .readpage = ext4_readpage, |
3883 | .readpages = ext4_readpages, | 3883 | .readpages = ext4_readpages, |
3884 | .writepage = ext4_writepage, | 3884 | .writepage = ext4_writepage, |
3885 | .writepages = ext4_da_writepages, | 3885 | .writepages = ext4_da_writepages, |
3886 | .write_begin = ext4_da_write_begin, | 3886 | .write_begin = ext4_da_write_begin, |
3887 | .write_end = ext4_da_write_end, | 3887 | .write_end = ext4_da_write_end, |
3888 | .bmap = ext4_bmap, | 3888 | .bmap = ext4_bmap, |
3889 | .invalidatepage = ext4_da_invalidatepage, | 3889 | .invalidatepage = ext4_da_invalidatepage, |
3890 | .releasepage = ext4_releasepage, | 3890 | .releasepage = ext4_releasepage, |
3891 | .direct_IO = ext4_direct_IO, | 3891 | .direct_IO = ext4_direct_IO, |
3892 | .migratepage = buffer_migrate_page, | 3892 | .migratepage = buffer_migrate_page, |
3893 | .is_partially_uptodate = block_is_partially_uptodate, | 3893 | .is_partially_uptodate = block_is_partially_uptodate, |
3894 | .error_remove_page = generic_error_remove_page, | 3894 | .error_remove_page = generic_error_remove_page, |
3895 | }; | 3895 | }; |
3896 | 3896 | ||
3897 | void ext4_set_aops(struct inode *inode) | 3897 | void ext4_set_aops(struct inode *inode) |
3898 | { | 3898 | { |
3899 | if (ext4_should_order_data(inode) && | 3899 | if (ext4_should_order_data(inode) && |
3900 | test_opt(inode->i_sb, DELALLOC)) | 3900 | test_opt(inode->i_sb, DELALLOC)) |
3901 | inode->i_mapping->a_ops = &ext4_da_aops; | 3901 | inode->i_mapping->a_ops = &ext4_da_aops; |
3902 | else if (ext4_should_order_data(inode)) | 3902 | else if (ext4_should_order_data(inode)) |
3903 | inode->i_mapping->a_ops = &ext4_ordered_aops; | 3903 | inode->i_mapping->a_ops = &ext4_ordered_aops; |
3904 | else if (ext4_should_writeback_data(inode) && | 3904 | else if (ext4_should_writeback_data(inode) && |
3905 | test_opt(inode->i_sb, DELALLOC)) | 3905 | test_opt(inode->i_sb, DELALLOC)) |
3906 | inode->i_mapping->a_ops = &ext4_da_aops; | 3906 | inode->i_mapping->a_ops = &ext4_da_aops; |
3907 | else if (ext4_should_writeback_data(inode)) | 3907 | else if (ext4_should_writeback_data(inode)) |
3908 | inode->i_mapping->a_ops = &ext4_writeback_aops; | 3908 | inode->i_mapping->a_ops = &ext4_writeback_aops; |
3909 | else | 3909 | else |
3910 | inode->i_mapping->a_ops = &ext4_journalled_aops; | 3910 | inode->i_mapping->a_ops = &ext4_journalled_aops; |
3911 | } | 3911 | } |
3912 | 3912 | ||
3913 | /* | 3913 | /* |
3914 | * ext4_block_truncate_page() zeroes out a mapping from file offset `from' | 3914 | * ext4_block_truncate_page() zeroes out a mapping from file offset `from' |
3915 | * up to the end of the block which corresponds to `from'. | 3915 | * up to the end of the block which corresponds to `from'. |
3916 | * This required during truncate. We need to physically zero the tail end | 3916 | * This required during truncate. We need to physically zero the tail end |
3917 | * of that block so it doesn't yield old data if the file is later grown. | 3917 | * of that block so it doesn't yield old data if the file is later grown. |
3918 | */ | 3918 | */ |
3919 | int ext4_block_truncate_page(handle_t *handle, | 3919 | int ext4_block_truncate_page(handle_t *handle, |
3920 | struct address_space *mapping, loff_t from) | 3920 | struct address_space *mapping, loff_t from) |
3921 | { | 3921 | { |
3922 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 3922 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
3923 | unsigned length; | 3923 | unsigned length; |
3924 | unsigned blocksize; | 3924 | unsigned blocksize; |
3925 | struct inode *inode = mapping->host; | 3925 | struct inode *inode = mapping->host; |
3926 | 3926 | ||
3927 | blocksize = inode->i_sb->s_blocksize; | 3927 | blocksize = inode->i_sb->s_blocksize; |
3928 | length = blocksize - (offset & (blocksize - 1)); | 3928 | length = blocksize - (offset & (blocksize - 1)); |
3929 | 3929 | ||
3930 | return ext4_block_zero_page_range(handle, mapping, from, length); | 3930 | return ext4_block_zero_page_range(handle, mapping, from, length); |
3931 | } | 3931 | } |
3932 | 3932 | ||
3933 | /* | 3933 | /* |
3934 | * ext4_block_zero_page_range() zeros out a mapping of length 'length' | 3934 | * ext4_block_zero_page_range() zeros out a mapping of length 'length' |
3935 | * starting from file offset 'from'. The range to be zero'd must | 3935 | * starting from file offset 'from'. The range to be zero'd must |
3936 | * be contained with in one block. If the specified range exceeds | 3936 | * be contained with in one block. If the specified range exceeds |
3937 | * the end of the block it will be shortened to end of the block | 3937 | * the end of the block it will be shortened to end of the block |
3938 | * that cooresponds to 'from' | 3938 | * that cooresponds to 'from' |
3939 | */ | 3939 | */ |
3940 | int ext4_block_zero_page_range(handle_t *handle, | 3940 | int ext4_block_zero_page_range(handle_t *handle, |
3941 | struct address_space *mapping, loff_t from, loff_t length) | 3941 | struct address_space *mapping, loff_t from, loff_t length) |
3942 | { | 3942 | { |
3943 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; | 3943 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; |
3944 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 3944 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
3945 | unsigned blocksize, max, pos; | 3945 | unsigned blocksize, max, pos; |
3946 | ext4_lblk_t iblock; | 3946 | ext4_lblk_t iblock; |
3947 | struct inode *inode = mapping->host; | 3947 | struct inode *inode = mapping->host; |
3948 | struct buffer_head *bh; | 3948 | struct buffer_head *bh; |
3949 | struct page *page; | 3949 | struct page *page; |
3950 | int err = 0; | 3950 | int err = 0; |
3951 | 3951 | ||
3952 | page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, | 3952 | page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, |
3953 | mapping_gfp_mask(mapping) & ~__GFP_FS); | 3953 | mapping_gfp_mask(mapping) & ~__GFP_FS); |
3954 | if (!page) | 3954 | if (!page) |
3955 | return -EINVAL; | 3955 | return -EINVAL; |
3956 | 3956 | ||
3957 | blocksize = inode->i_sb->s_blocksize; | 3957 | blocksize = inode->i_sb->s_blocksize; |
3958 | max = blocksize - (offset & (blocksize - 1)); | 3958 | max = blocksize - (offset & (blocksize - 1)); |
3959 | 3959 | ||
3960 | /* | 3960 | /* |
3961 | * correct length if it does not fall between | 3961 | * correct length if it does not fall between |
3962 | * 'from' and the end of the block | 3962 | * 'from' and the end of the block |
3963 | */ | 3963 | */ |
3964 | if (length > max || length < 0) | 3964 | if (length > max || length < 0) |
3965 | length = max; | 3965 | length = max; |
3966 | 3966 | ||
3967 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | 3967 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); |
3968 | 3968 | ||
3969 | if (!page_has_buffers(page)) | 3969 | if (!page_has_buffers(page)) |
3970 | create_empty_buffers(page, blocksize, 0); | 3970 | create_empty_buffers(page, blocksize, 0); |
3971 | 3971 | ||
3972 | /* Find the buffer that contains "offset" */ | 3972 | /* Find the buffer that contains "offset" */ |
3973 | bh = page_buffers(page); | 3973 | bh = page_buffers(page); |
3974 | pos = blocksize; | 3974 | pos = blocksize; |
3975 | while (offset >= pos) { | 3975 | while (offset >= pos) { |
3976 | bh = bh->b_this_page; | 3976 | bh = bh->b_this_page; |
3977 | iblock++; | 3977 | iblock++; |
3978 | pos += blocksize; | 3978 | pos += blocksize; |
3979 | } | 3979 | } |
3980 | 3980 | ||
3981 | err = 0; | 3981 | err = 0; |
3982 | if (buffer_freed(bh)) { | 3982 | if (buffer_freed(bh)) { |
3983 | BUFFER_TRACE(bh, "freed: skip"); | 3983 | BUFFER_TRACE(bh, "freed: skip"); |
3984 | goto unlock; | 3984 | goto unlock; |
3985 | } | 3985 | } |
3986 | 3986 | ||
3987 | if (!buffer_mapped(bh)) { | 3987 | if (!buffer_mapped(bh)) { |
3988 | BUFFER_TRACE(bh, "unmapped"); | 3988 | BUFFER_TRACE(bh, "unmapped"); |
3989 | ext4_get_block(inode, iblock, bh, 0); | 3989 | ext4_get_block(inode, iblock, bh, 0); |
3990 | /* unmapped? It's a hole - nothing to do */ | 3990 | /* unmapped? It's a hole - nothing to do */ |
3991 | if (!buffer_mapped(bh)) { | 3991 | if (!buffer_mapped(bh)) { |
3992 | BUFFER_TRACE(bh, "still unmapped"); | 3992 | BUFFER_TRACE(bh, "still unmapped"); |
3993 | goto unlock; | 3993 | goto unlock; |
3994 | } | 3994 | } |
3995 | } | 3995 | } |
3996 | 3996 | ||
3997 | /* Ok, it's mapped. Make sure it's up-to-date */ | 3997 | /* Ok, it's mapped. Make sure it's up-to-date */ |
3998 | if (PageUptodate(page)) | 3998 | if (PageUptodate(page)) |
3999 | set_buffer_uptodate(bh); | 3999 | set_buffer_uptodate(bh); |
4000 | 4000 | ||
4001 | if (!buffer_uptodate(bh)) { | 4001 | if (!buffer_uptodate(bh)) { |
4002 | err = -EIO; | 4002 | err = -EIO; |
4003 | ll_rw_block(READ, 1, &bh); | 4003 | ll_rw_block(READ, 1, &bh); |
4004 | wait_on_buffer(bh); | 4004 | wait_on_buffer(bh); |
4005 | /* Uhhuh. Read error. Complain and punt. */ | 4005 | /* Uhhuh. Read error. Complain and punt. */ |
4006 | if (!buffer_uptodate(bh)) | 4006 | if (!buffer_uptodate(bh)) |
4007 | goto unlock; | 4007 | goto unlock; |
4008 | } | 4008 | } |
4009 | 4009 | ||
4010 | if (ext4_should_journal_data(inode)) { | 4010 | if (ext4_should_journal_data(inode)) { |
4011 | BUFFER_TRACE(bh, "get write access"); | 4011 | BUFFER_TRACE(bh, "get write access"); |
4012 | err = ext4_journal_get_write_access(handle, bh); | 4012 | err = ext4_journal_get_write_access(handle, bh); |
4013 | if (err) | 4013 | if (err) |
4014 | goto unlock; | 4014 | goto unlock; |
4015 | } | 4015 | } |
4016 | 4016 | ||
4017 | zero_user(page, offset, length); | 4017 | zero_user(page, offset, length); |
4018 | 4018 | ||
4019 | BUFFER_TRACE(bh, "zeroed end of block"); | 4019 | BUFFER_TRACE(bh, "zeroed end of block"); |
4020 | 4020 | ||
4021 | err = 0; | 4021 | err = 0; |
4022 | if (ext4_should_journal_data(inode)) { | 4022 | if (ext4_should_journal_data(inode)) { |
4023 | err = ext4_handle_dirty_metadata(handle, inode, bh); | 4023 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
4024 | } else { | 4024 | } else { |
4025 | if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) | 4025 | if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) |
4026 | err = ext4_jbd2_file_inode(handle, inode); | 4026 | err = ext4_jbd2_file_inode(handle, inode); |
4027 | mark_buffer_dirty(bh); | 4027 | mark_buffer_dirty(bh); |
4028 | } | 4028 | } |
4029 | 4029 | ||
4030 | unlock: | 4030 | unlock: |
4031 | unlock_page(page); | 4031 | unlock_page(page); |
4032 | page_cache_release(page); | 4032 | page_cache_release(page); |
4033 | return err; | 4033 | return err; |
4034 | } | 4034 | } |
4035 | 4035 | ||
4036 | /* | 4036 | /* |
4037 | * Probably it should be a library function... search for first non-zero word | 4037 | * Probably it should be a library function... search for first non-zero word |
4038 | * or memcmp with zero_page, whatever is better for particular architecture. | 4038 | * or memcmp with zero_page, whatever is better for particular architecture. |
4039 | * Linus? | 4039 | * Linus? |
4040 | */ | 4040 | */ |
4041 | static inline int all_zeroes(__le32 *p, __le32 *q) | 4041 | static inline int all_zeroes(__le32 *p, __le32 *q) |
4042 | { | 4042 | { |
4043 | while (p < q) | 4043 | while (p < q) |
4044 | if (*p++) | 4044 | if (*p++) |
4045 | return 0; | 4045 | return 0; |
4046 | return 1; | 4046 | return 1; |
4047 | } | 4047 | } |
4048 | 4048 | ||
4049 | /** | 4049 | /** |
4050 | * ext4_find_shared - find the indirect blocks for partial truncation. | 4050 | * ext4_find_shared - find the indirect blocks for partial truncation. |
4051 | * @inode: inode in question | 4051 | * @inode: inode in question |
4052 | * @depth: depth of the affected branch | 4052 | * @depth: depth of the affected branch |
4053 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | 4053 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) |
4054 | * @chain: place to store the pointers to partial indirect blocks | 4054 | * @chain: place to store the pointers to partial indirect blocks |
4055 | * @top: place to the (detached) top of branch | 4055 | * @top: place to the (detached) top of branch |
4056 | * | 4056 | * |
4057 | * This is a helper function used by ext4_truncate(). | 4057 | * This is a helper function used by ext4_truncate(). |
4058 | * | 4058 | * |
4059 | * When we do truncate() we may have to clean the ends of several | 4059 | * When we do truncate() we may have to clean the ends of several |
4060 | * indirect blocks but leave the blocks themselves alive. Block is | 4060 | * indirect blocks but leave the blocks themselves alive. Block is |
4061 | * partially truncated if some data below the new i_size is referred | 4061 | * partially truncated if some data below the new i_size is referred |
4062 | * from it (and it is on the path to the first completely truncated | 4062 | * from it (and it is on the path to the first completely truncated |
4063 | * data block, indeed). We have to free the top of that path along | 4063 | * data block, indeed). We have to free the top of that path along |
4064 | * with everything to the right of the path. Since no allocation | 4064 | * with everything to the right of the path. Since no allocation |
4065 | * past the truncation point is possible until ext4_truncate() | 4065 | * past the truncation point is possible until ext4_truncate() |
4066 | * finishes, we may safely do the latter, but top of branch may | 4066 | * finishes, we may safely do the latter, but top of branch may |
4067 | * require special attention - pageout below the truncation point | 4067 | * require special attention - pageout below the truncation point |
4068 | * might try to populate it. | 4068 | * might try to populate it. |
4069 | * | 4069 | * |
4070 | * We atomically detach the top of branch from the tree, store the | 4070 | * We atomically detach the top of branch from the tree, store the |
4071 | * block number of its root in *@top, pointers to buffer_heads of | 4071 | * block number of its root in *@top, pointers to buffer_heads of |
4072 | * partially truncated blocks - in @chain[].bh and pointers to | 4072 | * partially truncated blocks - in @chain[].bh and pointers to |
4073 | * their last elements that should not be removed - in | 4073 | * their last elements that should not be removed - in |
4074 | * @chain[].p. Return value is the pointer to last filled element | 4074 | * @chain[].p. Return value is the pointer to last filled element |
4075 | * of @chain. | 4075 | * of @chain. |
4076 | * | 4076 | * |
4077 | * The work left to caller to do the actual freeing of subtrees: | 4077 | * The work left to caller to do the actual freeing of subtrees: |
4078 | * a) free the subtree starting from *@top | 4078 | * a) free the subtree starting from *@top |
4079 | * b) free the subtrees whose roots are stored in | 4079 | * b) free the subtrees whose roots are stored in |
4080 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | 4080 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) |
4081 | * c) free the subtrees growing from the inode past the @chain[0]. | 4081 | * c) free the subtrees growing from the inode past the @chain[0]. |
4082 | * (no partially truncated stuff there). */ | 4082 | * (no partially truncated stuff there). */ |
4083 | 4083 | ||
4084 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | 4084 | static Indirect *ext4_find_shared(struct inode *inode, int depth, |
4085 | ext4_lblk_t offsets[4], Indirect chain[4], | 4085 | ext4_lblk_t offsets[4], Indirect chain[4], |
4086 | __le32 *top) | 4086 | __le32 *top) |
4087 | { | 4087 | { |
4088 | Indirect *partial, *p; | 4088 | Indirect *partial, *p; |
4089 | int k, err; | 4089 | int k, err; |
4090 | 4090 | ||
4091 | *top = 0; | 4091 | *top = 0; |
4092 | /* Make k index the deepest non-null offset + 1 */ | 4092 | /* Make k index the deepest non-null offset + 1 */ |
4093 | for (k = depth; k > 1 && !offsets[k-1]; k--) | 4093 | for (k = depth; k > 1 && !offsets[k-1]; k--) |
4094 | ; | 4094 | ; |
4095 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | 4095 | partial = ext4_get_branch(inode, k, offsets, chain, &err); |
4096 | /* Writer: pointers */ | 4096 | /* Writer: pointers */ |
4097 | if (!partial) | 4097 | if (!partial) |
4098 | partial = chain + k-1; | 4098 | partial = chain + k-1; |
4099 | /* | 4099 | /* |
4100 | * If the branch acquired continuation since we've looked at it - | 4100 | * If the branch acquired continuation since we've looked at it - |
4101 | * fine, it should all survive and (new) top doesn't belong to us. | 4101 | * fine, it should all survive and (new) top doesn't belong to us. |
4102 | */ | 4102 | */ |
4103 | if (!partial->key && *partial->p) | 4103 | if (!partial->key && *partial->p) |
4104 | /* Writer: end */ | 4104 | /* Writer: end */ |
4105 | goto no_top; | 4105 | goto no_top; |
4106 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | 4106 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) |
4107 | ; | 4107 | ; |
4108 | /* | 4108 | /* |
4109 | * OK, we've found the last block that must survive. The rest of our | 4109 | * OK, we've found the last block that must survive. The rest of our |
4110 | * branch should be detached before unlocking. However, if that rest | 4110 | * branch should be detached before unlocking. However, if that rest |
4111 | * of branch is all ours and does not grow immediately from the inode | 4111 | * of branch is all ours and does not grow immediately from the inode |
4112 | * it's easier to cheat and just decrement partial->p. | 4112 | * it's easier to cheat and just decrement partial->p. |
4113 | */ | 4113 | */ |
4114 | if (p == chain + k - 1 && p > chain) { | 4114 | if (p == chain + k - 1 && p > chain) { |
4115 | p->p--; | 4115 | p->p--; |
4116 | } else { | 4116 | } else { |
4117 | *top = *p->p; | 4117 | *top = *p->p; |
4118 | /* Nope, don't do this in ext4. Must leave the tree intact */ | 4118 | /* Nope, don't do this in ext4. Must leave the tree intact */ |
4119 | #if 0 | 4119 | #if 0 |
4120 | *p->p = 0; | 4120 | *p->p = 0; |
4121 | #endif | 4121 | #endif |
4122 | } | 4122 | } |
4123 | /* Writer: end */ | 4123 | /* Writer: end */ |
4124 | 4124 | ||
4125 | while (partial > p) { | 4125 | while (partial > p) { |
4126 | brelse(partial->bh); | 4126 | brelse(partial->bh); |
4127 | partial--; | 4127 | partial--; |
4128 | } | 4128 | } |
4129 | no_top: | 4129 | no_top: |
4130 | return partial; | 4130 | return partial; |
4131 | } | 4131 | } |
4132 | 4132 | ||
4133 | /* | 4133 | /* |
4134 | * Zero a number of block pointers in either an inode or an indirect block. | 4134 | * Zero a number of block pointers in either an inode or an indirect block. |
4135 | * If we restart the transaction we must again get write access to the | 4135 | * If we restart the transaction we must again get write access to the |
4136 | * indirect block for further modification. | 4136 | * indirect block for further modification. |
4137 | * | 4137 | * |
4138 | * We release `count' blocks on disk, but (last - first) may be greater | 4138 | * We release `count' blocks on disk, but (last - first) may be greater |
4139 | * than `count' because there can be holes in there. | 4139 | * than `count' because there can be holes in there. |
4140 | * | 4140 | * |
4141 | * Return 0 on success, 1 on invalid block range | 4141 | * Return 0 on success, 1 on invalid block range |
4142 | * and < 0 on fatal error. | 4142 | * and < 0 on fatal error. |
4143 | */ | 4143 | */ |
4144 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | 4144 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, |
4145 | struct buffer_head *bh, | 4145 | struct buffer_head *bh, |
4146 | ext4_fsblk_t block_to_free, | 4146 | ext4_fsblk_t block_to_free, |
4147 | unsigned long count, __le32 *first, | 4147 | unsigned long count, __le32 *first, |
4148 | __le32 *last) | 4148 | __le32 *last) |
4149 | { | 4149 | { |
4150 | __le32 *p; | 4150 | __le32 *p; |
4151 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | 4151 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; |
4152 | int err; | 4152 | int err; |
4153 | 4153 | ||
4154 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | 4154 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) |
4155 | flags |= EXT4_FREE_BLOCKS_METADATA; | 4155 | flags |= EXT4_FREE_BLOCKS_METADATA; |
4156 | 4156 | ||
4157 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | 4157 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, |
4158 | count)) { | 4158 | count)) { |
4159 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | 4159 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " |
4160 | "blocks %llu len %lu", | 4160 | "blocks %llu len %lu", |
4161 | (unsigned long long) block_to_free, count); | 4161 | (unsigned long long) block_to_free, count); |
4162 | return 1; | 4162 | return 1; |
4163 | } | 4163 | } |
4164 | 4164 | ||
4165 | if (try_to_extend_transaction(handle, inode)) { | 4165 | if (try_to_extend_transaction(handle, inode)) { |
4166 | if (bh) { | 4166 | if (bh) { |
4167 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | 4167 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
4168 | err = ext4_handle_dirty_metadata(handle, inode, bh); | 4168 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
4169 | if (unlikely(err)) | 4169 | if (unlikely(err)) |
4170 | goto out_err; | 4170 | goto out_err; |
4171 | } | 4171 | } |
4172 | err = ext4_mark_inode_dirty(handle, inode); | 4172 | err = ext4_mark_inode_dirty(handle, inode); |
4173 | if (unlikely(err)) | 4173 | if (unlikely(err)) |
4174 | goto out_err; | 4174 | goto out_err; |
4175 | err = ext4_truncate_restart_trans(handle, inode, | 4175 | err = ext4_truncate_restart_trans(handle, inode, |
4176 | blocks_for_truncate(inode)); | 4176 | blocks_for_truncate(inode)); |
4177 | if (unlikely(err)) | 4177 | if (unlikely(err)) |
4178 | goto out_err; | 4178 | goto out_err; |
4179 | if (bh) { | 4179 | if (bh) { |
4180 | BUFFER_TRACE(bh, "retaking write access"); | 4180 | BUFFER_TRACE(bh, "retaking write access"); |
4181 | err = ext4_journal_get_write_access(handle, bh); | 4181 | err = ext4_journal_get_write_access(handle, bh); |
4182 | if (unlikely(err)) | 4182 | if (unlikely(err)) |
4183 | goto out_err; | 4183 | goto out_err; |
4184 | } | 4184 | } |
4185 | } | 4185 | } |
4186 | 4186 | ||
4187 | for (p = first; p < last; p++) | 4187 | for (p = first; p < last; p++) |
4188 | *p = 0; | 4188 | *p = 0; |
4189 | 4189 | ||
4190 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | 4190 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); |
4191 | return 0; | 4191 | return 0; |
4192 | out_err: | 4192 | out_err: |
4193 | ext4_std_error(inode->i_sb, err); | 4193 | ext4_std_error(inode->i_sb, err); |
4194 | return err; | 4194 | return err; |
4195 | } | 4195 | } |
4196 | 4196 | ||
4197 | /** | 4197 | /** |
4198 | * ext4_free_data - free a list of data blocks | 4198 | * ext4_free_data - free a list of data blocks |
4199 | * @handle: handle for this transaction | 4199 | * @handle: handle for this transaction |
4200 | * @inode: inode we are dealing with | 4200 | * @inode: inode we are dealing with |
4201 | * @this_bh: indirect buffer_head which contains *@first and *@last | 4201 | * @this_bh: indirect buffer_head which contains *@first and *@last |
4202 | * @first: array of block numbers | 4202 | * @first: array of block numbers |
4203 | * @last: points immediately past the end of array | 4203 | * @last: points immediately past the end of array |
4204 | * | 4204 | * |
4205 | * We are freeing all blocks referred from that array (numbers are stored as | 4205 | * We are freeing all blocks referred from that array (numbers are stored as |
4206 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | 4206 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. |
4207 | * | 4207 | * |
4208 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | 4208 | * We accumulate contiguous runs of blocks to free. Conveniently, if these |
4209 | * blocks are contiguous then releasing them at one time will only affect one | 4209 | * blocks are contiguous then releasing them at one time will only affect one |
4210 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | 4210 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't |
4211 | * actually use a lot of journal space. | 4211 | * actually use a lot of journal space. |
4212 | * | 4212 | * |
4213 | * @this_bh will be %NULL if @first and @last point into the inode's direct | 4213 | * @this_bh will be %NULL if @first and @last point into the inode's direct |
4214 | * block pointers. | 4214 | * block pointers. |
4215 | */ | 4215 | */ |
4216 | static void ext4_free_data(handle_t *handle, struct inode *inode, | 4216 | static void ext4_free_data(handle_t *handle, struct inode *inode, |
4217 | struct buffer_head *this_bh, | 4217 | struct buffer_head *this_bh, |
4218 | __le32 *first, __le32 *last) | 4218 | __le32 *first, __le32 *last) |
4219 | { | 4219 | { |
4220 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | 4220 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ |
4221 | unsigned long count = 0; /* Number of blocks in the run */ | 4221 | unsigned long count = 0; /* Number of blocks in the run */ |
4222 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | 4222 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind |
4223 | corresponding to | 4223 | corresponding to |
4224 | block_to_free */ | 4224 | block_to_free */ |
4225 | ext4_fsblk_t nr; /* Current block # */ | 4225 | ext4_fsblk_t nr; /* Current block # */ |
4226 | __le32 *p; /* Pointer into inode/ind | 4226 | __le32 *p; /* Pointer into inode/ind |
4227 | for current block */ | 4227 | for current block */ |
4228 | int err = 0; | 4228 | int err = 0; |
4229 | 4229 | ||
4230 | if (this_bh) { /* For indirect block */ | 4230 | if (this_bh) { /* For indirect block */ |
4231 | BUFFER_TRACE(this_bh, "get_write_access"); | 4231 | BUFFER_TRACE(this_bh, "get_write_access"); |
4232 | err = ext4_journal_get_write_access(handle, this_bh); | 4232 | err = ext4_journal_get_write_access(handle, this_bh); |
4233 | /* Important: if we can't update the indirect pointers | 4233 | /* Important: if we can't update the indirect pointers |
4234 | * to the blocks, we can't free them. */ | 4234 | * to the blocks, we can't free them. */ |
4235 | if (err) | 4235 | if (err) |
4236 | return; | 4236 | return; |
4237 | } | 4237 | } |
4238 | 4238 | ||
4239 | for (p = first; p < last; p++) { | 4239 | for (p = first; p < last; p++) { |
4240 | nr = le32_to_cpu(*p); | 4240 | nr = le32_to_cpu(*p); |
4241 | if (nr) { | 4241 | if (nr) { |
4242 | /* accumulate blocks to free if they're contiguous */ | 4242 | /* accumulate blocks to free if they're contiguous */ |
4243 | if (count == 0) { | 4243 | if (count == 0) { |
4244 | block_to_free = nr; | 4244 | block_to_free = nr; |
4245 | block_to_free_p = p; | 4245 | block_to_free_p = p; |
4246 | count = 1; | 4246 | count = 1; |
4247 | } else if (nr == block_to_free + count) { | 4247 | } else if (nr == block_to_free + count) { |
4248 | count++; | 4248 | count++; |
4249 | } else { | 4249 | } else { |
4250 | err = ext4_clear_blocks(handle, inode, this_bh, | 4250 | err = ext4_clear_blocks(handle, inode, this_bh, |
4251 | block_to_free, count, | 4251 | block_to_free, count, |
4252 | block_to_free_p, p); | 4252 | block_to_free_p, p); |
4253 | if (err) | 4253 | if (err) |
4254 | break; | 4254 | break; |
4255 | block_to_free = nr; | 4255 | block_to_free = nr; |
4256 | block_to_free_p = p; | 4256 | block_to_free_p = p; |
4257 | count = 1; | 4257 | count = 1; |
4258 | } | 4258 | } |
4259 | } | 4259 | } |
4260 | } | 4260 | } |
4261 | 4261 | ||
4262 | if (!err && count > 0) | 4262 | if (!err && count > 0) |
4263 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | 4263 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, |
4264 | count, block_to_free_p, p); | 4264 | count, block_to_free_p, p); |
4265 | if (err < 0) | 4265 | if (err < 0) |
4266 | /* fatal error */ | 4266 | /* fatal error */ |
4267 | return; | 4267 | return; |
4268 | 4268 | ||
4269 | if (this_bh) { | 4269 | if (this_bh) { |
4270 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | 4270 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); |
4271 | 4271 | ||
4272 | /* | 4272 | /* |
4273 | * The buffer head should have an attached journal head at this | 4273 | * The buffer head should have an attached journal head at this |
4274 | * point. However, if the data is corrupted and an indirect | 4274 | * point. However, if the data is corrupted and an indirect |
4275 | * block pointed to itself, it would have been detached when | 4275 | * block pointed to itself, it would have been detached when |
4276 | * the block was cleared. Check for this instead of OOPSing. | 4276 | * the block was cleared. Check for this instead of OOPSing. |
4277 | */ | 4277 | */ |
4278 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | 4278 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) |
4279 | ext4_handle_dirty_metadata(handle, inode, this_bh); | 4279 | ext4_handle_dirty_metadata(handle, inode, this_bh); |
4280 | else | 4280 | else |
4281 | EXT4_ERROR_INODE(inode, | 4281 | EXT4_ERROR_INODE(inode, |
4282 | "circular indirect block detected at " | 4282 | "circular indirect block detected at " |
4283 | "block %llu", | 4283 | "block %llu", |
4284 | (unsigned long long) this_bh->b_blocknr); | 4284 | (unsigned long long) this_bh->b_blocknr); |
4285 | } | 4285 | } |
4286 | } | 4286 | } |
4287 | 4287 | ||
4288 | /** | 4288 | /** |
4289 | * ext4_free_branches - free an array of branches | 4289 | * ext4_free_branches - free an array of branches |
4290 | * @handle: JBD handle for this transaction | 4290 | * @handle: JBD handle for this transaction |
4291 | * @inode: inode we are dealing with | 4291 | * @inode: inode we are dealing with |
4292 | * @parent_bh: the buffer_head which contains *@first and *@last | 4292 | * @parent_bh: the buffer_head which contains *@first and *@last |
4293 | * @first: array of block numbers | 4293 | * @first: array of block numbers |
4294 | * @last: pointer immediately past the end of array | 4294 | * @last: pointer immediately past the end of array |
4295 | * @depth: depth of the branches to free | 4295 | * @depth: depth of the branches to free |
4296 | * | 4296 | * |
4297 | * We are freeing all blocks referred from these branches (numbers are | 4297 | * We are freeing all blocks referred from these branches (numbers are |
4298 | * stored as little-endian 32-bit) and updating @inode->i_blocks | 4298 | * stored as little-endian 32-bit) and updating @inode->i_blocks |
4299 | * appropriately. | 4299 | * appropriately. |
4300 | */ | 4300 | */ |
4301 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | 4301 | static void ext4_free_branches(handle_t *handle, struct inode *inode, |
4302 | struct buffer_head *parent_bh, | 4302 | struct buffer_head *parent_bh, |
4303 | __le32 *first, __le32 *last, int depth) | 4303 | __le32 *first, __le32 *last, int depth) |
4304 | { | 4304 | { |
4305 | ext4_fsblk_t nr; | 4305 | ext4_fsblk_t nr; |
4306 | __le32 *p; | 4306 | __le32 *p; |
4307 | 4307 | ||
4308 | if (ext4_handle_is_aborted(handle)) | 4308 | if (ext4_handle_is_aborted(handle)) |
4309 | return; | 4309 | return; |
4310 | 4310 | ||
4311 | if (depth--) { | 4311 | if (depth--) { |
4312 | struct buffer_head *bh; | 4312 | struct buffer_head *bh; |
4313 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 4313 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); |
4314 | p = last; | 4314 | p = last; |
4315 | while (--p >= first) { | 4315 | while (--p >= first) { |
4316 | nr = le32_to_cpu(*p); | 4316 | nr = le32_to_cpu(*p); |
4317 | if (!nr) | 4317 | if (!nr) |
4318 | continue; /* A hole */ | 4318 | continue; /* A hole */ |
4319 | 4319 | ||
4320 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | 4320 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), |
4321 | nr, 1)) { | 4321 | nr, 1)) { |
4322 | EXT4_ERROR_INODE(inode, | 4322 | EXT4_ERROR_INODE(inode, |
4323 | "invalid indirect mapped " | 4323 | "invalid indirect mapped " |
4324 | "block %lu (level %d)", | 4324 | "block %lu (level %d)", |
4325 | (unsigned long) nr, depth); | 4325 | (unsigned long) nr, depth); |
4326 | break; | 4326 | break; |
4327 | } | 4327 | } |
4328 | 4328 | ||
4329 | /* Go read the buffer for the next level down */ | 4329 | /* Go read the buffer for the next level down */ |
4330 | bh = sb_bread(inode->i_sb, nr); | 4330 | bh = sb_bread(inode->i_sb, nr); |
4331 | 4331 | ||
4332 | /* | 4332 | /* |
4333 | * A read failure? Report error and clear slot | 4333 | * A read failure? Report error and clear slot |
4334 | * (should be rare). | 4334 | * (should be rare). |
4335 | */ | 4335 | */ |
4336 | if (!bh) { | 4336 | if (!bh) { |
4337 | EXT4_ERROR_INODE_BLOCK(inode, nr, | 4337 | EXT4_ERROR_INODE_BLOCK(inode, nr, |
4338 | "Read failure"); | 4338 | "Read failure"); |
4339 | continue; | 4339 | continue; |
4340 | } | 4340 | } |
4341 | 4341 | ||
4342 | /* This zaps the entire block. Bottom up. */ | 4342 | /* This zaps the entire block. Bottom up. */ |
4343 | BUFFER_TRACE(bh, "free child branches"); | 4343 | BUFFER_TRACE(bh, "free child branches"); |
4344 | ext4_free_branches(handle, inode, bh, | 4344 | ext4_free_branches(handle, inode, bh, |
4345 | (__le32 *) bh->b_data, | 4345 | (__le32 *) bh->b_data, |
4346 | (__le32 *) bh->b_data + addr_per_block, | 4346 | (__le32 *) bh->b_data + addr_per_block, |
4347 | depth); | 4347 | depth); |
4348 | brelse(bh); | 4348 | brelse(bh); |
4349 | 4349 | ||
4350 | /* | 4350 | /* |
4351 | * Everything below this this pointer has been | 4351 | * Everything below this this pointer has been |
4352 | * released. Now let this top-of-subtree go. | 4352 | * released. Now let this top-of-subtree go. |
4353 | * | 4353 | * |
4354 | * We want the freeing of this indirect block to be | 4354 | * We want the freeing of this indirect block to be |
4355 | * atomic in the journal with the updating of the | 4355 | * atomic in the journal with the updating of the |
4356 | * bitmap block which owns it. So make some room in | 4356 | * bitmap block which owns it. So make some room in |
4357 | * the journal. | 4357 | * the journal. |
4358 | * | 4358 | * |
4359 | * We zero the parent pointer *after* freeing its | 4359 | * We zero the parent pointer *after* freeing its |
4360 | * pointee in the bitmaps, so if extend_transaction() | 4360 | * pointee in the bitmaps, so if extend_transaction() |
4361 | * for some reason fails to put the bitmap changes and | 4361 | * for some reason fails to put the bitmap changes and |
4362 | * the release into the same transaction, recovery | 4362 | * the release into the same transaction, recovery |
4363 | * will merely complain about releasing a free block, | 4363 | * will merely complain about releasing a free block, |
4364 | * rather than leaking blocks. | 4364 | * rather than leaking blocks. |
4365 | */ | 4365 | */ |
4366 | if (ext4_handle_is_aborted(handle)) | 4366 | if (ext4_handle_is_aborted(handle)) |
4367 | return; | 4367 | return; |
4368 | if (try_to_extend_transaction(handle, inode)) { | 4368 | if (try_to_extend_transaction(handle, inode)) { |
4369 | ext4_mark_inode_dirty(handle, inode); | 4369 | ext4_mark_inode_dirty(handle, inode); |
4370 | ext4_truncate_restart_trans(handle, inode, | 4370 | ext4_truncate_restart_trans(handle, inode, |
4371 | blocks_for_truncate(inode)); | 4371 | blocks_for_truncate(inode)); |
4372 | } | 4372 | } |
4373 | 4373 | ||
4374 | /* | 4374 | /* |
4375 | * The forget flag here is critical because if | 4375 | * The forget flag here is critical because if |
4376 | * we are journaling (and not doing data | 4376 | * we are journaling (and not doing data |
4377 | * journaling), we have to make sure a revoke | 4377 | * journaling), we have to make sure a revoke |
4378 | * record is written to prevent the journal | 4378 | * record is written to prevent the journal |
4379 | * replay from overwriting the (former) | 4379 | * replay from overwriting the (former) |
4380 | * indirect block if it gets reallocated as a | 4380 | * indirect block if it gets reallocated as a |
4381 | * data block. This must happen in the same | 4381 | * data block. This must happen in the same |
4382 | * transaction where the data blocks are | 4382 | * transaction where the data blocks are |
4383 | * actually freed. | 4383 | * actually freed. |
4384 | */ | 4384 | */ |
4385 | ext4_free_blocks(handle, inode, NULL, nr, 1, | 4385 | ext4_free_blocks(handle, inode, NULL, nr, 1, |
4386 | EXT4_FREE_BLOCKS_METADATA| | 4386 | EXT4_FREE_BLOCKS_METADATA| |
4387 | EXT4_FREE_BLOCKS_FORGET); | 4387 | EXT4_FREE_BLOCKS_FORGET); |
4388 | 4388 | ||
4389 | if (parent_bh) { | 4389 | if (parent_bh) { |
4390 | /* | 4390 | /* |
4391 | * The block which we have just freed is | 4391 | * The block which we have just freed is |
4392 | * pointed to by an indirect block: journal it | 4392 | * pointed to by an indirect block: journal it |
4393 | */ | 4393 | */ |
4394 | BUFFER_TRACE(parent_bh, "get_write_access"); | 4394 | BUFFER_TRACE(parent_bh, "get_write_access"); |
4395 | if (!ext4_journal_get_write_access(handle, | 4395 | if (!ext4_journal_get_write_access(handle, |
4396 | parent_bh)){ | 4396 | parent_bh)){ |
4397 | *p = 0; | 4397 | *p = 0; |
4398 | BUFFER_TRACE(parent_bh, | 4398 | BUFFER_TRACE(parent_bh, |
4399 | "call ext4_handle_dirty_metadata"); | 4399 | "call ext4_handle_dirty_metadata"); |
4400 | ext4_handle_dirty_metadata(handle, | 4400 | ext4_handle_dirty_metadata(handle, |
4401 | inode, | 4401 | inode, |
4402 | parent_bh); | 4402 | parent_bh); |
4403 | } | 4403 | } |
4404 | } | 4404 | } |
4405 | } | 4405 | } |
4406 | } else { | 4406 | } else { |
4407 | /* We have reached the bottom of the tree. */ | 4407 | /* We have reached the bottom of the tree. */ |
4408 | BUFFER_TRACE(parent_bh, "free data blocks"); | 4408 | BUFFER_TRACE(parent_bh, "free data blocks"); |
4409 | ext4_free_data(handle, inode, parent_bh, first, last); | 4409 | ext4_free_data(handle, inode, parent_bh, first, last); |
4410 | } | 4410 | } |
4411 | } | 4411 | } |
4412 | 4412 | ||
4413 | int ext4_can_truncate(struct inode *inode) | 4413 | int ext4_can_truncate(struct inode *inode) |
4414 | { | 4414 | { |
4415 | if (S_ISREG(inode->i_mode)) | 4415 | if (S_ISREG(inode->i_mode)) |
4416 | return 1; | 4416 | return 1; |
4417 | if (S_ISDIR(inode->i_mode)) | 4417 | if (S_ISDIR(inode->i_mode)) |
4418 | return 1; | 4418 | return 1; |
4419 | if (S_ISLNK(inode->i_mode)) | 4419 | if (S_ISLNK(inode->i_mode)) |
4420 | return !ext4_inode_is_fast_symlink(inode); | 4420 | return !ext4_inode_is_fast_symlink(inode); |
4421 | return 0; | 4421 | return 0; |
4422 | } | 4422 | } |
4423 | 4423 | ||
4424 | /* | 4424 | /* |
4425 | * ext4_punch_hole: punches a hole in a file by releaseing the blocks | 4425 | * ext4_punch_hole: punches a hole in a file by releaseing the blocks |
4426 | * associated with the given offset and length | 4426 | * associated with the given offset and length |
4427 | * | 4427 | * |
4428 | * @inode: File inode | 4428 | * @inode: File inode |
4429 | * @offset: The offset where the hole will begin | 4429 | * @offset: The offset where the hole will begin |
4430 | * @len: The length of the hole | 4430 | * @len: The length of the hole |
4431 | * | 4431 | * |
4432 | * Returns: 0 on sucess or negative on failure | 4432 | * Returns: 0 on sucess or negative on failure |
4433 | */ | 4433 | */ |
4434 | 4434 | ||
4435 | int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | 4435 | int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) |
4436 | { | 4436 | { |
4437 | struct inode *inode = file->f_path.dentry->d_inode; | 4437 | struct inode *inode = file->f_path.dentry->d_inode; |
4438 | if (!S_ISREG(inode->i_mode)) | 4438 | if (!S_ISREG(inode->i_mode)) |
4439 | return -ENOTSUPP; | 4439 | return -ENOTSUPP; |
4440 | 4440 | ||
4441 | if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 4441 | if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
4442 | /* TODO: Add support for non extent hole punching */ | 4442 | /* TODO: Add support for non extent hole punching */ |
4443 | return -ENOTSUPP; | 4443 | return -ENOTSUPP; |
4444 | } | 4444 | } |
4445 | 4445 | ||
4446 | return ext4_ext_punch_hole(file, offset, length); | 4446 | return ext4_ext_punch_hole(file, offset, length); |
4447 | } | 4447 | } |
4448 | 4448 | ||
4449 | /* | 4449 | /* |
4450 | * ext4_truncate() | 4450 | * ext4_truncate() |
4451 | * | 4451 | * |
4452 | * We block out ext4_get_block() block instantiations across the entire | 4452 | * We block out ext4_get_block() block instantiations across the entire |
4453 | * transaction, and VFS/VM ensures that ext4_truncate() cannot run | 4453 | * transaction, and VFS/VM ensures that ext4_truncate() cannot run |
4454 | * simultaneously on behalf of the same inode. | 4454 | * simultaneously on behalf of the same inode. |
4455 | * | 4455 | * |
4456 | * As we work through the truncate and commmit bits of it to the journal there | 4456 | * As we work through the truncate and commmit bits of it to the journal there |
4457 | * is one core, guiding principle: the file's tree must always be consistent on | 4457 | * is one core, guiding principle: the file's tree must always be consistent on |
4458 | * disk. We must be able to restart the truncate after a crash. | 4458 | * disk. We must be able to restart the truncate after a crash. |
4459 | * | 4459 | * |
4460 | * The file's tree may be transiently inconsistent in memory (although it | 4460 | * The file's tree may be transiently inconsistent in memory (although it |
4461 | * probably isn't), but whenever we close off and commit a journal transaction, | 4461 | * probably isn't), but whenever we close off and commit a journal transaction, |
4462 | * the contents of (the filesystem + the journal) must be consistent and | 4462 | * the contents of (the filesystem + the journal) must be consistent and |
4463 | * restartable. It's pretty simple, really: bottom up, right to left (although | 4463 | * restartable. It's pretty simple, really: bottom up, right to left (although |
4464 | * left-to-right works OK too). | 4464 | * left-to-right works OK too). |
4465 | * | 4465 | * |
4466 | * Note that at recovery time, journal replay occurs *before* the restart of | 4466 | * Note that at recovery time, journal replay occurs *before* the restart of |
4467 | * truncate against the orphan inode list. | 4467 | * truncate against the orphan inode list. |
4468 | * | 4468 | * |
4469 | * The committed inode has the new, desired i_size (which is the same as | 4469 | * The committed inode has the new, desired i_size (which is the same as |
4470 | * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see | 4470 | * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see |
4471 | * that this inode's truncate did not complete and it will again call | 4471 | * that this inode's truncate did not complete and it will again call |
4472 | * ext4_truncate() to have another go. So there will be instantiated blocks | 4472 | * ext4_truncate() to have another go. So there will be instantiated blocks |
4473 | * to the right of the truncation point in a crashed ext4 filesystem. But | 4473 | * to the right of the truncation point in a crashed ext4 filesystem. But |
4474 | * that's fine - as long as they are linked from the inode, the post-crash | 4474 | * that's fine - as long as they are linked from the inode, the post-crash |
4475 | * ext4_truncate() run will find them and release them. | 4475 | * ext4_truncate() run will find them and release them. |
4476 | */ | 4476 | */ |
4477 | void ext4_truncate(struct inode *inode) | 4477 | void ext4_truncate(struct inode *inode) |
4478 | { | 4478 | { |
4479 | handle_t *handle; | 4479 | handle_t *handle; |
4480 | struct ext4_inode_info *ei = EXT4_I(inode); | 4480 | struct ext4_inode_info *ei = EXT4_I(inode); |
4481 | __le32 *i_data = ei->i_data; | 4481 | __le32 *i_data = ei->i_data; |
4482 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 4482 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); |
4483 | struct address_space *mapping = inode->i_mapping; | 4483 | struct address_space *mapping = inode->i_mapping; |
4484 | ext4_lblk_t offsets[4]; | 4484 | ext4_lblk_t offsets[4]; |
4485 | Indirect chain[4]; | 4485 | Indirect chain[4]; |
4486 | Indirect *partial; | 4486 | Indirect *partial; |
4487 | __le32 nr = 0; | 4487 | __le32 nr = 0; |
4488 | int n = 0; | 4488 | int n = 0; |
4489 | ext4_lblk_t last_block, max_block; | 4489 | ext4_lblk_t last_block, max_block; |
4490 | unsigned blocksize = inode->i_sb->s_blocksize; | 4490 | unsigned blocksize = inode->i_sb->s_blocksize; |
4491 | 4491 | ||
4492 | trace_ext4_truncate_enter(inode); | 4492 | trace_ext4_truncate_enter(inode); |
4493 | 4493 | ||
4494 | if (!ext4_can_truncate(inode)) | 4494 | if (!ext4_can_truncate(inode)) |
4495 | return; | 4495 | return; |
4496 | 4496 | ||
4497 | ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); | 4497 | ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); |
4498 | 4498 | ||
4499 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | 4499 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
4500 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); | 4500 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); |
4501 | 4501 | ||
4502 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 4502 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
4503 | ext4_ext_truncate(inode); | 4503 | ext4_ext_truncate(inode); |
4504 | trace_ext4_truncate_exit(inode); | 4504 | trace_ext4_truncate_exit(inode); |
4505 | return; | 4505 | return; |
4506 | } | 4506 | } |
4507 | 4507 | ||
4508 | handle = start_transaction(inode); | 4508 | handle = start_transaction(inode); |
4509 | if (IS_ERR(handle)) | 4509 | if (IS_ERR(handle)) |
4510 | return; /* AKPM: return what? */ | 4510 | return; /* AKPM: return what? */ |
4511 | 4511 | ||
4512 | last_block = (inode->i_size + blocksize-1) | 4512 | last_block = (inode->i_size + blocksize-1) |
4513 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | 4513 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); |
4514 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | 4514 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) |
4515 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | 4515 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); |
4516 | 4516 | ||
4517 | if (inode->i_size & (blocksize - 1)) | 4517 | if (inode->i_size & (blocksize - 1)) |
4518 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | 4518 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) |
4519 | goto out_stop; | 4519 | goto out_stop; |
4520 | 4520 | ||
4521 | if (last_block != max_block) { | 4521 | if (last_block != max_block) { |
4522 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | 4522 | n = ext4_block_to_path(inode, last_block, offsets, NULL); |
4523 | if (n == 0) | 4523 | if (n == 0) |
4524 | goto out_stop; /* error */ | 4524 | goto out_stop; /* error */ |
4525 | } | 4525 | } |
4526 | 4526 | ||
4527 | /* | 4527 | /* |
4528 | * OK. This truncate is going to happen. We add the inode to the | 4528 | * OK. This truncate is going to happen. We add the inode to the |
4529 | * orphan list, so that if this truncate spans multiple transactions, | 4529 | * orphan list, so that if this truncate spans multiple transactions, |
4530 | * and we crash, we will resume the truncate when the filesystem | 4530 | * and we crash, we will resume the truncate when the filesystem |
4531 | * recovers. It also marks the inode dirty, to catch the new size. | 4531 | * recovers. It also marks the inode dirty, to catch the new size. |
4532 | * | 4532 | * |
4533 | * Implication: the file must always be in a sane, consistent | 4533 | * Implication: the file must always be in a sane, consistent |
4534 | * truncatable state while each transaction commits. | 4534 | * truncatable state while each transaction commits. |
4535 | */ | 4535 | */ |
4536 | if (ext4_orphan_add(handle, inode)) | 4536 | if (ext4_orphan_add(handle, inode)) |
4537 | goto out_stop; | 4537 | goto out_stop; |
4538 | 4538 | ||
4539 | /* | 4539 | /* |
4540 | * From here we block out all ext4_get_block() callers who want to | 4540 | * From here we block out all ext4_get_block() callers who want to |
4541 | * modify the block allocation tree. | 4541 | * modify the block allocation tree. |
4542 | */ | 4542 | */ |
4543 | down_write(&ei->i_data_sem); | 4543 | down_write(&ei->i_data_sem); |
4544 | 4544 | ||
4545 | ext4_discard_preallocations(inode); | 4545 | ext4_discard_preallocations(inode); |
4546 | 4546 | ||
4547 | /* | 4547 | /* |
4548 | * The orphan list entry will now protect us from any crash which | 4548 | * The orphan list entry will now protect us from any crash which |
4549 | * occurs before the truncate completes, so it is now safe to propagate | 4549 | * occurs before the truncate completes, so it is now safe to propagate |
4550 | * the new, shorter inode size (held for now in i_size) into the | 4550 | * the new, shorter inode size (held for now in i_size) into the |
4551 | * on-disk inode. We do this via i_disksize, which is the value which | 4551 | * on-disk inode. We do this via i_disksize, which is the value which |
4552 | * ext4 *really* writes onto the disk inode. | 4552 | * ext4 *really* writes onto the disk inode. |
4553 | */ | 4553 | */ |
4554 | ei->i_disksize = inode->i_size; | 4554 | ei->i_disksize = inode->i_size; |
4555 | 4555 | ||
4556 | if (last_block == max_block) { | 4556 | if (last_block == max_block) { |
4557 | /* | 4557 | /* |
4558 | * It is unnecessary to free any data blocks if last_block is | 4558 | * It is unnecessary to free any data blocks if last_block is |
4559 | * equal to the indirect block limit. | 4559 | * equal to the indirect block limit. |
4560 | */ | 4560 | */ |
4561 | goto out_unlock; | 4561 | goto out_unlock; |
4562 | } else if (n == 1) { /* direct blocks */ | 4562 | } else if (n == 1) { /* direct blocks */ |
4563 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | 4563 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], |
4564 | i_data + EXT4_NDIR_BLOCKS); | 4564 | i_data + EXT4_NDIR_BLOCKS); |
4565 | goto do_indirects; | 4565 | goto do_indirects; |
4566 | } | 4566 | } |
4567 | 4567 | ||
4568 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | 4568 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); |
4569 | /* Kill the top of shared branch (not detached) */ | 4569 | /* Kill the top of shared branch (not detached) */ |
4570 | if (nr) { | 4570 | if (nr) { |
4571 | if (partial == chain) { | 4571 | if (partial == chain) { |
4572 | /* Shared branch grows from the inode */ | 4572 | /* Shared branch grows from the inode */ |
4573 | ext4_free_branches(handle, inode, NULL, | 4573 | ext4_free_branches(handle, inode, NULL, |
4574 | &nr, &nr+1, (chain+n-1) - partial); | 4574 | &nr, &nr+1, (chain+n-1) - partial); |
4575 | *partial->p = 0; | 4575 | *partial->p = 0; |
4576 | /* | 4576 | /* |
4577 | * We mark the inode dirty prior to restart, | 4577 | * We mark the inode dirty prior to restart, |
4578 | * and prior to stop. No need for it here. | 4578 | * and prior to stop. No need for it here. |
4579 | */ | 4579 | */ |
4580 | } else { | 4580 | } else { |
4581 | /* Shared branch grows from an indirect block */ | 4581 | /* Shared branch grows from an indirect block */ |
4582 | BUFFER_TRACE(partial->bh, "get_write_access"); | 4582 | BUFFER_TRACE(partial->bh, "get_write_access"); |
4583 | ext4_free_branches(handle, inode, partial->bh, | 4583 | ext4_free_branches(handle, inode, partial->bh, |
4584 | partial->p, | 4584 | partial->p, |
4585 | partial->p+1, (chain+n-1) - partial); | 4585 | partial->p+1, (chain+n-1) - partial); |
4586 | } | 4586 | } |
4587 | } | 4587 | } |
4588 | /* Clear the ends of indirect blocks on the shared branch */ | 4588 | /* Clear the ends of indirect blocks on the shared branch */ |
4589 | while (partial > chain) { | 4589 | while (partial > chain) { |
4590 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | 4590 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, |
4591 | (__le32*)partial->bh->b_data+addr_per_block, | 4591 | (__le32*)partial->bh->b_data+addr_per_block, |
4592 | (chain+n-1) - partial); | 4592 | (chain+n-1) - partial); |
4593 | BUFFER_TRACE(partial->bh, "call brelse"); | 4593 | BUFFER_TRACE(partial->bh, "call brelse"); |
4594 | brelse(partial->bh); | 4594 | brelse(partial->bh); |
4595 | partial--; | 4595 | partial--; |
4596 | } | 4596 | } |
4597 | do_indirects: | 4597 | do_indirects: |
4598 | /* Kill the remaining (whole) subtrees */ | 4598 | /* Kill the remaining (whole) subtrees */ |
4599 | switch (offsets[0]) { | 4599 | switch (offsets[0]) { |
4600 | default: | 4600 | default: |
4601 | nr = i_data[EXT4_IND_BLOCK]; | 4601 | nr = i_data[EXT4_IND_BLOCK]; |
4602 | if (nr) { | 4602 | if (nr) { |
4603 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | 4603 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); |
4604 | i_data[EXT4_IND_BLOCK] = 0; | 4604 | i_data[EXT4_IND_BLOCK] = 0; |
4605 | } | 4605 | } |
4606 | case EXT4_IND_BLOCK: | 4606 | case EXT4_IND_BLOCK: |
4607 | nr = i_data[EXT4_DIND_BLOCK]; | 4607 | nr = i_data[EXT4_DIND_BLOCK]; |
4608 | if (nr) { | 4608 | if (nr) { |
4609 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | 4609 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); |
4610 | i_data[EXT4_DIND_BLOCK] = 0; | 4610 | i_data[EXT4_DIND_BLOCK] = 0; |
4611 | } | 4611 | } |
4612 | case EXT4_DIND_BLOCK: | 4612 | case EXT4_DIND_BLOCK: |
4613 | nr = i_data[EXT4_TIND_BLOCK]; | 4613 | nr = i_data[EXT4_TIND_BLOCK]; |
4614 | if (nr) { | 4614 | if (nr) { |
4615 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | 4615 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); |
4616 | i_data[EXT4_TIND_BLOCK] = 0; | 4616 | i_data[EXT4_TIND_BLOCK] = 0; |
4617 | } | 4617 | } |
4618 | case EXT4_TIND_BLOCK: | 4618 | case EXT4_TIND_BLOCK: |
4619 | ; | 4619 | ; |
4620 | } | 4620 | } |
4621 | 4621 | ||
4622 | out_unlock: | 4622 | out_unlock: |
4623 | up_write(&ei->i_data_sem); | 4623 | up_write(&ei->i_data_sem); |
4624 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | 4624 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); |
4625 | ext4_mark_inode_dirty(handle, inode); | 4625 | ext4_mark_inode_dirty(handle, inode); |
4626 | 4626 | ||
4627 | /* | 4627 | /* |
4628 | * In a multi-transaction truncate, we only make the final transaction | 4628 | * In a multi-transaction truncate, we only make the final transaction |
4629 | * synchronous | 4629 | * synchronous |
4630 | */ | 4630 | */ |
4631 | if (IS_SYNC(inode)) | 4631 | if (IS_SYNC(inode)) |
4632 | ext4_handle_sync(handle); | 4632 | ext4_handle_sync(handle); |
4633 | out_stop: | 4633 | out_stop: |
4634 | /* | 4634 | /* |
4635 | * If this was a simple ftruncate(), and the file will remain alive | 4635 | * If this was a simple ftruncate(), and the file will remain alive |
4636 | * then we need to clear up the orphan record which we created above. | 4636 | * then we need to clear up the orphan record which we created above. |
4637 | * However, if this was a real unlink then we were called by | 4637 | * However, if this was a real unlink then we were called by |
4638 | * ext4_delete_inode(), and we allow that function to clean up the | 4638 | * ext4_delete_inode(), and we allow that function to clean up the |
4639 | * orphan info for us. | 4639 | * orphan info for us. |
4640 | */ | 4640 | */ |
4641 | if (inode->i_nlink) | 4641 | if (inode->i_nlink) |
4642 | ext4_orphan_del(handle, inode); | 4642 | ext4_orphan_del(handle, inode); |
4643 | 4643 | ||
4644 | ext4_journal_stop(handle); | 4644 | ext4_journal_stop(handle); |
4645 | trace_ext4_truncate_exit(inode); | 4645 | trace_ext4_truncate_exit(inode); |
4646 | } | 4646 | } |
4647 | 4647 | ||
4648 | /* | 4648 | /* |
4649 | * ext4_get_inode_loc returns with an extra refcount against the inode's | 4649 | * ext4_get_inode_loc returns with an extra refcount against the inode's |
4650 | * underlying buffer_head on success. If 'in_mem' is true, we have all | 4650 | * underlying buffer_head on success. If 'in_mem' is true, we have all |
4651 | * data in memory that is needed to recreate the on-disk version of this | 4651 | * data in memory that is needed to recreate the on-disk version of this |
4652 | * inode. | 4652 | * inode. |
4653 | */ | 4653 | */ |
4654 | static int __ext4_get_inode_loc(struct inode *inode, | 4654 | static int __ext4_get_inode_loc(struct inode *inode, |
4655 | struct ext4_iloc *iloc, int in_mem) | 4655 | struct ext4_iloc *iloc, int in_mem) |
4656 | { | 4656 | { |
4657 | struct ext4_group_desc *gdp; | 4657 | struct ext4_group_desc *gdp; |
4658 | struct buffer_head *bh; | 4658 | struct buffer_head *bh; |
4659 | struct super_block *sb = inode->i_sb; | 4659 | struct super_block *sb = inode->i_sb; |
4660 | ext4_fsblk_t block; | 4660 | ext4_fsblk_t block; |
4661 | int inodes_per_block, inode_offset; | 4661 | int inodes_per_block, inode_offset; |
4662 | 4662 | ||
4663 | iloc->bh = NULL; | 4663 | iloc->bh = NULL; |
4664 | if (!ext4_valid_inum(sb, inode->i_ino)) | 4664 | if (!ext4_valid_inum(sb, inode->i_ino)) |
4665 | return -EIO; | 4665 | return -EIO; |
4666 | 4666 | ||
4667 | iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); | 4667 | iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); |
4668 | gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); | 4668 | gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); |
4669 | if (!gdp) | 4669 | if (!gdp) |
4670 | return -EIO; | 4670 | return -EIO; |
4671 | 4671 | ||
4672 | /* | 4672 | /* |
4673 | * Figure out the offset within the block group inode table | 4673 | * Figure out the offset within the block group inode table |
4674 | */ | 4674 | */ |
4675 | inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; | 4675 | inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; |
4676 | inode_offset = ((inode->i_ino - 1) % | 4676 | inode_offset = ((inode->i_ino - 1) % |
4677 | EXT4_INODES_PER_GROUP(sb)); | 4677 | EXT4_INODES_PER_GROUP(sb)); |
4678 | block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); | 4678 | block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); |
4679 | iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); | 4679 | iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); |
4680 | 4680 | ||
4681 | bh = sb_getblk(sb, block); | 4681 | bh = sb_getblk(sb, block); |
4682 | if (!bh) { | 4682 | if (!bh) { |
4683 | EXT4_ERROR_INODE_BLOCK(inode, block, | 4683 | EXT4_ERROR_INODE_BLOCK(inode, block, |
4684 | "unable to read itable block"); | 4684 | "unable to read itable block"); |
4685 | return -EIO; | 4685 | return -EIO; |
4686 | } | 4686 | } |
4687 | if (!buffer_uptodate(bh)) { | 4687 | if (!buffer_uptodate(bh)) { |
4688 | lock_buffer(bh); | 4688 | lock_buffer(bh); |
4689 | 4689 | ||
4690 | /* | 4690 | /* |
4691 | * If the buffer has the write error flag, we have failed | 4691 | * If the buffer has the write error flag, we have failed |
4692 | * to write out another inode in the same block. In this | 4692 | * to write out another inode in the same block. In this |
4693 | * case, we don't have to read the block because we may | 4693 | * case, we don't have to read the block because we may |
4694 | * read the old inode data successfully. | 4694 | * read the old inode data successfully. |
4695 | */ | 4695 | */ |
4696 | if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) | 4696 | if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) |
4697 | set_buffer_uptodate(bh); | 4697 | set_buffer_uptodate(bh); |
4698 | 4698 | ||
4699 | if (buffer_uptodate(bh)) { | 4699 | if (buffer_uptodate(bh)) { |
4700 | /* someone brought it uptodate while we waited */ | 4700 | /* someone brought it uptodate while we waited */ |
4701 | unlock_buffer(bh); | 4701 | unlock_buffer(bh); |
4702 | goto has_buffer; | 4702 | goto has_buffer; |
4703 | } | 4703 | } |
4704 | 4704 | ||
4705 | /* | 4705 | /* |
4706 | * If we have all information of the inode in memory and this | 4706 | * If we have all information of the inode in memory and this |
4707 | * is the only valid inode in the block, we need not read the | 4707 | * is the only valid inode in the block, we need not read the |
4708 | * block. | 4708 | * block. |
4709 | */ | 4709 | */ |
4710 | if (in_mem) { | 4710 | if (in_mem) { |
4711 | struct buffer_head *bitmap_bh; | 4711 | struct buffer_head *bitmap_bh; |
4712 | int i, start; | 4712 | int i, start; |
4713 | 4713 | ||
4714 | start = inode_offset & ~(inodes_per_block - 1); | 4714 | start = inode_offset & ~(inodes_per_block - 1); |
4715 | 4715 | ||
4716 | /* Is the inode bitmap in cache? */ | 4716 | /* Is the inode bitmap in cache? */ |
4717 | bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); | 4717 | bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); |
4718 | if (!bitmap_bh) | 4718 | if (!bitmap_bh) |
4719 | goto make_io; | 4719 | goto make_io; |
4720 | 4720 | ||
4721 | /* | 4721 | /* |
4722 | * If the inode bitmap isn't in cache then the | 4722 | * If the inode bitmap isn't in cache then the |
4723 | * optimisation may end up performing two reads instead | 4723 | * optimisation may end up performing two reads instead |
4724 | * of one, so skip it. | 4724 | * of one, so skip it. |
4725 | */ | 4725 | */ |
4726 | if (!buffer_uptodate(bitmap_bh)) { | 4726 | if (!buffer_uptodate(bitmap_bh)) { |
4727 | brelse(bitmap_bh); | 4727 | brelse(bitmap_bh); |
4728 | goto make_io; | 4728 | goto make_io; |
4729 | } | 4729 | } |
4730 | for (i = start; i < start + inodes_per_block; i++) { | 4730 | for (i = start; i < start + inodes_per_block; i++) { |
4731 | if (i == inode_offset) | 4731 | if (i == inode_offset) |
4732 | continue; | 4732 | continue; |
4733 | if (ext4_test_bit(i, bitmap_bh->b_data)) | 4733 | if (ext4_test_bit(i, bitmap_bh->b_data)) |
4734 | break; | 4734 | break; |
4735 | } | 4735 | } |
4736 | brelse(bitmap_bh); | 4736 | brelse(bitmap_bh); |
4737 | if (i == start + inodes_per_block) { | 4737 | if (i == start + inodes_per_block) { |
4738 | /* all other inodes are free, so skip I/O */ | 4738 | /* all other inodes are free, so skip I/O */ |
4739 | memset(bh->b_data, 0, bh->b_size); | 4739 | memset(bh->b_data, 0, bh->b_size); |
4740 | set_buffer_uptodate(bh); | 4740 | set_buffer_uptodate(bh); |
4741 | unlock_buffer(bh); | 4741 | unlock_buffer(bh); |
4742 | goto has_buffer; | 4742 | goto has_buffer; |
4743 | } | 4743 | } |
4744 | } | 4744 | } |
4745 | 4745 | ||
4746 | make_io: | 4746 | make_io: |
4747 | /* | 4747 | /* |
4748 | * If we need to do any I/O, try to pre-readahead extra | 4748 | * If we need to do any I/O, try to pre-readahead extra |
4749 | * blocks from the inode table. | 4749 | * blocks from the inode table. |
4750 | */ | 4750 | */ |
4751 | if (EXT4_SB(sb)->s_inode_readahead_blks) { | 4751 | if (EXT4_SB(sb)->s_inode_readahead_blks) { |
4752 | ext4_fsblk_t b, end, table; | 4752 | ext4_fsblk_t b, end, table; |
4753 | unsigned num; | 4753 | unsigned num; |
4754 | 4754 | ||
4755 | table = ext4_inode_table(sb, gdp); | 4755 | table = ext4_inode_table(sb, gdp); |
4756 | /* s_inode_readahead_blks is always a power of 2 */ | 4756 | /* s_inode_readahead_blks is always a power of 2 */ |
4757 | b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); | 4757 | b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); |
4758 | if (table > b) | 4758 | if (table > b) |
4759 | b = table; | 4759 | b = table; |
4760 | end = b + EXT4_SB(sb)->s_inode_readahead_blks; | 4760 | end = b + EXT4_SB(sb)->s_inode_readahead_blks; |
4761 | num = EXT4_INODES_PER_GROUP(sb); | 4761 | num = EXT4_INODES_PER_GROUP(sb); |
4762 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | 4762 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, |
4763 | EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) | 4763 | EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) |
4764 | num -= ext4_itable_unused_count(sb, gdp); | 4764 | num -= ext4_itable_unused_count(sb, gdp); |
4765 | table += num / inodes_per_block; | 4765 | table += num / inodes_per_block; |
4766 | if (end > table) | 4766 | if (end > table) |
4767 | end = table; | 4767 | end = table; |
4768 | while (b <= end) | 4768 | while (b <= end) |
4769 | sb_breadahead(sb, b++); | 4769 | sb_breadahead(sb, b++); |
4770 | } | 4770 | } |
4771 | 4771 | ||
4772 | /* | 4772 | /* |
4773 | * There are other valid inodes in the buffer, this inode | 4773 | * There are other valid inodes in the buffer, this inode |
4774 | * has in-inode xattrs, or we don't have this inode in memory. | 4774 | * has in-inode xattrs, or we don't have this inode in memory. |
4775 | * Read the block from disk. | 4775 | * Read the block from disk. |
4776 | */ | 4776 | */ |
4777 | trace_ext4_load_inode(inode); | 4777 | trace_ext4_load_inode(inode); |
4778 | get_bh(bh); | 4778 | get_bh(bh); |
4779 | bh->b_end_io = end_buffer_read_sync; | 4779 | bh->b_end_io = end_buffer_read_sync; |
4780 | submit_bh(READ_META, bh); | 4780 | submit_bh(READ_META, bh); |
4781 | wait_on_buffer(bh); | 4781 | wait_on_buffer(bh); |
4782 | if (!buffer_uptodate(bh)) { | 4782 | if (!buffer_uptodate(bh)) { |
4783 | EXT4_ERROR_INODE_BLOCK(inode, block, | 4783 | EXT4_ERROR_INODE_BLOCK(inode, block, |
4784 | "unable to read itable block"); | 4784 | "unable to read itable block"); |
4785 | brelse(bh); | 4785 | brelse(bh); |
4786 | return -EIO; | 4786 | return -EIO; |
4787 | } | 4787 | } |
4788 | } | 4788 | } |
4789 | has_buffer: | 4789 | has_buffer: |
4790 | iloc->bh = bh; | 4790 | iloc->bh = bh; |
4791 | return 0; | 4791 | return 0; |
4792 | } | 4792 | } |
4793 | 4793 | ||
4794 | int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) | 4794 | int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) |
4795 | { | 4795 | { |
4796 | /* We have all inode data except xattrs in memory here. */ | 4796 | /* We have all inode data except xattrs in memory here. */ |
4797 | return __ext4_get_inode_loc(inode, iloc, | 4797 | return __ext4_get_inode_loc(inode, iloc, |
4798 | !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); | 4798 | !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); |
4799 | } | 4799 | } |
4800 | 4800 | ||
4801 | void ext4_set_inode_flags(struct inode *inode) | 4801 | void ext4_set_inode_flags(struct inode *inode) |
4802 | { | 4802 | { |
4803 | unsigned int flags = EXT4_I(inode)->i_flags; | 4803 | unsigned int flags = EXT4_I(inode)->i_flags; |
4804 | 4804 | ||
4805 | inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); | 4805 | inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); |
4806 | if (flags & EXT4_SYNC_FL) | 4806 | if (flags & EXT4_SYNC_FL) |
4807 | inode->i_flags |= S_SYNC; | 4807 | inode->i_flags |= S_SYNC; |
4808 | if (flags & EXT4_APPEND_FL) | 4808 | if (flags & EXT4_APPEND_FL) |
4809 | inode->i_flags |= S_APPEND; | 4809 | inode->i_flags |= S_APPEND; |
4810 | if (flags & EXT4_IMMUTABLE_FL) | 4810 | if (flags & EXT4_IMMUTABLE_FL) |
4811 | inode->i_flags |= S_IMMUTABLE; | 4811 | inode->i_flags |= S_IMMUTABLE; |
4812 | if (flags & EXT4_NOATIME_FL) | 4812 | if (flags & EXT4_NOATIME_FL) |
4813 | inode->i_flags |= S_NOATIME; | 4813 | inode->i_flags |= S_NOATIME; |
4814 | if (flags & EXT4_DIRSYNC_FL) | 4814 | if (flags & EXT4_DIRSYNC_FL) |
4815 | inode->i_flags |= S_DIRSYNC; | 4815 | inode->i_flags |= S_DIRSYNC; |
4816 | } | 4816 | } |
4817 | 4817 | ||
4818 | /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ | 4818 | /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ |
4819 | void ext4_get_inode_flags(struct ext4_inode_info *ei) | 4819 | void ext4_get_inode_flags(struct ext4_inode_info *ei) |
4820 | { | 4820 | { |
4821 | unsigned int vfs_fl; | 4821 | unsigned int vfs_fl; |
4822 | unsigned long old_fl, new_fl; | 4822 | unsigned long old_fl, new_fl; |
4823 | 4823 | ||
4824 | do { | 4824 | do { |
4825 | vfs_fl = ei->vfs_inode.i_flags; | 4825 | vfs_fl = ei->vfs_inode.i_flags; |
4826 | old_fl = ei->i_flags; | 4826 | old_fl = ei->i_flags; |
4827 | new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| | 4827 | new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| |
4828 | EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| | 4828 | EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| |
4829 | EXT4_DIRSYNC_FL); | 4829 | EXT4_DIRSYNC_FL); |
4830 | if (vfs_fl & S_SYNC) | 4830 | if (vfs_fl & S_SYNC) |
4831 | new_fl |= EXT4_SYNC_FL; | 4831 | new_fl |= EXT4_SYNC_FL; |
4832 | if (vfs_fl & S_APPEND) | 4832 | if (vfs_fl & S_APPEND) |
4833 | new_fl |= EXT4_APPEND_FL; | 4833 | new_fl |= EXT4_APPEND_FL; |
4834 | if (vfs_fl & S_IMMUTABLE) | 4834 | if (vfs_fl & S_IMMUTABLE) |
4835 | new_fl |= EXT4_IMMUTABLE_FL; | 4835 | new_fl |= EXT4_IMMUTABLE_FL; |
4836 | if (vfs_fl & S_NOATIME) | 4836 | if (vfs_fl & S_NOATIME) |
4837 | new_fl |= EXT4_NOATIME_FL; | 4837 | new_fl |= EXT4_NOATIME_FL; |
4838 | if (vfs_fl & S_DIRSYNC) | 4838 | if (vfs_fl & S_DIRSYNC) |
4839 | new_fl |= EXT4_DIRSYNC_FL; | 4839 | new_fl |= EXT4_DIRSYNC_FL; |
4840 | } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); | 4840 | } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); |
4841 | } | 4841 | } |
4842 | 4842 | ||
4843 | static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, | 4843 | static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, |
4844 | struct ext4_inode_info *ei) | 4844 | struct ext4_inode_info *ei) |
4845 | { | 4845 | { |
4846 | blkcnt_t i_blocks ; | 4846 | blkcnt_t i_blocks ; |
4847 | struct inode *inode = &(ei->vfs_inode); | 4847 | struct inode *inode = &(ei->vfs_inode); |
4848 | struct super_block *sb = inode->i_sb; | 4848 | struct super_block *sb = inode->i_sb; |
4849 | 4849 | ||
4850 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | 4850 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, |
4851 | EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { | 4851 | EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { |
4852 | /* we are using combined 48 bit field */ | 4852 | /* we are using combined 48 bit field */ |
4853 | i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | | 4853 | i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | |
4854 | le32_to_cpu(raw_inode->i_blocks_lo); | 4854 | le32_to_cpu(raw_inode->i_blocks_lo); |
4855 | if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { | 4855 | if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { |
4856 | /* i_blocks represent file system block size */ | 4856 | /* i_blocks represent file system block size */ |
4857 | return i_blocks << (inode->i_blkbits - 9); | 4857 | return i_blocks << (inode->i_blkbits - 9); |
4858 | } else { | 4858 | } else { |
4859 | return i_blocks; | 4859 | return i_blocks; |
4860 | } | 4860 | } |
4861 | } else { | 4861 | } else { |
4862 | return le32_to_cpu(raw_inode->i_blocks_lo); | 4862 | return le32_to_cpu(raw_inode->i_blocks_lo); |
4863 | } | 4863 | } |
4864 | } | 4864 | } |
4865 | 4865 | ||
4866 | struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | 4866 | struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
4867 | { | 4867 | { |
4868 | struct ext4_iloc iloc; | 4868 | struct ext4_iloc iloc; |
4869 | struct ext4_inode *raw_inode; | 4869 | struct ext4_inode *raw_inode; |
4870 | struct ext4_inode_info *ei; | 4870 | struct ext4_inode_info *ei; |
4871 | struct inode *inode; | 4871 | struct inode *inode; |
4872 | journal_t *journal = EXT4_SB(sb)->s_journal; | 4872 | journal_t *journal = EXT4_SB(sb)->s_journal; |
4873 | long ret; | 4873 | long ret; |
4874 | int block; | 4874 | int block; |
4875 | 4875 | ||
4876 | inode = iget_locked(sb, ino); | 4876 | inode = iget_locked(sb, ino); |
4877 | if (!inode) | 4877 | if (!inode) |
4878 | return ERR_PTR(-ENOMEM); | 4878 | return ERR_PTR(-ENOMEM); |
4879 | if (!(inode->i_state & I_NEW)) | 4879 | if (!(inode->i_state & I_NEW)) |
4880 | return inode; | 4880 | return inode; |
4881 | 4881 | ||
4882 | ei = EXT4_I(inode); | 4882 | ei = EXT4_I(inode); |
4883 | iloc.bh = NULL; | 4883 | iloc.bh = NULL; |
4884 | 4884 | ||
4885 | ret = __ext4_get_inode_loc(inode, &iloc, 0); | 4885 | ret = __ext4_get_inode_loc(inode, &iloc, 0); |
4886 | if (ret < 0) | 4886 | if (ret < 0) |
4887 | goto bad_inode; | 4887 | goto bad_inode; |
4888 | raw_inode = ext4_raw_inode(&iloc); | 4888 | raw_inode = ext4_raw_inode(&iloc); |
4889 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); | 4889 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); |
4890 | inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); | 4890 | inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); |
4891 | inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); | 4891 | inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); |
4892 | if (!(test_opt(inode->i_sb, NO_UID32))) { | 4892 | if (!(test_opt(inode->i_sb, NO_UID32))) { |
4893 | inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; | 4893 | inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; |
4894 | inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; | 4894 | inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; |
4895 | } | 4895 | } |
4896 | inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); | 4896 | inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); |
4897 | 4897 | ||
4898 | ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ | 4898 | ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ |
4899 | ei->i_dir_start_lookup = 0; | 4899 | ei->i_dir_start_lookup = 0; |
4900 | ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); | 4900 | ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); |
4901 | /* We now have enough fields to check if the inode was active or not. | 4901 | /* We now have enough fields to check if the inode was active or not. |
4902 | * This is needed because nfsd might try to access dead inodes | 4902 | * This is needed because nfsd might try to access dead inodes |
4903 | * the test is that same one that e2fsck uses | 4903 | * the test is that same one that e2fsck uses |
4904 | * NeilBrown 1999oct15 | 4904 | * NeilBrown 1999oct15 |
4905 | */ | 4905 | */ |
4906 | if (inode->i_nlink == 0) { | 4906 | if (inode->i_nlink == 0) { |
4907 | if (inode->i_mode == 0 || | 4907 | if (inode->i_mode == 0 || |
4908 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { | 4908 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { |
4909 | /* this inode is deleted */ | 4909 | /* this inode is deleted */ |
4910 | ret = -ESTALE; | 4910 | ret = -ESTALE; |
4911 | goto bad_inode; | 4911 | goto bad_inode; |
4912 | } | 4912 | } |
4913 | /* The only unlinked inodes we let through here have | 4913 | /* The only unlinked inodes we let through here have |
4914 | * valid i_mode and are being read by the orphan | 4914 | * valid i_mode and are being read by the orphan |
4915 | * recovery code: that's fine, we're about to complete | 4915 | * recovery code: that's fine, we're about to complete |
4916 | * the process of deleting those. */ | 4916 | * the process of deleting those. */ |
4917 | } | 4917 | } |
4918 | ei->i_flags = le32_to_cpu(raw_inode->i_flags); | 4918 | ei->i_flags = le32_to_cpu(raw_inode->i_flags); |
4919 | inode->i_blocks = ext4_inode_blocks(raw_inode, ei); | 4919 | inode->i_blocks = ext4_inode_blocks(raw_inode, ei); |
4920 | ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); | 4920 | ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); |
4921 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) | 4921 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) |
4922 | ei->i_file_acl |= | 4922 | ei->i_file_acl |= |
4923 | ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; | 4923 | ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; |
4924 | inode->i_size = ext4_isize(raw_inode); | 4924 | inode->i_size = ext4_isize(raw_inode); |
4925 | ei->i_disksize = inode->i_size; | 4925 | ei->i_disksize = inode->i_size; |
4926 | #ifdef CONFIG_QUOTA | 4926 | #ifdef CONFIG_QUOTA |
4927 | ei->i_reserved_quota = 0; | 4927 | ei->i_reserved_quota = 0; |
4928 | #endif | 4928 | #endif |
4929 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); | 4929 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); |
4930 | ei->i_block_group = iloc.block_group; | 4930 | ei->i_block_group = iloc.block_group; |
4931 | ei->i_last_alloc_group = ~0; | 4931 | ei->i_last_alloc_group = ~0; |
4932 | /* | 4932 | /* |
4933 | * NOTE! The in-memory inode i_data array is in little-endian order | 4933 | * NOTE! The in-memory inode i_data array is in little-endian order |
4934 | * even on big-endian machines: we do NOT byteswap the block numbers! | 4934 | * even on big-endian machines: we do NOT byteswap the block numbers! |
4935 | */ | 4935 | */ |
4936 | for (block = 0; block < EXT4_N_BLOCKS; block++) | 4936 | for (block = 0; block < EXT4_N_BLOCKS; block++) |
4937 | ei->i_data[block] = raw_inode->i_block[block]; | 4937 | ei->i_data[block] = raw_inode->i_block[block]; |
4938 | INIT_LIST_HEAD(&ei->i_orphan); | 4938 | INIT_LIST_HEAD(&ei->i_orphan); |
4939 | 4939 | ||
4940 | /* | 4940 | /* |
4941 | * Set transaction id's of transactions that have to be committed | 4941 | * Set transaction id's of transactions that have to be committed |
4942 | * to finish f[data]sync. We set them to currently running transaction | 4942 | * to finish f[data]sync. We set them to currently running transaction |
4943 | * as we cannot be sure that the inode or some of its metadata isn't | 4943 | * as we cannot be sure that the inode or some of its metadata isn't |
4944 | * part of the transaction - the inode could have been reclaimed and | 4944 | * part of the transaction - the inode could have been reclaimed and |
4945 | * now it is reread from disk. | 4945 | * now it is reread from disk. |
4946 | */ | 4946 | */ |
4947 | if (journal) { | 4947 | if (journal) { |
4948 | transaction_t *transaction; | 4948 | transaction_t *transaction; |
4949 | tid_t tid; | 4949 | tid_t tid; |
4950 | 4950 | ||
4951 | read_lock(&journal->j_state_lock); | 4951 | read_lock(&journal->j_state_lock); |
4952 | if (journal->j_running_transaction) | 4952 | if (journal->j_running_transaction) |
4953 | transaction = journal->j_running_transaction; | 4953 | transaction = journal->j_running_transaction; |
4954 | else | 4954 | else |
4955 | transaction = journal->j_committing_transaction; | 4955 | transaction = journal->j_committing_transaction; |
4956 | if (transaction) | 4956 | if (transaction) |
4957 | tid = transaction->t_tid; | 4957 | tid = transaction->t_tid; |
4958 | else | 4958 | else |
4959 | tid = journal->j_commit_sequence; | 4959 | tid = journal->j_commit_sequence; |
4960 | read_unlock(&journal->j_state_lock); | 4960 | read_unlock(&journal->j_state_lock); |
4961 | ei->i_sync_tid = tid; | 4961 | ei->i_sync_tid = tid; |
4962 | ei->i_datasync_tid = tid; | 4962 | ei->i_datasync_tid = tid; |
4963 | } | 4963 | } |
4964 | 4964 | ||
4965 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { | 4965 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { |
4966 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); | 4966 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); |
4967 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > | 4967 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > |
4968 | EXT4_INODE_SIZE(inode->i_sb)) { | 4968 | EXT4_INODE_SIZE(inode->i_sb)) { |
4969 | ret = -EIO; | 4969 | ret = -EIO; |
4970 | goto bad_inode; | 4970 | goto bad_inode; |
4971 | } | 4971 | } |
4972 | if (ei->i_extra_isize == 0) { | 4972 | if (ei->i_extra_isize == 0) { |
4973 | /* The extra space is currently unused. Use it. */ | 4973 | /* The extra space is currently unused. Use it. */ |
4974 | ei->i_extra_isize = sizeof(struct ext4_inode) - | 4974 | ei->i_extra_isize = sizeof(struct ext4_inode) - |
4975 | EXT4_GOOD_OLD_INODE_SIZE; | 4975 | EXT4_GOOD_OLD_INODE_SIZE; |
4976 | } else { | 4976 | } else { |
4977 | __le32 *magic = (void *)raw_inode + | 4977 | __le32 *magic = (void *)raw_inode + |
4978 | EXT4_GOOD_OLD_INODE_SIZE + | 4978 | EXT4_GOOD_OLD_INODE_SIZE + |
4979 | ei->i_extra_isize; | 4979 | ei->i_extra_isize; |
4980 | if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) | 4980 | if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) |
4981 | ext4_set_inode_state(inode, EXT4_STATE_XATTR); | 4981 | ext4_set_inode_state(inode, EXT4_STATE_XATTR); |
4982 | } | 4982 | } |
4983 | } else | 4983 | } else |
4984 | ei->i_extra_isize = 0; | 4984 | ei->i_extra_isize = 0; |
4985 | 4985 | ||
4986 | EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); | 4986 | EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); |
4987 | EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); | 4987 | EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); |
4988 | EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); | 4988 | EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); |
4989 | EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); | 4989 | EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); |
4990 | 4990 | ||
4991 | inode->i_version = le32_to_cpu(raw_inode->i_disk_version); | 4991 | inode->i_version = le32_to_cpu(raw_inode->i_disk_version); |
4992 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { | 4992 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { |
4993 | if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) | 4993 | if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) |
4994 | inode->i_version |= | 4994 | inode->i_version |= |
4995 | (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; | 4995 | (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; |
4996 | } | 4996 | } |
4997 | 4997 | ||
4998 | ret = 0; | 4998 | ret = 0; |
4999 | if (ei->i_file_acl && | 4999 | if (ei->i_file_acl && |
5000 | !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { | 5000 | !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { |
5001 | EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", | 5001 | EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", |
5002 | ei->i_file_acl); | 5002 | ei->i_file_acl); |
5003 | ret = -EIO; | 5003 | ret = -EIO; |
5004 | goto bad_inode; | 5004 | goto bad_inode; |
5005 | } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 5005 | } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
5006 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 5006 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
5007 | (S_ISLNK(inode->i_mode) && | 5007 | (S_ISLNK(inode->i_mode) && |
5008 | !ext4_inode_is_fast_symlink(inode))) | 5008 | !ext4_inode_is_fast_symlink(inode))) |
5009 | /* Validate extent which is part of inode */ | 5009 | /* Validate extent which is part of inode */ |
5010 | ret = ext4_ext_check_inode(inode); | 5010 | ret = ext4_ext_check_inode(inode); |
5011 | } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 5011 | } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
5012 | (S_ISLNK(inode->i_mode) && | 5012 | (S_ISLNK(inode->i_mode) && |
5013 | !ext4_inode_is_fast_symlink(inode))) { | 5013 | !ext4_inode_is_fast_symlink(inode))) { |
5014 | /* Validate block references which are part of inode */ | 5014 | /* Validate block references which are part of inode */ |
5015 | ret = ext4_check_inode_blockref(inode); | 5015 | ret = ext4_check_inode_blockref(inode); |
5016 | } | 5016 | } |
5017 | if (ret) | 5017 | if (ret) |
5018 | goto bad_inode; | 5018 | goto bad_inode; |
5019 | 5019 | ||
5020 | if (S_ISREG(inode->i_mode)) { | 5020 | if (S_ISREG(inode->i_mode)) { |
5021 | inode->i_op = &ext4_file_inode_operations; | 5021 | inode->i_op = &ext4_file_inode_operations; |
5022 | inode->i_fop = &ext4_file_operations; | 5022 | inode->i_fop = &ext4_file_operations; |
5023 | ext4_set_aops(inode); | 5023 | ext4_set_aops(inode); |
5024 | } else if (S_ISDIR(inode->i_mode)) { | 5024 | } else if (S_ISDIR(inode->i_mode)) { |
5025 | inode->i_op = &ext4_dir_inode_operations; | 5025 | inode->i_op = &ext4_dir_inode_operations; |
5026 | inode->i_fop = &ext4_dir_operations; | 5026 | inode->i_fop = &ext4_dir_operations; |
5027 | } else if (S_ISLNK(inode->i_mode)) { | 5027 | } else if (S_ISLNK(inode->i_mode)) { |
5028 | if (ext4_inode_is_fast_symlink(inode)) { | 5028 | if (ext4_inode_is_fast_symlink(inode)) { |
5029 | inode->i_op = &ext4_fast_symlink_inode_operations; | 5029 | inode->i_op = &ext4_fast_symlink_inode_operations; |
5030 | nd_terminate_link(ei->i_data, inode->i_size, | 5030 | nd_terminate_link(ei->i_data, inode->i_size, |
5031 | sizeof(ei->i_data) - 1); | 5031 | sizeof(ei->i_data) - 1); |
5032 | } else { | 5032 | } else { |
5033 | inode->i_op = &ext4_symlink_inode_operations; | 5033 | inode->i_op = &ext4_symlink_inode_operations; |
5034 | ext4_set_aops(inode); | 5034 | ext4_set_aops(inode); |
5035 | } | 5035 | } |
5036 | } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || | 5036 | } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || |
5037 | S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { | 5037 | S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { |
5038 | inode->i_op = &ext4_special_inode_operations; | 5038 | inode->i_op = &ext4_special_inode_operations; |
5039 | if (raw_inode->i_block[0]) | 5039 | if (raw_inode->i_block[0]) |
5040 | init_special_inode(inode, inode->i_mode, | 5040 | init_special_inode(inode, inode->i_mode, |
5041 | old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); | 5041 | old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); |
5042 | else | 5042 | else |
5043 | init_special_inode(inode, inode->i_mode, | 5043 | init_special_inode(inode, inode->i_mode, |
5044 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); | 5044 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); |
5045 | } else { | 5045 | } else { |
5046 | ret = -EIO; | 5046 | ret = -EIO; |
5047 | EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); | 5047 | EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); |
5048 | goto bad_inode; | 5048 | goto bad_inode; |
5049 | } | 5049 | } |
5050 | brelse(iloc.bh); | 5050 | brelse(iloc.bh); |
5051 | ext4_set_inode_flags(inode); | 5051 | ext4_set_inode_flags(inode); |
5052 | unlock_new_inode(inode); | 5052 | unlock_new_inode(inode); |
5053 | return inode; | 5053 | return inode; |
5054 | 5054 | ||
5055 | bad_inode: | 5055 | bad_inode: |
5056 | brelse(iloc.bh); | 5056 | brelse(iloc.bh); |
5057 | iget_failed(inode); | 5057 | iget_failed(inode); |
5058 | return ERR_PTR(ret); | 5058 | return ERR_PTR(ret); |
5059 | } | 5059 | } |
5060 | 5060 | ||
5061 | static int ext4_inode_blocks_set(handle_t *handle, | 5061 | static int ext4_inode_blocks_set(handle_t *handle, |
5062 | struct ext4_inode *raw_inode, | 5062 | struct ext4_inode *raw_inode, |
5063 | struct ext4_inode_info *ei) | 5063 | struct ext4_inode_info *ei) |
5064 | { | 5064 | { |
5065 | struct inode *inode = &(ei->vfs_inode); | 5065 | struct inode *inode = &(ei->vfs_inode); |
5066 | u64 i_blocks = inode->i_blocks; | 5066 | u64 i_blocks = inode->i_blocks; |
5067 | struct super_block *sb = inode->i_sb; | 5067 | struct super_block *sb = inode->i_sb; |
5068 | 5068 | ||
5069 | if (i_blocks <= ~0U) { | 5069 | if (i_blocks <= ~0U) { |
5070 | /* | 5070 | /* |
5071 | * i_blocks can be represnted in a 32 bit variable | 5071 | * i_blocks can be represnted in a 32 bit variable |
5072 | * as multiple of 512 bytes | 5072 | * as multiple of 512 bytes |
5073 | */ | 5073 | */ |
5074 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | 5074 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); |
5075 | raw_inode->i_blocks_high = 0; | 5075 | raw_inode->i_blocks_high = 0; |
5076 | ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); | 5076 | ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); |
5077 | return 0; | 5077 | return 0; |
5078 | } | 5078 | } |
5079 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) | 5079 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) |
5080 | return -EFBIG; | 5080 | return -EFBIG; |
5081 | 5081 | ||
5082 | if (i_blocks <= 0xffffffffffffULL) { | 5082 | if (i_blocks <= 0xffffffffffffULL) { |
5083 | /* | 5083 | /* |
5084 | * i_blocks can be represented in a 48 bit variable | 5084 | * i_blocks can be represented in a 48 bit variable |
5085 | * as multiple of 512 bytes | 5085 | * as multiple of 512 bytes |
5086 | */ | 5086 | */ |
5087 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | 5087 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); |
5088 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); | 5088 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); |
5089 | ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); | 5089 | ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); |
5090 | } else { | 5090 | } else { |
5091 | ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); | 5091 | ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); |
5092 | /* i_block is stored in file system block size */ | 5092 | /* i_block is stored in file system block size */ |
5093 | i_blocks = i_blocks >> (inode->i_blkbits - 9); | 5093 | i_blocks = i_blocks >> (inode->i_blkbits - 9); |
5094 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | 5094 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); |
5095 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); | 5095 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); |
5096 | } | 5096 | } |
5097 | return 0; | 5097 | return 0; |
5098 | } | 5098 | } |
5099 | 5099 | ||
5100 | /* | 5100 | /* |
5101 | * Post the struct inode info into an on-disk inode location in the | 5101 | * Post the struct inode info into an on-disk inode location in the |
5102 | * buffer-cache. This gobbles the caller's reference to the | 5102 | * buffer-cache. This gobbles the caller's reference to the |
5103 | * buffer_head in the inode location struct. | 5103 | * buffer_head in the inode location struct. |
5104 | * | 5104 | * |
5105 | * The caller must have write access to iloc->bh. | 5105 | * The caller must have write access to iloc->bh. |
5106 | */ | 5106 | */ |
5107 | static int ext4_do_update_inode(handle_t *handle, | 5107 | static int ext4_do_update_inode(handle_t *handle, |
5108 | struct inode *inode, | 5108 | struct inode *inode, |
5109 | struct ext4_iloc *iloc) | 5109 | struct ext4_iloc *iloc) |
5110 | { | 5110 | { |
5111 | struct ext4_inode *raw_inode = ext4_raw_inode(iloc); | 5111 | struct ext4_inode *raw_inode = ext4_raw_inode(iloc); |
5112 | struct ext4_inode_info *ei = EXT4_I(inode); | 5112 | struct ext4_inode_info *ei = EXT4_I(inode); |
5113 | struct buffer_head *bh = iloc->bh; | 5113 | struct buffer_head *bh = iloc->bh; |
5114 | int err = 0, rc, block; | 5114 | int err = 0, rc, block; |
5115 | 5115 | ||
5116 | /* For fields not not tracking in the in-memory inode, | 5116 | /* For fields not not tracking in the in-memory inode, |
5117 | * initialise them to zero for new inodes. */ | 5117 | * initialise them to zero for new inodes. */ |
5118 | if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) | 5118 | if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) |
5119 | memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); | 5119 | memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); |
5120 | 5120 | ||
5121 | ext4_get_inode_flags(ei); | 5121 | ext4_get_inode_flags(ei); |
5122 | raw_inode->i_mode = cpu_to_le16(inode->i_mode); | 5122 | raw_inode->i_mode = cpu_to_le16(inode->i_mode); |
5123 | if (!(test_opt(inode->i_sb, NO_UID32))) { | 5123 | if (!(test_opt(inode->i_sb, NO_UID32))) { |
5124 | raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); | 5124 | raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); |
5125 | raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); | 5125 | raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); |
5126 | /* | 5126 | /* |
5127 | * Fix up interoperability with old kernels. Otherwise, old inodes get | 5127 | * Fix up interoperability with old kernels. Otherwise, old inodes get |
5128 | * re-used with the upper 16 bits of the uid/gid intact | 5128 | * re-used with the upper 16 bits of the uid/gid intact |
5129 | */ | 5129 | */ |
5130 | if (!ei->i_dtime) { | 5130 | if (!ei->i_dtime) { |
5131 | raw_inode->i_uid_high = | 5131 | raw_inode->i_uid_high = |
5132 | cpu_to_le16(high_16_bits(inode->i_uid)); | 5132 | cpu_to_le16(high_16_bits(inode->i_uid)); |
5133 | raw_inode->i_gid_high = | 5133 | raw_inode->i_gid_high = |
5134 | cpu_to_le16(high_16_bits(inode->i_gid)); | 5134 | cpu_to_le16(high_16_bits(inode->i_gid)); |
5135 | } else { | 5135 | } else { |
5136 | raw_inode->i_uid_high = 0; | 5136 | raw_inode->i_uid_high = 0; |
5137 | raw_inode->i_gid_high = 0; | 5137 | raw_inode->i_gid_high = 0; |
5138 | } | 5138 | } |
5139 | } else { | 5139 | } else { |
5140 | raw_inode->i_uid_low = | 5140 | raw_inode->i_uid_low = |
5141 | cpu_to_le16(fs_high2lowuid(inode->i_uid)); | 5141 | cpu_to_le16(fs_high2lowuid(inode->i_uid)); |
5142 | raw_inode->i_gid_low = | 5142 | raw_inode->i_gid_low = |
5143 | cpu_to_le16(fs_high2lowgid(inode->i_gid)); | 5143 | cpu_to_le16(fs_high2lowgid(inode->i_gid)); |
5144 | raw_inode->i_uid_high = 0; | 5144 | raw_inode->i_uid_high = 0; |
5145 | raw_inode->i_gid_high = 0; | 5145 | raw_inode->i_gid_high = 0; |
5146 | } | 5146 | } |
5147 | raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); | 5147 | raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); |
5148 | 5148 | ||
5149 | EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); | 5149 | EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); |
5150 | EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); | 5150 | EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); |
5151 | EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); | 5151 | EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); |
5152 | EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); | 5152 | EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); |
5153 | 5153 | ||
5154 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) | 5154 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) |
5155 | goto out_brelse; | 5155 | goto out_brelse; |
5156 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); | 5156 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); |
5157 | raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); | 5157 | raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); |
5158 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != | 5158 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != |
5159 | cpu_to_le32(EXT4_OS_HURD)) | 5159 | cpu_to_le32(EXT4_OS_HURD)) |
5160 | raw_inode->i_file_acl_high = | 5160 | raw_inode->i_file_acl_high = |
5161 | cpu_to_le16(ei->i_file_acl >> 32); | 5161 | cpu_to_le16(ei->i_file_acl >> 32); |
5162 | raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); | 5162 | raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); |
5163 | ext4_isize_set(raw_inode, ei->i_disksize); | 5163 | ext4_isize_set(raw_inode, ei->i_disksize); |
5164 | if (ei->i_disksize > 0x7fffffffULL) { | 5164 | if (ei->i_disksize > 0x7fffffffULL) { |
5165 | struct super_block *sb = inode->i_sb; | 5165 | struct super_block *sb = inode->i_sb; |
5166 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, | 5166 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, |
5167 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || | 5167 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || |
5168 | EXT4_SB(sb)->s_es->s_rev_level == | 5168 | EXT4_SB(sb)->s_es->s_rev_level == |
5169 | cpu_to_le32(EXT4_GOOD_OLD_REV)) { | 5169 | cpu_to_le32(EXT4_GOOD_OLD_REV)) { |
5170 | /* If this is the first large file | 5170 | /* If this is the first large file |
5171 | * created, add a flag to the superblock. | 5171 | * created, add a flag to the superblock. |
5172 | */ | 5172 | */ |
5173 | err = ext4_journal_get_write_access(handle, | 5173 | err = ext4_journal_get_write_access(handle, |
5174 | EXT4_SB(sb)->s_sbh); | 5174 | EXT4_SB(sb)->s_sbh); |
5175 | if (err) | 5175 | if (err) |
5176 | goto out_brelse; | 5176 | goto out_brelse; |
5177 | ext4_update_dynamic_rev(sb); | 5177 | ext4_update_dynamic_rev(sb); |
5178 | EXT4_SET_RO_COMPAT_FEATURE(sb, | 5178 | EXT4_SET_RO_COMPAT_FEATURE(sb, |
5179 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); | 5179 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); |
5180 | sb->s_dirt = 1; | 5180 | sb->s_dirt = 1; |
5181 | ext4_handle_sync(handle); | 5181 | ext4_handle_sync(handle); |
5182 | err = ext4_handle_dirty_metadata(handle, NULL, | 5182 | err = ext4_handle_dirty_metadata(handle, NULL, |
5183 | EXT4_SB(sb)->s_sbh); | 5183 | EXT4_SB(sb)->s_sbh); |
5184 | } | 5184 | } |
5185 | } | 5185 | } |
5186 | raw_inode->i_generation = cpu_to_le32(inode->i_generation); | 5186 | raw_inode->i_generation = cpu_to_le32(inode->i_generation); |
5187 | if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { | 5187 | if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { |
5188 | if (old_valid_dev(inode->i_rdev)) { | 5188 | if (old_valid_dev(inode->i_rdev)) { |
5189 | raw_inode->i_block[0] = | 5189 | raw_inode->i_block[0] = |
5190 | cpu_to_le32(old_encode_dev(inode->i_rdev)); | 5190 | cpu_to_le32(old_encode_dev(inode->i_rdev)); |
5191 | raw_inode->i_block[1] = 0; | 5191 | raw_inode->i_block[1] = 0; |
5192 | } else { | 5192 | } else { |
5193 | raw_inode->i_block[0] = 0; | 5193 | raw_inode->i_block[0] = 0; |
5194 | raw_inode->i_block[1] = | 5194 | raw_inode->i_block[1] = |
5195 | cpu_to_le32(new_encode_dev(inode->i_rdev)); | 5195 | cpu_to_le32(new_encode_dev(inode->i_rdev)); |
5196 | raw_inode->i_block[2] = 0; | 5196 | raw_inode->i_block[2] = 0; |
5197 | } | 5197 | } |
5198 | } else | 5198 | } else |
5199 | for (block = 0; block < EXT4_N_BLOCKS; block++) | 5199 | for (block = 0; block < EXT4_N_BLOCKS; block++) |
5200 | raw_inode->i_block[block] = ei->i_data[block]; | 5200 | raw_inode->i_block[block] = ei->i_data[block]; |
5201 | 5201 | ||
5202 | raw_inode->i_disk_version = cpu_to_le32(inode->i_version); | 5202 | raw_inode->i_disk_version = cpu_to_le32(inode->i_version); |
5203 | if (ei->i_extra_isize) { | 5203 | if (ei->i_extra_isize) { |
5204 | if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) | 5204 | if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) |
5205 | raw_inode->i_version_hi = | 5205 | raw_inode->i_version_hi = |
5206 | cpu_to_le32(inode->i_version >> 32); | 5206 | cpu_to_le32(inode->i_version >> 32); |
5207 | raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); | 5207 | raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); |
5208 | } | 5208 | } |
5209 | 5209 | ||
5210 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | 5210 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
5211 | rc = ext4_handle_dirty_metadata(handle, NULL, bh); | 5211 | rc = ext4_handle_dirty_metadata(handle, NULL, bh); |
5212 | if (!err) | 5212 | if (!err) |
5213 | err = rc; | 5213 | err = rc; |
5214 | ext4_clear_inode_state(inode, EXT4_STATE_NEW); | 5214 | ext4_clear_inode_state(inode, EXT4_STATE_NEW); |
5215 | 5215 | ||
5216 | ext4_update_inode_fsync_trans(handle, inode, 0); | 5216 | ext4_update_inode_fsync_trans(handle, inode, 0); |
5217 | out_brelse: | 5217 | out_brelse: |
5218 | brelse(bh); | 5218 | brelse(bh); |
5219 | ext4_std_error(inode->i_sb, err); | 5219 | ext4_std_error(inode->i_sb, err); |
5220 | return err; | 5220 | return err; |
5221 | } | 5221 | } |
5222 | 5222 | ||
5223 | /* | 5223 | /* |
5224 | * ext4_write_inode() | 5224 | * ext4_write_inode() |
5225 | * | 5225 | * |
5226 | * We are called from a few places: | 5226 | * We are called from a few places: |
5227 | * | 5227 | * |
5228 | * - Within generic_file_write() for O_SYNC files. | 5228 | * - Within generic_file_write() for O_SYNC files. |
5229 | * Here, there will be no transaction running. We wait for any running | 5229 | * Here, there will be no transaction running. We wait for any running |
5230 | * trasnaction to commit. | 5230 | * trasnaction to commit. |
5231 | * | 5231 | * |
5232 | * - Within sys_sync(), kupdate and such. | 5232 | * - Within sys_sync(), kupdate and such. |
5233 | * We wait on commit, if tol to. | 5233 | * We wait on commit, if tol to. |
5234 | * | 5234 | * |
5235 | * - Within prune_icache() (PF_MEMALLOC == true) | 5235 | * - Within prune_icache() (PF_MEMALLOC == true) |
5236 | * Here we simply return. We can't afford to block kswapd on the | 5236 | * Here we simply return. We can't afford to block kswapd on the |
5237 | * journal commit. | 5237 | * journal commit. |
5238 | * | 5238 | * |
5239 | * In all cases it is actually safe for us to return without doing anything, | 5239 | * In all cases it is actually safe for us to return without doing anything, |
5240 | * because the inode has been copied into a raw inode buffer in | 5240 | * because the inode has been copied into a raw inode buffer in |
5241 | * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for | 5241 | * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for |
5242 | * knfsd. | 5242 | * knfsd. |
5243 | * | 5243 | * |
5244 | * Note that we are absolutely dependent upon all inode dirtiers doing the | 5244 | * Note that we are absolutely dependent upon all inode dirtiers doing the |
5245 | * right thing: they *must* call mark_inode_dirty() after dirtying info in | 5245 | * right thing: they *must* call mark_inode_dirty() after dirtying info in |
5246 | * which we are interested. | 5246 | * which we are interested. |
5247 | * | 5247 | * |
5248 | * It would be a bug for them to not do this. The code: | 5248 | * It would be a bug for them to not do this. The code: |
5249 | * | 5249 | * |
5250 | * mark_inode_dirty(inode) | 5250 | * mark_inode_dirty(inode) |
5251 | * stuff(); | 5251 | * stuff(); |
5252 | * inode->i_size = expr; | 5252 | * inode->i_size = expr; |
5253 | * | 5253 | * |
5254 | * is in error because a kswapd-driven write_inode() could occur while | 5254 | * is in error because a kswapd-driven write_inode() could occur while |
5255 | * `stuff()' is running, and the new i_size will be lost. Plus the inode | 5255 | * `stuff()' is running, and the new i_size will be lost. Plus the inode |
5256 | * will no longer be on the superblock's dirty inode list. | 5256 | * will no longer be on the superblock's dirty inode list. |
5257 | */ | 5257 | */ |
5258 | int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) | 5258 | int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) |
5259 | { | 5259 | { |
5260 | int err; | 5260 | int err; |
5261 | 5261 | ||
5262 | if (current->flags & PF_MEMALLOC) | 5262 | if (current->flags & PF_MEMALLOC) |
5263 | return 0; | 5263 | return 0; |
5264 | 5264 | ||
5265 | if (EXT4_SB(inode->i_sb)->s_journal) { | 5265 | if (EXT4_SB(inode->i_sb)->s_journal) { |
5266 | if (ext4_journal_current_handle()) { | 5266 | if (ext4_journal_current_handle()) { |
5267 | jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); | 5267 | jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); |
5268 | dump_stack(); | 5268 | dump_stack(); |
5269 | return -EIO; | 5269 | return -EIO; |
5270 | } | 5270 | } |
5271 | 5271 | ||
5272 | if (wbc->sync_mode != WB_SYNC_ALL) | 5272 | if (wbc->sync_mode != WB_SYNC_ALL) |
5273 | return 0; | 5273 | return 0; |
5274 | 5274 | ||
5275 | err = ext4_force_commit(inode->i_sb); | 5275 | err = ext4_force_commit(inode->i_sb); |
5276 | } else { | 5276 | } else { |
5277 | struct ext4_iloc iloc; | 5277 | struct ext4_iloc iloc; |
5278 | 5278 | ||
5279 | err = __ext4_get_inode_loc(inode, &iloc, 0); | 5279 | err = __ext4_get_inode_loc(inode, &iloc, 0); |
5280 | if (err) | 5280 | if (err) |
5281 | return err; | 5281 | return err; |
5282 | if (wbc->sync_mode == WB_SYNC_ALL) | 5282 | if (wbc->sync_mode == WB_SYNC_ALL) |
5283 | sync_dirty_buffer(iloc.bh); | 5283 | sync_dirty_buffer(iloc.bh); |
5284 | if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { | 5284 | if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { |
5285 | EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, | 5285 | EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, |
5286 | "IO error syncing inode"); | 5286 | "IO error syncing inode"); |
5287 | err = -EIO; | 5287 | err = -EIO; |
5288 | } | 5288 | } |
5289 | brelse(iloc.bh); | 5289 | brelse(iloc.bh); |
5290 | } | 5290 | } |
5291 | return err; | 5291 | return err; |
5292 | } | 5292 | } |
5293 | 5293 | ||
5294 | /* | 5294 | /* |
5295 | * ext4_setattr() | 5295 | * ext4_setattr() |
5296 | * | 5296 | * |
5297 | * Called from notify_change. | 5297 | * Called from notify_change. |
5298 | * | 5298 | * |
5299 | * We want to trap VFS attempts to truncate the file as soon as | 5299 | * We want to trap VFS attempts to truncate the file as soon as |
5300 | * possible. In particular, we want to make sure that when the VFS | 5300 | * possible. In particular, we want to make sure that when the VFS |
5301 | * shrinks i_size, we put the inode on the orphan list and modify | 5301 | * shrinks i_size, we put the inode on the orphan list and modify |
5302 | * i_disksize immediately, so that during the subsequent flushing of | 5302 | * i_disksize immediately, so that during the subsequent flushing of |
5303 | * dirty pages and freeing of disk blocks, we can guarantee that any | 5303 | * dirty pages and freeing of disk blocks, we can guarantee that any |
5304 | * commit will leave the blocks being flushed in an unused state on | 5304 | * commit will leave the blocks being flushed in an unused state on |
5305 | * disk. (On recovery, the inode will get truncated and the blocks will | 5305 | * disk. (On recovery, the inode will get truncated and the blocks will |
5306 | * be freed, so we have a strong guarantee that no future commit will | 5306 | * be freed, so we have a strong guarantee that no future commit will |
5307 | * leave these blocks visible to the user.) | 5307 | * leave these blocks visible to the user.) |
5308 | * | 5308 | * |
5309 | * Another thing we have to assure is that if we are in ordered mode | 5309 | * Another thing we have to assure is that if we are in ordered mode |
5310 | * and inode is still attached to the committing transaction, we must | 5310 | * and inode is still attached to the committing transaction, we must |
5311 | * we start writeout of all the dirty pages which are being truncated. | 5311 | * we start writeout of all the dirty pages which are being truncated. |
5312 | * This way we are sure that all the data written in the previous | 5312 | * This way we are sure that all the data written in the previous |
5313 | * transaction are already on disk (truncate waits for pages under | 5313 | * transaction are already on disk (truncate waits for pages under |
5314 | * writeback). | 5314 | * writeback). |
5315 | * | 5315 | * |
5316 | * Called with inode->i_mutex down. | 5316 | * Called with inode->i_mutex down. |
5317 | */ | 5317 | */ |
5318 | int ext4_setattr(struct dentry *dentry, struct iattr *attr) | 5318 | int ext4_setattr(struct dentry *dentry, struct iattr *attr) |
5319 | { | 5319 | { |
5320 | struct inode *inode = dentry->d_inode; | 5320 | struct inode *inode = dentry->d_inode; |
5321 | int error, rc = 0; | 5321 | int error, rc = 0; |
5322 | int orphan = 0; | 5322 | int orphan = 0; |
5323 | const unsigned int ia_valid = attr->ia_valid; | 5323 | const unsigned int ia_valid = attr->ia_valid; |
5324 | 5324 | ||
5325 | error = inode_change_ok(inode, attr); | 5325 | error = inode_change_ok(inode, attr); |
5326 | if (error) | 5326 | if (error) |
5327 | return error; | 5327 | return error; |
5328 | 5328 | ||
5329 | if (is_quota_modification(inode, attr)) | 5329 | if (is_quota_modification(inode, attr)) |
5330 | dquot_initialize(inode); | 5330 | dquot_initialize(inode); |
5331 | if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || | 5331 | if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || |
5332 | (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { | 5332 | (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { |
5333 | handle_t *handle; | 5333 | handle_t *handle; |
5334 | 5334 | ||
5335 | /* (user+group)*(old+new) structure, inode write (sb, | 5335 | /* (user+group)*(old+new) structure, inode write (sb, |
5336 | * inode block, ? - but truncate inode update has it) */ | 5336 | * inode block, ? - but truncate inode update has it) */ |
5337 | handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ | 5337 | handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ |
5338 | EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); | 5338 | EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); |
5339 | if (IS_ERR(handle)) { | 5339 | if (IS_ERR(handle)) { |
5340 | error = PTR_ERR(handle); | 5340 | error = PTR_ERR(handle); |
5341 | goto err_out; | 5341 | goto err_out; |
5342 | } | 5342 | } |
5343 | error = dquot_transfer(inode, attr); | 5343 | error = dquot_transfer(inode, attr); |
5344 | if (error) { | 5344 | if (error) { |
5345 | ext4_journal_stop(handle); | 5345 | ext4_journal_stop(handle); |
5346 | return error; | 5346 | return error; |
5347 | } | 5347 | } |
5348 | /* Update corresponding info in inode so that everything is in | 5348 | /* Update corresponding info in inode so that everything is in |
5349 | * one transaction */ | 5349 | * one transaction */ |
5350 | if (attr->ia_valid & ATTR_UID) | 5350 | if (attr->ia_valid & ATTR_UID) |
5351 | inode->i_uid = attr->ia_uid; | 5351 | inode->i_uid = attr->ia_uid; |
5352 | if (attr->ia_valid & ATTR_GID) | 5352 | if (attr->ia_valid & ATTR_GID) |
5353 | inode->i_gid = attr->ia_gid; | 5353 | inode->i_gid = attr->ia_gid; |
5354 | error = ext4_mark_inode_dirty(handle, inode); | 5354 | error = ext4_mark_inode_dirty(handle, inode); |
5355 | ext4_journal_stop(handle); | 5355 | ext4_journal_stop(handle); |
5356 | } | 5356 | } |
5357 | 5357 | ||
5358 | if (attr->ia_valid & ATTR_SIZE) { | 5358 | if (attr->ia_valid & ATTR_SIZE) { |
5359 | inode_dio_wait(inode); | 5359 | inode_dio_wait(inode); |
5360 | 5360 | ||
5361 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { | 5361 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { |
5362 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 5362 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
5363 | 5363 | ||
5364 | if (attr->ia_size > sbi->s_bitmap_maxbytes) | 5364 | if (attr->ia_size > sbi->s_bitmap_maxbytes) |
5365 | return -EFBIG; | 5365 | return -EFBIG; |
5366 | } | 5366 | } |
5367 | } | 5367 | } |
5368 | 5368 | ||
5369 | if (S_ISREG(inode->i_mode) && | 5369 | if (S_ISREG(inode->i_mode) && |
5370 | attr->ia_valid & ATTR_SIZE && | 5370 | attr->ia_valid & ATTR_SIZE && |
5371 | (attr->ia_size < inode->i_size)) { | 5371 | (attr->ia_size < inode->i_size)) { |
5372 | handle_t *handle; | 5372 | handle_t *handle; |
5373 | 5373 | ||
5374 | handle = ext4_journal_start(inode, 3); | 5374 | handle = ext4_journal_start(inode, 3); |
5375 | if (IS_ERR(handle)) { | 5375 | if (IS_ERR(handle)) { |
5376 | error = PTR_ERR(handle); | 5376 | error = PTR_ERR(handle); |
5377 | goto err_out; | 5377 | goto err_out; |
5378 | } | 5378 | } |
5379 | if (ext4_handle_valid(handle)) { | 5379 | if (ext4_handle_valid(handle)) { |
5380 | error = ext4_orphan_add(handle, inode); | 5380 | error = ext4_orphan_add(handle, inode); |
5381 | orphan = 1; | 5381 | orphan = 1; |
5382 | } | 5382 | } |
5383 | EXT4_I(inode)->i_disksize = attr->ia_size; | 5383 | EXT4_I(inode)->i_disksize = attr->ia_size; |
5384 | rc = ext4_mark_inode_dirty(handle, inode); | 5384 | rc = ext4_mark_inode_dirty(handle, inode); |
5385 | if (!error) | 5385 | if (!error) |
5386 | error = rc; | 5386 | error = rc; |
5387 | ext4_journal_stop(handle); | 5387 | ext4_journal_stop(handle); |
5388 | 5388 | ||
5389 | if (ext4_should_order_data(inode)) { | 5389 | if (ext4_should_order_data(inode)) { |
5390 | error = ext4_begin_ordered_truncate(inode, | 5390 | error = ext4_begin_ordered_truncate(inode, |
5391 | attr->ia_size); | 5391 | attr->ia_size); |
5392 | if (error) { | 5392 | if (error) { |
5393 | /* Do as much error cleanup as possible */ | 5393 | /* Do as much error cleanup as possible */ |
5394 | handle = ext4_journal_start(inode, 3); | 5394 | handle = ext4_journal_start(inode, 3); |
5395 | if (IS_ERR(handle)) { | 5395 | if (IS_ERR(handle)) { |
5396 | ext4_orphan_del(NULL, inode); | 5396 | ext4_orphan_del(NULL, inode); |
5397 | goto err_out; | 5397 | goto err_out; |
5398 | } | 5398 | } |
5399 | ext4_orphan_del(handle, inode); | 5399 | ext4_orphan_del(handle, inode); |
5400 | orphan = 0; | 5400 | orphan = 0; |
5401 | ext4_journal_stop(handle); | 5401 | ext4_journal_stop(handle); |
5402 | goto err_out; | 5402 | goto err_out; |
5403 | } | 5403 | } |
5404 | } | 5404 | } |
5405 | } | 5405 | } |
5406 | 5406 | ||
5407 | if (attr->ia_valid & ATTR_SIZE) { | 5407 | if (attr->ia_valid & ATTR_SIZE) { |
5408 | if (attr->ia_size != i_size_read(inode)) { | 5408 | if (attr->ia_size != i_size_read(inode)) { |
5409 | truncate_setsize(inode, attr->ia_size); | 5409 | truncate_setsize(inode, attr->ia_size); |
5410 | ext4_truncate(inode); | 5410 | ext4_truncate(inode); |
5411 | } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) | 5411 | } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) |
5412 | ext4_truncate(inode); | 5412 | ext4_truncate(inode); |
5413 | } | 5413 | } |
5414 | 5414 | ||
5415 | if (!rc) { | 5415 | if (!rc) { |
5416 | setattr_copy(inode, attr); | 5416 | setattr_copy(inode, attr); |
5417 | mark_inode_dirty(inode); | 5417 | mark_inode_dirty(inode); |
5418 | } | 5418 | } |
5419 | 5419 | ||
5420 | /* | 5420 | /* |
5421 | * If the call to ext4_truncate failed to get a transaction handle at | 5421 | * If the call to ext4_truncate failed to get a transaction handle at |
5422 | * all, we need to clean up the in-core orphan list manually. | 5422 | * all, we need to clean up the in-core orphan list manually. |
5423 | */ | 5423 | */ |
5424 | if (orphan && inode->i_nlink) | 5424 | if (orphan && inode->i_nlink) |
5425 | ext4_orphan_del(NULL, inode); | 5425 | ext4_orphan_del(NULL, inode); |
5426 | 5426 | ||
5427 | if (!rc && (ia_valid & ATTR_MODE)) | 5427 | if (!rc && (ia_valid & ATTR_MODE)) |
5428 | rc = ext4_acl_chmod(inode); | 5428 | rc = ext4_acl_chmod(inode); |
5429 | 5429 | ||
5430 | err_out: | 5430 | err_out: |
5431 | ext4_std_error(inode->i_sb, error); | 5431 | ext4_std_error(inode->i_sb, error); |
5432 | if (!error) | 5432 | if (!error) |
5433 | error = rc; | 5433 | error = rc; |
5434 | return error; | 5434 | return error; |
5435 | } | 5435 | } |
5436 | 5436 | ||
5437 | int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | 5437 | int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, |
5438 | struct kstat *stat) | 5438 | struct kstat *stat) |
5439 | { | 5439 | { |
5440 | struct inode *inode; | 5440 | struct inode *inode; |
5441 | unsigned long delalloc_blocks; | 5441 | unsigned long delalloc_blocks; |
5442 | 5442 | ||
5443 | inode = dentry->d_inode; | 5443 | inode = dentry->d_inode; |
5444 | generic_fillattr(inode, stat); | 5444 | generic_fillattr(inode, stat); |
5445 | 5445 | ||
5446 | /* | 5446 | /* |
5447 | * We can't update i_blocks if the block allocation is delayed | 5447 | * We can't update i_blocks if the block allocation is delayed |
5448 | * otherwise in the case of system crash before the real block | 5448 | * otherwise in the case of system crash before the real block |
5449 | * allocation is done, we will have i_blocks inconsistent with | 5449 | * allocation is done, we will have i_blocks inconsistent with |
5450 | * on-disk file blocks. | 5450 | * on-disk file blocks. |
5451 | * We always keep i_blocks updated together with real | 5451 | * We always keep i_blocks updated together with real |
5452 | * allocation. But to not confuse with user, stat | 5452 | * allocation. But to not confuse with user, stat |
5453 | * will return the blocks that include the delayed allocation | 5453 | * will return the blocks that include the delayed allocation |
5454 | * blocks for this file. | 5454 | * blocks for this file. |
5455 | */ | 5455 | */ |
5456 | delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 5456 | delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; |
5457 | 5457 | ||
5458 | stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; | 5458 | stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; |
5459 | return 0; | 5459 | return 0; |
5460 | } | 5460 | } |
5461 | 5461 | ||
5462 | static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | 5462 | static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, |
5463 | int chunk) | 5463 | int chunk) |
5464 | { | 5464 | { |
5465 | int indirects; | 5465 | int indirects; |
5466 | 5466 | ||
5467 | /* if nrblocks are contiguous */ | 5467 | /* if nrblocks are contiguous */ |
5468 | if (chunk) { | 5468 | if (chunk) { |
5469 | /* | 5469 | /* |
5470 | * With N contiguous data blocks, we need at most | 5470 | * With N contiguous data blocks, we need at most |
5471 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | 5471 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, |
5472 | * 2 dindirect blocks, and 1 tindirect block | 5472 | * 2 dindirect blocks, and 1 tindirect block |
5473 | */ | 5473 | */ |
5474 | return DIV_ROUND_UP(nrblocks, | 5474 | return DIV_ROUND_UP(nrblocks, |
5475 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | 5475 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; |
5476 | } | 5476 | } |
5477 | /* | 5477 | /* |
5478 | * if nrblocks are not contiguous, worse case, each block touch | 5478 | * if nrblocks are not contiguous, worse case, each block touch |
5479 | * a indirect block, and each indirect block touch a double indirect | 5479 | * a indirect block, and each indirect block touch a double indirect |
5480 | * block, plus a triple indirect block | 5480 | * block, plus a triple indirect block |
5481 | */ | 5481 | */ |
5482 | indirects = nrblocks * 2 + 1; | 5482 | indirects = nrblocks * 2 + 1; |
5483 | return indirects; | 5483 | return indirects; |
5484 | } | 5484 | } |
5485 | 5485 | ||
5486 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 5486 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5487 | { | 5487 | { |
5488 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 5488 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
5489 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); | 5489 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); |
5490 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 5490 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); |
5491 | } | 5491 | } |
5492 | 5492 | ||
5493 | /* | 5493 | /* |
5494 | * Account for index blocks, block groups bitmaps and block group | 5494 | * Account for index blocks, block groups bitmaps and block group |
5495 | * descriptor blocks if modify datablocks and index blocks | 5495 | * descriptor blocks if modify datablocks and index blocks |
5496 | * worse case, the indexs blocks spread over different block groups | 5496 | * worse case, the indexs blocks spread over different block groups |
5497 | * | 5497 | * |
5498 | * If datablocks are discontiguous, they are possible to spread over | 5498 | * If datablocks are discontiguous, they are possible to spread over |
5499 | * different block groups too. If they are contiuguous, with flexbg, | 5499 | * different block groups too. If they are contiuguous, with flexbg, |
5500 | * they could still across block group boundary. | 5500 | * they could still across block group boundary. |
5501 | * | 5501 | * |
5502 | * Also account for superblock, inode, quota and xattr blocks | 5502 | * Also account for superblock, inode, quota and xattr blocks |
5503 | */ | 5503 | */ |
5504 | static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 5504 | static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5505 | { | 5505 | { |
5506 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); | 5506 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); |
5507 | int gdpblocks; | 5507 | int gdpblocks; |
5508 | int idxblocks; | 5508 | int idxblocks; |
5509 | int ret = 0; | 5509 | int ret = 0; |
5510 | 5510 | ||
5511 | /* | 5511 | /* |
5512 | * How many index blocks need to touch to modify nrblocks? | 5512 | * How many index blocks need to touch to modify nrblocks? |
5513 | * The "Chunk" flag indicating whether the nrblocks is | 5513 | * The "Chunk" flag indicating whether the nrblocks is |
5514 | * physically contiguous on disk | 5514 | * physically contiguous on disk |
5515 | * | 5515 | * |
5516 | * For Direct IO and fallocate, they calls get_block to allocate | 5516 | * For Direct IO and fallocate, they calls get_block to allocate |
5517 | * one single extent at a time, so they could set the "Chunk" flag | 5517 | * one single extent at a time, so they could set the "Chunk" flag |
5518 | */ | 5518 | */ |
5519 | idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); | 5519 | idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); |
5520 | 5520 | ||
5521 | ret = idxblocks; | 5521 | ret = idxblocks; |
5522 | 5522 | ||
5523 | /* | 5523 | /* |
5524 | * Now let's see how many group bitmaps and group descriptors need | 5524 | * Now let's see how many group bitmaps and group descriptors need |
5525 | * to account | 5525 | * to account |
5526 | */ | 5526 | */ |
5527 | groups = idxblocks; | 5527 | groups = idxblocks; |
5528 | if (chunk) | 5528 | if (chunk) |
5529 | groups += 1; | 5529 | groups += 1; |
5530 | else | 5530 | else |
5531 | groups += nrblocks; | 5531 | groups += nrblocks; |
5532 | 5532 | ||
5533 | gdpblocks = groups; | 5533 | gdpblocks = groups; |
5534 | if (groups > ngroups) | 5534 | if (groups > ngroups) |
5535 | groups = ngroups; | 5535 | groups = ngroups; |
5536 | if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) | 5536 | if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) |
5537 | gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; | 5537 | gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; |
5538 | 5538 | ||
5539 | /* bitmaps and block group descriptor blocks */ | 5539 | /* bitmaps and block group descriptor blocks */ |
5540 | ret += groups + gdpblocks; | 5540 | ret += groups + gdpblocks; |
5541 | 5541 | ||
5542 | /* Blocks for super block, inode, quota and xattr blocks */ | 5542 | /* Blocks for super block, inode, quota and xattr blocks */ |
5543 | ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); | 5543 | ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); |
5544 | 5544 | ||
5545 | return ret; | 5545 | return ret; |
5546 | } | 5546 | } |
5547 | 5547 | ||
5548 | /* | 5548 | /* |
5549 | * Calculate the total number of credits to reserve to fit | 5549 | * Calculate the total number of credits to reserve to fit |
5550 | * the modification of a single pages into a single transaction, | 5550 | * the modification of a single pages into a single transaction, |
5551 | * which may include multiple chunks of block allocations. | 5551 | * which may include multiple chunks of block allocations. |
5552 | * | 5552 | * |
5553 | * This could be called via ext4_write_begin() | 5553 | * This could be called via ext4_write_begin() |
5554 | * | 5554 | * |
5555 | * We need to consider the worse case, when | 5555 | * We need to consider the worse case, when |
5556 | * one new block per extent. | 5556 | * one new block per extent. |
5557 | */ | 5557 | */ |
5558 | int ext4_writepage_trans_blocks(struct inode *inode) | 5558 | int ext4_writepage_trans_blocks(struct inode *inode) |
5559 | { | 5559 | { |
5560 | int bpp = ext4_journal_blocks_per_page(inode); | 5560 | int bpp = ext4_journal_blocks_per_page(inode); |
5561 | int ret; | 5561 | int ret; |
5562 | 5562 | ||
5563 | ret = ext4_meta_trans_blocks(inode, bpp, 0); | 5563 | ret = ext4_meta_trans_blocks(inode, bpp, 0); |
5564 | 5564 | ||
5565 | /* Account for data blocks for journalled mode */ | 5565 | /* Account for data blocks for journalled mode */ |
5566 | if (ext4_should_journal_data(inode)) | 5566 | if (ext4_should_journal_data(inode)) |
5567 | ret += bpp; | 5567 | ret += bpp; |
5568 | return ret; | 5568 | return ret; |
5569 | } | 5569 | } |
5570 | 5570 | ||
5571 | /* | 5571 | /* |
5572 | * Calculate the journal credits for a chunk of data modification. | 5572 | * Calculate the journal credits for a chunk of data modification. |
5573 | * | 5573 | * |
5574 | * This is called from DIO, fallocate or whoever calling | 5574 | * This is called from DIO, fallocate or whoever calling |
5575 | * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. | 5575 | * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. |
5576 | * | 5576 | * |
5577 | * journal buffers for data blocks are not included here, as DIO | 5577 | * journal buffers for data blocks are not included here, as DIO |
5578 | * and fallocate do no need to journal data buffers. | 5578 | * and fallocate do no need to journal data buffers. |
5579 | */ | 5579 | */ |
5580 | int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) | 5580 | int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) |
5581 | { | 5581 | { |
5582 | return ext4_meta_trans_blocks(inode, nrblocks, 1); | 5582 | return ext4_meta_trans_blocks(inode, nrblocks, 1); |
5583 | } | 5583 | } |
5584 | 5584 | ||
5585 | /* | 5585 | /* |
5586 | * The caller must have previously called ext4_reserve_inode_write(). | 5586 | * The caller must have previously called ext4_reserve_inode_write(). |
5587 | * Give this, we know that the caller already has write access to iloc->bh. | 5587 | * Give this, we know that the caller already has write access to iloc->bh. |
5588 | */ | 5588 | */ |
5589 | int ext4_mark_iloc_dirty(handle_t *handle, | 5589 | int ext4_mark_iloc_dirty(handle_t *handle, |
5590 | struct inode *inode, struct ext4_iloc *iloc) | 5590 | struct inode *inode, struct ext4_iloc *iloc) |
5591 | { | 5591 | { |
5592 | int err = 0; | 5592 | int err = 0; |
5593 | 5593 | ||
5594 | if (test_opt(inode->i_sb, I_VERSION)) | 5594 | if (test_opt(inode->i_sb, I_VERSION)) |
5595 | inode_inc_iversion(inode); | 5595 | inode_inc_iversion(inode); |
5596 | 5596 | ||
5597 | /* the do_update_inode consumes one bh->b_count */ | 5597 | /* the do_update_inode consumes one bh->b_count */ |
5598 | get_bh(iloc->bh); | 5598 | get_bh(iloc->bh); |
5599 | 5599 | ||
5600 | /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ | 5600 | /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ |
5601 | err = ext4_do_update_inode(handle, inode, iloc); | 5601 | err = ext4_do_update_inode(handle, inode, iloc); |
5602 | put_bh(iloc->bh); | 5602 | put_bh(iloc->bh); |
5603 | return err; | 5603 | return err; |
5604 | } | 5604 | } |
5605 | 5605 | ||
5606 | /* | 5606 | /* |
5607 | * On success, We end up with an outstanding reference count against | 5607 | * On success, We end up with an outstanding reference count against |
5608 | * iloc->bh. This _must_ be cleaned up later. | 5608 | * iloc->bh. This _must_ be cleaned up later. |
5609 | */ | 5609 | */ |
5610 | 5610 | ||
5611 | int | 5611 | int |
5612 | ext4_reserve_inode_write(handle_t *handle, struct inode *inode, | 5612 | ext4_reserve_inode_write(handle_t *handle, struct inode *inode, |
5613 | struct ext4_iloc *iloc) | 5613 | struct ext4_iloc *iloc) |
5614 | { | 5614 | { |
5615 | int err; | 5615 | int err; |
5616 | 5616 | ||
5617 | err = ext4_get_inode_loc(inode, iloc); | 5617 | err = ext4_get_inode_loc(inode, iloc); |
5618 | if (!err) { | 5618 | if (!err) { |
5619 | BUFFER_TRACE(iloc->bh, "get_write_access"); | 5619 | BUFFER_TRACE(iloc->bh, "get_write_access"); |
5620 | err = ext4_journal_get_write_access(handle, iloc->bh); | 5620 | err = ext4_journal_get_write_access(handle, iloc->bh); |
5621 | if (err) { | 5621 | if (err) { |
5622 | brelse(iloc->bh); | 5622 | brelse(iloc->bh); |
5623 | iloc->bh = NULL; | 5623 | iloc->bh = NULL; |
5624 | } | 5624 | } |
5625 | } | 5625 | } |
5626 | ext4_std_error(inode->i_sb, err); | 5626 | ext4_std_error(inode->i_sb, err); |
5627 | return err; | 5627 | return err; |
5628 | } | 5628 | } |
5629 | 5629 | ||
5630 | /* | 5630 | /* |
5631 | * Expand an inode by new_extra_isize bytes. | 5631 | * Expand an inode by new_extra_isize bytes. |
5632 | * Returns 0 on success or negative error number on failure. | 5632 | * Returns 0 on success or negative error number on failure. |
5633 | */ | 5633 | */ |
5634 | static int ext4_expand_extra_isize(struct inode *inode, | 5634 | static int ext4_expand_extra_isize(struct inode *inode, |
5635 | unsigned int new_extra_isize, | 5635 | unsigned int new_extra_isize, |
5636 | struct ext4_iloc iloc, | 5636 | struct ext4_iloc iloc, |
5637 | handle_t *handle) | 5637 | handle_t *handle) |
5638 | { | 5638 | { |
5639 | struct ext4_inode *raw_inode; | 5639 | struct ext4_inode *raw_inode; |
5640 | struct ext4_xattr_ibody_header *header; | 5640 | struct ext4_xattr_ibody_header *header; |
5641 | 5641 | ||
5642 | if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) | 5642 | if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) |
5643 | return 0; | 5643 | return 0; |
5644 | 5644 | ||
5645 | raw_inode = ext4_raw_inode(&iloc); | 5645 | raw_inode = ext4_raw_inode(&iloc); |
5646 | 5646 | ||
5647 | header = IHDR(inode, raw_inode); | 5647 | header = IHDR(inode, raw_inode); |
5648 | 5648 | ||
5649 | /* No extended attributes present */ | 5649 | /* No extended attributes present */ |
5650 | if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || | 5650 | if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || |
5651 | header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { | 5651 | header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { |
5652 | memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, | 5652 | memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, |
5653 | new_extra_isize); | 5653 | new_extra_isize); |
5654 | EXT4_I(inode)->i_extra_isize = new_extra_isize; | 5654 | EXT4_I(inode)->i_extra_isize = new_extra_isize; |
5655 | return 0; | 5655 | return 0; |
5656 | } | 5656 | } |
5657 | 5657 | ||
5658 | /* try to expand with EAs present */ | 5658 | /* try to expand with EAs present */ |
5659 | return ext4_expand_extra_isize_ea(inode, new_extra_isize, | 5659 | return ext4_expand_extra_isize_ea(inode, new_extra_isize, |
5660 | raw_inode, handle); | 5660 | raw_inode, handle); |
5661 | } | 5661 | } |
5662 | 5662 | ||
5663 | /* | 5663 | /* |
5664 | * What we do here is to mark the in-core inode as clean with respect to inode | 5664 | * What we do here is to mark the in-core inode as clean with respect to inode |
5665 | * dirtiness (it may still be data-dirty). | 5665 | * dirtiness (it may still be data-dirty). |
5666 | * This means that the in-core inode may be reaped by prune_icache | 5666 | * This means that the in-core inode may be reaped by prune_icache |
5667 | * without having to perform any I/O. This is a very good thing, | 5667 | * without having to perform any I/O. This is a very good thing, |
5668 | * because *any* task may call prune_icache - even ones which | 5668 | * because *any* task may call prune_icache - even ones which |
5669 | * have a transaction open against a different journal. | 5669 | * have a transaction open against a different journal. |
5670 | * | 5670 | * |
5671 | * Is this cheating? Not really. Sure, we haven't written the | 5671 | * Is this cheating? Not really. Sure, we haven't written the |
5672 | * inode out, but prune_icache isn't a user-visible syncing function. | 5672 | * inode out, but prune_icache isn't a user-visible syncing function. |
5673 | * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) | 5673 | * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) |
5674 | * we start and wait on commits. | 5674 | * we start and wait on commits. |
5675 | * | 5675 | * |
5676 | * Is this efficient/effective? Well, we're being nice to the system | 5676 | * Is this efficient/effective? Well, we're being nice to the system |
5677 | * by cleaning up our inodes proactively so they can be reaped | 5677 | * by cleaning up our inodes proactively so they can be reaped |
5678 | * without I/O. But we are potentially leaving up to five seconds' | 5678 | * without I/O. But we are potentially leaving up to five seconds' |
5679 | * worth of inodes floating about which prune_icache wants us to | 5679 | * worth of inodes floating about which prune_icache wants us to |
5680 | * write out. One way to fix that would be to get prune_icache() | 5680 | * write out. One way to fix that would be to get prune_icache() |
5681 | * to do a write_super() to free up some memory. It has the desired | 5681 | * to do a write_super() to free up some memory. It has the desired |
5682 | * effect. | 5682 | * effect. |
5683 | */ | 5683 | */ |
5684 | int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | 5684 | int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) |
5685 | { | 5685 | { |
5686 | struct ext4_iloc iloc; | 5686 | struct ext4_iloc iloc; |
5687 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 5687 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
5688 | static unsigned int mnt_count; | 5688 | static unsigned int mnt_count; |
5689 | int err, ret; | 5689 | int err, ret; |
5690 | 5690 | ||
5691 | might_sleep(); | 5691 | might_sleep(); |
5692 | trace_ext4_mark_inode_dirty(inode, _RET_IP_); | 5692 | trace_ext4_mark_inode_dirty(inode, _RET_IP_); |
5693 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 5693 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
5694 | if (ext4_handle_valid(handle) && | 5694 | if (ext4_handle_valid(handle) && |
5695 | EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && | 5695 | EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && |
5696 | !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { | 5696 | !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { |
5697 | /* | 5697 | /* |
5698 | * We need extra buffer credits since we may write into EA block | 5698 | * We need extra buffer credits since we may write into EA block |
5699 | * with this same handle. If journal_extend fails, then it will | 5699 | * with this same handle. If journal_extend fails, then it will |
5700 | * only result in a minor loss of functionality for that inode. | 5700 | * only result in a minor loss of functionality for that inode. |
5701 | * If this is felt to be critical, then e2fsck should be run to | 5701 | * If this is felt to be critical, then e2fsck should be run to |
5702 | * force a large enough s_min_extra_isize. | 5702 | * force a large enough s_min_extra_isize. |
5703 | */ | 5703 | */ |
5704 | if ((jbd2_journal_extend(handle, | 5704 | if ((jbd2_journal_extend(handle, |
5705 | EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { | 5705 | EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { |
5706 | ret = ext4_expand_extra_isize(inode, | 5706 | ret = ext4_expand_extra_isize(inode, |
5707 | sbi->s_want_extra_isize, | 5707 | sbi->s_want_extra_isize, |
5708 | iloc, handle); | 5708 | iloc, handle); |
5709 | if (ret) { | 5709 | if (ret) { |
5710 | ext4_set_inode_state(inode, | 5710 | ext4_set_inode_state(inode, |
5711 | EXT4_STATE_NO_EXPAND); | 5711 | EXT4_STATE_NO_EXPAND); |
5712 | if (mnt_count != | 5712 | if (mnt_count != |
5713 | le16_to_cpu(sbi->s_es->s_mnt_count)) { | 5713 | le16_to_cpu(sbi->s_es->s_mnt_count)) { |
5714 | ext4_warning(inode->i_sb, | 5714 | ext4_warning(inode->i_sb, |
5715 | "Unable to expand inode %lu. Delete" | 5715 | "Unable to expand inode %lu. Delete" |
5716 | " some EAs or run e2fsck.", | 5716 | " some EAs or run e2fsck.", |
5717 | inode->i_ino); | 5717 | inode->i_ino); |
5718 | mnt_count = | 5718 | mnt_count = |
5719 | le16_to_cpu(sbi->s_es->s_mnt_count); | 5719 | le16_to_cpu(sbi->s_es->s_mnt_count); |
5720 | } | 5720 | } |
5721 | } | 5721 | } |
5722 | } | 5722 | } |
5723 | } | 5723 | } |
5724 | if (!err) | 5724 | if (!err) |
5725 | err = ext4_mark_iloc_dirty(handle, inode, &iloc); | 5725 | err = ext4_mark_iloc_dirty(handle, inode, &iloc); |
5726 | return err; | 5726 | return err; |
5727 | } | 5727 | } |
5728 | 5728 | ||
5729 | /* | 5729 | /* |
5730 | * ext4_dirty_inode() is called from __mark_inode_dirty() | 5730 | * ext4_dirty_inode() is called from __mark_inode_dirty() |
5731 | * | 5731 | * |
5732 | * We're really interested in the case where a file is being extended. | 5732 | * We're really interested in the case where a file is being extended. |
5733 | * i_size has been changed by generic_commit_write() and we thus need | 5733 | * i_size has been changed by generic_commit_write() and we thus need |
5734 | * to include the updated inode in the current transaction. | 5734 | * to include the updated inode in the current transaction. |
5735 | * | 5735 | * |
5736 | * Also, dquot_alloc_block() will always dirty the inode when blocks | 5736 | * Also, dquot_alloc_block() will always dirty the inode when blocks |
5737 | * are allocated to the file. | 5737 | * are allocated to the file. |
5738 | * | 5738 | * |
5739 | * If the inode is marked synchronous, we don't honour that here - doing | 5739 | * If the inode is marked synchronous, we don't honour that here - doing |
5740 | * so would cause a commit on atime updates, which we don't bother doing. | 5740 | * so would cause a commit on atime updates, which we don't bother doing. |
5741 | * We handle synchronous inodes at the highest possible level. | 5741 | * We handle synchronous inodes at the highest possible level. |
5742 | */ | 5742 | */ |
5743 | void ext4_dirty_inode(struct inode *inode, int flags) | 5743 | void ext4_dirty_inode(struct inode *inode, int flags) |
5744 | { | 5744 | { |
5745 | handle_t *handle; | 5745 | handle_t *handle; |
5746 | 5746 | ||
5747 | handle = ext4_journal_start(inode, 2); | 5747 | handle = ext4_journal_start(inode, 2); |
5748 | if (IS_ERR(handle)) | 5748 | if (IS_ERR(handle)) |
5749 | goto out; | 5749 | goto out; |
5750 | 5750 | ||
5751 | ext4_mark_inode_dirty(handle, inode); | 5751 | ext4_mark_inode_dirty(handle, inode); |
5752 | 5752 | ||
5753 | ext4_journal_stop(handle); | 5753 | ext4_journal_stop(handle); |
5754 | out: | 5754 | out: |
5755 | return; | 5755 | return; |
5756 | } | 5756 | } |
5757 | 5757 | ||
5758 | #if 0 | 5758 | #if 0 |
5759 | /* | 5759 | /* |
5760 | * Bind an inode's backing buffer_head into this transaction, to prevent | 5760 | * Bind an inode's backing buffer_head into this transaction, to prevent |
5761 | * it from being flushed to disk early. Unlike | 5761 | * it from being flushed to disk early. Unlike |
5762 | * ext4_reserve_inode_write, this leaves behind no bh reference and | 5762 | * ext4_reserve_inode_write, this leaves behind no bh reference and |
5763 | * returns no iloc structure, so the caller needs to repeat the iloc | 5763 | * returns no iloc structure, so the caller needs to repeat the iloc |
5764 | * lookup to mark the inode dirty later. | 5764 | * lookup to mark the inode dirty later. |
5765 | */ | 5765 | */ |
5766 | static int ext4_pin_inode(handle_t *handle, struct inode *inode) | 5766 | static int ext4_pin_inode(handle_t *handle, struct inode *inode) |
5767 | { | 5767 | { |
5768 | struct ext4_iloc iloc; | 5768 | struct ext4_iloc iloc; |
5769 | 5769 | ||
5770 | int err = 0; | 5770 | int err = 0; |
5771 | if (handle) { | 5771 | if (handle) { |
5772 | err = ext4_get_inode_loc(inode, &iloc); | 5772 | err = ext4_get_inode_loc(inode, &iloc); |
5773 | if (!err) { | 5773 | if (!err) { |
5774 | BUFFER_TRACE(iloc.bh, "get_write_access"); | 5774 | BUFFER_TRACE(iloc.bh, "get_write_access"); |
5775 | err = jbd2_journal_get_write_access(handle, iloc.bh); | 5775 | err = jbd2_journal_get_write_access(handle, iloc.bh); |
5776 | if (!err) | 5776 | if (!err) |
5777 | err = ext4_handle_dirty_metadata(handle, | 5777 | err = ext4_handle_dirty_metadata(handle, |
5778 | NULL, | 5778 | NULL, |
5779 | iloc.bh); | 5779 | iloc.bh); |
5780 | brelse(iloc.bh); | 5780 | brelse(iloc.bh); |
5781 | } | 5781 | } |
5782 | } | 5782 | } |
5783 | ext4_std_error(inode->i_sb, err); | 5783 | ext4_std_error(inode->i_sb, err); |
5784 | return err; | 5784 | return err; |
5785 | } | 5785 | } |
5786 | #endif | 5786 | #endif |
5787 | 5787 | ||
5788 | int ext4_change_inode_journal_flag(struct inode *inode, int val) | 5788 | int ext4_change_inode_journal_flag(struct inode *inode, int val) |
5789 | { | 5789 | { |
5790 | journal_t *journal; | 5790 | journal_t *journal; |
5791 | handle_t *handle; | 5791 | handle_t *handle; |
5792 | int err; | 5792 | int err; |
5793 | 5793 | ||
5794 | /* | 5794 | /* |
5795 | * We have to be very careful here: changing a data block's | 5795 | * We have to be very careful here: changing a data block's |
5796 | * journaling status dynamically is dangerous. If we write a | 5796 | * journaling status dynamically is dangerous. If we write a |
5797 | * data block to the journal, change the status and then delete | 5797 | * data block to the journal, change the status and then delete |
5798 | * that block, we risk forgetting to revoke the old log record | 5798 | * that block, we risk forgetting to revoke the old log record |
5799 | * from the journal and so a subsequent replay can corrupt data. | 5799 | * from the journal and so a subsequent replay can corrupt data. |
5800 | * So, first we make sure that the journal is empty and that | 5800 | * So, first we make sure that the journal is empty and that |
5801 | * nobody is changing anything. | 5801 | * nobody is changing anything. |
5802 | */ | 5802 | */ |
5803 | 5803 | ||
5804 | journal = EXT4_JOURNAL(inode); | 5804 | journal = EXT4_JOURNAL(inode); |
5805 | if (!journal) | 5805 | if (!journal) |
5806 | return 0; | 5806 | return 0; |
5807 | if (is_journal_aborted(journal)) | 5807 | if (is_journal_aborted(journal)) |
5808 | return -EROFS; | 5808 | return -EROFS; |
5809 | 5809 | ||
5810 | jbd2_journal_lock_updates(journal); | 5810 | jbd2_journal_lock_updates(journal); |
5811 | jbd2_journal_flush(journal); | 5811 | jbd2_journal_flush(journal); |
5812 | 5812 | ||
5813 | /* | 5813 | /* |
5814 | * OK, there are no updates running now, and all cached data is | 5814 | * OK, there are no updates running now, and all cached data is |
5815 | * synced to disk. We are now in a completely consistent state | 5815 | * synced to disk. We are now in a completely consistent state |
5816 | * which doesn't have anything in the journal, and we know that | 5816 | * which doesn't have anything in the journal, and we know that |
5817 | * no filesystem updates are running, so it is safe to modify | 5817 | * no filesystem updates are running, so it is safe to modify |
5818 | * the inode's in-core data-journaling state flag now. | 5818 | * the inode's in-core data-journaling state flag now. |
5819 | */ | 5819 | */ |
5820 | 5820 | ||
5821 | if (val) | 5821 | if (val) |
5822 | ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); | 5822 | ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); |
5823 | else | 5823 | else |
5824 | ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); | 5824 | ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); |
5825 | ext4_set_aops(inode); | 5825 | ext4_set_aops(inode); |
5826 | 5826 | ||
5827 | jbd2_journal_unlock_updates(journal); | 5827 | jbd2_journal_unlock_updates(journal); |
5828 | 5828 | ||
5829 | /* Finally we can mark the inode as dirty. */ | 5829 | /* Finally we can mark the inode as dirty. */ |
5830 | 5830 | ||
5831 | handle = ext4_journal_start(inode, 1); | 5831 | handle = ext4_journal_start(inode, 1); |
5832 | if (IS_ERR(handle)) | 5832 | if (IS_ERR(handle)) |
5833 | return PTR_ERR(handle); | 5833 | return PTR_ERR(handle); |
5834 | 5834 | ||
5835 | err = ext4_mark_inode_dirty(handle, inode); | 5835 | err = ext4_mark_inode_dirty(handle, inode); |
5836 | ext4_handle_sync(handle); | 5836 | ext4_handle_sync(handle); |
5837 | ext4_journal_stop(handle); | 5837 | ext4_journal_stop(handle); |
5838 | ext4_std_error(inode->i_sb, err); | 5838 | ext4_std_error(inode->i_sb, err); |
5839 | 5839 | ||
5840 | return err; | 5840 | return err; |
5841 | } | 5841 | } |
5842 | 5842 | ||
5843 | static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) | 5843 | static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) |
5844 | { | 5844 | { |
5845 | return !buffer_mapped(bh); | 5845 | return !buffer_mapped(bh); |
5846 | } | 5846 | } |
5847 | 5847 | ||
5848 | int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | 5848 | int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
5849 | { | 5849 | { |
5850 | struct page *page = vmf->page; | 5850 | struct page *page = vmf->page; |
5851 | loff_t size; | 5851 | loff_t size; |
5852 | unsigned long len; | 5852 | unsigned long len; |
5853 | int ret; | 5853 | int ret; |
5854 | struct file *file = vma->vm_file; | 5854 | struct file *file = vma->vm_file; |
5855 | struct inode *inode = file->f_path.dentry->d_inode; | 5855 | struct inode *inode = file->f_path.dentry->d_inode; |
5856 | struct address_space *mapping = inode->i_mapping; | 5856 | struct address_space *mapping = inode->i_mapping; |
5857 | handle_t *handle; | 5857 | handle_t *handle; |
5858 | get_block_t *get_block; | 5858 | get_block_t *get_block; |
5859 | int retries = 0; | 5859 | int retries = 0; |
5860 | 5860 | ||
5861 | /* | 5861 | /* |
5862 | * This check is racy but catches the common case. We rely on | 5862 | * This check is racy but catches the common case. We rely on |
5863 | * __block_page_mkwrite() to do a reliable check. | 5863 | * __block_page_mkwrite() to do a reliable check. |
5864 | */ | 5864 | */ |
5865 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | 5865 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
5866 | /* Delalloc case is easy... */ | 5866 | /* Delalloc case is easy... */ |
5867 | if (test_opt(inode->i_sb, DELALLOC) && | 5867 | if (test_opt(inode->i_sb, DELALLOC) && |
5868 | !ext4_should_journal_data(inode) && | 5868 | !ext4_should_journal_data(inode) && |
5869 | !ext4_nonda_switch(inode->i_sb)) { | 5869 | !ext4_nonda_switch(inode->i_sb)) { |
5870 | do { | 5870 | do { |
5871 | ret = __block_page_mkwrite(vma, vmf, | 5871 | ret = __block_page_mkwrite(vma, vmf, |
5872 | ext4_da_get_block_prep); | 5872 | ext4_da_get_block_prep); |
5873 | } while (ret == -ENOSPC && | 5873 | } while (ret == -ENOSPC && |
5874 | ext4_should_retry_alloc(inode->i_sb, &retries)); | 5874 | ext4_should_retry_alloc(inode->i_sb, &retries)); |
5875 | goto out_ret; | 5875 | goto out_ret; |
5876 | } | 5876 | } |
5877 | 5877 | ||
5878 | lock_page(page); | 5878 | lock_page(page); |
5879 | size = i_size_read(inode); | 5879 | size = i_size_read(inode); |
5880 | /* Page got truncated from under us? */ | 5880 | /* Page got truncated from under us? */ |
5881 | if (page->mapping != mapping || page_offset(page) > size) { | 5881 | if (page->mapping != mapping || page_offset(page) > size) { |
5882 | unlock_page(page); | 5882 | unlock_page(page); |
5883 | ret = VM_FAULT_NOPAGE; | 5883 | ret = VM_FAULT_NOPAGE; |
5884 | goto out; | 5884 | goto out; |
5885 | } | 5885 | } |
5886 | 5886 | ||
5887 | if (page->index == size >> PAGE_CACHE_SHIFT) | 5887 | if (page->index == size >> PAGE_CACHE_SHIFT) |
5888 | len = size & ~PAGE_CACHE_MASK; | 5888 | len = size & ~PAGE_CACHE_MASK; |
5889 | else | 5889 | else |
5890 | len = PAGE_CACHE_SIZE; | 5890 | len = PAGE_CACHE_SIZE; |
5891 | /* | 5891 | /* |
5892 | * Return if we have all the buffers mapped. This avoids the need to do | 5892 | * Return if we have all the buffers mapped. This avoids the need to do |
5893 | * journal_start/journal_stop which can block and take a long time | 5893 | * journal_start/journal_stop which can block and take a long time |
5894 | */ | 5894 | */ |
5895 | if (page_has_buffers(page)) { | 5895 | if (page_has_buffers(page)) { |
5896 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | 5896 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, |
5897 | ext4_bh_unmapped)) { | 5897 | ext4_bh_unmapped)) { |
5898 | /* Wait so that we don't change page under IO */ | 5898 | /* Wait so that we don't change page under IO */ |
5899 | wait_on_page_writeback(page); | 5899 | wait_on_page_writeback(page); |
5900 | ret = VM_FAULT_LOCKED; | 5900 | ret = VM_FAULT_LOCKED; |
5901 | goto out; | 5901 | goto out; |
5902 | } | 5902 | } |
5903 | } | 5903 | } |
5904 | unlock_page(page); | 5904 | unlock_page(page); |
5905 | /* OK, we need to fill the hole... */ | 5905 | /* OK, we need to fill the hole... */ |
5906 | if (ext4_should_dioread_nolock(inode)) | 5906 | if (ext4_should_dioread_nolock(inode)) |
5907 | get_block = ext4_get_block_write; | 5907 | get_block = ext4_get_block_write; |
5908 | else | 5908 | else |
5909 | get_block = ext4_get_block; | 5909 | get_block = ext4_get_block; |
5910 | retry_alloc: | 5910 | retry_alloc: |
5911 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | 5911 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); |
5912 | if (IS_ERR(handle)) { | 5912 | if (IS_ERR(handle)) { |
5913 | ret = VM_FAULT_SIGBUS; | 5913 | ret = VM_FAULT_SIGBUS; |
5914 | goto out; | 5914 | goto out; |
5915 | } | 5915 | } |
5916 | ret = __block_page_mkwrite(vma, vmf, get_block); | 5916 | ret = __block_page_mkwrite(vma, vmf, get_block); |
5917 | if (!ret && ext4_should_journal_data(inode)) { | 5917 | if (!ret && ext4_should_journal_data(inode)) { |
5918 | if (walk_page_buffers(handle, page_buffers(page), 0, | 5918 | if (walk_page_buffers(handle, page_buffers(page), 0, |
5919 | PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { | 5919 | PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { |
5920 | unlock_page(page); | 5920 | unlock_page(page); |
5921 | ret = VM_FAULT_SIGBUS; | 5921 | ret = VM_FAULT_SIGBUS; |
5922 | goto out; | 5922 | goto out; |
5923 | } | 5923 | } |
5924 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 5924 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
5925 | } | 5925 | } |
5926 | ext4_journal_stop(handle); | 5926 | ext4_journal_stop(handle); |
5927 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 5927 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
5928 | goto retry_alloc; | 5928 | goto retry_alloc; |
5929 | out_ret: | 5929 | out_ret: |
5930 | ret = block_page_mkwrite_return(ret); | 5930 | ret = block_page_mkwrite_return(ret); |
5931 | out: | 5931 | out: |
5932 | return ret; | 5932 | return ret; |
5933 | } | 5933 | } |
5934 | 5934 |
fs/fs-writeback.c
1 | /* | 1 | /* |
2 | * fs/fs-writeback.c | 2 | * fs/fs-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * | 5 | * |
6 | * Contains all the functions related to writing back and waiting | 6 | * Contains all the functions related to writing back and waiting |
7 | * upon dirty inodes against superblocks, and writing back dirty | 7 | * upon dirty inodes against superblocks, and writing back dirty |
8 | * pages against inodes. ie: data writeback. Writeout of the | 8 | * pages against inodes. ie: data writeback. Writeout of the |
9 | * inode itself is not handled here. | 9 | * inode itself is not handled here. |
10 | * | 10 | * |
11 | * 10Apr2002 Andrew Morton | 11 | * 10Apr2002 Andrew Morton |
12 | * Split out of fs/inode.c | 12 | * Split out of fs/inode.c |
13 | * Additions for address_space-based writeback | 13 | * Additions for address_space-based writeback |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/kthread.h> | 23 | #include <linux/kthread.h> |
24 | #include <linux/freezer.h> | 24 | #include <linux/freezer.h> |
25 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
29 | #include <linux/tracepoint.h> | 29 | #include <linux/tracepoint.h> |
30 | #include "internal.h" | 30 | #include "internal.h" |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Passed into wb_writeback(), essentially a subset of writeback_control | 33 | * Passed into wb_writeback(), essentially a subset of writeback_control |
34 | */ | 34 | */ |
35 | struct wb_writeback_work { | 35 | struct wb_writeback_work { |
36 | long nr_pages; | 36 | long nr_pages; |
37 | struct super_block *sb; | 37 | struct super_block *sb; |
38 | unsigned long *older_than_this; | ||
38 | enum writeback_sync_modes sync_mode; | 39 | enum writeback_sync_modes sync_mode; |
40 | unsigned int tagged_writepages:1; | ||
39 | unsigned int for_kupdate:1; | 41 | unsigned int for_kupdate:1; |
40 | unsigned int range_cyclic:1; | 42 | unsigned int range_cyclic:1; |
41 | unsigned int for_background:1; | 43 | unsigned int for_background:1; |
42 | 44 | ||
43 | struct list_head list; /* pending work list */ | 45 | struct list_head list; /* pending work list */ |
44 | struct completion *done; /* set if the caller waits */ | 46 | struct completion *done; /* set if the caller waits */ |
45 | }; | 47 | }; |
46 | 48 | ||
47 | /* | 49 | /* |
48 | * Include the creation of the trace points after defining the | 50 | * Include the creation of the trace points after defining the |
49 | * wb_writeback_work structure so that the definition remains local to this | 51 | * wb_writeback_work structure so that the definition remains local to this |
50 | * file. | 52 | * file. |
51 | */ | 53 | */ |
52 | #define CREATE_TRACE_POINTS | 54 | #define CREATE_TRACE_POINTS |
53 | #include <trace/events/writeback.h> | 55 | #include <trace/events/writeback.h> |
54 | 56 | ||
55 | /* | 57 | /* |
56 | * We don't actually have pdflush, but this one is exported though /proc... | 58 | * We don't actually have pdflush, but this one is exported though /proc... |
57 | */ | 59 | */ |
58 | int nr_pdflush_threads; | 60 | int nr_pdflush_threads; |
59 | 61 | ||
60 | /** | 62 | /** |
61 | * writeback_in_progress - determine whether there is writeback in progress | 63 | * writeback_in_progress - determine whether there is writeback in progress |
62 | * @bdi: the device's backing_dev_info structure. | 64 | * @bdi: the device's backing_dev_info structure. |
63 | * | 65 | * |
64 | * Determine whether there is writeback waiting to be handled against a | 66 | * Determine whether there is writeback waiting to be handled against a |
65 | * backing device. | 67 | * backing device. |
66 | */ | 68 | */ |
67 | int writeback_in_progress(struct backing_dev_info *bdi) | 69 | int writeback_in_progress(struct backing_dev_info *bdi) |
68 | { | 70 | { |
69 | return test_bit(BDI_writeback_running, &bdi->state); | 71 | return test_bit(BDI_writeback_running, &bdi->state); |
70 | } | 72 | } |
71 | 73 | ||
72 | static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) | 74 | static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) |
73 | { | 75 | { |
74 | struct super_block *sb = inode->i_sb; | 76 | struct super_block *sb = inode->i_sb; |
75 | 77 | ||
76 | if (strcmp(sb->s_type->name, "bdev") == 0) | 78 | if (strcmp(sb->s_type->name, "bdev") == 0) |
77 | return inode->i_mapping->backing_dev_info; | 79 | return inode->i_mapping->backing_dev_info; |
78 | 80 | ||
79 | return sb->s_bdi; | 81 | return sb->s_bdi; |
80 | } | 82 | } |
81 | 83 | ||
82 | static inline struct inode *wb_inode(struct list_head *head) | 84 | static inline struct inode *wb_inode(struct list_head *head) |
83 | { | 85 | { |
84 | return list_entry(head, struct inode, i_wb_list); | 86 | return list_entry(head, struct inode, i_wb_list); |
85 | } | 87 | } |
86 | 88 | ||
87 | /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ | 89 | /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ |
88 | static void bdi_wakeup_flusher(struct backing_dev_info *bdi) | 90 | static void bdi_wakeup_flusher(struct backing_dev_info *bdi) |
89 | { | 91 | { |
90 | if (bdi->wb.task) { | 92 | if (bdi->wb.task) { |
91 | wake_up_process(bdi->wb.task); | 93 | wake_up_process(bdi->wb.task); |
92 | } else { | 94 | } else { |
93 | /* | 95 | /* |
94 | * The bdi thread isn't there, wake up the forker thread which | 96 | * The bdi thread isn't there, wake up the forker thread which |
95 | * will create and run it. | 97 | * will create and run it. |
96 | */ | 98 | */ |
97 | wake_up_process(default_backing_dev_info.wb.task); | 99 | wake_up_process(default_backing_dev_info.wb.task); |
98 | } | 100 | } |
99 | } | 101 | } |
100 | 102 | ||
101 | static void bdi_queue_work(struct backing_dev_info *bdi, | 103 | static void bdi_queue_work(struct backing_dev_info *bdi, |
102 | struct wb_writeback_work *work) | 104 | struct wb_writeback_work *work) |
103 | { | 105 | { |
104 | trace_writeback_queue(bdi, work); | 106 | trace_writeback_queue(bdi, work); |
105 | 107 | ||
106 | spin_lock_bh(&bdi->wb_lock); | 108 | spin_lock_bh(&bdi->wb_lock); |
107 | list_add_tail(&work->list, &bdi->work_list); | 109 | list_add_tail(&work->list, &bdi->work_list); |
108 | if (!bdi->wb.task) | 110 | if (!bdi->wb.task) |
109 | trace_writeback_nothread(bdi, work); | 111 | trace_writeback_nothread(bdi, work); |
110 | bdi_wakeup_flusher(bdi); | 112 | bdi_wakeup_flusher(bdi); |
111 | spin_unlock_bh(&bdi->wb_lock); | 113 | spin_unlock_bh(&bdi->wb_lock); |
112 | } | 114 | } |
113 | 115 | ||
114 | static void | 116 | static void |
115 | __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | 117 | __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, |
116 | bool range_cyclic) | 118 | bool range_cyclic) |
117 | { | 119 | { |
118 | struct wb_writeback_work *work; | 120 | struct wb_writeback_work *work; |
119 | 121 | ||
120 | /* | 122 | /* |
121 | * This is WB_SYNC_NONE writeback, so if allocation fails just | 123 | * This is WB_SYNC_NONE writeback, so if allocation fails just |
122 | * wakeup the thread for old dirty data writeback | 124 | * wakeup the thread for old dirty data writeback |
123 | */ | 125 | */ |
124 | work = kzalloc(sizeof(*work), GFP_ATOMIC); | 126 | work = kzalloc(sizeof(*work), GFP_ATOMIC); |
125 | if (!work) { | 127 | if (!work) { |
126 | if (bdi->wb.task) { | 128 | if (bdi->wb.task) { |
127 | trace_writeback_nowork(bdi); | 129 | trace_writeback_nowork(bdi); |
128 | wake_up_process(bdi->wb.task); | 130 | wake_up_process(bdi->wb.task); |
129 | } | 131 | } |
130 | return; | 132 | return; |
131 | } | 133 | } |
132 | 134 | ||
133 | work->sync_mode = WB_SYNC_NONE; | 135 | work->sync_mode = WB_SYNC_NONE; |
134 | work->nr_pages = nr_pages; | 136 | work->nr_pages = nr_pages; |
135 | work->range_cyclic = range_cyclic; | 137 | work->range_cyclic = range_cyclic; |
136 | 138 | ||
137 | bdi_queue_work(bdi, work); | 139 | bdi_queue_work(bdi, work); |
138 | } | 140 | } |
139 | 141 | ||
140 | /** | 142 | /** |
141 | * bdi_start_writeback - start writeback | 143 | * bdi_start_writeback - start writeback |
142 | * @bdi: the backing device to write from | 144 | * @bdi: the backing device to write from |
143 | * @nr_pages: the number of pages to write | 145 | * @nr_pages: the number of pages to write |
144 | * | 146 | * |
145 | * Description: | 147 | * Description: |
146 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only | 148 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only |
147 | * started when this function returns, we make no guarantees on | 149 | * started when this function returns, we make no guarantees on |
148 | * completion. Caller need not hold sb s_umount semaphore. | 150 | * completion. Caller need not hold sb s_umount semaphore. |
149 | * | 151 | * |
150 | */ | 152 | */ |
151 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) | 153 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) |
152 | { | 154 | { |
153 | __bdi_start_writeback(bdi, nr_pages, true); | 155 | __bdi_start_writeback(bdi, nr_pages, true); |
154 | } | 156 | } |
155 | 157 | ||
156 | /** | 158 | /** |
157 | * bdi_start_background_writeback - start background writeback | 159 | * bdi_start_background_writeback - start background writeback |
158 | * @bdi: the backing device to write from | 160 | * @bdi: the backing device to write from |
159 | * | 161 | * |
160 | * Description: | 162 | * Description: |
161 | * This makes sure WB_SYNC_NONE background writeback happens. When | 163 | * This makes sure WB_SYNC_NONE background writeback happens. When |
162 | * this function returns, it is only guaranteed that for given BDI | 164 | * this function returns, it is only guaranteed that for given BDI |
163 | * some IO is happening if we are over background dirty threshold. | 165 | * some IO is happening if we are over background dirty threshold. |
164 | * Caller need not hold sb s_umount semaphore. | 166 | * Caller need not hold sb s_umount semaphore. |
165 | */ | 167 | */ |
166 | void bdi_start_background_writeback(struct backing_dev_info *bdi) | 168 | void bdi_start_background_writeback(struct backing_dev_info *bdi) |
167 | { | 169 | { |
168 | /* | 170 | /* |
169 | * We just wake up the flusher thread. It will perform background | 171 | * We just wake up the flusher thread. It will perform background |
170 | * writeback as soon as there is no other work to do. | 172 | * writeback as soon as there is no other work to do. |
171 | */ | 173 | */ |
172 | trace_writeback_wake_background(bdi); | 174 | trace_writeback_wake_background(bdi); |
173 | spin_lock_bh(&bdi->wb_lock); | 175 | spin_lock_bh(&bdi->wb_lock); |
174 | bdi_wakeup_flusher(bdi); | 176 | bdi_wakeup_flusher(bdi); |
175 | spin_unlock_bh(&bdi->wb_lock); | 177 | spin_unlock_bh(&bdi->wb_lock); |
176 | } | 178 | } |
177 | 179 | ||
178 | /* | 180 | /* |
179 | * Remove the inode from the writeback list it is on. | 181 | * Remove the inode from the writeback list it is on. |
180 | */ | 182 | */ |
181 | void inode_wb_list_del(struct inode *inode) | 183 | void inode_wb_list_del(struct inode *inode) |
182 | { | 184 | { |
183 | spin_lock(&inode_wb_list_lock); | 185 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
186 | |||
187 | spin_lock(&bdi->wb.list_lock); | ||
184 | list_del_init(&inode->i_wb_list); | 188 | list_del_init(&inode->i_wb_list); |
185 | spin_unlock(&inode_wb_list_lock); | 189 | spin_unlock(&bdi->wb.list_lock); |
186 | } | 190 | } |
187 | 191 | ||
188 | |||
189 | /* | 192 | /* |
190 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the | 193 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
191 | * furthest end of its superblock's dirty-inode list. | 194 | * furthest end of its superblock's dirty-inode list. |
192 | * | 195 | * |
193 | * Before stamping the inode's ->dirtied_when, we check to see whether it is | 196 | * Before stamping the inode's ->dirtied_when, we check to see whether it is |
194 | * already the most-recently-dirtied inode on the b_dirty list. If that is | 197 | * already the most-recently-dirtied inode on the b_dirty list. If that is |
195 | * the case then the inode must have been redirtied while it was being written | 198 | * the case then the inode must have been redirtied while it was being written |
196 | * out and we don't reset its dirtied_when. | 199 | * out and we don't reset its dirtied_when. |
197 | */ | 200 | */ |
198 | static void redirty_tail(struct inode *inode) | 201 | static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) |
199 | { | 202 | { |
200 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 203 | assert_spin_locked(&wb->list_lock); |
201 | |||
202 | assert_spin_locked(&inode_wb_list_lock); | ||
203 | if (!list_empty(&wb->b_dirty)) { | 204 | if (!list_empty(&wb->b_dirty)) { |
204 | struct inode *tail; | 205 | struct inode *tail; |
205 | 206 | ||
206 | tail = wb_inode(wb->b_dirty.next); | 207 | tail = wb_inode(wb->b_dirty.next); |
207 | if (time_before(inode->dirtied_when, tail->dirtied_when)) | 208 | if (time_before(inode->dirtied_when, tail->dirtied_when)) |
208 | inode->dirtied_when = jiffies; | 209 | inode->dirtied_when = jiffies; |
209 | } | 210 | } |
210 | list_move(&inode->i_wb_list, &wb->b_dirty); | 211 | list_move(&inode->i_wb_list, &wb->b_dirty); |
211 | } | 212 | } |
212 | 213 | ||
213 | /* | 214 | /* |
214 | * requeue inode for re-scanning after bdi->b_io list is exhausted. | 215 | * requeue inode for re-scanning after bdi->b_io list is exhausted. |
215 | */ | 216 | */ |
216 | static void requeue_io(struct inode *inode) | 217 | static void requeue_io(struct inode *inode, struct bdi_writeback *wb) |
217 | { | 218 | { |
218 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 219 | assert_spin_locked(&wb->list_lock); |
219 | |||
220 | assert_spin_locked(&inode_wb_list_lock); | ||
221 | list_move(&inode->i_wb_list, &wb->b_more_io); | 220 | list_move(&inode->i_wb_list, &wb->b_more_io); |
222 | } | 221 | } |
223 | 222 | ||
224 | static void inode_sync_complete(struct inode *inode) | 223 | static void inode_sync_complete(struct inode *inode) |
225 | { | 224 | { |
226 | /* | 225 | /* |
227 | * Prevent speculative execution through | 226 | * Prevent speculative execution through |
228 | * spin_unlock(&inode_wb_list_lock); | 227 | * spin_unlock(&wb->list_lock); |
229 | */ | 228 | */ |
230 | 229 | ||
231 | smp_mb(); | 230 | smp_mb(); |
232 | wake_up_bit(&inode->i_state, __I_SYNC); | 231 | wake_up_bit(&inode->i_state, __I_SYNC); |
233 | } | 232 | } |
234 | 233 | ||
235 | static bool inode_dirtied_after(struct inode *inode, unsigned long t) | 234 | static bool inode_dirtied_after(struct inode *inode, unsigned long t) |
236 | { | 235 | { |
237 | bool ret = time_after(inode->dirtied_when, t); | 236 | bool ret = time_after(inode->dirtied_when, t); |
238 | #ifndef CONFIG_64BIT | 237 | #ifndef CONFIG_64BIT |
239 | /* | 238 | /* |
240 | * For inodes being constantly redirtied, dirtied_when can get stuck. | 239 | * For inodes being constantly redirtied, dirtied_when can get stuck. |
241 | * It _appears_ to be in the future, but is actually in distant past. | 240 | * It _appears_ to be in the future, but is actually in distant past. |
242 | * This test is necessary to prevent such wrapped-around relative times | 241 | * This test is necessary to prevent such wrapped-around relative times |
243 | * from permanently stopping the whole bdi writeback. | 242 | * from permanently stopping the whole bdi writeback. |
244 | */ | 243 | */ |
245 | ret = ret && time_before_eq(inode->dirtied_when, jiffies); | 244 | ret = ret && time_before_eq(inode->dirtied_when, jiffies); |
246 | #endif | 245 | #endif |
247 | return ret; | 246 | return ret; |
248 | } | 247 | } |
249 | 248 | ||
250 | /* | 249 | /* |
251 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. | 250 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. |
252 | */ | 251 | */ |
253 | static void move_expired_inodes(struct list_head *delaying_queue, | 252 | static int move_expired_inodes(struct list_head *delaying_queue, |
254 | struct list_head *dispatch_queue, | 253 | struct list_head *dispatch_queue, |
255 | unsigned long *older_than_this) | 254 | unsigned long *older_than_this) |
256 | { | 255 | { |
257 | LIST_HEAD(tmp); | 256 | LIST_HEAD(tmp); |
258 | struct list_head *pos, *node; | 257 | struct list_head *pos, *node; |
259 | struct super_block *sb = NULL; | 258 | struct super_block *sb = NULL; |
260 | struct inode *inode; | 259 | struct inode *inode; |
261 | int do_sb_sort = 0; | 260 | int do_sb_sort = 0; |
261 | int moved = 0; | ||
262 | 262 | ||
263 | while (!list_empty(delaying_queue)) { | 263 | while (!list_empty(delaying_queue)) { |
264 | inode = wb_inode(delaying_queue->prev); | 264 | inode = wb_inode(delaying_queue->prev); |
265 | if (older_than_this && | 265 | if (older_than_this && |
266 | inode_dirtied_after(inode, *older_than_this)) | 266 | inode_dirtied_after(inode, *older_than_this)) |
267 | break; | 267 | break; |
268 | if (sb && sb != inode->i_sb) | 268 | if (sb && sb != inode->i_sb) |
269 | do_sb_sort = 1; | 269 | do_sb_sort = 1; |
270 | sb = inode->i_sb; | 270 | sb = inode->i_sb; |
271 | list_move(&inode->i_wb_list, &tmp); | 271 | list_move(&inode->i_wb_list, &tmp); |
272 | moved++; | ||
272 | } | 273 | } |
273 | 274 | ||
274 | /* just one sb in list, splice to dispatch_queue and we're done */ | 275 | /* just one sb in list, splice to dispatch_queue and we're done */ |
275 | if (!do_sb_sort) { | 276 | if (!do_sb_sort) { |
276 | list_splice(&tmp, dispatch_queue); | 277 | list_splice(&tmp, dispatch_queue); |
277 | return; | 278 | goto out; |
278 | } | 279 | } |
279 | 280 | ||
280 | /* Move inodes from one superblock together */ | 281 | /* Move inodes from one superblock together */ |
281 | while (!list_empty(&tmp)) { | 282 | while (!list_empty(&tmp)) { |
282 | sb = wb_inode(tmp.prev)->i_sb; | 283 | sb = wb_inode(tmp.prev)->i_sb; |
283 | list_for_each_prev_safe(pos, node, &tmp) { | 284 | list_for_each_prev_safe(pos, node, &tmp) { |
284 | inode = wb_inode(pos); | 285 | inode = wb_inode(pos); |
285 | if (inode->i_sb == sb) | 286 | if (inode->i_sb == sb) |
286 | list_move(&inode->i_wb_list, dispatch_queue); | 287 | list_move(&inode->i_wb_list, dispatch_queue); |
287 | } | 288 | } |
288 | } | 289 | } |
290 | out: | ||
291 | return moved; | ||
289 | } | 292 | } |
290 | 293 | ||
291 | /* | 294 | /* |
292 | * Queue all expired dirty inodes for io, eldest first. | 295 | * Queue all expired dirty inodes for io, eldest first. |
293 | * Before | 296 | * Before |
294 | * newly dirtied b_dirty b_io b_more_io | 297 | * newly dirtied b_dirty b_io b_more_io |
295 | * =============> gf edc BA | 298 | * =============> gf edc BA |
296 | * After | 299 | * After |
297 | * newly dirtied b_dirty b_io b_more_io | 300 | * newly dirtied b_dirty b_io b_more_io |
298 | * =============> g fBAedc | 301 | * =============> g fBAedc |
299 | * | | 302 | * | |
300 | * +--> dequeue for IO | 303 | * +--> dequeue for IO |
301 | */ | 304 | */ |
302 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) | 305 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) |
303 | { | 306 | { |
304 | assert_spin_locked(&inode_wb_list_lock); | 307 | int moved; |
308 | assert_spin_locked(&wb->list_lock); | ||
305 | list_splice_init(&wb->b_more_io, &wb->b_io); | 309 | list_splice_init(&wb->b_more_io, &wb->b_io); |
306 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | 310 | moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); |
311 | trace_writeback_queue_io(wb, older_than_this, moved); | ||
307 | } | 312 | } |
308 | 313 | ||
309 | static int write_inode(struct inode *inode, struct writeback_control *wbc) | 314 | static int write_inode(struct inode *inode, struct writeback_control *wbc) |
310 | { | 315 | { |
311 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) | 316 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) |
312 | return inode->i_sb->s_op->write_inode(inode, wbc); | 317 | return inode->i_sb->s_op->write_inode(inode, wbc); |
313 | return 0; | 318 | return 0; |
314 | } | 319 | } |
315 | 320 | ||
316 | /* | 321 | /* |
317 | * Wait for writeback on an inode to complete. | 322 | * Wait for writeback on an inode to complete. |
318 | */ | 323 | */ |
319 | static void inode_wait_for_writeback(struct inode *inode) | 324 | static void inode_wait_for_writeback(struct inode *inode, |
325 | struct bdi_writeback *wb) | ||
320 | { | 326 | { |
321 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); | 327 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); |
322 | wait_queue_head_t *wqh; | 328 | wait_queue_head_t *wqh; |
323 | 329 | ||
324 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); | 330 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
325 | while (inode->i_state & I_SYNC) { | 331 | while (inode->i_state & I_SYNC) { |
326 | spin_unlock(&inode->i_lock); | 332 | spin_unlock(&inode->i_lock); |
327 | spin_unlock(&inode_wb_list_lock); | 333 | spin_unlock(&wb->list_lock); |
328 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); | 334 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
329 | spin_lock(&inode_wb_list_lock); | 335 | spin_lock(&wb->list_lock); |
330 | spin_lock(&inode->i_lock); | 336 | spin_lock(&inode->i_lock); |
331 | } | 337 | } |
332 | } | 338 | } |
333 | 339 | ||
334 | /* | 340 | /* |
335 | * Write out an inode's dirty pages. Called under inode_wb_list_lock and | 341 | * Write out an inode's dirty pages. Called under wb->list_lock and |
336 | * inode->i_lock. Either the caller has an active reference on the inode or | 342 | * inode->i_lock. Either the caller has an active reference on the inode or |
337 | * the inode has I_WILL_FREE set. | 343 | * the inode has I_WILL_FREE set. |
338 | * | 344 | * |
339 | * If `wait' is set, wait on the writeout. | 345 | * If `wait' is set, wait on the writeout. |
340 | * | 346 | * |
341 | * The whole writeout design is quite complex and fragile. We want to avoid | 347 | * The whole writeout design is quite complex and fragile. We want to avoid |
342 | * starvation of particular inodes when others are being redirtied, prevent | 348 | * starvation of particular inodes when others are being redirtied, prevent |
343 | * livelocks, etc. | 349 | * livelocks, etc. |
344 | */ | 350 | */ |
345 | static int | 351 | static int |
346 | writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | 352 | writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, |
353 | struct writeback_control *wbc) | ||
347 | { | 354 | { |
348 | struct address_space *mapping = inode->i_mapping; | 355 | struct address_space *mapping = inode->i_mapping; |
356 | long nr_to_write = wbc->nr_to_write; | ||
349 | unsigned dirty; | 357 | unsigned dirty; |
350 | int ret; | 358 | int ret; |
351 | 359 | ||
352 | assert_spin_locked(&inode_wb_list_lock); | 360 | assert_spin_locked(&wb->list_lock); |
353 | assert_spin_locked(&inode->i_lock); | 361 | assert_spin_locked(&inode->i_lock); |
354 | 362 | ||
355 | if (!atomic_read(&inode->i_count)) | 363 | if (!atomic_read(&inode->i_count)) |
356 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); | 364 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); |
357 | else | 365 | else |
358 | WARN_ON(inode->i_state & I_WILL_FREE); | 366 | WARN_ON(inode->i_state & I_WILL_FREE); |
359 | 367 | ||
360 | if (inode->i_state & I_SYNC) { | 368 | if (inode->i_state & I_SYNC) { |
361 | /* | 369 | /* |
362 | * If this inode is locked for writeback and we are not doing | 370 | * If this inode is locked for writeback and we are not doing |
363 | * writeback-for-data-integrity, move it to b_more_io so that | 371 | * writeback-for-data-integrity, move it to b_more_io so that |
364 | * writeback can proceed with the other inodes on s_io. | 372 | * writeback can proceed with the other inodes on s_io. |
365 | * | 373 | * |
366 | * We'll have another go at writing back this inode when we | 374 | * We'll have another go at writing back this inode when we |
367 | * completed a full scan of b_io. | 375 | * completed a full scan of b_io. |
368 | */ | 376 | */ |
369 | if (wbc->sync_mode != WB_SYNC_ALL) { | 377 | if (wbc->sync_mode != WB_SYNC_ALL) { |
370 | requeue_io(inode); | 378 | requeue_io(inode, wb); |
379 | trace_writeback_single_inode_requeue(inode, wbc, | ||
380 | nr_to_write); | ||
371 | return 0; | 381 | return 0; |
372 | } | 382 | } |
373 | 383 | ||
374 | /* | 384 | /* |
375 | * It's a data-integrity sync. We must wait. | 385 | * It's a data-integrity sync. We must wait. |
376 | */ | 386 | */ |
377 | inode_wait_for_writeback(inode); | 387 | inode_wait_for_writeback(inode, wb); |
378 | } | 388 | } |
379 | 389 | ||
380 | BUG_ON(inode->i_state & I_SYNC); | 390 | BUG_ON(inode->i_state & I_SYNC); |
381 | 391 | ||
382 | /* Set I_SYNC, reset I_DIRTY_PAGES */ | 392 | /* Set I_SYNC, reset I_DIRTY_PAGES */ |
383 | inode->i_state |= I_SYNC; | 393 | inode->i_state |= I_SYNC; |
384 | inode->i_state &= ~I_DIRTY_PAGES; | 394 | inode->i_state &= ~I_DIRTY_PAGES; |
385 | spin_unlock(&inode->i_lock); | 395 | spin_unlock(&inode->i_lock); |
386 | spin_unlock(&inode_wb_list_lock); | 396 | spin_unlock(&wb->list_lock); |
387 | 397 | ||
388 | ret = do_writepages(mapping, wbc); | 398 | ret = do_writepages(mapping, wbc); |
389 | 399 | ||
390 | /* | 400 | /* |
391 | * Make sure to wait on the data before writing out the metadata. | 401 | * Make sure to wait on the data before writing out the metadata. |
392 | * This is important for filesystems that modify metadata on data | 402 | * This is important for filesystems that modify metadata on data |
393 | * I/O completion. | 403 | * I/O completion. |
394 | */ | 404 | */ |
395 | if (wbc->sync_mode == WB_SYNC_ALL) { | 405 | if (wbc->sync_mode == WB_SYNC_ALL) { |
396 | int err = filemap_fdatawait(mapping); | 406 | int err = filemap_fdatawait(mapping); |
397 | if (ret == 0) | 407 | if (ret == 0) |
398 | ret = err; | 408 | ret = err; |
399 | } | 409 | } |
400 | 410 | ||
401 | /* | 411 | /* |
402 | * Some filesystems may redirty the inode during the writeback | 412 | * Some filesystems may redirty the inode during the writeback |
403 | * due to delalloc, clear dirty metadata flags right before | 413 | * due to delalloc, clear dirty metadata flags right before |
404 | * write_inode() | 414 | * write_inode() |
405 | */ | 415 | */ |
406 | spin_lock(&inode->i_lock); | 416 | spin_lock(&inode->i_lock); |
407 | dirty = inode->i_state & I_DIRTY; | 417 | dirty = inode->i_state & I_DIRTY; |
408 | inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); | 418 | inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); |
409 | spin_unlock(&inode->i_lock); | 419 | spin_unlock(&inode->i_lock); |
410 | /* Don't write the inode if only I_DIRTY_PAGES was set */ | 420 | /* Don't write the inode if only I_DIRTY_PAGES was set */ |
411 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 421 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
412 | int err = write_inode(inode, wbc); | 422 | int err = write_inode(inode, wbc); |
413 | if (ret == 0) | 423 | if (ret == 0) |
414 | ret = err; | 424 | ret = err; |
415 | } | 425 | } |
416 | 426 | ||
417 | spin_lock(&inode_wb_list_lock); | 427 | spin_lock(&wb->list_lock); |
418 | spin_lock(&inode->i_lock); | 428 | spin_lock(&inode->i_lock); |
419 | inode->i_state &= ~I_SYNC; | 429 | inode->i_state &= ~I_SYNC; |
420 | if (!(inode->i_state & I_FREEING)) { | 430 | if (!(inode->i_state & I_FREEING)) { |
431 | /* | ||
432 | * Sync livelock prevention. Each inode is tagged and synced in | ||
433 | * one shot. If still dirty, it will be redirty_tail()'ed below. | ||
434 | * Update the dirty time to prevent enqueue and sync it again. | ||
435 | */ | ||
436 | if ((inode->i_state & I_DIRTY) && | ||
437 | (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) | ||
438 | inode->dirtied_when = jiffies; | ||
439 | |||
421 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | 440 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { |
422 | /* | 441 | /* |
423 | * We didn't write back all the pages. nfs_writepages() | 442 | * We didn't write back all the pages. nfs_writepages() |
424 | * sometimes bales out without doing anything. | 443 | * sometimes bales out without doing anything. |
425 | */ | 444 | */ |
426 | inode->i_state |= I_DIRTY_PAGES; | 445 | inode->i_state |= I_DIRTY_PAGES; |
427 | if (wbc->nr_to_write <= 0) { | 446 | if (wbc->nr_to_write <= 0) { |
428 | /* | 447 | /* |
429 | * slice used up: queue for next turn | 448 | * slice used up: queue for next turn |
430 | */ | 449 | */ |
431 | requeue_io(inode); | 450 | requeue_io(inode, wb); |
432 | } else { | 451 | } else { |
433 | /* | 452 | /* |
434 | * Writeback blocked by something other than | 453 | * Writeback blocked by something other than |
435 | * congestion. Delay the inode for some time to | 454 | * congestion. Delay the inode for some time to |
436 | * avoid spinning on the CPU (100% iowait) | 455 | * avoid spinning on the CPU (100% iowait) |
437 | * retrying writeback of the dirty page/inode | 456 | * retrying writeback of the dirty page/inode |
438 | * that cannot be performed immediately. | 457 | * that cannot be performed immediately. |
439 | */ | 458 | */ |
440 | redirty_tail(inode); | 459 | redirty_tail(inode, wb); |
441 | } | 460 | } |
442 | } else if (inode->i_state & I_DIRTY) { | 461 | } else if (inode->i_state & I_DIRTY) { |
443 | /* | 462 | /* |
444 | * Filesystems can dirty the inode during writeback | 463 | * Filesystems can dirty the inode during writeback |
445 | * operations, such as delayed allocation during | 464 | * operations, such as delayed allocation during |
446 | * submission or metadata updates after data IO | 465 | * submission or metadata updates after data IO |
447 | * completion. | 466 | * completion. |
448 | */ | 467 | */ |
449 | redirty_tail(inode); | 468 | redirty_tail(inode, wb); |
450 | } else { | 469 | } else { |
451 | /* | 470 | /* |
452 | * The inode is clean. At this point we either have | 471 | * The inode is clean. At this point we either have |
453 | * a reference to the inode or it's on it's way out. | 472 | * a reference to the inode or it's on it's way out. |
454 | * No need to add it back to the LRU. | 473 | * No need to add it back to the LRU. |
455 | */ | 474 | */ |
456 | list_del_init(&inode->i_wb_list); | 475 | list_del_init(&inode->i_wb_list); |
457 | } | 476 | } |
458 | } | 477 | } |
459 | inode_sync_complete(inode); | 478 | inode_sync_complete(inode); |
479 | trace_writeback_single_inode(inode, wbc, nr_to_write); | ||
460 | return ret; | 480 | return ret; |
461 | } | 481 | } |
462 | 482 | ||
483 | static long writeback_chunk_size(struct backing_dev_info *bdi, | ||
484 | struct wb_writeback_work *work) | ||
485 | { | ||
486 | long pages; | ||
487 | |||
488 | /* | ||
489 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
490 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
491 | * here avoids calling into writeback_inodes_wb() more than once. | ||
492 | * | ||
493 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
494 | * | ||
495 | * wb_writeback() | ||
496 | * writeback_sb_inodes() <== called only once | ||
497 | * write_cache_pages() <== called once for each inode | ||
498 | * (quickly) tag currently dirty pages | ||
499 | * (maybe slowly) sync all tagged pages | ||
500 | */ | ||
501 | if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) | ||
502 | pages = LONG_MAX; | ||
503 | else { | ||
504 | pages = min(bdi->avg_write_bandwidth / 2, | ||
505 | global_dirty_limit / DIRTY_SCOPE); | ||
506 | pages = min(pages, work->nr_pages); | ||
507 | pages = round_down(pages + MIN_WRITEBACK_PAGES, | ||
508 | MIN_WRITEBACK_PAGES); | ||
509 | } | ||
510 | |||
511 | return pages; | ||
512 | } | ||
513 | |||
463 | /* | 514 | /* |
464 | * Write a portion of b_io inodes which belong to @sb. | 515 | * Write a portion of b_io inodes which belong to @sb. |
465 | * | 516 | * |
466 | * If @only_this_sb is true, then find and write all such | 517 | * If @only_this_sb is true, then find and write all such |
467 | * inodes. Otherwise write only ones which go sequentially | 518 | * inodes. Otherwise write only ones which go sequentially |
468 | * in reverse order. | 519 | * in reverse order. |
469 | * | 520 | * |
470 | * Return 1, if the caller writeback routine should be | 521 | * Return the number of pages and/or inodes written. |
471 | * interrupted. Otherwise return 0. | ||
472 | */ | 522 | */ |
473 | static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | 523 | static long writeback_sb_inodes(struct super_block *sb, |
474 | struct writeback_control *wbc, bool only_this_sb) | 524 | struct bdi_writeback *wb, |
525 | struct wb_writeback_work *work) | ||
475 | { | 526 | { |
527 | struct writeback_control wbc = { | ||
528 | .sync_mode = work->sync_mode, | ||
529 | .tagged_writepages = work->tagged_writepages, | ||
530 | .for_kupdate = work->for_kupdate, | ||
531 | .for_background = work->for_background, | ||
532 | .range_cyclic = work->range_cyclic, | ||
533 | .range_start = 0, | ||
534 | .range_end = LLONG_MAX, | ||
535 | }; | ||
536 | unsigned long start_time = jiffies; | ||
537 | long write_chunk; | ||
538 | long wrote = 0; /* count both pages and inodes */ | ||
539 | |||
476 | while (!list_empty(&wb->b_io)) { | 540 | while (!list_empty(&wb->b_io)) { |
477 | long pages_skipped; | ||
478 | struct inode *inode = wb_inode(wb->b_io.prev); | 541 | struct inode *inode = wb_inode(wb->b_io.prev); |
479 | 542 | ||
480 | if (inode->i_sb != sb) { | 543 | if (inode->i_sb != sb) { |
481 | if (only_this_sb) { | 544 | if (work->sb) { |
482 | /* | 545 | /* |
483 | * We only want to write back data for this | 546 | * We only want to write back data for this |
484 | * superblock, move all inodes not belonging | 547 | * superblock, move all inodes not belonging |
485 | * to it back onto the dirty list. | 548 | * to it back onto the dirty list. |
486 | */ | 549 | */ |
487 | redirty_tail(inode); | 550 | redirty_tail(inode, wb); |
488 | continue; | 551 | continue; |
489 | } | 552 | } |
490 | 553 | ||
491 | /* | 554 | /* |
492 | * The inode belongs to a different superblock. | 555 | * The inode belongs to a different superblock. |
493 | * Bounce back to the caller to unpin this and | 556 | * Bounce back to the caller to unpin this and |
494 | * pin the next superblock. | 557 | * pin the next superblock. |
495 | */ | 558 | */ |
496 | return 0; | 559 | break; |
497 | } | 560 | } |
498 | 561 | ||
499 | /* | 562 | /* |
500 | * Don't bother with new inodes or inodes beeing freed, first | 563 | * Don't bother with new inodes or inodes beeing freed, first |
501 | * kind does not need peridic writeout yet, and for the latter | 564 | * kind does not need peridic writeout yet, and for the latter |
502 | * kind writeout is handled by the freer. | 565 | * kind writeout is handled by the freer. |
503 | */ | 566 | */ |
504 | spin_lock(&inode->i_lock); | 567 | spin_lock(&inode->i_lock); |
505 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { | 568 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { |
506 | spin_unlock(&inode->i_lock); | 569 | spin_unlock(&inode->i_lock); |
507 | requeue_io(inode); | 570 | redirty_tail(inode, wb); |
508 | continue; | 571 | continue; |
509 | } | 572 | } |
510 | |||
511 | /* | ||
512 | * Was this inode dirtied after sync_sb_inodes was called? | ||
513 | * This keeps sync from extra jobs and livelock. | ||
514 | */ | ||
515 | if (inode_dirtied_after(inode, wbc->wb_start)) { | ||
516 | spin_unlock(&inode->i_lock); | ||
517 | return 1; | ||
518 | } | ||
519 | |||
520 | __iget(inode); | 573 | __iget(inode); |
574 | write_chunk = writeback_chunk_size(wb->bdi, work); | ||
575 | wbc.nr_to_write = write_chunk; | ||
576 | wbc.pages_skipped = 0; | ||
521 | 577 | ||
522 | pages_skipped = wbc->pages_skipped; | 578 | writeback_single_inode(inode, wb, &wbc); |
523 | writeback_single_inode(inode, wbc); | 579 | |
524 | if (wbc->pages_skipped != pages_skipped) { | 580 | work->nr_pages -= write_chunk - wbc.nr_to_write; |
581 | wrote += write_chunk - wbc.nr_to_write; | ||
582 | if (!(inode->i_state & I_DIRTY)) | ||
583 | wrote++; | ||
584 | if (wbc.pages_skipped) { | ||
525 | /* | 585 | /* |
526 | * writeback is not making progress due to locked | 586 | * writeback is not making progress due to locked |
527 | * buffers. Skip this inode for now. | 587 | * buffers. Skip this inode for now. |
528 | */ | 588 | */ |
529 | redirty_tail(inode); | 589 | redirty_tail(inode, wb); |
530 | } | 590 | } |
531 | spin_unlock(&inode->i_lock); | 591 | spin_unlock(&inode->i_lock); |
532 | spin_unlock(&inode_wb_list_lock); | 592 | spin_unlock(&wb->list_lock); |
533 | iput(inode); | 593 | iput(inode); |
534 | cond_resched(); | 594 | cond_resched(); |
535 | spin_lock(&inode_wb_list_lock); | 595 | spin_lock(&wb->list_lock); |
536 | if (wbc->nr_to_write <= 0) { | 596 | /* |
537 | wbc->more_io = 1; | 597 | * bail out to wb_writeback() often enough to check |
538 | return 1; | 598 | * background threshold and other termination conditions. |
599 | */ | ||
600 | if (wrote) { | ||
601 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
602 | break; | ||
603 | if (work->nr_pages <= 0) | ||
604 | break; | ||
539 | } | 605 | } |
540 | if (!list_empty(&wb->b_more_io)) | ||
541 | wbc->more_io = 1; | ||
542 | } | 606 | } |
543 | /* b_io is empty */ | 607 | return wrote; |
544 | return 1; | ||
545 | } | 608 | } |
546 | 609 | ||
547 | void writeback_inodes_wb(struct bdi_writeback *wb, | 610 | static long __writeback_inodes_wb(struct bdi_writeback *wb, |
548 | struct writeback_control *wbc) | 611 | struct wb_writeback_work *work) |
549 | { | 612 | { |
550 | int ret = 0; | 613 | unsigned long start_time = jiffies; |
614 | long wrote = 0; | ||
551 | 615 | ||
552 | if (!wbc->wb_start) | ||
553 | wbc->wb_start = jiffies; /* livelock avoidance */ | ||
554 | spin_lock(&inode_wb_list_lock); | ||
555 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | ||
556 | queue_io(wb, wbc->older_than_this); | ||
557 | |||
558 | while (!list_empty(&wb->b_io)) { | 616 | while (!list_empty(&wb->b_io)) { |
559 | struct inode *inode = wb_inode(wb->b_io.prev); | 617 | struct inode *inode = wb_inode(wb->b_io.prev); |
560 | struct super_block *sb = inode->i_sb; | 618 | struct super_block *sb = inode->i_sb; |
561 | 619 | ||
562 | if (!grab_super_passive(sb)) { | 620 | if (!grab_super_passive(sb)) { |
563 | requeue_io(inode); | 621 | requeue_io(inode, wb); |
564 | continue; | 622 | continue; |
565 | } | 623 | } |
566 | ret = writeback_sb_inodes(sb, wb, wbc, false); | 624 | wrote += writeback_sb_inodes(sb, wb, work); |
567 | drop_super(sb); | 625 | drop_super(sb); |
568 | 626 | ||
569 | if (ret) | 627 | /* refer to the same tests at the end of writeback_sb_inodes */ |
570 | break; | 628 | if (wrote) { |
629 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
630 | break; | ||
631 | if (work->nr_pages <= 0) | ||
632 | break; | ||
633 | } | ||
571 | } | 634 | } |
572 | spin_unlock(&inode_wb_list_lock); | ||
573 | /* Leave any unwritten inodes on b_io */ | 635 | /* Leave any unwritten inodes on b_io */ |
636 | return wrote; | ||
574 | } | 637 | } |
575 | 638 | ||
576 | static void __writeback_inodes_sb(struct super_block *sb, | 639 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) |
577 | struct bdi_writeback *wb, struct writeback_control *wbc) | ||
578 | { | 640 | { |
579 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 641 | struct wb_writeback_work work = { |
642 | .nr_pages = nr_pages, | ||
643 | .sync_mode = WB_SYNC_NONE, | ||
644 | .range_cyclic = 1, | ||
645 | }; | ||
580 | 646 | ||
581 | spin_lock(&inode_wb_list_lock); | 647 | spin_lock(&wb->list_lock); |
582 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | 648 | if (list_empty(&wb->b_io)) |
583 | queue_io(wb, wbc->older_than_this); | 649 | queue_io(wb, NULL); |
584 | writeback_sb_inodes(sb, wb, wbc, true); | 650 | __writeback_inodes_wb(wb, &work); |
585 | spin_unlock(&inode_wb_list_lock); | 651 | spin_unlock(&wb->list_lock); |
652 | |||
653 | return nr_pages - work.nr_pages; | ||
586 | } | 654 | } |
587 | 655 | ||
588 | /* | ||
589 | * The maximum number of pages to writeout in a single bdi flush/kupdate | ||
590 | * operation. We do this so we don't hold I_SYNC against an inode for | ||
591 | * enormous amounts of time, which would block a userspace task which has | ||
592 | * been forced to throttle against that inode. Also, the code reevaluates | ||
593 | * the dirty each time it has written this many pages. | ||
594 | */ | ||
595 | #define MAX_WRITEBACK_PAGES 1024 | ||
596 | |||
597 | static inline bool over_bground_thresh(void) | 656 | static inline bool over_bground_thresh(void) |
598 | { | 657 | { |
599 | unsigned long background_thresh, dirty_thresh; | 658 | unsigned long background_thresh, dirty_thresh; |
600 | 659 | ||
601 | global_dirty_limits(&background_thresh, &dirty_thresh); | 660 | global_dirty_limits(&background_thresh, &dirty_thresh); |
602 | 661 | ||
603 | return (global_page_state(NR_FILE_DIRTY) + | 662 | return (global_page_state(NR_FILE_DIRTY) + |
604 | global_page_state(NR_UNSTABLE_NFS) > background_thresh); | 663 | global_page_state(NR_UNSTABLE_NFS) > background_thresh); |
605 | } | 664 | } |
606 | 665 | ||
607 | /* | 666 | /* |
667 | * Called under wb->list_lock. If there are multiple wb per bdi, | ||
668 | * only the flusher working on the first wb should do it. | ||
669 | */ | ||
670 | static void wb_update_bandwidth(struct bdi_writeback *wb, | ||
671 | unsigned long start_time) | ||
672 | { | ||
673 | __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); | ||
674 | } | ||
675 | |||
676 | /* | ||
608 | * Explicit flushing or periodic writeback of "old" data. | 677 | * Explicit flushing or periodic writeback of "old" data. |
609 | * | 678 | * |
610 | * Define "old": the first time one of an inode's pages is dirtied, we mark the | 679 | * Define "old": the first time one of an inode's pages is dirtied, we mark the |
611 | * dirtying-time in the inode's address_space. So this periodic writeback code | 680 | * dirtying-time in the inode's address_space. So this periodic writeback code |
612 | * just walks the superblock inode list, writing back any inodes which are | 681 | * just walks the superblock inode list, writing back any inodes which are |
613 | * older than a specific point in time. | 682 | * older than a specific point in time. |
614 | * | 683 | * |
615 | * Try to run once per dirty_writeback_interval. But if a writeback event | 684 | * Try to run once per dirty_writeback_interval. But if a writeback event |
616 | * takes longer than a dirty_writeback_interval interval, then leave a | 685 | * takes longer than a dirty_writeback_interval interval, then leave a |
617 | * one-second gap. | 686 | * one-second gap. |
618 | * | 687 | * |
619 | * older_than_this takes precedence over nr_to_write. So we'll only write back | 688 | * older_than_this takes precedence over nr_to_write. So we'll only write back |
620 | * all dirty pages if they are all attached to "old" mappings. | 689 | * all dirty pages if they are all attached to "old" mappings. |
621 | */ | 690 | */ |
622 | static long wb_writeback(struct bdi_writeback *wb, | 691 | static long wb_writeback(struct bdi_writeback *wb, |
623 | struct wb_writeback_work *work) | 692 | struct wb_writeback_work *work) |
624 | { | 693 | { |
625 | struct writeback_control wbc = { | 694 | unsigned long wb_start = jiffies; |
626 | .sync_mode = work->sync_mode, | 695 | long nr_pages = work->nr_pages; |
627 | .older_than_this = NULL, | ||
628 | .for_kupdate = work->for_kupdate, | ||
629 | .for_background = work->for_background, | ||
630 | .range_cyclic = work->range_cyclic, | ||
631 | }; | ||
632 | unsigned long oldest_jif; | 696 | unsigned long oldest_jif; |
633 | long wrote = 0; | ||
634 | long write_chunk; | ||
635 | struct inode *inode; | 697 | struct inode *inode; |
698 | long progress; | ||
636 | 699 | ||
637 | if (wbc.for_kupdate) { | 700 | oldest_jif = jiffies; |
638 | wbc.older_than_this = &oldest_jif; | 701 | work->older_than_this = &oldest_jif; |
639 | oldest_jif = jiffies - | ||
640 | msecs_to_jiffies(dirty_expire_interval * 10); | ||
641 | } | ||
642 | if (!wbc.range_cyclic) { | ||
643 | wbc.range_start = 0; | ||
644 | wbc.range_end = LLONG_MAX; | ||
645 | } | ||
646 | 702 | ||
647 | /* | 703 | spin_lock(&wb->list_lock); |
648 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
649 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
650 | * here avoids calling into writeback_inodes_wb() more than once. | ||
651 | * | ||
652 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
653 | * | ||
654 | * wb_writeback() | ||
655 | * __writeback_inodes_sb() <== called only once | ||
656 | * write_cache_pages() <== called once for each inode | ||
657 | * (quickly) tag currently dirty pages | ||
658 | * (maybe slowly) sync all tagged pages | ||
659 | */ | ||
660 | if (wbc.sync_mode == WB_SYNC_NONE) | ||
661 | write_chunk = MAX_WRITEBACK_PAGES; | ||
662 | else | ||
663 | write_chunk = LONG_MAX; | ||
664 | |||
665 | wbc.wb_start = jiffies; /* livelock avoidance */ | ||
666 | for (;;) { | 704 | for (;;) { |
667 | /* | 705 | /* |
668 | * Stop writeback when nr_pages has been consumed | 706 | * Stop writeback when nr_pages has been consumed |
669 | */ | 707 | */ |
670 | if (work->nr_pages <= 0) | 708 | if (work->nr_pages <= 0) |
671 | break; | 709 | break; |
672 | 710 | ||
673 | /* | 711 | /* |
674 | * Background writeout and kupdate-style writeback may | 712 | * Background writeout and kupdate-style writeback may |
675 | * run forever. Stop them if there is other work to do | 713 | * run forever. Stop them if there is other work to do |
676 | * so that e.g. sync can proceed. They'll be restarted | 714 | * so that e.g. sync can proceed. They'll be restarted |
677 | * after the other works are all done. | 715 | * after the other works are all done. |
678 | */ | 716 | */ |
679 | if ((work->for_background || work->for_kupdate) && | 717 | if ((work->for_background || work->for_kupdate) && |
680 | !list_empty(&wb->bdi->work_list)) | 718 | !list_empty(&wb->bdi->work_list)) |
681 | break; | 719 | break; |
682 | 720 | ||
683 | /* | 721 | /* |
684 | * For background writeout, stop when we are below the | 722 | * For background writeout, stop when we are below the |
685 | * background dirty threshold | 723 | * background dirty threshold |
686 | */ | 724 | */ |
687 | if (work->for_background && !over_bground_thresh()) | 725 | if (work->for_background && !over_bground_thresh()) |
688 | break; | 726 | break; |
689 | 727 | ||
690 | wbc.more_io = 0; | 728 | if (work->for_kupdate) { |
691 | wbc.nr_to_write = write_chunk; | 729 | oldest_jif = jiffies - |
692 | wbc.pages_skipped = 0; | 730 | msecs_to_jiffies(dirty_expire_interval * 10); |
731 | work->older_than_this = &oldest_jif; | ||
732 | } | ||
693 | 733 | ||
694 | trace_wbc_writeback_start(&wbc, wb->bdi); | 734 | trace_writeback_start(wb->bdi, work); |
735 | if (list_empty(&wb->b_io)) | ||
736 | queue_io(wb, work->older_than_this); | ||
695 | if (work->sb) | 737 | if (work->sb) |
696 | __writeback_inodes_sb(work->sb, wb, &wbc); | 738 | progress = writeback_sb_inodes(work->sb, wb, work); |
697 | else | 739 | else |
698 | writeback_inodes_wb(wb, &wbc); | 740 | progress = __writeback_inodes_wb(wb, work); |
699 | trace_wbc_writeback_written(&wbc, wb->bdi); | 741 | trace_writeback_written(wb->bdi, work); |
700 | 742 | ||
701 | work->nr_pages -= write_chunk - wbc.nr_to_write; | 743 | wb_update_bandwidth(wb, wb_start); |
702 | wrote += write_chunk - wbc.nr_to_write; | ||
703 | 744 | ||
704 | /* | 745 | /* |
705 | * If we consumed everything, see if we have more | 746 | * Did we write something? Try for more |
747 | * | ||
748 | * Dirty inodes are moved to b_io for writeback in batches. | ||
749 | * The completion of the current batch does not necessarily | ||
750 | * mean the overall work is done. So we keep looping as long | ||
751 | * as made some progress on cleaning pages or inodes. | ||
706 | */ | 752 | */ |
707 | if (wbc.nr_to_write <= 0) | 753 | if (progress) |
708 | continue; | 754 | continue; |
709 | /* | 755 | /* |
710 | * Didn't write everything and we don't have more IO, bail | 756 | * No more inodes for IO, bail |
711 | */ | 757 | */ |
712 | if (!wbc.more_io) | 758 | if (list_empty(&wb->b_more_io)) |
713 | break; | 759 | break; |
714 | /* | 760 | /* |
715 | * Did we write something? Try for more | ||
716 | */ | ||
717 | if (wbc.nr_to_write < write_chunk) | ||
718 | continue; | ||
719 | /* | ||
720 | * Nothing written. Wait for some inode to | 761 | * Nothing written. Wait for some inode to |
721 | * become available for writeback. Otherwise | 762 | * become available for writeback. Otherwise |
722 | * we'll just busyloop. | 763 | * we'll just busyloop. |
723 | */ | 764 | */ |
724 | spin_lock(&inode_wb_list_lock); | ||
725 | if (!list_empty(&wb->b_more_io)) { | 765 | if (!list_empty(&wb->b_more_io)) { |
766 | trace_writeback_wait(wb->bdi, work); | ||
726 | inode = wb_inode(wb->b_more_io.prev); | 767 | inode = wb_inode(wb->b_more_io.prev); |
727 | trace_wbc_writeback_wait(&wbc, wb->bdi); | ||
728 | spin_lock(&inode->i_lock); | 768 | spin_lock(&inode->i_lock); |
729 | inode_wait_for_writeback(inode); | 769 | inode_wait_for_writeback(inode, wb); |
730 | spin_unlock(&inode->i_lock); | 770 | spin_unlock(&inode->i_lock); |
731 | } | 771 | } |
732 | spin_unlock(&inode_wb_list_lock); | ||
733 | } | 772 | } |
773 | spin_unlock(&wb->list_lock); | ||
734 | 774 | ||
735 | return wrote; | 775 | return nr_pages - work->nr_pages; |
736 | } | 776 | } |
737 | 777 | ||
738 | /* | 778 | /* |
739 | * Return the next wb_writeback_work struct that hasn't been processed yet. | 779 | * Return the next wb_writeback_work struct that hasn't been processed yet. |
740 | */ | 780 | */ |
741 | static struct wb_writeback_work * | 781 | static struct wb_writeback_work * |
742 | get_next_work_item(struct backing_dev_info *bdi) | 782 | get_next_work_item(struct backing_dev_info *bdi) |
743 | { | 783 | { |
744 | struct wb_writeback_work *work = NULL; | 784 | struct wb_writeback_work *work = NULL; |
745 | 785 | ||
746 | spin_lock_bh(&bdi->wb_lock); | 786 | spin_lock_bh(&bdi->wb_lock); |
747 | if (!list_empty(&bdi->work_list)) { | 787 | if (!list_empty(&bdi->work_list)) { |
748 | work = list_entry(bdi->work_list.next, | 788 | work = list_entry(bdi->work_list.next, |
749 | struct wb_writeback_work, list); | 789 | struct wb_writeback_work, list); |
750 | list_del_init(&work->list); | 790 | list_del_init(&work->list); |
751 | } | 791 | } |
752 | spin_unlock_bh(&bdi->wb_lock); | 792 | spin_unlock_bh(&bdi->wb_lock); |
753 | return work; | 793 | return work; |
754 | } | 794 | } |
755 | 795 | ||
756 | /* | 796 | /* |
757 | * Add in the number of potentially dirty inodes, because each inode | 797 | * Add in the number of potentially dirty inodes, because each inode |
758 | * write can dirty pagecache in the underlying blockdev. | 798 | * write can dirty pagecache in the underlying blockdev. |
759 | */ | 799 | */ |
760 | static unsigned long get_nr_dirty_pages(void) | 800 | static unsigned long get_nr_dirty_pages(void) |
761 | { | 801 | { |
762 | return global_page_state(NR_FILE_DIRTY) + | 802 | return global_page_state(NR_FILE_DIRTY) + |
763 | global_page_state(NR_UNSTABLE_NFS) + | 803 | global_page_state(NR_UNSTABLE_NFS) + |
764 | get_nr_dirty_inodes(); | 804 | get_nr_dirty_inodes(); |
765 | } | 805 | } |
766 | 806 | ||
767 | static long wb_check_background_flush(struct bdi_writeback *wb) | 807 | static long wb_check_background_flush(struct bdi_writeback *wb) |
768 | { | 808 | { |
769 | if (over_bground_thresh()) { | 809 | if (over_bground_thresh()) { |
770 | 810 | ||
771 | struct wb_writeback_work work = { | 811 | struct wb_writeback_work work = { |
772 | .nr_pages = LONG_MAX, | 812 | .nr_pages = LONG_MAX, |
773 | .sync_mode = WB_SYNC_NONE, | 813 | .sync_mode = WB_SYNC_NONE, |
774 | .for_background = 1, | 814 | .for_background = 1, |
775 | .range_cyclic = 1, | 815 | .range_cyclic = 1, |
776 | }; | 816 | }; |
777 | 817 | ||
778 | return wb_writeback(wb, &work); | 818 | return wb_writeback(wb, &work); |
779 | } | 819 | } |
780 | 820 | ||
781 | return 0; | 821 | return 0; |
782 | } | 822 | } |
783 | 823 | ||
784 | static long wb_check_old_data_flush(struct bdi_writeback *wb) | 824 | static long wb_check_old_data_flush(struct bdi_writeback *wb) |
785 | { | 825 | { |
786 | unsigned long expired; | 826 | unsigned long expired; |
787 | long nr_pages; | 827 | long nr_pages; |
788 | 828 | ||
789 | /* | 829 | /* |
790 | * When set to zero, disable periodic writeback | 830 | * When set to zero, disable periodic writeback |
791 | */ | 831 | */ |
792 | if (!dirty_writeback_interval) | 832 | if (!dirty_writeback_interval) |
793 | return 0; | 833 | return 0; |
794 | 834 | ||
795 | expired = wb->last_old_flush + | 835 | expired = wb->last_old_flush + |
796 | msecs_to_jiffies(dirty_writeback_interval * 10); | 836 | msecs_to_jiffies(dirty_writeback_interval * 10); |
797 | if (time_before(jiffies, expired)) | 837 | if (time_before(jiffies, expired)) |
798 | return 0; | 838 | return 0; |
799 | 839 | ||
800 | wb->last_old_flush = jiffies; | 840 | wb->last_old_flush = jiffies; |
801 | nr_pages = get_nr_dirty_pages(); | 841 | nr_pages = get_nr_dirty_pages(); |
802 | 842 | ||
803 | if (nr_pages) { | 843 | if (nr_pages) { |
804 | struct wb_writeback_work work = { | 844 | struct wb_writeback_work work = { |
805 | .nr_pages = nr_pages, | 845 | .nr_pages = nr_pages, |
806 | .sync_mode = WB_SYNC_NONE, | 846 | .sync_mode = WB_SYNC_NONE, |
807 | .for_kupdate = 1, | 847 | .for_kupdate = 1, |
808 | .range_cyclic = 1, | 848 | .range_cyclic = 1, |
809 | }; | 849 | }; |
810 | 850 | ||
811 | return wb_writeback(wb, &work); | 851 | return wb_writeback(wb, &work); |
812 | } | 852 | } |
813 | 853 | ||
814 | return 0; | 854 | return 0; |
815 | } | 855 | } |
816 | 856 | ||
817 | /* | 857 | /* |
818 | * Retrieve work items and do the writeback they describe | 858 | * Retrieve work items and do the writeback they describe |
819 | */ | 859 | */ |
820 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | 860 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait) |
821 | { | 861 | { |
822 | struct backing_dev_info *bdi = wb->bdi; | 862 | struct backing_dev_info *bdi = wb->bdi; |
823 | struct wb_writeback_work *work; | 863 | struct wb_writeback_work *work; |
824 | long wrote = 0; | 864 | long wrote = 0; |
825 | 865 | ||
826 | set_bit(BDI_writeback_running, &wb->bdi->state); | 866 | set_bit(BDI_writeback_running, &wb->bdi->state); |
827 | while ((work = get_next_work_item(bdi)) != NULL) { | 867 | while ((work = get_next_work_item(bdi)) != NULL) { |
828 | /* | 868 | /* |
829 | * Override sync mode, in case we must wait for completion | 869 | * Override sync mode, in case we must wait for completion |
830 | * because this thread is exiting now. | 870 | * because this thread is exiting now. |
831 | */ | 871 | */ |
832 | if (force_wait) | 872 | if (force_wait) |
833 | work->sync_mode = WB_SYNC_ALL; | 873 | work->sync_mode = WB_SYNC_ALL; |
834 | 874 | ||
835 | trace_writeback_exec(bdi, work); | 875 | trace_writeback_exec(bdi, work); |
836 | 876 | ||
837 | wrote += wb_writeback(wb, work); | 877 | wrote += wb_writeback(wb, work); |
838 | 878 | ||
839 | /* | 879 | /* |
840 | * Notify the caller of completion if this is a synchronous | 880 | * Notify the caller of completion if this is a synchronous |
841 | * work item, otherwise just free it. | 881 | * work item, otherwise just free it. |
842 | */ | 882 | */ |
843 | if (work->done) | 883 | if (work->done) |
844 | complete(work->done); | 884 | complete(work->done); |
845 | else | 885 | else |
846 | kfree(work); | 886 | kfree(work); |
847 | } | 887 | } |
848 | 888 | ||
849 | /* | 889 | /* |
850 | * Check for periodic writeback, kupdated() style | 890 | * Check for periodic writeback, kupdated() style |
851 | */ | 891 | */ |
852 | wrote += wb_check_old_data_flush(wb); | 892 | wrote += wb_check_old_data_flush(wb); |
853 | wrote += wb_check_background_flush(wb); | 893 | wrote += wb_check_background_flush(wb); |
854 | clear_bit(BDI_writeback_running, &wb->bdi->state); | 894 | clear_bit(BDI_writeback_running, &wb->bdi->state); |
855 | 895 | ||
856 | return wrote; | 896 | return wrote; |
857 | } | 897 | } |
858 | 898 | ||
859 | /* | 899 | /* |
860 | * Handle writeback of dirty data for the device backed by this bdi. Also | 900 | * Handle writeback of dirty data for the device backed by this bdi. Also |
861 | * wakes up periodically and does kupdated style flushing. | 901 | * wakes up periodically and does kupdated style flushing. |
862 | */ | 902 | */ |
863 | int bdi_writeback_thread(void *data) | 903 | int bdi_writeback_thread(void *data) |
864 | { | 904 | { |
865 | struct bdi_writeback *wb = data; | 905 | struct bdi_writeback *wb = data; |
866 | struct backing_dev_info *bdi = wb->bdi; | 906 | struct backing_dev_info *bdi = wb->bdi; |
867 | long pages_written; | 907 | long pages_written; |
868 | 908 | ||
869 | current->flags |= PF_SWAPWRITE; | 909 | current->flags |= PF_SWAPWRITE; |
870 | set_freezable(); | 910 | set_freezable(); |
871 | wb->last_active = jiffies; | 911 | wb->last_active = jiffies; |
872 | 912 | ||
873 | /* | 913 | /* |
874 | * Our parent may run at a different priority, just set us to normal | 914 | * Our parent may run at a different priority, just set us to normal |
875 | */ | 915 | */ |
876 | set_user_nice(current, 0); | 916 | set_user_nice(current, 0); |
877 | 917 | ||
878 | trace_writeback_thread_start(bdi); | 918 | trace_writeback_thread_start(bdi); |
879 | 919 | ||
880 | while (!kthread_should_stop()) { | 920 | while (!kthread_should_stop()) { |
881 | /* | 921 | /* |
882 | * Remove own delayed wake-up timer, since we are already awake | 922 | * Remove own delayed wake-up timer, since we are already awake |
883 | * and we'll take care of the preriodic write-back. | 923 | * and we'll take care of the preriodic write-back. |
884 | */ | 924 | */ |
885 | del_timer(&wb->wakeup_timer); | 925 | del_timer(&wb->wakeup_timer); |
886 | 926 | ||
887 | pages_written = wb_do_writeback(wb, 0); | 927 | pages_written = wb_do_writeback(wb, 0); |
888 | 928 | ||
889 | trace_writeback_pages_written(pages_written); | 929 | trace_writeback_pages_written(pages_written); |
890 | 930 | ||
891 | if (pages_written) | 931 | if (pages_written) |
892 | wb->last_active = jiffies; | 932 | wb->last_active = jiffies; |
893 | 933 | ||
894 | set_current_state(TASK_INTERRUPTIBLE); | 934 | set_current_state(TASK_INTERRUPTIBLE); |
895 | if (!list_empty(&bdi->work_list) || kthread_should_stop()) { | 935 | if (!list_empty(&bdi->work_list) || kthread_should_stop()) { |
896 | __set_current_state(TASK_RUNNING); | 936 | __set_current_state(TASK_RUNNING); |
897 | continue; | 937 | continue; |
898 | } | 938 | } |
899 | 939 | ||
900 | if (wb_has_dirty_io(wb) && dirty_writeback_interval) | 940 | if (wb_has_dirty_io(wb) && dirty_writeback_interval) |
901 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); | 941 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
902 | else { | 942 | else { |
903 | /* | 943 | /* |
904 | * We have nothing to do, so can go sleep without any | 944 | * We have nothing to do, so can go sleep without any |
905 | * timeout and save power. When a work is queued or | 945 | * timeout and save power. When a work is queued or |
906 | * something is made dirty - we will be woken up. | 946 | * something is made dirty - we will be woken up. |
907 | */ | 947 | */ |
908 | schedule(); | 948 | schedule(); |
909 | } | 949 | } |
910 | 950 | ||
911 | try_to_freeze(); | 951 | try_to_freeze(); |
912 | } | 952 | } |
913 | 953 | ||
914 | /* Flush any work that raced with us exiting */ | 954 | /* Flush any work that raced with us exiting */ |
915 | if (!list_empty(&bdi->work_list)) | 955 | if (!list_empty(&bdi->work_list)) |
916 | wb_do_writeback(wb, 1); | 956 | wb_do_writeback(wb, 1); |
917 | 957 | ||
918 | trace_writeback_thread_stop(bdi); | 958 | trace_writeback_thread_stop(bdi); |
919 | return 0; | 959 | return 0; |
920 | } | 960 | } |
921 | 961 | ||
922 | 962 | ||
923 | /* | 963 | /* |
924 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | 964 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back |
925 | * the whole world. | 965 | * the whole world. |
926 | */ | 966 | */ |
927 | void wakeup_flusher_threads(long nr_pages) | 967 | void wakeup_flusher_threads(long nr_pages) |
928 | { | 968 | { |
929 | struct backing_dev_info *bdi; | 969 | struct backing_dev_info *bdi; |
930 | 970 | ||
931 | if (!nr_pages) { | 971 | if (!nr_pages) { |
932 | nr_pages = global_page_state(NR_FILE_DIRTY) + | 972 | nr_pages = global_page_state(NR_FILE_DIRTY) + |
933 | global_page_state(NR_UNSTABLE_NFS); | 973 | global_page_state(NR_UNSTABLE_NFS); |
934 | } | 974 | } |
935 | 975 | ||
936 | rcu_read_lock(); | 976 | rcu_read_lock(); |
937 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { | 977 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { |
938 | if (!bdi_has_dirty_io(bdi)) | 978 | if (!bdi_has_dirty_io(bdi)) |
939 | continue; | 979 | continue; |
940 | __bdi_start_writeback(bdi, nr_pages, false); | 980 | __bdi_start_writeback(bdi, nr_pages, false); |
941 | } | 981 | } |
942 | rcu_read_unlock(); | 982 | rcu_read_unlock(); |
943 | } | 983 | } |
944 | 984 | ||
945 | static noinline void block_dump___mark_inode_dirty(struct inode *inode) | 985 | static noinline void block_dump___mark_inode_dirty(struct inode *inode) |
946 | { | 986 | { |
947 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { | 987 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { |
948 | struct dentry *dentry; | 988 | struct dentry *dentry; |
949 | const char *name = "?"; | 989 | const char *name = "?"; |
950 | 990 | ||
951 | dentry = d_find_alias(inode); | 991 | dentry = d_find_alias(inode); |
952 | if (dentry) { | 992 | if (dentry) { |
953 | spin_lock(&dentry->d_lock); | 993 | spin_lock(&dentry->d_lock); |
954 | name = (const char *) dentry->d_name.name; | 994 | name = (const char *) dentry->d_name.name; |
955 | } | 995 | } |
956 | printk(KERN_DEBUG | 996 | printk(KERN_DEBUG |
957 | "%s(%d): dirtied inode %lu (%s) on %s\n", | 997 | "%s(%d): dirtied inode %lu (%s) on %s\n", |
958 | current->comm, task_pid_nr(current), inode->i_ino, | 998 | current->comm, task_pid_nr(current), inode->i_ino, |
959 | name, inode->i_sb->s_id); | 999 | name, inode->i_sb->s_id); |
960 | if (dentry) { | 1000 | if (dentry) { |
961 | spin_unlock(&dentry->d_lock); | 1001 | spin_unlock(&dentry->d_lock); |
962 | dput(dentry); | 1002 | dput(dentry); |
963 | } | 1003 | } |
964 | } | 1004 | } |
965 | } | 1005 | } |
966 | 1006 | ||
967 | /** | 1007 | /** |
968 | * __mark_inode_dirty - internal function | 1008 | * __mark_inode_dirty - internal function |
969 | * @inode: inode to mark | 1009 | * @inode: inode to mark |
970 | * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) | 1010 | * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) |
971 | * Mark an inode as dirty. Callers should use mark_inode_dirty or | 1011 | * Mark an inode as dirty. Callers should use mark_inode_dirty or |
972 | * mark_inode_dirty_sync. | 1012 | * mark_inode_dirty_sync. |
973 | * | 1013 | * |
974 | * Put the inode on the super block's dirty list. | 1014 | * Put the inode on the super block's dirty list. |
975 | * | 1015 | * |
976 | * CAREFUL! We mark it dirty unconditionally, but move it onto the | 1016 | * CAREFUL! We mark it dirty unconditionally, but move it onto the |
977 | * dirty list only if it is hashed or if it refers to a blockdev. | 1017 | * dirty list only if it is hashed or if it refers to a blockdev. |
978 | * If it was not hashed, it will never be added to the dirty list | 1018 | * If it was not hashed, it will never be added to the dirty list |
979 | * even if it is later hashed, as it will have been marked dirty already. | 1019 | * even if it is later hashed, as it will have been marked dirty already. |
980 | * | 1020 | * |
981 | * In short, make sure you hash any inodes _before_ you start marking | 1021 | * In short, make sure you hash any inodes _before_ you start marking |
982 | * them dirty. | 1022 | * them dirty. |
983 | * | 1023 | * |
984 | * Note that for blockdevs, inode->dirtied_when represents the dirtying time of | 1024 | * Note that for blockdevs, inode->dirtied_when represents the dirtying time of |
985 | * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of | 1025 | * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of |
986 | * the kernel-internal blockdev inode represents the dirtying time of the | 1026 | * the kernel-internal blockdev inode represents the dirtying time of the |
987 | * blockdev's pages. This is why for I_DIRTY_PAGES we always use | 1027 | * blockdev's pages. This is why for I_DIRTY_PAGES we always use |
988 | * page->mapping->host, so the page-dirtying time is recorded in the internal | 1028 | * page->mapping->host, so the page-dirtying time is recorded in the internal |
989 | * blockdev inode. | 1029 | * blockdev inode. |
990 | */ | 1030 | */ |
991 | void __mark_inode_dirty(struct inode *inode, int flags) | 1031 | void __mark_inode_dirty(struct inode *inode, int flags) |
992 | { | 1032 | { |
993 | struct super_block *sb = inode->i_sb; | 1033 | struct super_block *sb = inode->i_sb; |
994 | struct backing_dev_info *bdi = NULL; | 1034 | struct backing_dev_info *bdi = NULL; |
995 | 1035 | ||
996 | /* | 1036 | /* |
997 | * Don't do this for I_DIRTY_PAGES - that doesn't actually | 1037 | * Don't do this for I_DIRTY_PAGES - that doesn't actually |
998 | * dirty the inode itself | 1038 | * dirty the inode itself |
999 | */ | 1039 | */ |
1000 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 1040 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
1001 | if (sb->s_op->dirty_inode) | 1041 | if (sb->s_op->dirty_inode) |
1002 | sb->s_op->dirty_inode(inode, flags); | 1042 | sb->s_op->dirty_inode(inode, flags); |
1003 | } | 1043 | } |
1004 | 1044 | ||
1005 | /* | 1045 | /* |
1006 | * make sure that changes are seen by all cpus before we test i_state | 1046 | * make sure that changes are seen by all cpus before we test i_state |
1007 | * -- mikulas | 1047 | * -- mikulas |
1008 | */ | 1048 | */ |
1009 | smp_mb(); | 1049 | smp_mb(); |
1010 | 1050 | ||
1011 | /* avoid the locking if we can */ | 1051 | /* avoid the locking if we can */ |
1012 | if ((inode->i_state & flags) == flags) | 1052 | if ((inode->i_state & flags) == flags) |
1013 | return; | 1053 | return; |
1014 | 1054 | ||
1015 | if (unlikely(block_dump)) | 1055 | if (unlikely(block_dump)) |
1016 | block_dump___mark_inode_dirty(inode); | 1056 | block_dump___mark_inode_dirty(inode); |
1017 | 1057 | ||
1018 | spin_lock(&inode->i_lock); | 1058 | spin_lock(&inode->i_lock); |
1019 | if ((inode->i_state & flags) != flags) { | 1059 | if ((inode->i_state & flags) != flags) { |
1020 | const int was_dirty = inode->i_state & I_DIRTY; | 1060 | const int was_dirty = inode->i_state & I_DIRTY; |
1021 | 1061 | ||
1022 | inode->i_state |= flags; | 1062 | inode->i_state |= flags; |
1023 | 1063 | ||
1024 | /* | 1064 | /* |
1025 | * If the inode is being synced, just update its dirty state. | 1065 | * If the inode is being synced, just update its dirty state. |
1026 | * The unlocker will place the inode on the appropriate | 1066 | * The unlocker will place the inode on the appropriate |
1027 | * superblock list, based upon its state. | 1067 | * superblock list, based upon its state. |
1028 | */ | 1068 | */ |
1029 | if (inode->i_state & I_SYNC) | 1069 | if (inode->i_state & I_SYNC) |
1030 | goto out_unlock_inode; | 1070 | goto out_unlock_inode; |
1031 | 1071 | ||
1032 | /* | 1072 | /* |
1033 | * Only add valid (hashed) inodes to the superblock's | 1073 | * Only add valid (hashed) inodes to the superblock's |
1034 | * dirty list. Add blockdev inodes as well. | 1074 | * dirty list. Add blockdev inodes as well. |
1035 | */ | 1075 | */ |
1036 | if (!S_ISBLK(inode->i_mode)) { | 1076 | if (!S_ISBLK(inode->i_mode)) { |
1037 | if (inode_unhashed(inode)) | 1077 | if (inode_unhashed(inode)) |
1038 | goto out_unlock_inode; | 1078 | goto out_unlock_inode; |
1039 | } | 1079 | } |
1040 | if (inode->i_state & I_FREEING) | 1080 | if (inode->i_state & I_FREEING) |
1041 | goto out_unlock_inode; | 1081 | goto out_unlock_inode; |
1042 | 1082 | ||
1043 | /* | 1083 | /* |
1044 | * If the inode was already on b_dirty/b_io/b_more_io, don't | 1084 | * If the inode was already on b_dirty/b_io/b_more_io, don't |
1045 | * reposition it (that would break b_dirty time-ordering). | 1085 | * reposition it (that would break b_dirty time-ordering). |
1046 | */ | 1086 | */ |
1047 | if (!was_dirty) { | 1087 | if (!was_dirty) { |
1048 | bool wakeup_bdi = false; | 1088 | bool wakeup_bdi = false; |
1049 | bdi = inode_to_bdi(inode); | 1089 | bdi = inode_to_bdi(inode); |
1050 | 1090 | ||
1051 | if (bdi_cap_writeback_dirty(bdi)) { | 1091 | if (bdi_cap_writeback_dirty(bdi)) { |
1052 | WARN(!test_bit(BDI_registered, &bdi->state), | 1092 | WARN(!test_bit(BDI_registered, &bdi->state), |
1053 | "bdi-%s not registered\n", bdi->name); | 1093 | "bdi-%s not registered\n", bdi->name); |
1054 | 1094 | ||
1055 | /* | 1095 | /* |
1056 | * If this is the first dirty inode for this | 1096 | * If this is the first dirty inode for this |
1057 | * bdi, we have to wake-up the corresponding | 1097 | * bdi, we have to wake-up the corresponding |
1058 | * bdi thread to make sure background | 1098 | * bdi thread to make sure background |
1059 | * write-back happens later. | 1099 | * write-back happens later. |
1060 | */ | 1100 | */ |
1061 | if (!wb_has_dirty_io(&bdi->wb)) | 1101 | if (!wb_has_dirty_io(&bdi->wb)) |
1062 | wakeup_bdi = true; | 1102 | wakeup_bdi = true; |
1063 | } | 1103 | } |
1064 | 1104 | ||
1065 | spin_unlock(&inode->i_lock); | 1105 | spin_unlock(&inode->i_lock); |
1066 | spin_lock(&inode_wb_list_lock); | 1106 | spin_lock(&bdi->wb.list_lock); |
1067 | inode->dirtied_when = jiffies; | 1107 | inode->dirtied_when = jiffies; |
1068 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); | 1108 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
1069 | spin_unlock(&inode_wb_list_lock); | 1109 | spin_unlock(&bdi->wb.list_lock); |
1070 | 1110 | ||
1071 | if (wakeup_bdi) | 1111 | if (wakeup_bdi) |
1072 | bdi_wakeup_thread_delayed(bdi); | 1112 | bdi_wakeup_thread_delayed(bdi); |
1073 | return; | 1113 | return; |
1074 | } | 1114 | } |
1075 | } | 1115 | } |
1076 | out_unlock_inode: | 1116 | out_unlock_inode: |
1077 | spin_unlock(&inode->i_lock); | 1117 | spin_unlock(&inode->i_lock); |
1078 | 1118 | ||
1079 | } | 1119 | } |
1080 | EXPORT_SYMBOL(__mark_inode_dirty); | 1120 | EXPORT_SYMBOL(__mark_inode_dirty); |
1081 | 1121 | ||
1082 | /* | 1122 | /* |
1083 | * Write out a superblock's list of dirty inodes. A wait will be performed | 1123 | * Write out a superblock's list of dirty inodes. A wait will be performed |
1084 | * upon no inodes, all inodes or the final one, depending upon sync_mode. | 1124 | * upon no inodes, all inodes or the final one, depending upon sync_mode. |
1085 | * | 1125 | * |
1086 | * If older_than_this is non-NULL, then only write out inodes which | 1126 | * If older_than_this is non-NULL, then only write out inodes which |
1087 | * had their first dirtying at a time earlier than *older_than_this. | 1127 | * had their first dirtying at a time earlier than *older_than_this. |
1088 | * | 1128 | * |
1089 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. | 1129 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. |
1090 | * This function assumes that the blockdev superblock's inodes are backed by | 1130 | * This function assumes that the blockdev superblock's inodes are backed by |
1091 | * a variety of queues, so all inodes are searched. For other superblocks, | 1131 | * a variety of queues, so all inodes are searched. For other superblocks, |
1092 | * assume that all inodes are backed by the same queue. | 1132 | * assume that all inodes are backed by the same queue. |
1093 | * | 1133 | * |
1094 | * The inodes to be written are parked on bdi->b_io. They are moved back onto | 1134 | * The inodes to be written are parked on bdi->b_io. They are moved back onto |
1095 | * bdi->b_dirty as they are selected for writing. This way, none can be missed | 1135 | * bdi->b_dirty as they are selected for writing. This way, none can be missed |
1096 | * on the writer throttling path, and we get decent balancing between many | 1136 | * on the writer throttling path, and we get decent balancing between many |
1097 | * throttled threads: we don't want them all piling up on inode_sync_wait. | 1137 | * throttled threads: we don't want them all piling up on inode_sync_wait. |
1098 | */ | 1138 | */ |
1099 | static void wait_sb_inodes(struct super_block *sb) | 1139 | static void wait_sb_inodes(struct super_block *sb) |
1100 | { | 1140 | { |
1101 | struct inode *inode, *old_inode = NULL; | 1141 | struct inode *inode, *old_inode = NULL; |
1102 | 1142 | ||
1103 | /* | 1143 | /* |
1104 | * We need to be protected against the filesystem going from | 1144 | * We need to be protected against the filesystem going from |
1105 | * r/o to r/w or vice versa. | 1145 | * r/o to r/w or vice versa. |
1106 | */ | 1146 | */ |
1107 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1147 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
1108 | 1148 | ||
1109 | spin_lock(&inode_sb_list_lock); | 1149 | spin_lock(&inode_sb_list_lock); |
1110 | 1150 | ||
1111 | /* | 1151 | /* |
1112 | * Data integrity sync. Must wait for all pages under writeback, | 1152 | * Data integrity sync. Must wait for all pages under writeback, |
1113 | * because there may have been pages dirtied before our sync | 1153 | * because there may have been pages dirtied before our sync |
1114 | * call, but which had writeout started before we write it out. | 1154 | * call, but which had writeout started before we write it out. |
1115 | * In which case, the inode may not be on the dirty list, but | 1155 | * In which case, the inode may not be on the dirty list, but |
1116 | * we still have to wait for that writeout. | 1156 | * we still have to wait for that writeout. |
1117 | */ | 1157 | */ |
1118 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { | 1158 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { |
1119 | struct address_space *mapping = inode->i_mapping; | 1159 | struct address_space *mapping = inode->i_mapping; |
1120 | 1160 | ||
1121 | spin_lock(&inode->i_lock); | 1161 | spin_lock(&inode->i_lock); |
1122 | if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || | 1162 | if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || |
1123 | (mapping->nrpages == 0)) { | 1163 | (mapping->nrpages == 0)) { |
1124 | spin_unlock(&inode->i_lock); | 1164 | spin_unlock(&inode->i_lock); |
1125 | continue; | 1165 | continue; |
1126 | } | 1166 | } |
1127 | __iget(inode); | 1167 | __iget(inode); |
1128 | spin_unlock(&inode->i_lock); | 1168 | spin_unlock(&inode->i_lock); |
1129 | spin_unlock(&inode_sb_list_lock); | 1169 | spin_unlock(&inode_sb_list_lock); |
1130 | 1170 | ||
1131 | /* | 1171 | /* |
1132 | * We hold a reference to 'inode' so it couldn't have been | 1172 | * We hold a reference to 'inode' so it couldn't have been |
1133 | * removed from s_inodes list while we dropped the | 1173 | * removed from s_inodes list while we dropped the |
1134 | * inode_sb_list_lock. We cannot iput the inode now as we can | 1174 | * inode_sb_list_lock. We cannot iput the inode now as we can |
1135 | * be holding the last reference and we cannot iput it under | 1175 | * be holding the last reference and we cannot iput it under |
1136 | * inode_sb_list_lock. So we keep the reference and iput it | 1176 | * inode_sb_list_lock. So we keep the reference and iput it |
1137 | * later. | 1177 | * later. |
1138 | */ | 1178 | */ |
1139 | iput(old_inode); | 1179 | iput(old_inode); |
1140 | old_inode = inode; | 1180 | old_inode = inode; |
1141 | 1181 | ||
1142 | filemap_fdatawait(mapping); | 1182 | filemap_fdatawait(mapping); |
1143 | 1183 | ||
1144 | cond_resched(); | 1184 | cond_resched(); |
1145 | 1185 | ||
1146 | spin_lock(&inode_sb_list_lock); | 1186 | spin_lock(&inode_sb_list_lock); |
1147 | } | 1187 | } |
1148 | spin_unlock(&inode_sb_list_lock); | 1188 | spin_unlock(&inode_sb_list_lock); |
1149 | iput(old_inode); | 1189 | iput(old_inode); |
1150 | } | 1190 | } |
1151 | 1191 | ||
1152 | /** | 1192 | /** |
1153 | * writeback_inodes_sb_nr - writeback dirty inodes from given super_block | 1193 | * writeback_inodes_sb_nr - writeback dirty inodes from given super_block |
1154 | * @sb: the superblock | 1194 | * @sb: the superblock |
1155 | * @nr: the number of pages to write | 1195 | * @nr: the number of pages to write |
1156 | * | 1196 | * |
1157 | * Start writeback on some inodes on this super_block. No guarantees are made | 1197 | * Start writeback on some inodes on this super_block. No guarantees are made |
1158 | * on how many (if any) will be written, and this function does not wait | 1198 | * on how many (if any) will be written, and this function does not wait |
1159 | * for IO completion of submitted IO. | 1199 | * for IO completion of submitted IO. |
1160 | */ | 1200 | */ |
1161 | void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) | 1201 | void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) |
1162 | { | 1202 | { |
1163 | DECLARE_COMPLETION_ONSTACK(done); | 1203 | DECLARE_COMPLETION_ONSTACK(done); |
1164 | struct wb_writeback_work work = { | 1204 | struct wb_writeback_work work = { |
1165 | .sb = sb, | 1205 | .sb = sb, |
1166 | .sync_mode = WB_SYNC_NONE, | 1206 | .sync_mode = WB_SYNC_NONE, |
1167 | .done = &done, | 1207 | .tagged_writepages = 1, |
1168 | .nr_pages = nr, | 1208 | .done = &done, |
1209 | .nr_pages = nr, | ||
1169 | }; | 1210 | }; |
1170 | 1211 | ||
1171 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1212 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
1172 | bdi_queue_work(sb->s_bdi, &work); | 1213 | bdi_queue_work(sb->s_bdi, &work); |
1173 | wait_for_completion(&done); | 1214 | wait_for_completion(&done); |
1174 | } | 1215 | } |
1175 | EXPORT_SYMBOL(writeback_inodes_sb_nr); | 1216 | EXPORT_SYMBOL(writeback_inodes_sb_nr); |
1176 | 1217 | ||
1177 | /** | 1218 | /** |
1178 | * writeback_inodes_sb - writeback dirty inodes from given super_block | 1219 | * writeback_inodes_sb - writeback dirty inodes from given super_block |
1179 | * @sb: the superblock | 1220 | * @sb: the superblock |
1180 | * | 1221 | * |
1181 | * Start writeback on some inodes on this super_block. No guarantees are made | 1222 | * Start writeback on some inodes on this super_block. No guarantees are made |
1182 | * on how many (if any) will be written, and this function does not wait | 1223 | * on how many (if any) will be written, and this function does not wait |
1183 | * for IO completion of submitted IO. | 1224 | * for IO completion of submitted IO. |
1184 | */ | 1225 | */ |
1185 | void writeback_inodes_sb(struct super_block *sb) | 1226 | void writeback_inodes_sb(struct super_block *sb) |
1186 | { | 1227 | { |
1187 | return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); | 1228 | return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); |
1188 | } | 1229 | } |
1189 | EXPORT_SYMBOL(writeback_inodes_sb); | 1230 | EXPORT_SYMBOL(writeback_inodes_sb); |
1190 | 1231 | ||
1191 | /** | 1232 | /** |
1192 | * writeback_inodes_sb_if_idle - start writeback if none underway | 1233 | * writeback_inodes_sb_if_idle - start writeback if none underway |
1193 | * @sb: the superblock | 1234 | * @sb: the superblock |
1194 | * | 1235 | * |
1195 | * Invoke writeback_inodes_sb if no writeback is currently underway. | 1236 | * Invoke writeback_inodes_sb if no writeback is currently underway. |
1196 | * Returns 1 if writeback was started, 0 if not. | 1237 | * Returns 1 if writeback was started, 0 if not. |
1197 | */ | 1238 | */ |
1198 | int writeback_inodes_sb_if_idle(struct super_block *sb) | 1239 | int writeback_inodes_sb_if_idle(struct super_block *sb) |
1199 | { | 1240 | { |
1200 | if (!writeback_in_progress(sb->s_bdi)) { | 1241 | if (!writeback_in_progress(sb->s_bdi)) { |
1201 | down_read(&sb->s_umount); | 1242 | down_read(&sb->s_umount); |
1202 | writeback_inodes_sb(sb); | 1243 | writeback_inodes_sb(sb); |
1203 | up_read(&sb->s_umount); | 1244 | up_read(&sb->s_umount); |
1204 | return 1; | 1245 | return 1; |
1205 | } else | 1246 | } else |
1206 | return 0; | 1247 | return 0; |
1207 | } | 1248 | } |
1208 | EXPORT_SYMBOL(writeback_inodes_sb_if_idle); | 1249 | EXPORT_SYMBOL(writeback_inodes_sb_if_idle); |
1209 | 1250 | ||
1210 | /** | 1251 | /** |
1211 | * writeback_inodes_sb_if_idle - start writeback if none underway | 1252 | * writeback_inodes_sb_if_idle - start writeback if none underway |
1212 | * @sb: the superblock | 1253 | * @sb: the superblock |
1213 | * @nr: the number of pages to write | 1254 | * @nr: the number of pages to write |
1214 | * | 1255 | * |
1215 | * Invoke writeback_inodes_sb if no writeback is currently underway. | 1256 | * Invoke writeback_inodes_sb if no writeback is currently underway. |
1216 | * Returns 1 if writeback was started, 0 if not. | 1257 | * Returns 1 if writeback was started, 0 if not. |
1217 | */ | 1258 | */ |
1218 | int writeback_inodes_sb_nr_if_idle(struct super_block *sb, | 1259 | int writeback_inodes_sb_nr_if_idle(struct super_block *sb, |
1219 | unsigned long nr) | 1260 | unsigned long nr) |
1220 | { | 1261 | { |
1221 | if (!writeback_in_progress(sb->s_bdi)) { | 1262 | if (!writeback_in_progress(sb->s_bdi)) { |
1222 | down_read(&sb->s_umount); | 1263 | down_read(&sb->s_umount); |
1223 | writeback_inodes_sb_nr(sb, nr); | 1264 | writeback_inodes_sb_nr(sb, nr); |
1224 | up_read(&sb->s_umount); | 1265 | up_read(&sb->s_umount); |
1225 | return 1; | 1266 | return 1; |
1226 | } else | 1267 | } else |
1227 | return 0; | 1268 | return 0; |
1228 | } | 1269 | } |
1229 | EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); | 1270 | EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); |
1230 | 1271 | ||
1231 | /** | 1272 | /** |
1232 | * sync_inodes_sb - sync sb inode pages | 1273 | * sync_inodes_sb - sync sb inode pages |
1233 | * @sb: the superblock | 1274 | * @sb: the superblock |
1234 | * | 1275 | * |
1235 | * This function writes and waits on any dirty inode belonging to this | 1276 | * This function writes and waits on any dirty inode belonging to this |
1236 | * super_block. | 1277 | * super_block. |
1237 | */ | 1278 | */ |
1238 | void sync_inodes_sb(struct super_block *sb) | 1279 | void sync_inodes_sb(struct super_block *sb) |
1239 | { | 1280 | { |
1240 | DECLARE_COMPLETION_ONSTACK(done); | 1281 | DECLARE_COMPLETION_ONSTACK(done); |
1241 | struct wb_writeback_work work = { | 1282 | struct wb_writeback_work work = { |
1242 | .sb = sb, | 1283 | .sb = sb, |
1243 | .sync_mode = WB_SYNC_ALL, | 1284 | .sync_mode = WB_SYNC_ALL, |
1244 | .nr_pages = LONG_MAX, | 1285 | .nr_pages = LONG_MAX, |
1245 | .range_cyclic = 0, | 1286 | .range_cyclic = 0, |
1246 | .done = &done, | 1287 | .done = &done, |
1247 | }; | 1288 | }; |
1248 | 1289 | ||
1249 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1290 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
1250 | 1291 | ||
1251 | bdi_queue_work(sb->s_bdi, &work); | 1292 | bdi_queue_work(sb->s_bdi, &work); |
1252 | wait_for_completion(&done); | 1293 | wait_for_completion(&done); |
1253 | 1294 | ||
1254 | wait_sb_inodes(sb); | 1295 | wait_sb_inodes(sb); |
1255 | } | 1296 | } |
1256 | EXPORT_SYMBOL(sync_inodes_sb); | 1297 | EXPORT_SYMBOL(sync_inodes_sb); |
1257 | 1298 | ||
1258 | /** | 1299 | /** |
1259 | * write_inode_now - write an inode to disk | 1300 | * write_inode_now - write an inode to disk |
1260 | * @inode: inode to write to disk | 1301 | * @inode: inode to write to disk |
fs/inode.c
1 | /* | 1 | /* |
2 | * (C) 1997 Linus Torvalds | 2 | * (C) 1997 Linus Torvalds |
3 | * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) | 3 | * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) |
4 | */ | 4 | */ |
5 | #include <linux/fs.h> | 5 | #include <linux/fs.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/dcache.h> | 7 | #include <linux/dcache.h> |
8 | #include <linux/init.h> | 8 | #include <linux/init.h> |
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/writeback.h> | 10 | #include <linux/writeback.h> |
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/backing-dev.h> | 12 | #include <linux/backing-dev.h> |
13 | #include <linux/wait.h> | 13 | #include <linux/wait.h> |
14 | #include <linux/rwsem.h> | 14 | #include <linux/rwsem.h> |
15 | #include <linux/hash.h> | 15 | #include <linux/hash.h> |
16 | #include <linux/swap.h> | 16 | #include <linux/swap.h> |
17 | #include <linux/security.h> | 17 | #include <linux/security.h> |
18 | #include <linux/pagemap.h> | 18 | #include <linux/pagemap.h> |
19 | #include <linux/cdev.h> | 19 | #include <linux/cdev.h> |
20 | #include <linux/bootmem.h> | 20 | #include <linux/bootmem.h> |
21 | #include <linux/fsnotify.h> | 21 | #include <linux/fsnotify.h> |
22 | #include <linux/mount.h> | 22 | #include <linux/mount.h> |
23 | #include <linux/async.h> | 23 | #include <linux/async.h> |
24 | #include <linux/posix_acl.h> | 24 | #include <linux/posix_acl.h> |
25 | #include <linux/prefetch.h> | 25 | #include <linux/prefetch.h> |
26 | #include <linux/ima.h> | 26 | #include <linux/ima.h> |
27 | #include <linux/cred.h> | 27 | #include <linux/cred.h> |
28 | #include <linux/buffer_head.h> /* for inode_has_buffers */ | 28 | #include <linux/buffer_head.h> /* for inode_has_buffers */ |
29 | #include "internal.h" | 29 | #include "internal.h" |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Inode locking rules: | 32 | * Inode locking rules: |
33 | * | 33 | * |
34 | * inode->i_lock protects: | 34 | * inode->i_lock protects: |
35 | * inode->i_state, inode->i_hash, __iget() | 35 | * inode->i_state, inode->i_hash, __iget() |
36 | * inode->i_sb->s_inode_lru_lock protects: | 36 | * inode->i_sb->s_inode_lru_lock protects: |
37 | * inode->i_sb->s_inode_lru, inode->i_lru | 37 | * inode->i_sb->s_inode_lru, inode->i_lru |
38 | * inode_sb_list_lock protects: | 38 | * inode_sb_list_lock protects: |
39 | * sb->s_inodes, inode->i_sb_list | 39 | * sb->s_inodes, inode->i_sb_list |
40 | * inode_wb_list_lock protects: | 40 | * bdi->wb.list_lock protects: |
41 | * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list | 41 | * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list |
42 | * inode_hash_lock protects: | 42 | * inode_hash_lock protects: |
43 | * inode_hashtable, inode->i_hash | 43 | * inode_hashtable, inode->i_hash |
44 | * | 44 | * |
45 | * Lock ordering: | 45 | * Lock ordering: |
46 | * | 46 | * |
47 | * inode_sb_list_lock | 47 | * inode_sb_list_lock |
48 | * inode->i_lock | 48 | * inode->i_lock |
49 | * inode->i_sb->s_inode_lru_lock | 49 | * inode->i_sb->s_inode_lru_lock |
50 | * | 50 | * |
51 | * inode_wb_list_lock | 51 | * bdi->wb.list_lock |
52 | * inode->i_lock | 52 | * inode->i_lock |
53 | * | 53 | * |
54 | * inode_hash_lock | 54 | * inode_hash_lock |
55 | * inode_sb_list_lock | 55 | * inode_sb_list_lock |
56 | * inode->i_lock | 56 | * inode->i_lock |
57 | * | 57 | * |
58 | * iunique_lock | 58 | * iunique_lock |
59 | * inode_hash_lock | 59 | * inode_hash_lock |
60 | */ | 60 | */ |
61 | 61 | ||
62 | static unsigned int i_hash_mask __read_mostly; | 62 | static unsigned int i_hash_mask __read_mostly; |
63 | static unsigned int i_hash_shift __read_mostly; | 63 | static unsigned int i_hash_shift __read_mostly; |
64 | static struct hlist_head *inode_hashtable __read_mostly; | 64 | static struct hlist_head *inode_hashtable __read_mostly; |
65 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); | 65 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); |
66 | 66 | ||
67 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); | 67 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); |
68 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); | ||
69 | 68 | ||
70 | /* | 69 | /* |
71 | * Empty aops. Can be used for the cases where the user does not | 70 | * Empty aops. Can be used for the cases where the user does not |
72 | * define any of the address_space operations. | 71 | * define any of the address_space operations. |
73 | */ | 72 | */ |
74 | const struct address_space_operations empty_aops = { | 73 | const struct address_space_operations empty_aops = { |
75 | }; | 74 | }; |
76 | EXPORT_SYMBOL(empty_aops); | 75 | EXPORT_SYMBOL(empty_aops); |
77 | 76 | ||
78 | /* | 77 | /* |
79 | * Statistics gathering.. | 78 | * Statistics gathering.. |
80 | */ | 79 | */ |
81 | struct inodes_stat_t inodes_stat; | 80 | struct inodes_stat_t inodes_stat; |
82 | 81 | ||
83 | static DEFINE_PER_CPU(unsigned int, nr_inodes); | 82 | static DEFINE_PER_CPU(unsigned int, nr_inodes); |
84 | static DEFINE_PER_CPU(unsigned int, nr_unused); | 83 | static DEFINE_PER_CPU(unsigned int, nr_unused); |
85 | 84 | ||
86 | static struct kmem_cache *inode_cachep __read_mostly; | 85 | static struct kmem_cache *inode_cachep __read_mostly; |
87 | 86 | ||
88 | static int get_nr_inodes(void) | 87 | static int get_nr_inodes(void) |
89 | { | 88 | { |
90 | int i; | 89 | int i; |
91 | int sum = 0; | 90 | int sum = 0; |
92 | for_each_possible_cpu(i) | 91 | for_each_possible_cpu(i) |
93 | sum += per_cpu(nr_inodes, i); | 92 | sum += per_cpu(nr_inodes, i); |
94 | return sum < 0 ? 0 : sum; | 93 | return sum < 0 ? 0 : sum; |
95 | } | 94 | } |
96 | 95 | ||
97 | static inline int get_nr_inodes_unused(void) | 96 | static inline int get_nr_inodes_unused(void) |
98 | { | 97 | { |
99 | int i; | 98 | int i; |
100 | int sum = 0; | 99 | int sum = 0; |
101 | for_each_possible_cpu(i) | 100 | for_each_possible_cpu(i) |
102 | sum += per_cpu(nr_unused, i); | 101 | sum += per_cpu(nr_unused, i); |
103 | return sum < 0 ? 0 : sum; | 102 | return sum < 0 ? 0 : sum; |
104 | } | 103 | } |
105 | 104 | ||
106 | int get_nr_dirty_inodes(void) | 105 | int get_nr_dirty_inodes(void) |
107 | { | 106 | { |
108 | /* not actually dirty inodes, but a wild approximation */ | 107 | /* not actually dirty inodes, but a wild approximation */ |
109 | int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); | 108 | int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); |
110 | return nr_dirty > 0 ? nr_dirty : 0; | 109 | return nr_dirty > 0 ? nr_dirty : 0; |
111 | } | 110 | } |
112 | 111 | ||
113 | /* | 112 | /* |
114 | * Handle nr_inode sysctl | 113 | * Handle nr_inode sysctl |
115 | */ | 114 | */ |
116 | #ifdef CONFIG_SYSCTL | 115 | #ifdef CONFIG_SYSCTL |
117 | int proc_nr_inodes(ctl_table *table, int write, | 116 | int proc_nr_inodes(ctl_table *table, int write, |
118 | void __user *buffer, size_t *lenp, loff_t *ppos) | 117 | void __user *buffer, size_t *lenp, loff_t *ppos) |
119 | { | 118 | { |
120 | inodes_stat.nr_inodes = get_nr_inodes(); | 119 | inodes_stat.nr_inodes = get_nr_inodes(); |
121 | inodes_stat.nr_unused = get_nr_inodes_unused(); | 120 | inodes_stat.nr_unused = get_nr_inodes_unused(); |
122 | return proc_dointvec(table, write, buffer, lenp, ppos); | 121 | return proc_dointvec(table, write, buffer, lenp, ppos); |
123 | } | 122 | } |
124 | #endif | 123 | #endif |
125 | 124 | ||
126 | /** | 125 | /** |
127 | * inode_init_always - perform inode structure intialisation | 126 | * inode_init_always - perform inode structure intialisation |
128 | * @sb: superblock inode belongs to | 127 | * @sb: superblock inode belongs to |
129 | * @inode: inode to initialise | 128 | * @inode: inode to initialise |
130 | * | 129 | * |
131 | * These are initializations that need to be done on every inode | 130 | * These are initializations that need to be done on every inode |
132 | * allocation as the fields are not initialised by slab allocation. | 131 | * allocation as the fields are not initialised by slab allocation. |
133 | */ | 132 | */ |
134 | int inode_init_always(struct super_block *sb, struct inode *inode) | 133 | int inode_init_always(struct super_block *sb, struct inode *inode) |
135 | { | 134 | { |
136 | static const struct inode_operations empty_iops; | 135 | static const struct inode_operations empty_iops; |
137 | static const struct file_operations empty_fops; | 136 | static const struct file_operations empty_fops; |
138 | struct address_space *const mapping = &inode->i_data; | 137 | struct address_space *const mapping = &inode->i_data; |
139 | 138 | ||
140 | inode->i_sb = sb; | 139 | inode->i_sb = sb; |
141 | inode->i_blkbits = sb->s_blocksize_bits; | 140 | inode->i_blkbits = sb->s_blocksize_bits; |
142 | inode->i_flags = 0; | 141 | inode->i_flags = 0; |
143 | atomic_set(&inode->i_count, 1); | 142 | atomic_set(&inode->i_count, 1); |
144 | inode->i_op = &empty_iops; | 143 | inode->i_op = &empty_iops; |
145 | inode->i_fop = &empty_fops; | 144 | inode->i_fop = &empty_fops; |
146 | inode->i_nlink = 1; | 145 | inode->i_nlink = 1; |
147 | inode->i_uid = 0; | 146 | inode->i_uid = 0; |
148 | inode->i_gid = 0; | 147 | inode->i_gid = 0; |
149 | atomic_set(&inode->i_writecount, 0); | 148 | atomic_set(&inode->i_writecount, 0); |
150 | inode->i_size = 0; | 149 | inode->i_size = 0; |
151 | inode->i_blocks = 0; | 150 | inode->i_blocks = 0; |
152 | inode->i_bytes = 0; | 151 | inode->i_bytes = 0; |
153 | inode->i_generation = 0; | 152 | inode->i_generation = 0; |
154 | #ifdef CONFIG_QUOTA | 153 | #ifdef CONFIG_QUOTA |
155 | memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); | 154 | memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); |
156 | #endif | 155 | #endif |
157 | inode->i_pipe = NULL; | 156 | inode->i_pipe = NULL; |
158 | inode->i_bdev = NULL; | 157 | inode->i_bdev = NULL; |
159 | inode->i_cdev = NULL; | 158 | inode->i_cdev = NULL; |
160 | inode->i_rdev = 0; | 159 | inode->i_rdev = 0; |
161 | inode->dirtied_when = 0; | 160 | inode->dirtied_when = 0; |
162 | 161 | ||
163 | if (security_inode_alloc(inode)) | 162 | if (security_inode_alloc(inode)) |
164 | goto out; | 163 | goto out; |
165 | spin_lock_init(&inode->i_lock); | 164 | spin_lock_init(&inode->i_lock); |
166 | lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); | 165 | lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); |
167 | 166 | ||
168 | mutex_init(&inode->i_mutex); | 167 | mutex_init(&inode->i_mutex); |
169 | lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); | 168 | lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); |
170 | 169 | ||
171 | atomic_set(&inode->i_dio_count, 0); | 170 | atomic_set(&inode->i_dio_count, 0); |
172 | 171 | ||
173 | mapping->a_ops = &empty_aops; | 172 | mapping->a_ops = &empty_aops; |
174 | mapping->host = inode; | 173 | mapping->host = inode; |
175 | mapping->flags = 0; | 174 | mapping->flags = 0; |
176 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); | 175 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); |
177 | mapping->assoc_mapping = NULL; | 176 | mapping->assoc_mapping = NULL; |
178 | mapping->backing_dev_info = &default_backing_dev_info; | 177 | mapping->backing_dev_info = &default_backing_dev_info; |
179 | mapping->writeback_index = 0; | 178 | mapping->writeback_index = 0; |
180 | 179 | ||
181 | /* | 180 | /* |
182 | * If the block_device provides a backing_dev_info for client | 181 | * If the block_device provides a backing_dev_info for client |
183 | * inodes then use that. Otherwise the inode share the bdev's | 182 | * inodes then use that. Otherwise the inode share the bdev's |
184 | * backing_dev_info. | 183 | * backing_dev_info. |
185 | */ | 184 | */ |
186 | if (sb->s_bdev) { | 185 | if (sb->s_bdev) { |
187 | struct backing_dev_info *bdi; | 186 | struct backing_dev_info *bdi; |
188 | 187 | ||
189 | bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; | 188 | bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; |
190 | mapping->backing_dev_info = bdi; | 189 | mapping->backing_dev_info = bdi; |
191 | } | 190 | } |
192 | inode->i_private = NULL; | 191 | inode->i_private = NULL; |
193 | inode->i_mapping = mapping; | 192 | inode->i_mapping = mapping; |
194 | #ifdef CONFIG_FS_POSIX_ACL | 193 | #ifdef CONFIG_FS_POSIX_ACL |
195 | inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; | 194 | inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; |
196 | #endif | 195 | #endif |
197 | 196 | ||
198 | #ifdef CONFIG_FSNOTIFY | 197 | #ifdef CONFIG_FSNOTIFY |
199 | inode->i_fsnotify_mask = 0; | 198 | inode->i_fsnotify_mask = 0; |
200 | #endif | 199 | #endif |
201 | 200 | ||
202 | this_cpu_inc(nr_inodes); | 201 | this_cpu_inc(nr_inodes); |
203 | 202 | ||
204 | return 0; | 203 | return 0; |
205 | out: | 204 | out: |
206 | return -ENOMEM; | 205 | return -ENOMEM; |
207 | } | 206 | } |
208 | EXPORT_SYMBOL(inode_init_always); | 207 | EXPORT_SYMBOL(inode_init_always); |
209 | 208 | ||
210 | static struct inode *alloc_inode(struct super_block *sb) | 209 | static struct inode *alloc_inode(struct super_block *sb) |
211 | { | 210 | { |
212 | struct inode *inode; | 211 | struct inode *inode; |
213 | 212 | ||
214 | if (sb->s_op->alloc_inode) | 213 | if (sb->s_op->alloc_inode) |
215 | inode = sb->s_op->alloc_inode(sb); | 214 | inode = sb->s_op->alloc_inode(sb); |
216 | else | 215 | else |
217 | inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); | 216 | inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); |
218 | 217 | ||
219 | if (!inode) | 218 | if (!inode) |
220 | return NULL; | 219 | return NULL; |
221 | 220 | ||
222 | if (unlikely(inode_init_always(sb, inode))) { | 221 | if (unlikely(inode_init_always(sb, inode))) { |
223 | if (inode->i_sb->s_op->destroy_inode) | 222 | if (inode->i_sb->s_op->destroy_inode) |
224 | inode->i_sb->s_op->destroy_inode(inode); | 223 | inode->i_sb->s_op->destroy_inode(inode); |
225 | else | 224 | else |
226 | kmem_cache_free(inode_cachep, inode); | 225 | kmem_cache_free(inode_cachep, inode); |
227 | return NULL; | 226 | return NULL; |
228 | } | 227 | } |
229 | 228 | ||
230 | return inode; | 229 | return inode; |
231 | } | 230 | } |
232 | 231 | ||
233 | void free_inode_nonrcu(struct inode *inode) | 232 | void free_inode_nonrcu(struct inode *inode) |
234 | { | 233 | { |
235 | kmem_cache_free(inode_cachep, inode); | 234 | kmem_cache_free(inode_cachep, inode); |
236 | } | 235 | } |
237 | EXPORT_SYMBOL(free_inode_nonrcu); | 236 | EXPORT_SYMBOL(free_inode_nonrcu); |
238 | 237 | ||
239 | void __destroy_inode(struct inode *inode) | 238 | void __destroy_inode(struct inode *inode) |
240 | { | 239 | { |
241 | BUG_ON(inode_has_buffers(inode)); | 240 | BUG_ON(inode_has_buffers(inode)); |
242 | security_inode_free(inode); | 241 | security_inode_free(inode); |
243 | fsnotify_inode_delete(inode); | 242 | fsnotify_inode_delete(inode); |
244 | #ifdef CONFIG_FS_POSIX_ACL | 243 | #ifdef CONFIG_FS_POSIX_ACL |
245 | if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) | 244 | if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) |
246 | posix_acl_release(inode->i_acl); | 245 | posix_acl_release(inode->i_acl); |
247 | if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) | 246 | if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) |
248 | posix_acl_release(inode->i_default_acl); | 247 | posix_acl_release(inode->i_default_acl); |
249 | #endif | 248 | #endif |
250 | this_cpu_dec(nr_inodes); | 249 | this_cpu_dec(nr_inodes); |
251 | } | 250 | } |
252 | EXPORT_SYMBOL(__destroy_inode); | 251 | EXPORT_SYMBOL(__destroy_inode); |
253 | 252 | ||
254 | static void i_callback(struct rcu_head *head) | 253 | static void i_callback(struct rcu_head *head) |
255 | { | 254 | { |
256 | struct inode *inode = container_of(head, struct inode, i_rcu); | 255 | struct inode *inode = container_of(head, struct inode, i_rcu); |
257 | INIT_LIST_HEAD(&inode->i_dentry); | 256 | INIT_LIST_HEAD(&inode->i_dentry); |
258 | kmem_cache_free(inode_cachep, inode); | 257 | kmem_cache_free(inode_cachep, inode); |
259 | } | 258 | } |
260 | 259 | ||
261 | static void destroy_inode(struct inode *inode) | 260 | static void destroy_inode(struct inode *inode) |
262 | { | 261 | { |
263 | BUG_ON(!list_empty(&inode->i_lru)); | 262 | BUG_ON(!list_empty(&inode->i_lru)); |
264 | __destroy_inode(inode); | 263 | __destroy_inode(inode); |
265 | if (inode->i_sb->s_op->destroy_inode) | 264 | if (inode->i_sb->s_op->destroy_inode) |
266 | inode->i_sb->s_op->destroy_inode(inode); | 265 | inode->i_sb->s_op->destroy_inode(inode); |
267 | else | 266 | else |
268 | call_rcu(&inode->i_rcu, i_callback); | 267 | call_rcu(&inode->i_rcu, i_callback); |
269 | } | 268 | } |
270 | 269 | ||
271 | void address_space_init_once(struct address_space *mapping) | 270 | void address_space_init_once(struct address_space *mapping) |
272 | { | 271 | { |
273 | memset(mapping, 0, sizeof(*mapping)); | 272 | memset(mapping, 0, sizeof(*mapping)); |
274 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); | 273 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); |
275 | spin_lock_init(&mapping->tree_lock); | 274 | spin_lock_init(&mapping->tree_lock); |
276 | mutex_init(&mapping->i_mmap_mutex); | 275 | mutex_init(&mapping->i_mmap_mutex); |
277 | INIT_LIST_HEAD(&mapping->private_list); | 276 | INIT_LIST_HEAD(&mapping->private_list); |
278 | spin_lock_init(&mapping->private_lock); | 277 | spin_lock_init(&mapping->private_lock); |
279 | INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); | 278 | INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); |
280 | INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); | 279 | INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); |
281 | } | 280 | } |
282 | EXPORT_SYMBOL(address_space_init_once); | 281 | EXPORT_SYMBOL(address_space_init_once); |
283 | 282 | ||
284 | /* | 283 | /* |
285 | * These are initializations that only need to be done | 284 | * These are initializations that only need to be done |
286 | * once, because the fields are idempotent across use | 285 | * once, because the fields are idempotent across use |
287 | * of the inode, so let the slab aware of that. | 286 | * of the inode, so let the slab aware of that. |
288 | */ | 287 | */ |
289 | void inode_init_once(struct inode *inode) | 288 | void inode_init_once(struct inode *inode) |
290 | { | 289 | { |
291 | memset(inode, 0, sizeof(*inode)); | 290 | memset(inode, 0, sizeof(*inode)); |
292 | INIT_HLIST_NODE(&inode->i_hash); | 291 | INIT_HLIST_NODE(&inode->i_hash); |
293 | INIT_LIST_HEAD(&inode->i_dentry); | 292 | INIT_LIST_HEAD(&inode->i_dentry); |
294 | INIT_LIST_HEAD(&inode->i_devices); | 293 | INIT_LIST_HEAD(&inode->i_devices); |
295 | INIT_LIST_HEAD(&inode->i_wb_list); | 294 | INIT_LIST_HEAD(&inode->i_wb_list); |
296 | INIT_LIST_HEAD(&inode->i_lru); | 295 | INIT_LIST_HEAD(&inode->i_lru); |
297 | address_space_init_once(&inode->i_data); | 296 | address_space_init_once(&inode->i_data); |
298 | i_size_ordered_init(inode); | 297 | i_size_ordered_init(inode); |
299 | #ifdef CONFIG_FSNOTIFY | 298 | #ifdef CONFIG_FSNOTIFY |
300 | INIT_HLIST_HEAD(&inode->i_fsnotify_marks); | 299 | INIT_HLIST_HEAD(&inode->i_fsnotify_marks); |
301 | #endif | 300 | #endif |
302 | } | 301 | } |
303 | EXPORT_SYMBOL(inode_init_once); | 302 | EXPORT_SYMBOL(inode_init_once); |
304 | 303 | ||
305 | static void init_once(void *foo) | 304 | static void init_once(void *foo) |
306 | { | 305 | { |
307 | struct inode *inode = (struct inode *) foo; | 306 | struct inode *inode = (struct inode *) foo; |
308 | 307 | ||
309 | inode_init_once(inode); | 308 | inode_init_once(inode); |
310 | } | 309 | } |
311 | 310 | ||
312 | /* | 311 | /* |
313 | * inode->i_lock must be held | 312 | * inode->i_lock must be held |
314 | */ | 313 | */ |
315 | void __iget(struct inode *inode) | 314 | void __iget(struct inode *inode) |
316 | { | 315 | { |
317 | atomic_inc(&inode->i_count); | 316 | atomic_inc(&inode->i_count); |
318 | } | 317 | } |
319 | 318 | ||
320 | /* | 319 | /* |
321 | * get additional reference to inode; caller must already hold one. | 320 | * get additional reference to inode; caller must already hold one. |
322 | */ | 321 | */ |
323 | void ihold(struct inode *inode) | 322 | void ihold(struct inode *inode) |
324 | { | 323 | { |
325 | WARN_ON(atomic_inc_return(&inode->i_count) < 2); | 324 | WARN_ON(atomic_inc_return(&inode->i_count) < 2); |
326 | } | 325 | } |
327 | EXPORT_SYMBOL(ihold); | 326 | EXPORT_SYMBOL(ihold); |
328 | 327 | ||
329 | static void inode_lru_list_add(struct inode *inode) | 328 | static void inode_lru_list_add(struct inode *inode) |
330 | { | 329 | { |
331 | spin_lock(&inode->i_sb->s_inode_lru_lock); | 330 | spin_lock(&inode->i_sb->s_inode_lru_lock); |
332 | if (list_empty(&inode->i_lru)) { | 331 | if (list_empty(&inode->i_lru)) { |
333 | list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); | 332 | list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); |
334 | inode->i_sb->s_nr_inodes_unused++; | 333 | inode->i_sb->s_nr_inodes_unused++; |
335 | this_cpu_inc(nr_unused); | 334 | this_cpu_inc(nr_unused); |
336 | } | 335 | } |
337 | spin_unlock(&inode->i_sb->s_inode_lru_lock); | 336 | spin_unlock(&inode->i_sb->s_inode_lru_lock); |
338 | } | 337 | } |
339 | 338 | ||
340 | static void inode_lru_list_del(struct inode *inode) | 339 | static void inode_lru_list_del(struct inode *inode) |
341 | { | 340 | { |
342 | spin_lock(&inode->i_sb->s_inode_lru_lock); | 341 | spin_lock(&inode->i_sb->s_inode_lru_lock); |
343 | if (!list_empty(&inode->i_lru)) { | 342 | if (!list_empty(&inode->i_lru)) { |
344 | list_del_init(&inode->i_lru); | 343 | list_del_init(&inode->i_lru); |
345 | inode->i_sb->s_nr_inodes_unused--; | 344 | inode->i_sb->s_nr_inodes_unused--; |
346 | this_cpu_dec(nr_unused); | 345 | this_cpu_dec(nr_unused); |
347 | } | 346 | } |
348 | spin_unlock(&inode->i_sb->s_inode_lru_lock); | 347 | spin_unlock(&inode->i_sb->s_inode_lru_lock); |
349 | } | 348 | } |
350 | 349 | ||
351 | /** | 350 | /** |
352 | * inode_sb_list_add - add inode to the superblock list of inodes | 351 | * inode_sb_list_add - add inode to the superblock list of inodes |
353 | * @inode: inode to add | 352 | * @inode: inode to add |
354 | */ | 353 | */ |
355 | void inode_sb_list_add(struct inode *inode) | 354 | void inode_sb_list_add(struct inode *inode) |
356 | { | 355 | { |
357 | spin_lock(&inode_sb_list_lock); | 356 | spin_lock(&inode_sb_list_lock); |
358 | list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); | 357 | list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); |
359 | spin_unlock(&inode_sb_list_lock); | 358 | spin_unlock(&inode_sb_list_lock); |
360 | } | 359 | } |
361 | EXPORT_SYMBOL_GPL(inode_sb_list_add); | 360 | EXPORT_SYMBOL_GPL(inode_sb_list_add); |
362 | 361 | ||
363 | static inline void inode_sb_list_del(struct inode *inode) | 362 | static inline void inode_sb_list_del(struct inode *inode) |
364 | { | 363 | { |
365 | spin_lock(&inode_sb_list_lock); | 364 | spin_lock(&inode_sb_list_lock); |
366 | list_del_init(&inode->i_sb_list); | 365 | list_del_init(&inode->i_sb_list); |
367 | spin_unlock(&inode_sb_list_lock); | 366 | spin_unlock(&inode_sb_list_lock); |
368 | } | 367 | } |
369 | 368 | ||
370 | static unsigned long hash(struct super_block *sb, unsigned long hashval) | 369 | static unsigned long hash(struct super_block *sb, unsigned long hashval) |
371 | { | 370 | { |
372 | unsigned long tmp; | 371 | unsigned long tmp; |
373 | 372 | ||
374 | tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / | 373 | tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / |
375 | L1_CACHE_BYTES; | 374 | L1_CACHE_BYTES; |
376 | tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); | 375 | tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); |
377 | return tmp & i_hash_mask; | 376 | return tmp & i_hash_mask; |
378 | } | 377 | } |
379 | 378 | ||
380 | /** | 379 | /** |
381 | * __insert_inode_hash - hash an inode | 380 | * __insert_inode_hash - hash an inode |
382 | * @inode: unhashed inode | 381 | * @inode: unhashed inode |
383 | * @hashval: unsigned long value used to locate this object in the | 382 | * @hashval: unsigned long value used to locate this object in the |
384 | * inode_hashtable. | 383 | * inode_hashtable. |
385 | * | 384 | * |
386 | * Add an inode to the inode hash for this superblock. | 385 | * Add an inode to the inode hash for this superblock. |
387 | */ | 386 | */ |
388 | void __insert_inode_hash(struct inode *inode, unsigned long hashval) | 387 | void __insert_inode_hash(struct inode *inode, unsigned long hashval) |
389 | { | 388 | { |
390 | struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); | 389 | struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); |
391 | 390 | ||
392 | spin_lock(&inode_hash_lock); | 391 | spin_lock(&inode_hash_lock); |
393 | spin_lock(&inode->i_lock); | 392 | spin_lock(&inode->i_lock); |
394 | hlist_add_head(&inode->i_hash, b); | 393 | hlist_add_head(&inode->i_hash, b); |
395 | spin_unlock(&inode->i_lock); | 394 | spin_unlock(&inode->i_lock); |
396 | spin_unlock(&inode_hash_lock); | 395 | spin_unlock(&inode_hash_lock); |
397 | } | 396 | } |
398 | EXPORT_SYMBOL(__insert_inode_hash); | 397 | EXPORT_SYMBOL(__insert_inode_hash); |
399 | 398 | ||
400 | /** | 399 | /** |
401 | * remove_inode_hash - remove an inode from the hash | 400 | * remove_inode_hash - remove an inode from the hash |
402 | * @inode: inode to unhash | 401 | * @inode: inode to unhash |
403 | * | 402 | * |
404 | * Remove an inode from the superblock. | 403 | * Remove an inode from the superblock. |
405 | */ | 404 | */ |
406 | void remove_inode_hash(struct inode *inode) | 405 | void remove_inode_hash(struct inode *inode) |
407 | { | 406 | { |
408 | spin_lock(&inode_hash_lock); | 407 | spin_lock(&inode_hash_lock); |
409 | spin_lock(&inode->i_lock); | 408 | spin_lock(&inode->i_lock); |
410 | hlist_del_init(&inode->i_hash); | 409 | hlist_del_init(&inode->i_hash); |
411 | spin_unlock(&inode->i_lock); | 410 | spin_unlock(&inode->i_lock); |
412 | spin_unlock(&inode_hash_lock); | 411 | spin_unlock(&inode_hash_lock); |
413 | } | 412 | } |
414 | EXPORT_SYMBOL(remove_inode_hash); | 413 | EXPORT_SYMBOL(remove_inode_hash); |
415 | 414 | ||
416 | void end_writeback(struct inode *inode) | 415 | void end_writeback(struct inode *inode) |
417 | { | 416 | { |
418 | might_sleep(); | 417 | might_sleep(); |
419 | /* | 418 | /* |
420 | * We have to cycle tree_lock here because reclaim can be still in the | 419 | * We have to cycle tree_lock here because reclaim can be still in the |
421 | * process of removing the last page (in __delete_from_page_cache()) | 420 | * process of removing the last page (in __delete_from_page_cache()) |
422 | * and we must not free mapping under it. | 421 | * and we must not free mapping under it. |
423 | */ | 422 | */ |
424 | spin_lock_irq(&inode->i_data.tree_lock); | 423 | spin_lock_irq(&inode->i_data.tree_lock); |
425 | BUG_ON(inode->i_data.nrpages); | 424 | BUG_ON(inode->i_data.nrpages); |
426 | spin_unlock_irq(&inode->i_data.tree_lock); | 425 | spin_unlock_irq(&inode->i_data.tree_lock); |
427 | BUG_ON(!list_empty(&inode->i_data.private_list)); | 426 | BUG_ON(!list_empty(&inode->i_data.private_list)); |
428 | BUG_ON(!(inode->i_state & I_FREEING)); | 427 | BUG_ON(!(inode->i_state & I_FREEING)); |
429 | BUG_ON(inode->i_state & I_CLEAR); | 428 | BUG_ON(inode->i_state & I_CLEAR); |
430 | inode_sync_wait(inode); | 429 | inode_sync_wait(inode); |
431 | /* don't need i_lock here, no concurrent mods to i_state */ | 430 | /* don't need i_lock here, no concurrent mods to i_state */ |
432 | inode->i_state = I_FREEING | I_CLEAR; | 431 | inode->i_state = I_FREEING | I_CLEAR; |
433 | } | 432 | } |
434 | EXPORT_SYMBOL(end_writeback); | 433 | EXPORT_SYMBOL(end_writeback); |
435 | 434 | ||
436 | /* | 435 | /* |
437 | * Free the inode passed in, removing it from the lists it is still connected | 436 | * Free the inode passed in, removing it from the lists it is still connected |
438 | * to. We remove any pages still attached to the inode and wait for any IO that | 437 | * to. We remove any pages still attached to the inode and wait for any IO that |
439 | * is still in progress before finally destroying the inode. | 438 | * is still in progress before finally destroying the inode. |
440 | * | 439 | * |
441 | * An inode must already be marked I_FREEING so that we avoid the inode being | 440 | * An inode must already be marked I_FREEING so that we avoid the inode being |
442 | * moved back onto lists if we race with other code that manipulates the lists | 441 | * moved back onto lists if we race with other code that manipulates the lists |
443 | * (e.g. writeback_single_inode). The caller is responsible for setting this. | 442 | * (e.g. writeback_single_inode). The caller is responsible for setting this. |
444 | * | 443 | * |
445 | * An inode must already be removed from the LRU list before being evicted from | 444 | * An inode must already be removed from the LRU list before being evicted from |
446 | * the cache. This should occur atomically with setting the I_FREEING state | 445 | * the cache. This should occur atomically with setting the I_FREEING state |
447 | * flag, so no inodes here should ever be on the LRU when being evicted. | 446 | * flag, so no inodes here should ever be on the LRU when being evicted. |
448 | */ | 447 | */ |
449 | static void evict(struct inode *inode) | 448 | static void evict(struct inode *inode) |
450 | { | 449 | { |
451 | const struct super_operations *op = inode->i_sb->s_op; | 450 | const struct super_operations *op = inode->i_sb->s_op; |
452 | 451 | ||
453 | BUG_ON(!(inode->i_state & I_FREEING)); | 452 | BUG_ON(!(inode->i_state & I_FREEING)); |
454 | BUG_ON(!list_empty(&inode->i_lru)); | 453 | BUG_ON(!list_empty(&inode->i_lru)); |
455 | 454 | ||
456 | inode_wb_list_del(inode); | 455 | inode_wb_list_del(inode); |
457 | inode_sb_list_del(inode); | 456 | inode_sb_list_del(inode); |
458 | 457 | ||
459 | if (op->evict_inode) { | 458 | if (op->evict_inode) { |
460 | op->evict_inode(inode); | 459 | op->evict_inode(inode); |
461 | } else { | 460 | } else { |
462 | if (inode->i_data.nrpages) | 461 | if (inode->i_data.nrpages) |
463 | truncate_inode_pages(&inode->i_data, 0); | 462 | truncate_inode_pages(&inode->i_data, 0); |
464 | end_writeback(inode); | 463 | end_writeback(inode); |
465 | } | 464 | } |
466 | if (S_ISBLK(inode->i_mode) && inode->i_bdev) | 465 | if (S_ISBLK(inode->i_mode) && inode->i_bdev) |
467 | bd_forget(inode); | 466 | bd_forget(inode); |
468 | if (S_ISCHR(inode->i_mode) && inode->i_cdev) | 467 | if (S_ISCHR(inode->i_mode) && inode->i_cdev) |
469 | cd_forget(inode); | 468 | cd_forget(inode); |
470 | 469 | ||
471 | remove_inode_hash(inode); | 470 | remove_inode_hash(inode); |
472 | 471 | ||
473 | spin_lock(&inode->i_lock); | 472 | spin_lock(&inode->i_lock); |
474 | wake_up_bit(&inode->i_state, __I_NEW); | 473 | wake_up_bit(&inode->i_state, __I_NEW); |
475 | BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); | 474 | BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); |
476 | spin_unlock(&inode->i_lock); | 475 | spin_unlock(&inode->i_lock); |
477 | 476 | ||
478 | destroy_inode(inode); | 477 | destroy_inode(inode); |
479 | } | 478 | } |
480 | 479 | ||
481 | /* | 480 | /* |
482 | * dispose_list - dispose of the contents of a local list | 481 | * dispose_list - dispose of the contents of a local list |
483 | * @head: the head of the list to free | 482 | * @head: the head of the list to free |
484 | * | 483 | * |
485 | * Dispose-list gets a local list with local inodes in it, so it doesn't | 484 | * Dispose-list gets a local list with local inodes in it, so it doesn't |
486 | * need to worry about list corruption and SMP locks. | 485 | * need to worry about list corruption and SMP locks. |
487 | */ | 486 | */ |
488 | static void dispose_list(struct list_head *head) | 487 | static void dispose_list(struct list_head *head) |
489 | { | 488 | { |
490 | while (!list_empty(head)) { | 489 | while (!list_empty(head)) { |
491 | struct inode *inode; | 490 | struct inode *inode; |
492 | 491 | ||
493 | inode = list_first_entry(head, struct inode, i_lru); | 492 | inode = list_first_entry(head, struct inode, i_lru); |
494 | list_del_init(&inode->i_lru); | 493 | list_del_init(&inode->i_lru); |
495 | 494 | ||
496 | evict(inode); | 495 | evict(inode); |
497 | } | 496 | } |
498 | } | 497 | } |
499 | 498 | ||
500 | /** | 499 | /** |
501 | * evict_inodes - evict all evictable inodes for a superblock | 500 | * evict_inodes - evict all evictable inodes for a superblock |
502 | * @sb: superblock to operate on | 501 | * @sb: superblock to operate on |
503 | * | 502 | * |
504 | * Make sure that no inodes with zero refcount are retained. This is | 503 | * Make sure that no inodes with zero refcount are retained. This is |
505 | * called by superblock shutdown after having MS_ACTIVE flag removed, | 504 | * called by superblock shutdown after having MS_ACTIVE flag removed, |
506 | * so any inode reaching zero refcount during or after that call will | 505 | * so any inode reaching zero refcount during or after that call will |
507 | * be immediately evicted. | 506 | * be immediately evicted. |
508 | */ | 507 | */ |
509 | void evict_inodes(struct super_block *sb) | 508 | void evict_inodes(struct super_block *sb) |
510 | { | 509 | { |
511 | struct inode *inode, *next; | 510 | struct inode *inode, *next; |
512 | LIST_HEAD(dispose); | 511 | LIST_HEAD(dispose); |
513 | 512 | ||
514 | spin_lock(&inode_sb_list_lock); | 513 | spin_lock(&inode_sb_list_lock); |
515 | list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { | 514 | list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { |
516 | if (atomic_read(&inode->i_count)) | 515 | if (atomic_read(&inode->i_count)) |
517 | continue; | 516 | continue; |
518 | 517 | ||
519 | spin_lock(&inode->i_lock); | 518 | spin_lock(&inode->i_lock); |
520 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { | 519 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { |
521 | spin_unlock(&inode->i_lock); | 520 | spin_unlock(&inode->i_lock); |
522 | continue; | 521 | continue; |
523 | } | 522 | } |
524 | 523 | ||
525 | inode->i_state |= I_FREEING; | 524 | inode->i_state |= I_FREEING; |
526 | inode_lru_list_del(inode); | 525 | inode_lru_list_del(inode); |
527 | spin_unlock(&inode->i_lock); | 526 | spin_unlock(&inode->i_lock); |
528 | list_add(&inode->i_lru, &dispose); | 527 | list_add(&inode->i_lru, &dispose); |
529 | } | 528 | } |
530 | spin_unlock(&inode_sb_list_lock); | 529 | spin_unlock(&inode_sb_list_lock); |
531 | 530 | ||
532 | dispose_list(&dispose); | 531 | dispose_list(&dispose); |
533 | } | 532 | } |
534 | 533 | ||
535 | /** | 534 | /** |
536 | * invalidate_inodes - attempt to free all inodes on a superblock | 535 | * invalidate_inodes - attempt to free all inodes on a superblock |
537 | * @sb: superblock to operate on | 536 | * @sb: superblock to operate on |
538 | * @kill_dirty: flag to guide handling of dirty inodes | 537 | * @kill_dirty: flag to guide handling of dirty inodes |
539 | * | 538 | * |
540 | * Attempts to free all inodes for a given superblock. If there were any | 539 | * Attempts to free all inodes for a given superblock. If there were any |
541 | * busy inodes return a non-zero value, else zero. | 540 | * busy inodes return a non-zero value, else zero. |
542 | * If @kill_dirty is set, discard dirty inodes too, otherwise treat | 541 | * If @kill_dirty is set, discard dirty inodes too, otherwise treat |
543 | * them as busy. | 542 | * them as busy. |
544 | */ | 543 | */ |
545 | int invalidate_inodes(struct super_block *sb, bool kill_dirty) | 544 | int invalidate_inodes(struct super_block *sb, bool kill_dirty) |
546 | { | 545 | { |
547 | int busy = 0; | 546 | int busy = 0; |
548 | struct inode *inode, *next; | 547 | struct inode *inode, *next; |
549 | LIST_HEAD(dispose); | 548 | LIST_HEAD(dispose); |
550 | 549 | ||
551 | spin_lock(&inode_sb_list_lock); | 550 | spin_lock(&inode_sb_list_lock); |
552 | list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { | 551 | list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { |
553 | spin_lock(&inode->i_lock); | 552 | spin_lock(&inode->i_lock); |
554 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { | 553 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { |
555 | spin_unlock(&inode->i_lock); | 554 | spin_unlock(&inode->i_lock); |
556 | continue; | 555 | continue; |
557 | } | 556 | } |
558 | if (inode->i_state & I_DIRTY && !kill_dirty) { | 557 | if (inode->i_state & I_DIRTY && !kill_dirty) { |
559 | spin_unlock(&inode->i_lock); | 558 | spin_unlock(&inode->i_lock); |
560 | busy = 1; | 559 | busy = 1; |
561 | continue; | 560 | continue; |
562 | } | 561 | } |
563 | if (atomic_read(&inode->i_count)) { | 562 | if (atomic_read(&inode->i_count)) { |
564 | spin_unlock(&inode->i_lock); | 563 | spin_unlock(&inode->i_lock); |
565 | busy = 1; | 564 | busy = 1; |
566 | continue; | 565 | continue; |
567 | } | 566 | } |
568 | 567 | ||
569 | inode->i_state |= I_FREEING; | 568 | inode->i_state |= I_FREEING; |
570 | inode_lru_list_del(inode); | 569 | inode_lru_list_del(inode); |
571 | spin_unlock(&inode->i_lock); | 570 | spin_unlock(&inode->i_lock); |
572 | list_add(&inode->i_lru, &dispose); | 571 | list_add(&inode->i_lru, &dispose); |
573 | } | 572 | } |
574 | spin_unlock(&inode_sb_list_lock); | 573 | spin_unlock(&inode_sb_list_lock); |
575 | 574 | ||
576 | dispose_list(&dispose); | 575 | dispose_list(&dispose); |
577 | 576 | ||
578 | return busy; | 577 | return busy; |
579 | } | 578 | } |
580 | 579 | ||
581 | static int can_unuse(struct inode *inode) | 580 | static int can_unuse(struct inode *inode) |
582 | { | 581 | { |
583 | if (inode->i_state & ~I_REFERENCED) | 582 | if (inode->i_state & ~I_REFERENCED) |
584 | return 0; | 583 | return 0; |
585 | if (inode_has_buffers(inode)) | 584 | if (inode_has_buffers(inode)) |
586 | return 0; | 585 | return 0; |
587 | if (atomic_read(&inode->i_count)) | 586 | if (atomic_read(&inode->i_count)) |
588 | return 0; | 587 | return 0; |
589 | if (inode->i_data.nrpages) | 588 | if (inode->i_data.nrpages) |
590 | return 0; | 589 | return 0; |
591 | return 1; | 590 | return 1; |
592 | } | 591 | } |
593 | 592 | ||
594 | /* | 593 | /* |
595 | * Walk the superblock inode LRU for freeable inodes and attempt to free them. | 594 | * Walk the superblock inode LRU for freeable inodes and attempt to free them. |
596 | * This is called from the superblock shrinker function with a number of inodes | 595 | * This is called from the superblock shrinker function with a number of inodes |
597 | * to trim from the LRU. Inodes to be freed are moved to a temporary list and | 596 | * to trim from the LRU. Inodes to be freed are moved to a temporary list and |
598 | * then are freed outside inode_lock by dispose_list(). | 597 | * then are freed outside inode_lock by dispose_list(). |
599 | * | 598 | * |
600 | * Any inodes which are pinned purely because of attached pagecache have their | 599 | * Any inodes which are pinned purely because of attached pagecache have their |
601 | * pagecache removed. If the inode has metadata buffers attached to | 600 | * pagecache removed. If the inode has metadata buffers attached to |
602 | * mapping->private_list then try to remove them. | 601 | * mapping->private_list then try to remove them. |
603 | * | 602 | * |
604 | * If the inode has the I_REFERENCED flag set, then it means that it has been | 603 | * If the inode has the I_REFERENCED flag set, then it means that it has been |
605 | * used recently - the flag is set in iput_final(). When we encounter such an | 604 | * used recently - the flag is set in iput_final(). When we encounter such an |
606 | * inode, clear the flag and move it to the back of the LRU so it gets another | 605 | * inode, clear the flag and move it to the back of the LRU so it gets another |
607 | * pass through the LRU before it gets reclaimed. This is necessary because of | 606 | * pass through the LRU before it gets reclaimed. This is necessary because of |
608 | * the fact we are doing lazy LRU updates to minimise lock contention so the | 607 | * the fact we are doing lazy LRU updates to minimise lock contention so the |
609 | * LRU does not have strict ordering. Hence we don't want to reclaim inodes | 608 | * LRU does not have strict ordering. Hence we don't want to reclaim inodes |
610 | * with this flag set because they are the inodes that are out of order. | 609 | * with this flag set because they are the inodes that are out of order. |
611 | */ | 610 | */ |
612 | void prune_icache_sb(struct super_block *sb, int nr_to_scan) | 611 | void prune_icache_sb(struct super_block *sb, int nr_to_scan) |
613 | { | 612 | { |
614 | LIST_HEAD(freeable); | 613 | LIST_HEAD(freeable); |
615 | int nr_scanned; | 614 | int nr_scanned; |
616 | unsigned long reap = 0; | 615 | unsigned long reap = 0; |
617 | 616 | ||
618 | spin_lock(&sb->s_inode_lru_lock); | 617 | spin_lock(&sb->s_inode_lru_lock); |
619 | for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { | 618 | for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { |
620 | struct inode *inode; | 619 | struct inode *inode; |
621 | 620 | ||
622 | if (list_empty(&sb->s_inode_lru)) | 621 | if (list_empty(&sb->s_inode_lru)) |
623 | break; | 622 | break; |
624 | 623 | ||
625 | inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); | 624 | inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); |
626 | 625 | ||
627 | /* | 626 | /* |
628 | * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, | 627 | * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, |
629 | * so use a trylock. If we fail to get the lock, just move the | 628 | * so use a trylock. If we fail to get the lock, just move the |
630 | * inode to the back of the list so we don't spin on it. | 629 | * inode to the back of the list so we don't spin on it. |
631 | */ | 630 | */ |
632 | if (!spin_trylock(&inode->i_lock)) { | 631 | if (!spin_trylock(&inode->i_lock)) { |
633 | list_move(&inode->i_lru, &sb->s_inode_lru); | 632 | list_move(&inode->i_lru, &sb->s_inode_lru); |
634 | continue; | 633 | continue; |
635 | } | 634 | } |
636 | 635 | ||
637 | /* | 636 | /* |
638 | * Referenced or dirty inodes are still in use. Give them | 637 | * Referenced or dirty inodes are still in use. Give them |
639 | * another pass through the LRU as we canot reclaim them now. | 638 | * another pass through the LRU as we canot reclaim them now. |
640 | */ | 639 | */ |
641 | if (atomic_read(&inode->i_count) || | 640 | if (atomic_read(&inode->i_count) || |
642 | (inode->i_state & ~I_REFERENCED)) { | 641 | (inode->i_state & ~I_REFERENCED)) { |
643 | list_del_init(&inode->i_lru); | 642 | list_del_init(&inode->i_lru); |
644 | spin_unlock(&inode->i_lock); | 643 | spin_unlock(&inode->i_lock); |
645 | sb->s_nr_inodes_unused--; | 644 | sb->s_nr_inodes_unused--; |
646 | this_cpu_dec(nr_unused); | 645 | this_cpu_dec(nr_unused); |
647 | continue; | 646 | continue; |
648 | } | 647 | } |
649 | 648 | ||
650 | /* recently referenced inodes get one more pass */ | 649 | /* recently referenced inodes get one more pass */ |
651 | if (inode->i_state & I_REFERENCED) { | 650 | if (inode->i_state & I_REFERENCED) { |
652 | inode->i_state &= ~I_REFERENCED; | 651 | inode->i_state &= ~I_REFERENCED; |
653 | list_move(&inode->i_lru, &sb->s_inode_lru); | 652 | list_move(&inode->i_lru, &sb->s_inode_lru); |
654 | spin_unlock(&inode->i_lock); | 653 | spin_unlock(&inode->i_lock); |
655 | continue; | 654 | continue; |
656 | } | 655 | } |
657 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { | 656 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { |
658 | __iget(inode); | 657 | __iget(inode); |
659 | spin_unlock(&inode->i_lock); | 658 | spin_unlock(&inode->i_lock); |
660 | spin_unlock(&sb->s_inode_lru_lock); | 659 | spin_unlock(&sb->s_inode_lru_lock); |
661 | if (remove_inode_buffers(inode)) | 660 | if (remove_inode_buffers(inode)) |
662 | reap += invalidate_mapping_pages(&inode->i_data, | 661 | reap += invalidate_mapping_pages(&inode->i_data, |
663 | 0, -1); | 662 | 0, -1); |
664 | iput(inode); | 663 | iput(inode); |
665 | spin_lock(&sb->s_inode_lru_lock); | 664 | spin_lock(&sb->s_inode_lru_lock); |
666 | 665 | ||
667 | if (inode != list_entry(sb->s_inode_lru.next, | 666 | if (inode != list_entry(sb->s_inode_lru.next, |
668 | struct inode, i_lru)) | 667 | struct inode, i_lru)) |
669 | continue; /* wrong inode or list_empty */ | 668 | continue; /* wrong inode or list_empty */ |
670 | /* avoid lock inversions with trylock */ | 669 | /* avoid lock inversions with trylock */ |
671 | if (!spin_trylock(&inode->i_lock)) | 670 | if (!spin_trylock(&inode->i_lock)) |
672 | continue; | 671 | continue; |
673 | if (!can_unuse(inode)) { | 672 | if (!can_unuse(inode)) { |
674 | spin_unlock(&inode->i_lock); | 673 | spin_unlock(&inode->i_lock); |
675 | continue; | 674 | continue; |
676 | } | 675 | } |
677 | } | 676 | } |
678 | WARN_ON(inode->i_state & I_NEW); | 677 | WARN_ON(inode->i_state & I_NEW); |
679 | inode->i_state |= I_FREEING; | 678 | inode->i_state |= I_FREEING; |
680 | spin_unlock(&inode->i_lock); | 679 | spin_unlock(&inode->i_lock); |
681 | 680 | ||
682 | list_move(&inode->i_lru, &freeable); | 681 | list_move(&inode->i_lru, &freeable); |
683 | sb->s_nr_inodes_unused--; | 682 | sb->s_nr_inodes_unused--; |
684 | this_cpu_dec(nr_unused); | 683 | this_cpu_dec(nr_unused); |
685 | } | 684 | } |
686 | if (current_is_kswapd()) | 685 | if (current_is_kswapd()) |
687 | __count_vm_events(KSWAPD_INODESTEAL, reap); | 686 | __count_vm_events(KSWAPD_INODESTEAL, reap); |
688 | else | 687 | else |
689 | __count_vm_events(PGINODESTEAL, reap); | 688 | __count_vm_events(PGINODESTEAL, reap); |
690 | spin_unlock(&sb->s_inode_lru_lock); | 689 | spin_unlock(&sb->s_inode_lru_lock); |
691 | 690 | ||
692 | dispose_list(&freeable); | 691 | dispose_list(&freeable); |
693 | } | 692 | } |
694 | 693 | ||
695 | static void __wait_on_freeing_inode(struct inode *inode); | 694 | static void __wait_on_freeing_inode(struct inode *inode); |
696 | /* | 695 | /* |
697 | * Called with the inode lock held. | 696 | * Called with the inode lock held. |
698 | */ | 697 | */ |
699 | static struct inode *find_inode(struct super_block *sb, | 698 | static struct inode *find_inode(struct super_block *sb, |
700 | struct hlist_head *head, | 699 | struct hlist_head *head, |
701 | int (*test)(struct inode *, void *), | 700 | int (*test)(struct inode *, void *), |
702 | void *data) | 701 | void *data) |
703 | { | 702 | { |
704 | struct hlist_node *node; | 703 | struct hlist_node *node; |
705 | struct inode *inode = NULL; | 704 | struct inode *inode = NULL; |
706 | 705 | ||
707 | repeat: | 706 | repeat: |
708 | hlist_for_each_entry(inode, node, head, i_hash) { | 707 | hlist_for_each_entry(inode, node, head, i_hash) { |
709 | spin_lock(&inode->i_lock); | 708 | spin_lock(&inode->i_lock); |
710 | if (inode->i_sb != sb) { | 709 | if (inode->i_sb != sb) { |
711 | spin_unlock(&inode->i_lock); | 710 | spin_unlock(&inode->i_lock); |
712 | continue; | 711 | continue; |
713 | } | 712 | } |
714 | if (!test(inode, data)) { | 713 | if (!test(inode, data)) { |
715 | spin_unlock(&inode->i_lock); | 714 | spin_unlock(&inode->i_lock); |
716 | continue; | 715 | continue; |
717 | } | 716 | } |
718 | if (inode->i_state & (I_FREEING|I_WILL_FREE)) { | 717 | if (inode->i_state & (I_FREEING|I_WILL_FREE)) { |
719 | __wait_on_freeing_inode(inode); | 718 | __wait_on_freeing_inode(inode); |
720 | goto repeat; | 719 | goto repeat; |
721 | } | 720 | } |
722 | __iget(inode); | 721 | __iget(inode); |
723 | spin_unlock(&inode->i_lock); | 722 | spin_unlock(&inode->i_lock); |
724 | return inode; | 723 | return inode; |
725 | } | 724 | } |
726 | return NULL; | 725 | return NULL; |
727 | } | 726 | } |
728 | 727 | ||
729 | /* | 728 | /* |
730 | * find_inode_fast is the fast path version of find_inode, see the comment at | 729 | * find_inode_fast is the fast path version of find_inode, see the comment at |
731 | * iget_locked for details. | 730 | * iget_locked for details. |
732 | */ | 731 | */ |
733 | static struct inode *find_inode_fast(struct super_block *sb, | 732 | static struct inode *find_inode_fast(struct super_block *sb, |
734 | struct hlist_head *head, unsigned long ino) | 733 | struct hlist_head *head, unsigned long ino) |
735 | { | 734 | { |
736 | struct hlist_node *node; | 735 | struct hlist_node *node; |
737 | struct inode *inode = NULL; | 736 | struct inode *inode = NULL; |
738 | 737 | ||
739 | repeat: | 738 | repeat: |
740 | hlist_for_each_entry(inode, node, head, i_hash) { | 739 | hlist_for_each_entry(inode, node, head, i_hash) { |
741 | spin_lock(&inode->i_lock); | 740 | spin_lock(&inode->i_lock); |
742 | if (inode->i_ino != ino) { | 741 | if (inode->i_ino != ino) { |
743 | spin_unlock(&inode->i_lock); | 742 | spin_unlock(&inode->i_lock); |
744 | continue; | 743 | continue; |
745 | } | 744 | } |
746 | if (inode->i_sb != sb) { | 745 | if (inode->i_sb != sb) { |
747 | spin_unlock(&inode->i_lock); | 746 | spin_unlock(&inode->i_lock); |
748 | continue; | 747 | continue; |
749 | } | 748 | } |
750 | if (inode->i_state & (I_FREEING|I_WILL_FREE)) { | 749 | if (inode->i_state & (I_FREEING|I_WILL_FREE)) { |
751 | __wait_on_freeing_inode(inode); | 750 | __wait_on_freeing_inode(inode); |
752 | goto repeat; | 751 | goto repeat; |
753 | } | 752 | } |
754 | __iget(inode); | 753 | __iget(inode); |
755 | spin_unlock(&inode->i_lock); | 754 | spin_unlock(&inode->i_lock); |
756 | return inode; | 755 | return inode; |
757 | } | 756 | } |
758 | return NULL; | 757 | return NULL; |
759 | } | 758 | } |
760 | 759 | ||
761 | /* | 760 | /* |
762 | * Each cpu owns a range of LAST_INO_BATCH numbers. | 761 | * Each cpu owns a range of LAST_INO_BATCH numbers. |
763 | * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, | 762 | * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, |
764 | * to renew the exhausted range. | 763 | * to renew the exhausted range. |
765 | * | 764 | * |
766 | * This does not significantly increase overflow rate because every CPU can | 765 | * This does not significantly increase overflow rate because every CPU can |
767 | * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is | 766 | * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is |
768 | * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the | 767 | * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the |
769 | * 2^32 range, and is a worst-case. Even a 50% wastage would only increase | 768 | * 2^32 range, and is a worst-case. Even a 50% wastage would only increase |
770 | * overflow rate by 2x, which does not seem too significant. | 769 | * overflow rate by 2x, which does not seem too significant. |
771 | * | 770 | * |
772 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW | 771 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW |
773 | * error if st_ino won't fit in target struct field. Use 32bit counter | 772 | * error if st_ino won't fit in target struct field. Use 32bit counter |
774 | * here to attempt to avoid that. | 773 | * here to attempt to avoid that. |
775 | */ | 774 | */ |
776 | #define LAST_INO_BATCH 1024 | 775 | #define LAST_INO_BATCH 1024 |
777 | static DEFINE_PER_CPU(unsigned int, last_ino); | 776 | static DEFINE_PER_CPU(unsigned int, last_ino); |
778 | 777 | ||
779 | unsigned int get_next_ino(void) | 778 | unsigned int get_next_ino(void) |
780 | { | 779 | { |
781 | unsigned int *p = &get_cpu_var(last_ino); | 780 | unsigned int *p = &get_cpu_var(last_ino); |
782 | unsigned int res = *p; | 781 | unsigned int res = *p; |
783 | 782 | ||
784 | #ifdef CONFIG_SMP | 783 | #ifdef CONFIG_SMP |
785 | if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { | 784 | if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { |
786 | static atomic_t shared_last_ino; | 785 | static atomic_t shared_last_ino; |
787 | int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); | 786 | int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); |
788 | 787 | ||
789 | res = next - LAST_INO_BATCH; | 788 | res = next - LAST_INO_BATCH; |
790 | } | 789 | } |
791 | #endif | 790 | #endif |
792 | 791 | ||
793 | *p = ++res; | 792 | *p = ++res; |
794 | put_cpu_var(last_ino); | 793 | put_cpu_var(last_ino); |
795 | return res; | 794 | return res; |
796 | } | 795 | } |
797 | EXPORT_SYMBOL(get_next_ino); | 796 | EXPORT_SYMBOL(get_next_ino); |
798 | 797 | ||
799 | /** | 798 | /** |
800 | * new_inode - obtain an inode | 799 | * new_inode - obtain an inode |
801 | * @sb: superblock | 800 | * @sb: superblock |
802 | * | 801 | * |
803 | * Allocates a new inode for given superblock. The default gfp_mask | 802 | * Allocates a new inode for given superblock. The default gfp_mask |
804 | * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. | 803 | * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. |
805 | * If HIGHMEM pages are unsuitable or it is known that pages allocated | 804 | * If HIGHMEM pages are unsuitable or it is known that pages allocated |
806 | * for the page cache are not reclaimable or migratable, | 805 | * for the page cache are not reclaimable or migratable, |
807 | * mapping_set_gfp_mask() must be called with suitable flags on the | 806 | * mapping_set_gfp_mask() must be called with suitable flags on the |
808 | * newly created inode's mapping | 807 | * newly created inode's mapping |
809 | * | 808 | * |
810 | */ | 809 | */ |
811 | struct inode *new_inode(struct super_block *sb) | 810 | struct inode *new_inode(struct super_block *sb) |
812 | { | 811 | { |
813 | struct inode *inode; | 812 | struct inode *inode; |
814 | 813 | ||
815 | spin_lock_prefetch(&inode_sb_list_lock); | 814 | spin_lock_prefetch(&inode_sb_list_lock); |
816 | 815 | ||
817 | inode = alloc_inode(sb); | 816 | inode = alloc_inode(sb); |
818 | if (inode) { | 817 | if (inode) { |
819 | spin_lock(&inode->i_lock); | 818 | spin_lock(&inode->i_lock); |
820 | inode->i_state = 0; | 819 | inode->i_state = 0; |
821 | spin_unlock(&inode->i_lock); | 820 | spin_unlock(&inode->i_lock); |
822 | inode_sb_list_add(inode); | 821 | inode_sb_list_add(inode); |
823 | } | 822 | } |
824 | return inode; | 823 | return inode; |
825 | } | 824 | } |
826 | EXPORT_SYMBOL(new_inode); | 825 | EXPORT_SYMBOL(new_inode); |
827 | 826 | ||
828 | /** | 827 | /** |
829 | * unlock_new_inode - clear the I_NEW state and wake up any waiters | 828 | * unlock_new_inode - clear the I_NEW state and wake up any waiters |
830 | * @inode: new inode to unlock | 829 | * @inode: new inode to unlock |
831 | * | 830 | * |
832 | * Called when the inode is fully initialised to clear the new state of the | 831 | * Called when the inode is fully initialised to clear the new state of the |
833 | * inode and wake up anyone waiting for the inode to finish initialisation. | 832 | * inode and wake up anyone waiting for the inode to finish initialisation. |
834 | */ | 833 | */ |
835 | void unlock_new_inode(struct inode *inode) | 834 | void unlock_new_inode(struct inode *inode) |
836 | { | 835 | { |
837 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 836 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
838 | if (S_ISDIR(inode->i_mode)) { | 837 | if (S_ISDIR(inode->i_mode)) { |
839 | struct file_system_type *type = inode->i_sb->s_type; | 838 | struct file_system_type *type = inode->i_sb->s_type; |
840 | 839 | ||
841 | /* Set new key only if filesystem hasn't already changed it */ | 840 | /* Set new key only if filesystem hasn't already changed it */ |
842 | if (!lockdep_match_class(&inode->i_mutex, | 841 | if (!lockdep_match_class(&inode->i_mutex, |
843 | &type->i_mutex_key)) { | 842 | &type->i_mutex_key)) { |
844 | /* | 843 | /* |
845 | * ensure nobody is actually holding i_mutex | 844 | * ensure nobody is actually holding i_mutex |
846 | */ | 845 | */ |
847 | mutex_destroy(&inode->i_mutex); | 846 | mutex_destroy(&inode->i_mutex); |
848 | mutex_init(&inode->i_mutex); | 847 | mutex_init(&inode->i_mutex); |
849 | lockdep_set_class(&inode->i_mutex, | 848 | lockdep_set_class(&inode->i_mutex, |
850 | &type->i_mutex_dir_key); | 849 | &type->i_mutex_dir_key); |
851 | } | 850 | } |
852 | } | 851 | } |
853 | #endif | 852 | #endif |
854 | spin_lock(&inode->i_lock); | 853 | spin_lock(&inode->i_lock); |
855 | WARN_ON(!(inode->i_state & I_NEW)); | 854 | WARN_ON(!(inode->i_state & I_NEW)); |
856 | inode->i_state &= ~I_NEW; | 855 | inode->i_state &= ~I_NEW; |
857 | wake_up_bit(&inode->i_state, __I_NEW); | 856 | wake_up_bit(&inode->i_state, __I_NEW); |
858 | spin_unlock(&inode->i_lock); | 857 | spin_unlock(&inode->i_lock); |
859 | } | 858 | } |
860 | EXPORT_SYMBOL(unlock_new_inode); | 859 | EXPORT_SYMBOL(unlock_new_inode); |
861 | 860 | ||
862 | /** | 861 | /** |
863 | * iget5_locked - obtain an inode from a mounted file system | 862 | * iget5_locked - obtain an inode from a mounted file system |
864 | * @sb: super block of file system | 863 | * @sb: super block of file system |
865 | * @hashval: hash value (usually inode number) to get | 864 | * @hashval: hash value (usually inode number) to get |
866 | * @test: callback used for comparisons between inodes | 865 | * @test: callback used for comparisons between inodes |
867 | * @set: callback used to initialize a new struct inode | 866 | * @set: callback used to initialize a new struct inode |
868 | * @data: opaque data pointer to pass to @test and @set | 867 | * @data: opaque data pointer to pass to @test and @set |
869 | * | 868 | * |
870 | * Search for the inode specified by @hashval and @data in the inode cache, | 869 | * Search for the inode specified by @hashval and @data in the inode cache, |
871 | * and if present it is return it with an increased reference count. This is | 870 | * and if present it is return it with an increased reference count. This is |
872 | * a generalized version of iget_locked() for file systems where the inode | 871 | * a generalized version of iget_locked() for file systems where the inode |
873 | * number is not sufficient for unique identification of an inode. | 872 | * number is not sufficient for unique identification of an inode. |
874 | * | 873 | * |
875 | * If the inode is not in cache, allocate a new inode and return it locked, | 874 | * If the inode is not in cache, allocate a new inode and return it locked, |
876 | * hashed, and with the I_NEW flag set. The file system gets to fill it in | 875 | * hashed, and with the I_NEW flag set. The file system gets to fill it in |
877 | * before unlocking it via unlock_new_inode(). | 876 | * before unlocking it via unlock_new_inode(). |
878 | * | 877 | * |
879 | * Note both @test and @set are called with the inode_hash_lock held, so can't | 878 | * Note both @test and @set are called with the inode_hash_lock held, so can't |
880 | * sleep. | 879 | * sleep. |
881 | */ | 880 | */ |
882 | struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, | 881 | struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, |
883 | int (*test)(struct inode *, void *), | 882 | int (*test)(struct inode *, void *), |
884 | int (*set)(struct inode *, void *), void *data) | 883 | int (*set)(struct inode *, void *), void *data) |
885 | { | 884 | { |
886 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); | 885 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); |
887 | struct inode *inode; | 886 | struct inode *inode; |
888 | 887 | ||
889 | spin_lock(&inode_hash_lock); | 888 | spin_lock(&inode_hash_lock); |
890 | inode = find_inode(sb, head, test, data); | 889 | inode = find_inode(sb, head, test, data); |
891 | spin_unlock(&inode_hash_lock); | 890 | spin_unlock(&inode_hash_lock); |
892 | 891 | ||
893 | if (inode) { | 892 | if (inode) { |
894 | wait_on_inode(inode); | 893 | wait_on_inode(inode); |
895 | return inode; | 894 | return inode; |
896 | } | 895 | } |
897 | 896 | ||
898 | inode = alloc_inode(sb); | 897 | inode = alloc_inode(sb); |
899 | if (inode) { | 898 | if (inode) { |
900 | struct inode *old; | 899 | struct inode *old; |
901 | 900 | ||
902 | spin_lock(&inode_hash_lock); | 901 | spin_lock(&inode_hash_lock); |
903 | /* We released the lock, so.. */ | 902 | /* We released the lock, so.. */ |
904 | old = find_inode(sb, head, test, data); | 903 | old = find_inode(sb, head, test, data); |
905 | if (!old) { | 904 | if (!old) { |
906 | if (set(inode, data)) | 905 | if (set(inode, data)) |
907 | goto set_failed; | 906 | goto set_failed; |
908 | 907 | ||
909 | spin_lock(&inode->i_lock); | 908 | spin_lock(&inode->i_lock); |
910 | inode->i_state = I_NEW; | 909 | inode->i_state = I_NEW; |
911 | hlist_add_head(&inode->i_hash, head); | 910 | hlist_add_head(&inode->i_hash, head); |
912 | spin_unlock(&inode->i_lock); | 911 | spin_unlock(&inode->i_lock); |
913 | inode_sb_list_add(inode); | 912 | inode_sb_list_add(inode); |
914 | spin_unlock(&inode_hash_lock); | 913 | spin_unlock(&inode_hash_lock); |
915 | 914 | ||
916 | /* Return the locked inode with I_NEW set, the | 915 | /* Return the locked inode with I_NEW set, the |
917 | * caller is responsible for filling in the contents | 916 | * caller is responsible for filling in the contents |
918 | */ | 917 | */ |
919 | return inode; | 918 | return inode; |
920 | } | 919 | } |
921 | 920 | ||
922 | /* | 921 | /* |
923 | * Uhhuh, somebody else created the same inode under | 922 | * Uhhuh, somebody else created the same inode under |
924 | * us. Use the old inode instead of the one we just | 923 | * us. Use the old inode instead of the one we just |
925 | * allocated. | 924 | * allocated. |
926 | */ | 925 | */ |
927 | spin_unlock(&inode_hash_lock); | 926 | spin_unlock(&inode_hash_lock); |
928 | destroy_inode(inode); | 927 | destroy_inode(inode); |
929 | inode = old; | 928 | inode = old; |
930 | wait_on_inode(inode); | 929 | wait_on_inode(inode); |
931 | } | 930 | } |
932 | return inode; | 931 | return inode; |
933 | 932 | ||
934 | set_failed: | 933 | set_failed: |
935 | spin_unlock(&inode_hash_lock); | 934 | spin_unlock(&inode_hash_lock); |
936 | destroy_inode(inode); | 935 | destroy_inode(inode); |
937 | return NULL; | 936 | return NULL; |
938 | } | 937 | } |
939 | EXPORT_SYMBOL(iget5_locked); | 938 | EXPORT_SYMBOL(iget5_locked); |
940 | 939 | ||
941 | /** | 940 | /** |
942 | * iget_locked - obtain an inode from a mounted file system | 941 | * iget_locked - obtain an inode from a mounted file system |
943 | * @sb: super block of file system | 942 | * @sb: super block of file system |
944 | * @ino: inode number to get | 943 | * @ino: inode number to get |
945 | * | 944 | * |
946 | * Search for the inode specified by @ino in the inode cache and if present | 945 | * Search for the inode specified by @ino in the inode cache and if present |
947 | * return it with an increased reference count. This is for file systems | 946 | * return it with an increased reference count. This is for file systems |
948 | * where the inode number is sufficient for unique identification of an inode. | 947 | * where the inode number is sufficient for unique identification of an inode. |
949 | * | 948 | * |
950 | * If the inode is not in cache, allocate a new inode and return it locked, | 949 | * If the inode is not in cache, allocate a new inode and return it locked, |
951 | * hashed, and with the I_NEW flag set. The file system gets to fill it in | 950 | * hashed, and with the I_NEW flag set. The file system gets to fill it in |
952 | * before unlocking it via unlock_new_inode(). | 951 | * before unlocking it via unlock_new_inode(). |
953 | */ | 952 | */ |
954 | struct inode *iget_locked(struct super_block *sb, unsigned long ino) | 953 | struct inode *iget_locked(struct super_block *sb, unsigned long ino) |
955 | { | 954 | { |
956 | struct hlist_head *head = inode_hashtable + hash(sb, ino); | 955 | struct hlist_head *head = inode_hashtable + hash(sb, ino); |
957 | struct inode *inode; | 956 | struct inode *inode; |
958 | 957 | ||
959 | spin_lock(&inode_hash_lock); | 958 | spin_lock(&inode_hash_lock); |
960 | inode = find_inode_fast(sb, head, ino); | 959 | inode = find_inode_fast(sb, head, ino); |
961 | spin_unlock(&inode_hash_lock); | 960 | spin_unlock(&inode_hash_lock); |
962 | if (inode) { | 961 | if (inode) { |
963 | wait_on_inode(inode); | 962 | wait_on_inode(inode); |
964 | return inode; | 963 | return inode; |
965 | } | 964 | } |
966 | 965 | ||
967 | inode = alloc_inode(sb); | 966 | inode = alloc_inode(sb); |
968 | if (inode) { | 967 | if (inode) { |
969 | struct inode *old; | 968 | struct inode *old; |
970 | 969 | ||
971 | spin_lock(&inode_hash_lock); | 970 | spin_lock(&inode_hash_lock); |
972 | /* We released the lock, so.. */ | 971 | /* We released the lock, so.. */ |
973 | old = find_inode_fast(sb, head, ino); | 972 | old = find_inode_fast(sb, head, ino); |
974 | if (!old) { | 973 | if (!old) { |
975 | inode->i_ino = ino; | 974 | inode->i_ino = ino; |
976 | spin_lock(&inode->i_lock); | 975 | spin_lock(&inode->i_lock); |
977 | inode->i_state = I_NEW; | 976 | inode->i_state = I_NEW; |
978 | hlist_add_head(&inode->i_hash, head); | 977 | hlist_add_head(&inode->i_hash, head); |
979 | spin_unlock(&inode->i_lock); | 978 | spin_unlock(&inode->i_lock); |
980 | inode_sb_list_add(inode); | 979 | inode_sb_list_add(inode); |
981 | spin_unlock(&inode_hash_lock); | 980 | spin_unlock(&inode_hash_lock); |
982 | 981 | ||
983 | /* Return the locked inode with I_NEW set, the | 982 | /* Return the locked inode with I_NEW set, the |
984 | * caller is responsible for filling in the contents | 983 | * caller is responsible for filling in the contents |
985 | */ | 984 | */ |
986 | return inode; | 985 | return inode; |
987 | } | 986 | } |
988 | 987 | ||
989 | /* | 988 | /* |
990 | * Uhhuh, somebody else created the same inode under | 989 | * Uhhuh, somebody else created the same inode under |
991 | * us. Use the old inode instead of the one we just | 990 | * us. Use the old inode instead of the one we just |
992 | * allocated. | 991 | * allocated. |
993 | */ | 992 | */ |
994 | spin_unlock(&inode_hash_lock); | 993 | spin_unlock(&inode_hash_lock); |
995 | destroy_inode(inode); | 994 | destroy_inode(inode); |
996 | inode = old; | 995 | inode = old; |
997 | wait_on_inode(inode); | 996 | wait_on_inode(inode); |
998 | } | 997 | } |
999 | return inode; | 998 | return inode; |
1000 | } | 999 | } |
1001 | EXPORT_SYMBOL(iget_locked); | 1000 | EXPORT_SYMBOL(iget_locked); |
1002 | 1001 | ||
1003 | /* | 1002 | /* |
1004 | * search the inode cache for a matching inode number. | 1003 | * search the inode cache for a matching inode number. |
1005 | * If we find one, then the inode number we are trying to | 1004 | * If we find one, then the inode number we are trying to |
1006 | * allocate is not unique and so we should not use it. | 1005 | * allocate is not unique and so we should not use it. |
1007 | * | 1006 | * |
1008 | * Returns 1 if the inode number is unique, 0 if it is not. | 1007 | * Returns 1 if the inode number is unique, 0 if it is not. |
1009 | */ | 1008 | */ |
1010 | static int test_inode_iunique(struct super_block *sb, unsigned long ino) | 1009 | static int test_inode_iunique(struct super_block *sb, unsigned long ino) |
1011 | { | 1010 | { |
1012 | struct hlist_head *b = inode_hashtable + hash(sb, ino); | 1011 | struct hlist_head *b = inode_hashtable + hash(sb, ino); |
1013 | struct hlist_node *node; | 1012 | struct hlist_node *node; |
1014 | struct inode *inode; | 1013 | struct inode *inode; |
1015 | 1014 | ||
1016 | spin_lock(&inode_hash_lock); | 1015 | spin_lock(&inode_hash_lock); |
1017 | hlist_for_each_entry(inode, node, b, i_hash) { | 1016 | hlist_for_each_entry(inode, node, b, i_hash) { |
1018 | if (inode->i_ino == ino && inode->i_sb == sb) { | 1017 | if (inode->i_ino == ino && inode->i_sb == sb) { |
1019 | spin_unlock(&inode_hash_lock); | 1018 | spin_unlock(&inode_hash_lock); |
1020 | return 0; | 1019 | return 0; |
1021 | } | 1020 | } |
1022 | } | 1021 | } |
1023 | spin_unlock(&inode_hash_lock); | 1022 | spin_unlock(&inode_hash_lock); |
1024 | 1023 | ||
1025 | return 1; | 1024 | return 1; |
1026 | } | 1025 | } |
1027 | 1026 | ||
1028 | /** | 1027 | /** |
1029 | * iunique - get a unique inode number | 1028 | * iunique - get a unique inode number |
1030 | * @sb: superblock | 1029 | * @sb: superblock |
1031 | * @max_reserved: highest reserved inode number | 1030 | * @max_reserved: highest reserved inode number |
1032 | * | 1031 | * |
1033 | * Obtain an inode number that is unique on the system for a given | 1032 | * Obtain an inode number that is unique on the system for a given |
1034 | * superblock. This is used by file systems that have no natural | 1033 | * superblock. This is used by file systems that have no natural |
1035 | * permanent inode numbering system. An inode number is returned that | 1034 | * permanent inode numbering system. An inode number is returned that |
1036 | * is higher than the reserved limit but unique. | 1035 | * is higher than the reserved limit but unique. |
1037 | * | 1036 | * |
1038 | * BUGS: | 1037 | * BUGS: |
1039 | * With a large number of inodes live on the file system this function | 1038 | * With a large number of inodes live on the file system this function |
1040 | * currently becomes quite slow. | 1039 | * currently becomes quite slow. |
1041 | */ | 1040 | */ |
1042 | ino_t iunique(struct super_block *sb, ino_t max_reserved) | 1041 | ino_t iunique(struct super_block *sb, ino_t max_reserved) |
1043 | { | 1042 | { |
1044 | /* | 1043 | /* |
1045 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW | 1044 | * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW |
1046 | * error if st_ino won't fit in target struct field. Use 32bit counter | 1045 | * error if st_ino won't fit in target struct field. Use 32bit counter |
1047 | * here to attempt to avoid that. | 1046 | * here to attempt to avoid that. |
1048 | */ | 1047 | */ |
1049 | static DEFINE_SPINLOCK(iunique_lock); | 1048 | static DEFINE_SPINLOCK(iunique_lock); |
1050 | static unsigned int counter; | 1049 | static unsigned int counter; |
1051 | ino_t res; | 1050 | ino_t res; |
1052 | 1051 | ||
1053 | spin_lock(&iunique_lock); | 1052 | spin_lock(&iunique_lock); |
1054 | do { | 1053 | do { |
1055 | if (counter <= max_reserved) | 1054 | if (counter <= max_reserved) |
1056 | counter = max_reserved + 1; | 1055 | counter = max_reserved + 1; |
1057 | res = counter++; | 1056 | res = counter++; |
1058 | } while (!test_inode_iunique(sb, res)); | 1057 | } while (!test_inode_iunique(sb, res)); |
1059 | spin_unlock(&iunique_lock); | 1058 | spin_unlock(&iunique_lock); |
1060 | 1059 | ||
1061 | return res; | 1060 | return res; |
1062 | } | 1061 | } |
1063 | EXPORT_SYMBOL(iunique); | 1062 | EXPORT_SYMBOL(iunique); |
1064 | 1063 | ||
1065 | struct inode *igrab(struct inode *inode) | 1064 | struct inode *igrab(struct inode *inode) |
1066 | { | 1065 | { |
1067 | spin_lock(&inode->i_lock); | 1066 | spin_lock(&inode->i_lock); |
1068 | if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { | 1067 | if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { |
1069 | __iget(inode); | 1068 | __iget(inode); |
1070 | spin_unlock(&inode->i_lock); | 1069 | spin_unlock(&inode->i_lock); |
1071 | } else { | 1070 | } else { |
1072 | spin_unlock(&inode->i_lock); | 1071 | spin_unlock(&inode->i_lock); |
1073 | /* | 1072 | /* |
1074 | * Handle the case where s_op->clear_inode is not been | 1073 | * Handle the case where s_op->clear_inode is not been |
1075 | * called yet, and somebody is calling igrab | 1074 | * called yet, and somebody is calling igrab |
1076 | * while the inode is getting freed. | 1075 | * while the inode is getting freed. |
1077 | */ | 1076 | */ |
1078 | inode = NULL; | 1077 | inode = NULL; |
1079 | } | 1078 | } |
1080 | return inode; | 1079 | return inode; |
1081 | } | 1080 | } |
1082 | EXPORT_SYMBOL(igrab); | 1081 | EXPORT_SYMBOL(igrab); |
1083 | 1082 | ||
1084 | /** | 1083 | /** |
1085 | * ilookup5_nowait - search for an inode in the inode cache | 1084 | * ilookup5_nowait - search for an inode in the inode cache |
1086 | * @sb: super block of file system to search | 1085 | * @sb: super block of file system to search |
1087 | * @hashval: hash value (usually inode number) to search for | 1086 | * @hashval: hash value (usually inode number) to search for |
1088 | * @test: callback used for comparisons between inodes | 1087 | * @test: callback used for comparisons between inodes |
1089 | * @data: opaque data pointer to pass to @test | 1088 | * @data: opaque data pointer to pass to @test |
1090 | * | 1089 | * |
1091 | * Search for the inode specified by @hashval and @data in the inode cache. | 1090 | * Search for the inode specified by @hashval and @data in the inode cache. |
1092 | * If the inode is in the cache, the inode is returned with an incremented | 1091 | * If the inode is in the cache, the inode is returned with an incremented |
1093 | * reference count. | 1092 | * reference count. |
1094 | * | 1093 | * |
1095 | * Note: I_NEW is not waited upon so you have to be very careful what you do | 1094 | * Note: I_NEW is not waited upon so you have to be very careful what you do |
1096 | * with the returned inode. You probably should be using ilookup5() instead. | 1095 | * with the returned inode. You probably should be using ilookup5() instead. |
1097 | * | 1096 | * |
1098 | * Note2: @test is called with the inode_hash_lock held, so can't sleep. | 1097 | * Note2: @test is called with the inode_hash_lock held, so can't sleep. |
1099 | */ | 1098 | */ |
1100 | struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, | 1099 | struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, |
1101 | int (*test)(struct inode *, void *), void *data) | 1100 | int (*test)(struct inode *, void *), void *data) |
1102 | { | 1101 | { |
1103 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); | 1102 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); |
1104 | struct inode *inode; | 1103 | struct inode *inode; |
1105 | 1104 | ||
1106 | spin_lock(&inode_hash_lock); | 1105 | spin_lock(&inode_hash_lock); |
1107 | inode = find_inode(sb, head, test, data); | 1106 | inode = find_inode(sb, head, test, data); |
1108 | spin_unlock(&inode_hash_lock); | 1107 | spin_unlock(&inode_hash_lock); |
1109 | 1108 | ||
1110 | return inode; | 1109 | return inode; |
1111 | } | 1110 | } |
1112 | EXPORT_SYMBOL(ilookup5_nowait); | 1111 | EXPORT_SYMBOL(ilookup5_nowait); |
1113 | 1112 | ||
1114 | /** | 1113 | /** |
1115 | * ilookup5 - search for an inode in the inode cache | 1114 | * ilookup5 - search for an inode in the inode cache |
1116 | * @sb: super block of file system to search | 1115 | * @sb: super block of file system to search |
1117 | * @hashval: hash value (usually inode number) to search for | 1116 | * @hashval: hash value (usually inode number) to search for |
1118 | * @test: callback used for comparisons between inodes | 1117 | * @test: callback used for comparisons between inodes |
1119 | * @data: opaque data pointer to pass to @test | 1118 | * @data: opaque data pointer to pass to @test |
1120 | * | 1119 | * |
1121 | * Search for the inode specified by @hashval and @data in the inode cache, | 1120 | * Search for the inode specified by @hashval and @data in the inode cache, |
1122 | * and if the inode is in the cache, return the inode with an incremented | 1121 | * and if the inode is in the cache, return the inode with an incremented |
1123 | * reference count. Waits on I_NEW before returning the inode. | 1122 | * reference count. Waits on I_NEW before returning the inode. |
1124 | * returned with an incremented reference count. | 1123 | * returned with an incremented reference count. |
1125 | * | 1124 | * |
1126 | * This is a generalized version of ilookup() for file systems where the | 1125 | * This is a generalized version of ilookup() for file systems where the |
1127 | * inode number is not sufficient for unique identification of an inode. | 1126 | * inode number is not sufficient for unique identification of an inode. |
1128 | * | 1127 | * |
1129 | * Note: @test is called with the inode_hash_lock held, so can't sleep. | 1128 | * Note: @test is called with the inode_hash_lock held, so can't sleep. |
1130 | */ | 1129 | */ |
1131 | struct inode *ilookup5(struct super_block *sb, unsigned long hashval, | 1130 | struct inode *ilookup5(struct super_block *sb, unsigned long hashval, |
1132 | int (*test)(struct inode *, void *), void *data) | 1131 | int (*test)(struct inode *, void *), void *data) |
1133 | { | 1132 | { |
1134 | struct inode *inode = ilookup5_nowait(sb, hashval, test, data); | 1133 | struct inode *inode = ilookup5_nowait(sb, hashval, test, data); |
1135 | 1134 | ||
1136 | if (inode) | 1135 | if (inode) |
1137 | wait_on_inode(inode); | 1136 | wait_on_inode(inode); |
1138 | return inode; | 1137 | return inode; |
1139 | } | 1138 | } |
1140 | EXPORT_SYMBOL(ilookup5); | 1139 | EXPORT_SYMBOL(ilookup5); |
1141 | 1140 | ||
1142 | /** | 1141 | /** |
1143 | * ilookup - search for an inode in the inode cache | 1142 | * ilookup - search for an inode in the inode cache |
1144 | * @sb: super block of file system to search | 1143 | * @sb: super block of file system to search |
1145 | * @ino: inode number to search for | 1144 | * @ino: inode number to search for |
1146 | * | 1145 | * |
1147 | * Search for the inode @ino in the inode cache, and if the inode is in the | 1146 | * Search for the inode @ino in the inode cache, and if the inode is in the |
1148 | * cache, the inode is returned with an incremented reference count. | 1147 | * cache, the inode is returned with an incremented reference count. |
1149 | */ | 1148 | */ |
1150 | struct inode *ilookup(struct super_block *sb, unsigned long ino) | 1149 | struct inode *ilookup(struct super_block *sb, unsigned long ino) |
1151 | { | 1150 | { |
1152 | struct hlist_head *head = inode_hashtable + hash(sb, ino); | 1151 | struct hlist_head *head = inode_hashtable + hash(sb, ino); |
1153 | struct inode *inode; | 1152 | struct inode *inode; |
1154 | 1153 | ||
1155 | spin_lock(&inode_hash_lock); | 1154 | spin_lock(&inode_hash_lock); |
1156 | inode = find_inode_fast(sb, head, ino); | 1155 | inode = find_inode_fast(sb, head, ino); |
1157 | spin_unlock(&inode_hash_lock); | 1156 | spin_unlock(&inode_hash_lock); |
1158 | 1157 | ||
1159 | if (inode) | 1158 | if (inode) |
1160 | wait_on_inode(inode); | 1159 | wait_on_inode(inode); |
1161 | return inode; | 1160 | return inode; |
1162 | } | 1161 | } |
1163 | EXPORT_SYMBOL(ilookup); | 1162 | EXPORT_SYMBOL(ilookup); |
1164 | 1163 | ||
1165 | int insert_inode_locked(struct inode *inode) | 1164 | int insert_inode_locked(struct inode *inode) |
1166 | { | 1165 | { |
1167 | struct super_block *sb = inode->i_sb; | 1166 | struct super_block *sb = inode->i_sb; |
1168 | ino_t ino = inode->i_ino; | 1167 | ino_t ino = inode->i_ino; |
1169 | struct hlist_head *head = inode_hashtable + hash(sb, ino); | 1168 | struct hlist_head *head = inode_hashtable + hash(sb, ino); |
1170 | 1169 | ||
1171 | while (1) { | 1170 | while (1) { |
1172 | struct hlist_node *node; | 1171 | struct hlist_node *node; |
1173 | struct inode *old = NULL; | 1172 | struct inode *old = NULL; |
1174 | spin_lock(&inode_hash_lock); | 1173 | spin_lock(&inode_hash_lock); |
1175 | hlist_for_each_entry(old, node, head, i_hash) { | 1174 | hlist_for_each_entry(old, node, head, i_hash) { |
1176 | if (old->i_ino != ino) | 1175 | if (old->i_ino != ino) |
1177 | continue; | 1176 | continue; |
1178 | if (old->i_sb != sb) | 1177 | if (old->i_sb != sb) |
1179 | continue; | 1178 | continue; |
1180 | spin_lock(&old->i_lock); | 1179 | spin_lock(&old->i_lock); |
1181 | if (old->i_state & (I_FREEING|I_WILL_FREE)) { | 1180 | if (old->i_state & (I_FREEING|I_WILL_FREE)) { |
1182 | spin_unlock(&old->i_lock); | 1181 | spin_unlock(&old->i_lock); |
1183 | continue; | 1182 | continue; |
1184 | } | 1183 | } |
1185 | break; | 1184 | break; |
1186 | } | 1185 | } |
1187 | if (likely(!node)) { | 1186 | if (likely(!node)) { |
1188 | spin_lock(&inode->i_lock); | 1187 | spin_lock(&inode->i_lock); |
1189 | inode->i_state |= I_NEW; | 1188 | inode->i_state |= I_NEW; |
1190 | hlist_add_head(&inode->i_hash, head); | 1189 | hlist_add_head(&inode->i_hash, head); |
1191 | spin_unlock(&inode->i_lock); | 1190 | spin_unlock(&inode->i_lock); |
1192 | spin_unlock(&inode_hash_lock); | 1191 | spin_unlock(&inode_hash_lock); |
1193 | return 0; | 1192 | return 0; |
1194 | } | 1193 | } |
1195 | __iget(old); | 1194 | __iget(old); |
1196 | spin_unlock(&old->i_lock); | 1195 | spin_unlock(&old->i_lock); |
1197 | spin_unlock(&inode_hash_lock); | 1196 | spin_unlock(&inode_hash_lock); |
1198 | wait_on_inode(old); | 1197 | wait_on_inode(old); |
1199 | if (unlikely(!inode_unhashed(old))) { | 1198 | if (unlikely(!inode_unhashed(old))) { |
1200 | iput(old); | 1199 | iput(old); |
1201 | return -EBUSY; | 1200 | return -EBUSY; |
1202 | } | 1201 | } |
1203 | iput(old); | 1202 | iput(old); |
1204 | } | 1203 | } |
1205 | } | 1204 | } |
1206 | EXPORT_SYMBOL(insert_inode_locked); | 1205 | EXPORT_SYMBOL(insert_inode_locked); |
1207 | 1206 | ||
1208 | int insert_inode_locked4(struct inode *inode, unsigned long hashval, | 1207 | int insert_inode_locked4(struct inode *inode, unsigned long hashval, |
1209 | int (*test)(struct inode *, void *), void *data) | 1208 | int (*test)(struct inode *, void *), void *data) |
1210 | { | 1209 | { |
1211 | struct super_block *sb = inode->i_sb; | 1210 | struct super_block *sb = inode->i_sb; |
1212 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); | 1211 | struct hlist_head *head = inode_hashtable + hash(sb, hashval); |
1213 | 1212 | ||
1214 | while (1) { | 1213 | while (1) { |
1215 | struct hlist_node *node; | 1214 | struct hlist_node *node; |
1216 | struct inode *old = NULL; | 1215 | struct inode *old = NULL; |
1217 | 1216 | ||
1218 | spin_lock(&inode_hash_lock); | 1217 | spin_lock(&inode_hash_lock); |
1219 | hlist_for_each_entry(old, node, head, i_hash) { | 1218 | hlist_for_each_entry(old, node, head, i_hash) { |
1220 | if (old->i_sb != sb) | 1219 | if (old->i_sb != sb) |
1221 | continue; | 1220 | continue; |
1222 | if (!test(old, data)) | 1221 | if (!test(old, data)) |
1223 | continue; | 1222 | continue; |
1224 | spin_lock(&old->i_lock); | 1223 | spin_lock(&old->i_lock); |
1225 | if (old->i_state & (I_FREEING|I_WILL_FREE)) { | 1224 | if (old->i_state & (I_FREEING|I_WILL_FREE)) { |
1226 | spin_unlock(&old->i_lock); | 1225 | spin_unlock(&old->i_lock); |
1227 | continue; | 1226 | continue; |
1228 | } | 1227 | } |
1229 | break; | 1228 | break; |
1230 | } | 1229 | } |
1231 | if (likely(!node)) { | 1230 | if (likely(!node)) { |
1232 | spin_lock(&inode->i_lock); | 1231 | spin_lock(&inode->i_lock); |
1233 | inode->i_state |= I_NEW; | 1232 | inode->i_state |= I_NEW; |
1234 | hlist_add_head(&inode->i_hash, head); | 1233 | hlist_add_head(&inode->i_hash, head); |
1235 | spin_unlock(&inode->i_lock); | 1234 | spin_unlock(&inode->i_lock); |
1236 | spin_unlock(&inode_hash_lock); | 1235 | spin_unlock(&inode_hash_lock); |
1237 | return 0; | 1236 | return 0; |
1238 | } | 1237 | } |
1239 | __iget(old); | 1238 | __iget(old); |
1240 | spin_unlock(&old->i_lock); | 1239 | spin_unlock(&old->i_lock); |
1241 | spin_unlock(&inode_hash_lock); | 1240 | spin_unlock(&inode_hash_lock); |
1242 | wait_on_inode(old); | 1241 | wait_on_inode(old); |
1243 | if (unlikely(!inode_unhashed(old))) { | 1242 | if (unlikely(!inode_unhashed(old))) { |
1244 | iput(old); | 1243 | iput(old); |
1245 | return -EBUSY; | 1244 | return -EBUSY; |
1246 | } | 1245 | } |
1247 | iput(old); | 1246 | iput(old); |
1248 | } | 1247 | } |
1249 | } | 1248 | } |
1250 | EXPORT_SYMBOL(insert_inode_locked4); | 1249 | EXPORT_SYMBOL(insert_inode_locked4); |
1251 | 1250 | ||
1252 | 1251 | ||
1253 | int generic_delete_inode(struct inode *inode) | 1252 | int generic_delete_inode(struct inode *inode) |
1254 | { | 1253 | { |
1255 | return 1; | 1254 | return 1; |
1256 | } | 1255 | } |
1257 | EXPORT_SYMBOL(generic_delete_inode); | 1256 | EXPORT_SYMBOL(generic_delete_inode); |
1258 | 1257 | ||
1259 | /* | 1258 | /* |
1260 | * Normal UNIX filesystem behaviour: delete the | 1259 | * Normal UNIX filesystem behaviour: delete the |
1261 | * inode when the usage count drops to zero, and | 1260 | * inode when the usage count drops to zero, and |
1262 | * i_nlink is zero. | 1261 | * i_nlink is zero. |
1263 | */ | 1262 | */ |
1264 | int generic_drop_inode(struct inode *inode) | 1263 | int generic_drop_inode(struct inode *inode) |
1265 | { | 1264 | { |
1266 | return !inode->i_nlink || inode_unhashed(inode); | 1265 | return !inode->i_nlink || inode_unhashed(inode); |
1267 | } | 1266 | } |
1268 | EXPORT_SYMBOL_GPL(generic_drop_inode); | 1267 | EXPORT_SYMBOL_GPL(generic_drop_inode); |
1269 | 1268 | ||
1270 | /* | 1269 | /* |
1271 | * Called when we're dropping the last reference | 1270 | * Called when we're dropping the last reference |
1272 | * to an inode. | 1271 | * to an inode. |
1273 | * | 1272 | * |
1274 | * Call the FS "drop_inode()" function, defaulting to | 1273 | * Call the FS "drop_inode()" function, defaulting to |
1275 | * the legacy UNIX filesystem behaviour. If it tells | 1274 | * the legacy UNIX filesystem behaviour. If it tells |
1276 | * us to evict inode, do so. Otherwise, retain inode | 1275 | * us to evict inode, do so. Otherwise, retain inode |
1277 | * in cache if fs is alive, sync and evict if fs is | 1276 | * in cache if fs is alive, sync and evict if fs is |
1278 | * shutting down. | 1277 | * shutting down. |
1279 | */ | 1278 | */ |
1280 | static void iput_final(struct inode *inode) | 1279 | static void iput_final(struct inode *inode) |
1281 | { | 1280 | { |
1282 | struct super_block *sb = inode->i_sb; | 1281 | struct super_block *sb = inode->i_sb; |
1283 | const struct super_operations *op = inode->i_sb->s_op; | 1282 | const struct super_operations *op = inode->i_sb->s_op; |
1284 | int drop; | 1283 | int drop; |
1285 | 1284 | ||
1286 | WARN_ON(inode->i_state & I_NEW); | 1285 | WARN_ON(inode->i_state & I_NEW); |
1287 | 1286 | ||
1288 | if (op->drop_inode) | 1287 | if (op->drop_inode) |
1289 | drop = op->drop_inode(inode); | 1288 | drop = op->drop_inode(inode); |
1290 | else | 1289 | else |
1291 | drop = generic_drop_inode(inode); | 1290 | drop = generic_drop_inode(inode); |
1292 | 1291 | ||
1293 | if (!drop && (sb->s_flags & MS_ACTIVE)) { | 1292 | if (!drop && (sb->s_flags & MS_ACTIVE)) { |
1294 | inode->i_state |= I_REFERENCED; | 1293 | inode->i_state |= I_REFERENCED; |
1295 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) | 1294 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) |
1296 | inode_lru_list_add(inode); | 1295 | inode_lru_list_add(inode); |
1297 | spin_unlock(&inode->i_lock); | 1296 | spin_unlock(&inode->i_lock); |
1298 | return; | 1297 | return; |
1299 | } | 1298 | } |
1300 | 1299 | ||
1301 | if (!drop) { | 1300 | if (!drop) { |
1302 | inode->i_state |= I_WILL_FREE; | 1301 | inode->i_state |= I_WILL_FREE; |
1303 | spin_unlock(&inode->i_lock); | 1302 | spin_unlock(&inode->i_lock); |
1304 | write_inode_now(inode, 1); | 1303 | write_inode_now(inode, 1); |
1305 | spin_lock(&inode->i_lock); | 1304 | spin_lock(&inode->i_lock); |
1306 | WARN_ON(inode->i_state & I_NEW); | 1305 | WARN_ON(inode->i_state & I_NEW); |
1307 | inode->i_state &= ~I_WILL_FREE; | 1306 | inode->i_state &= ~I_WILL_FREE; |
1308 | } | 1307 | } |
1309 | 1308 | ||
1310 | inode->i_state |= I_FREEING; | 1309 | inode->i_state |= I_FREEING; |
1311 | inode_lru_list_del(inode); | 1310 | inode_lru_list_del(inode); |
1312 | spin_unlock(&inode->i_lock); | 1311 | spin_unlock(&inode->i_lock); |
1313 | 1312 | ||
1314 | evict(inode); | 1313 | evict(inode); |
1315 | } | 1314 | } |
1316 | 1315 | ||
1317 | /** | 1316 | /** |
1318 | * iput - put an inode | 1317 | * iput - put an inode |
1319 | * @inode: inode to put | 1318 | * @inode: inode to put |
1320 | * | 1319 | * |
1321 | * Puts an inode, dropping its usage count. If the inode use count hits | 1320 | * Puts an inode, dropping its usage count. If the inode use count hits |
1322 | * zero, the inode is then freed and may also be destroyed. | 1321 | * zero, the inode is then freed and may also be destroyed. |
1323 | * | 1322 | * |
1324 | * Consequently, iput() can sleep. | 1323 | * Consequently, iput() can sleep. |
1325 | */ | 1324 | */ |
1326 | void iput(struct inode *inode) | 1325 | void iput(struct inode *inode) |
1327 | { | 1326 | { |
1328 | if (inode) { | 1327 | if (inode) { |
1329 | BUG_ON(inode->i_state & I_CLEAR); | 1328 | BUG_ON(inode->i_state & I_CLEAR); |
1330 | 1329 | ||
1331 | if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) | 1330 | if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) |
1332 | iput_final(inode); | 1331 | iput_final(inode); |
1333 | } | 1332 | } |
1334 | } | 1333 | } |
1335 | EXPORT_SYMBOL(iput); | 1334 | EXPORT_SYMBOL(iput); |
1336 | 1335 | ||
1337 | /** | 1336 | /** |
1338 | * bmap - find a block number in a file | 1337 | * bmap - find a block number in a file |
1339 | * @inode: inode of file | 1338 | * @inode: inode of file |
1340 | * @block: block to find | 1339 | * @block: block to find |
1341 | * | 1340 | * |
1342 | * Returns the block number on the device holding the inode that | 1341 | * Returns the block number on the device holding the inode that |
1343 | * is the disk block number for the block of the file requested. | 1342 | * is the disk block number for the block of the file requested. |
1344 | * That is, asked for block 4 of inode 1 the function will return the | 1343 | * That is, asked for block 4 of inode 1 the function will return the |
1345 | * disk block relative to the disk start that holds that block of the | 1344 | * disk block relative to the disk start that holds that block of the |
1346 | * file. | 1345 | * file. |
1347 | */ | 1346 | */ |
1348 | sector_t bmap(struct inode *inode, sector_t block) | 1347 | sector_t bmap(struct inode *inode, sector_t block) |
1349 | { | 1348 | { |
1350 | sector_t res = 0; | 1349 | sector_t res = 0; |
1351 | if (inode->i_mapping->a_ops->bmap) | 1350 | if (inode->i_mapping->a_ops->bmap) |
1352 | res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); | 1351 | res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); |
1353 | return res; | 1352 | return res; |
1354 | } | 1353 | } |
1355 | EXPORT_SYMBOL(bmap); | 1354 | EXPORT_SYMBOL(bmap); |
1356 | 1355 | ||
1357 | /* | 1356 | /* |
1358 | * With relative atime, only update atime if the previous atime is | 1357 | * With relative atime, only update atime if the previous atime is |
1359 | * earlier than either the ctime or mtime or if at least a day has | 1358 | * earlier than either the ctime or mtime or if at least a day has |
1360 | * passed since the last atime update. | 1359 | * passed since the last atime update. |
1361 | */ | 1360 | */ |
1362 | static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, | 1361 | static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, |
1363 | struct timespec now) | 1362 | struct timespec now) |
1364 | { | 1363 | { |
1365 | 1364 | ||
1366 | if (!(mnt->mnt_flags & MNT_RELATIME)) | 1365 | if (!(mnt->mnt_flags & MNT_RELATIME)) |
1367 | return 1; | 1366 | return 1; |
1368 | /* | 1367 | /* |
1369 | * Is mtime younger than atime? If yes, update atime: | 1368 | * Is mtime younger than atime? If yes, update atime: |
1370 | */ | 1369 | */ |
1371 | if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) | 1370 | if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) |
1372 | return 1; | 1371 | return 1; |
1373 | /* | 1372 | /* |
1374 | * Is ctime younger than atime? If yes, update atime: | 1373 | * Is ctime younger than atime? If yes, update atime: |
1375 | */ | 1374 | */ |
1376 | if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) | 1375 | if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) |
1377 | return 1; | 1376 | return 1; |
1378 | 1377 | ||
1379 | /* | 1378 | /* |
1380 | * Is the previous atime value older than a day? If yes, | 1379 | * Is the previous atime value older than a day? If yes, |
1381 | * update atime: | 1380 | * update atime: |
1382 | */ | 1381 | */ |
1383 | if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) | 1382 | if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) |
1384 | return 1; | 1383 | return 1; |
1385 | /* | 1384 | /* |
1386 | * Good, we can skip the atime update: | 1385 | * Good, we can skip the atime update: |
1387 | */ | 1386 | */ |
1388 | return 0; | 1387 | return 0; |
1389 | } | 1388 | } |
1390 | 1389 | ||
1391 | /** | 1390 | /** |
1392 | * touch_atime - update the access time | 1391 | * touch_atime - update the access time |
1393 | * @mnt: mount the inode is accessed on | 1392 | * @mnt: mount the inode is accessed on |
1394 | * @dentry: dentry accessed | 1393 | * @dentry: dentry accessed |
1395 | * | 1394 | * |
1396 | * Update the accessed time on an inode and mark it for writeback. | 1395 | * Update the accessed time on an inode and mark it for writeback. |
1397 | * This function automatically handles read only file systems and media, | 1396 | * This function automatically handles read only file systems and media, |
1398 | * as well as the "noatime" flag and inode specific "noatime" markers. | 1397 | * as well as the "noatime" flag and inode specific "noatime" markers. |
1399 | */ | 1398 | */ |
1400 | void touch_atime(struct vfsmount *mnt, struct dentry *dentry) | 1399 | void touch_atime(struct vfsmount *mnt, struct dentry *dentry) |
1401 | { | 1400 | { |
1402 | struct inode *inode = dentry->d_inode; | 1401 | struct inode *inode = dentry->d_inode; |
1403 | struct timespec now; | 1402 | struct timespec now; |
1404 | 1403 | ||
1405 | if (inode->i_flags & S_NOATIME) | 1404 | if (inode->i_flags & S_NOATIME) |
1406 | return; | 1405 | return; |
1407 | if (IS_NOATIME(inode)) | 1406 | if (IS_NOATIME(inode)) |
1408 | return; | 1407 | return; |
1409 | if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) | 1408 | if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) |
1410 | return; | 1409 | return; |
1411 | 1410 | ||
1412 | if (mnt->mnt_flags & MNT_NOATIME) | 1411 | if (mnt->mnt_flags & MNT_NOATIME) |
1413 | return; | 1412 | return; |
1414 | if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) | 1413 | if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) |
1415 | return; | 1414 | return; |
1416 | 1415 | ||
1417 | now = current_fs_time(inode->i_sb); | 1416 | now = current_fs_time(inode->i_sb); |
1418 | 1417 | ||
1419 | if (!relatime_need_update(mnt, inode, now)) | 1418 | if (!relatime_need_update(mnt, inode, now)) |
1420 | return; | 1419 | return; |
1421 | 1420 | ||
1422 | if (timespec_equal(&inode->i_atime, &now)) | 1421 | if (timespec_equal(&inode->i_atime, &now)) |
1423 | return; | 1422 | return; |
1424 | 1423 | ||
1425 | if (mnt_want_write(mnt)) | 1424 | if (mnt_want_write(mnt)) |
1426 | return; | 1425 | return; |
1427 | 1426 | ||
1428 | inode->i_atime = now; | 1427 | inode->i_atime = now; |
1429 | mark_inode_dirty_sync(inode); | 1428 | mark_inode_dirty_sync(inode); |
1430 | mnt_drop_write(mnt); | 1429 | mnt_drop_write(mnt); |
1431 | } | 1430 | } |
1432 | EXPORT_SYMBOL(touch_atime); | 1431 | EXPORT_SYMBOL(touch_atime); |
1433 | 1432 | ||
1434 | /** | 1433 | /** |
1435 | * file_update_time - update mtime and ctime time | 1434 | * file_update_time - update mtime and ctime time |
1436 | * @file: file accessed | 1435 | * @file: file accessed |
1437 | * | 1436 | * |
1438 | * Update the mtime and ctime members of an inode and mark the inode | 1437 | * Update the mtime and ctime members of an inode and mark the inode |
1439 | * for writeback. Note that this function is meant exclusively for | 1438 | * for writeback. Note that this function is meant exclusively for |
1440 | * usage in the file write path of filesystems, and filesystems may | 1439 | * usage in the file write path of filesystems, and filesystems may |
1441 | * choose to explicitly ignore update via this function with the | 1440 | * choose to explicitly ignore update via this function with the |
1442 | * S_NOCMTIME inode flag, e.g. for network filesystem where these | 1441 | * S_NOCMTIME inode flag, e.g. for network filesystem where these |
1443 | * timestamps are handled by the server. | 1442 | * timestamps are handled by the server. |
1444 | */ | 1443 | */ |
1445 | 1444 | ||
1446 | void file_update_time(struct file *file) | 1445 | void file_update_time(struct file *file) |
1447 | { | 1446 | { |
1448 | struct inode *inode = file->f_path.dentry->d_inode; | 1447 | struct inode *inode = file->f_path.dentry->d_inode; |
1449 | struct timespec now; | 1448 | struct timespec now; |
1450 | enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; | 1449 | enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; |
1451 | 1450 | ||
1452 | /* First try to exhaust all avenues to not sync */ | 1451 | /* First try to exhaust all avenues to not sync */ |
1453 | if (IS_NOCMTIME(inode)) | 1452 | if (IS_NOCMTIME(inode)) |
1454 | return; | 1453 | return; |
1455 | 1454 | ||
1456 | now = current_fs_time(inode->i_sb); | 1455 | now = current_fs_time(inode->i_sb); |
1457 | if (!timespec_equal(&inode->i_mtime, &now)) | 1456 | if (!timespec_equal(&inode->i_mtime, &now)) |
1458 | sync_it = S_MTIME; | 1457 | sync_it = S_MTIME; |
1459 | 1458 | ||
1460 | if (!timespec_equal(&inode->i_ctime, &now)) | 1459 | if (!timespec_equal(&inode->i_ctime, &now)) |
1461 | sync_it |= S_CTIME; | 1460 | sync_it |= S_CTIME; |
1462 | 1461 | ||
1463 | if (IS_I_VERSION(inode)) | 1462 | if (IS_I_VERSION(inode)) |
1464 | sync_it |= S_VERSION; | 1463 | sync_it |= S_VERSION; |
1465 | 1464 | ||
1466 | if (!sync_it) | 1465 | if (!sync_it) |
1467 | return; | 1466 | return; |
1468 | 1467 | ||
1469 | /* Finally allowed to write? Takes lock. */ | 1468 | /* Finally allowed to write? Takes lock. */ |
1470 | if (mnt_want_write_file(file)) | 1469 | if (mnt_want_write_file(file)) |
1471 | return; | 1470 | return; |
1472 | 1471 | ||
1473 | /* Only change inode inside the lock region */ | 1472 | /* Only change inode inside the lock region */ |
1474 | if (sync_it & S_VERSION) | 1473 | if (sync_it & S_VERSION) |
1475 | inode_inc_iversion(inode); | 1474 | inode_inc_iversion(inode); |
1476 | if (sync_it & S_CTIME) | 1475 | if (sync_it & S_CTIME) |
1477 | inode->i_ctime = now; | 1476 | inode->i_ctime = now; |
1478 | if (sync_it & S_MTIME) | 1477 | if (sync_it & S_MTIME) |
1479 | inode->i_mtime = now; | 1478 | inode->i_mtime = now; |
1480 | mark_inode_dirty_sync(inode); | 1479 | mark_inode_dirty_sync(inode); |
1481 | mnt_drop_write(file->f_path.mnt); | 1480 | mnt_drop_write(file->f_path.mnt); |
1482 | } | 1481 | } |
1483 | EXPORT_SYMBOL(file_update_time); | 1482 | EXPORT_SYMBOL(file_update_time); |
1484 | 1483 | ||
1485 | int inode_needs_sync(struct inode *inode) | 1484 | int inode_needs_sync(struct inode *inode) |
1486 | { | 1485 | { |
1487 | if (IS_SYNC(inode)) | 1486 | if (IS_SYNC(inode)) |
1488 | return 1; | 1487 | return 1; |
1489 | if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) | 1488 | if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) |
1490 | return 1; | 1489 | return 1; |
1491 | return 0; | 1490 | return 0; |
1492 | } | 1491 | } |
1493 | EXPORT_SYMBOL(inode_needs_sync); | 1492 | EXPORT_SYMBOL(inode_needs_sync); |
1494 | 1493 | ||
1495 | int inode_wait(void *word) | 1494 | int inode_wait(void *word) |
1496 | { | 1495 | { |
1497 | schedule(); | 1496 | schedule(); |
1498 | return 0; | 1497 | return 0; |
1499 | } | 1498 | } |
1500 | EXPORT_SYMBOL(inode_wait); | 1499 | EXPORT_SYMBOL(inode_wait); |
1501 | 1500 | ||
1502 | /* | 1501 | /* |
1503 | * If we try to find an inode in the inode hash while it is being | 1502 | * If we try to find an inode in the inode hash while it is being |
1504 | * deleted, we have to wait until the filesystem completes its | 1503 | * deleted, we have to wait until the filesystem completes its |
1505 | * deletion before reporting that it isn't found. This function waits | 1504 | * deletion before reporting that it isn't found. This function waits |
1506 | * until the deletion _might_ have completed. Callers are responsible | 1505 | * until the deletion _might_ have completed. Callers are responsible |
1507 | * to recheck inode state. | 1506 | * to recheck inode state. |
1508 | * | 1507 | * |
1509 | * It doesn't matter if I_NEW is not set initially, a call to | 1508 | * It doesn't matter if I_NEW is not set initially, a call to |
1510 | * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list | 1509 | * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list |
1511 | * will DTRT. | 1510 | * will DTRT. |
1512 | */ | 1511 | */ |
1513 | static void __wait_on_freeing_inode(struct inode *inode) | 1512 | static void __wait_on_freeing_inode(struct inode *inode) |
1514 | { | 1513 | { |
1515 | wait_queue_head_t *wq; | 1514 | wait_queue_head_t *wq; |
1516 | DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); | 1515 | DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); |
1517 | wq = bit_waitqueue(&inode->i_state, __I_NEW); | 1516 | wq = bit_waitqueue(&inode->i_state, __I_NEW); |
1518 | prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); | 1517 | prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); |
1519 | spin_unlock(&inode->i_lock); | 1518 | spin_unlock(&inode->i_lock); |
1520 | spin_unlock(&inode_hash_lock); | 1519 | spin_unlock(&inode_hash_lock); |
1521 | schedule(); | 1520 | schedule(); |
1522 | finish_wait(wq, &wait.wait); | 1521 | finish_wait(wq, &wait.wait); |
1523 | spin_lock(&inode_hash_lock); | 1522 | spin_lock(&inode_hash_lock); |
1524 | } | 1523 | } |
1525 | 1524 | ||
1526 | static __initdata unsigned long ihash_entries; | 1525 | static __initdata unsigned long ihash_entries; |
1527 | static int __init set_ihash_entries(char *str) | 1526 | static int __init set_ihash_entries(char *str) |
1528 | { | 1527 | { |
1529 | if (!str) | 1528 | if (!str) |
1530 | return 0; | 1529 | return 0; |
1531 | ihash_entries = simple_strtoul(str, &str, 0); | 1530 | ihash_entries = simple_strtoul(str, &str, 0); |
1532 | return 1; | 1531 | return 1; |
1533 | } | 1532 | } |
1534 | __setup("ihash_entries=", set_ihash_entries); | 1533 | __setup("ihash_entries=", set_ihash_entries); |
1535 | 1534 | ||
1536 | /* | 1535 | /* |
1537 | * Initialize the waitqueues and inode hash table. | 1536 | * Initialize the waitqueues and inode hash table. |
1538 | */ | 1537 | */ |
1539 | void __init inode_init_early(void) | 1538 | void __init inode_init_early(void) |
1540 | { | 1539 | { |
1541 | int loop; | 1540 | int loop; |
1542 | 1541 | ||
1543 | /* If hashes are distributed across NUMA nodes, defer | 1542 | /* If hashes are distributed across NUMA nodes, defer |
1544 | * hash allocation until vmalloc space is available. | 1543 | * hash allocation until vmalloc space is available. |
1545 | */ | 1544 | */ |
1546 | if (hashdist) | 1545 | if (hashdist) |
1547 | return; | 1546 | return; |
1548 | 1547 | ||
1549 | inode_hashtable = | 1548 | inode_hashtable = |
1550 | alloc_large_system_hash("Inode-cache", | 1549 | alloc_large_system_hash("Inode-cache", |
1551 | sizeof(struct hlist_head), | 1550 | sizeof(struct hlist_head), |
1552 | ihash_entries, | 1551 | ihash_entries, |
1553 | 14, | 1552 | 14, |
1554 | HASH_EARLY, | 1553 | HASH_EARLY, |
1555 | &i_hash_shift, | 1554 | &i_hash_shift, |
1556 | &i_hash_mask, | 1555 | &i_hash_mask, |
1557 | 0); | 1556 | 0); |
1558 | 1557 | ||
1559 | for (loop = 0; loop < (1 << i_hash_shift); loop++) | 1558 | for (loop = 0; loop < (1 << i_hash_shift); loop++) |
1560 | INIT_HLIST_HEAD(&inode_hashtable[loop]); | 1559 | INIT_HLIST_HEAD(&inode_hashtable[loop]); |
1561 | } | 1560 | } |
1562 | 1561 | ||
1563 | void __init inode_init(void) | 1562 | void __init inode_init(void) |
1564 | { | 1563 | { |
1565 | int loop; | 1564 | int loop; |
1566 | 1565 | ||
1567 | /* inode slab cache */ | 1566 | /* inode slab cache */ |
1568 | inode_cachep = kmem_cache_create("inode_cache", | 1567 | inode_cachep = kmem_cache_create("inode_cache", |
1569 | sizeof(struct inode), | 1568 | sizeof(struct inode), |
1570 | 0, | 1569 | 0, |
1571 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| | 1570 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| |
1572 | SLAB_MEM_SPREAD), | 1571 | SLAB_MEM_SPREAD), |
1573 | init_once); | 1572 | init_once); |
1574 | 1573 | ||
1575 | /* Hash may have been set up in inode_init_early */ | 1574 | /* Hash may have been set up in inode_init_early */ |
1576 | if (!hashdist) | 1575 | if (!hashdist) |
1577 | return; | 1576 | return; |
1578 | 1577 | ||
1579 | inode_hashtable = | 1578 | inode_hashtable = |
1580 | alloc_large_system_hash("Inode-cache", | 1579 | alloc_large_system_hash("Inode-cache", |
1581 | sizeof(struct hlist_head), | 1580 | sizeof(struct hlist_head), |
1582 | ihash_entries, | 1581 | ihash_entries, |
1583 | 14, | 1582 | 14, |
1584 | 0, | 1583 | 0, |
1585 | &i_hash_shift, | 1584 | &i_hash_shift, |
1586 | &i_hash_mask, | 1585 | &i_hash_mask, |
1587 | 0); | 1586 | 0); |
1588 | 1587 | ||
1589 | for (loop = 0; loop < (1 << i_hash_shift); loop++) | 1588 | for (loop = 0; loop < (1 << i_hash_shift); loop++) |
1590 | INIT_HLIST_HEAD(&inode_hashtable[loop]); | 1589 | INIT_HLIST_HEAD(&inode_hashtable[loop]); |
1591 | } | 1590 | } |
1592 | 1591 | ||
1593 | void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) | 1592 | void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) |
1594 | { | 1593 | { |
1595 | inode->i_mode = mode; | 1594 | inode->i_mode = mode; |
1596 | if (S_ISCHR(mode)) { | 1595 | if (S_ISCHR(mode)) { |
1597 | inode->i_fop = &def_chr_fops; | 1596 | inode->i_fop = &def_chr_fops; |
1598 | inode->i_rdev = rdev; | 1597 | inode->i_rdev = rdev; |
1599 | } else if (S_ISBLK(mode)) { | 1598 | } else if (S_ISBLK(mode)) { |
1600 | inode->i_fop = &def_blk_fops; | 1599 | inode->i_fop = &def_blk_fops; |
1601 | inode->i_rdev = rdev; | 1600 | inode->i_rdev = rdev; |
1602 | } else if (S_ISFIFO(mode)) | 1601 | } else if (S_ISFIFO(mode)) |
1603 | inode->i_fop = &def_fifo_fops; | 1602 | inode->i_fop = &def_fifo_fops; |
1604 | else if (S_ISSOCK(mode)) | 1603 | else if (S_ISSOCK(mode)) |
1605 | inode->i_fop = &bad_sock_fops; | 1604 | inode->i_fop = &bad_sock_fops; |
1606 | else | 1605 | else |
1607 | printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" | 1606 | printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" |
1608 | " inode %s:%lu\n", mode, inode->i_sb->s_id, | 1607 | " inode %s:%lu\n", mode, inode->i_sb->s_id, |
1609 | inode->i_ino); | 1608 | inode->i_ino); |
1610 | } | 1609 | } |
1611 | EXPORT_SYMBOL(init_special_inode); | 1610 | EXPORT_SYMBOL(init_special_inode); |
1612 | 1611 | ||
1613 | /** | 1612 | /** |
1614 | * inode_init_owner - Init uid,gid,mode for new inode according to posix standards | 1613 | * inode_init_owner - Init uid,gid,mode for new inode according to posix standards |
1615 | * @inode: New inode | 1614 | * @inode: New inode |
1616 | * @dir: Directory inode | 1615 | * @dir: Directory inode |
1617 | * @mode: mode of the new inode | 1616 | * @mode: mode of the new inode |
1618 | */ | 1617 | */ |
1619 | void inode_init_owner(struct inode *inode, const struct inode *dir, | 1618 | void inode_init_owner(struct inode *inode, const struct inode *dir, |
1620 | mode_t mode) | 1619 | mode_t mode) |
1621 | { | 1620 | { |
1622 | inode->i_uid = current_fsuid(); | 1621 | inode->i_uid = current_fsuid(); |
1623 | if (dir && dir->i_mode & S_ISGID) { | 1622 | if (dir && dir->i_mode & S_ISGID) { |
1624 | inode->i_gid = dir->i_gid; | 1623 | inode->i_gid = dir->i_gid; |
1625 | if (S_ISDIR(mode)) | 1624 | if (S_ISDIR(mode)) |
1626 | mode |= S_ISGID; | 1625 | mode |= S_ISGID; |
1627 | } else | 1626 | } else |
1628 | inode->i_gid = current_fsgid(); | 1627 | inode->i_gid = current_fsgid(); |
1629 | inode->i_mode = mode; | 1628 | inode->i_mode = mode; |
1630 | } | 1629 | } |
1631 | EXPORT_SYMBOL(inode_init_owner); | 1630 | EXPORT_SYMBOL(inode_init_owner); |
1632 | 1631 | ||
1633 | /** | 1632 | /** |
1634 | * inode_owner_or_capable - check current task permissions to inode | 1633 | * inode_owner_or_capable - check current task permissions to inode |
1635 | * @inode: inode being checked | 1634 | * @inode: inode being checked |
1636 | * | 1635 | * |
1637 | * Return true if current either has CAP_FOWNER to the inode, or | 1636 | * Return true if current either has CAP_FOWNER to the inode, or |
1638 | * owns the file. | 1637 | * owns the file. |
1639 | */ | 1638 | */ |
1640 | bool inode_owner_or_capable(const struct inode *inode) | 1639 | bool inode_owner_or_capable(const struct inode *inode) |
1641 | { | 1640 | { |
1642 | struct user_namespace *ns = inode_userns(inode); | 1641 | struct user_namespace *ns = inode_userns(inode); |
1643 | 1642 | ||
1644 | if (current_user_ns() == ns && current_fsuid() == inode->i_uid) | 1643 | if (current_user_ns() == ns && current_fsuid() == inode->i_uid) |
1645 | return true; | 1644 | return true; |
1646 | if (ns_capable(ns, CAP_FOWNER)) | 1645 | if (ns_capable(ns, CAP_FOWNER)) |
1647 | return true; | 1646 | return true; |
1648 | return false; | 1647 | return false; |
1649 | } | 1648 | } |
1650 | EXPORT_SYMBOL(inode_owner_or_capable); | 1649 | EXPORT_SYMBOL(inode_owner_or_capable); |
1651 | 1650 |
fs/nfs/write.c
1 | /* | 1 | /* |
2 | * linux/fs/nfs/write.c | 2 | * linux/fs/nfs/write.c |
3 | * | 3 | * |
4 | * Write file data over NFS. | 4 | * Write file data over NFS. |
5 | * | 5 | * |
6 | * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de> | 6 | * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de> |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/types.h> | 9 | #include <linux/types.h> |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/pagemap.h> | 12 | #include <linux/pagemap.h> |
13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
14 | #include <linux/writeback.h> | 14 | #include <linux/writeback.h> |
15 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
16 | #include <linux/migrate.h> | 16 | #include <linux/migrate.h> |
17 | 17 | ||
18 | #include <linux/sunrpc/clnt.h> | 18 | #include <linux/sunrpc/clnt.h> |
19 | #include <linux/nfs_fs.h> | 19 | #include <linux/nfs_fs.h> |
20 | #include <linux/nfs_mount.h> | 20 | #include <linux/nfs_mount.h> |
21 | #include <linux/nfs_page.h> | 21 | #include <linux/nfs_page.h> |
22 | #include <linux/backing-dev.h> | 22 | #include <linux/backing-dev.h> |
23 | 23 | ||
24 | #include <asm/uaccess.h> | 24 | #include <asm/uaccess.h> |
25 | 25 | ||
26 | #include "delegation.h" | 26 | #include "delegation.h" |
27 | #include "internal.h" | 27 | #include "internal.h" |
28 | #include "iostat.h" | 28 | #include "iostat.h" |
29 | #include "nfs4_fs.h" | 29 | #include "nfs4_fs.h" |
30 | #include "fscache.h" | 30 | #include "fscache.h" |
31 | #include "pnfs.h" | 31 | #include "pnfs.h" |
32 | 32 | ||
33 | #define NFSDBG_FACILITY NFSDBG_PAGECACHE | 33 | #define NFSDBG_FACILITY NFSDBG_PAGECACHE |
34 | 34 | ||
35 | #define MIN_POOL_WRITE (32) | 35 | #define MIN_POOL_WRITE (32) |
36 | #define MIN_POOL_COMMIT (4) | 36 | #define MIN_POOL_COMMIT (4) |
37 | 37 | ||
38 | /* | 38 | /* |
39 | * Local function declarations | 39 | * Local function declarations |
40 | */ | 40 | */ |
41 | static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, | 41 | static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, |
42 | struct inode *inode, int ioflags); | 42 | struct inode *inode, int ioflags); |
43 | static void nfs_redirty_request(struct nfs_page *req); | 43 | static void nfs_redirty_request(struct nfs_page *req); |
44 | static const struct rpc_call_ops nfs_write_partial_ops; | 44 | static const struct rpc_call_ops nfs_write_partial_ops; |
45 | static const struct rpc_call_ops nfs_write_full_ops; | 45 | static const struct rpc_call_ops nfs_write_full_ops; |
46 | static const struct rpc_call_ops nfs_commit_ops; | 46 | static const struct rpc_call_ops nfs_commit_ops; |
47 | 47 | ||
48 | static struct kmem_cache *nfs_wdata_cachep; | 48 | static struct kmem_cache *nfs_wdata_cachep; |
49 | static mempool_t *nfs_wdata_mempool; | 49 | static mempool_t *nfs_wdata_mempool; |
50 | static mempool_t *nfs_commit_mempool; | 50 | static mempool_t *nfs_commit_mempool; |
51 | 51 | ||
52 | struct nfs_write_data *nfs_commitdata_alloc(void) | 52 | struct nfs_write_data *nfs_commitdata_alloc(void) |
53 | { | 53 | { |
54 | struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); | 54 | struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); |
55 | 55 | ||
56 | if (p) { | 56 | if (p) { |
57 | memset(p, 0, sizeof(*p)); | 57 | memset(p, 0, sizeof(*p)); |
58 | INIT_LIST_HEAD(&p->pages); | 58 | INIT_LIST_HEAD(&p->pages); |
59 | } | 59 | } |
60 | return p; | 60 | return p; |
61 | } | 61 | } |
62 | EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); | 62 | EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); |
63 | 63 | ||
64 | void nfs_commit_free(struct nfs_write_data *p) | 64 | void nfs_commit_free(struct nfs_write_data *p) |
65 | { | 65 | { |
66 | if (p && (p->pagevec != &p->page_array[0])) | 66 | if (p && (p->pagevec != &p->page_array[0])) |
67 | kfree(p->pagevec); | 67 | kfree(p->pagevec); |
68 | mempool_free(p, nfs_commit_mempool); | 68 | mempool_free(p, nfs_commit_mempool); |
69 | } | 69 | } |
70 | EXPORT_SYMBOL_GPL(nfs_commit_free); | 70 | EXPORT_SYMBOL_GPL(nfs_commit_free); |
71 | 71 | ||
72 | struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) | 72 | struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) |
73 | { | 73 | { |
74 | struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); | 74 | struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); |
75 | 75 | ||
76 | if (p) { | 76 | if (p) { |
77 | memset(p, 0, sizeof(*p)); | 77 | memset(p, 0, sizeof(*p)); |
78 | INIT_LIST_HEAD(&p->pages); | 78 | INIT_LIST_HEAD(&p->pages); |
79 | p->npages = pagecount; | 79 | p->npages = pagecount; |
80 | if (pagecount <= ARRAY_SIZE(p->page_array)) | 80 | if (pagecount <= ARRAY_SIZE(p->page_array)) |
81 | p->pagevec = p->page_array; | 81 | p->pagevec = p->page_array; |
82 | else { | 82 | else { |
83 | p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); | 83 | p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); |
84 | if (!p->pagevec) { | 84 | if (!p->pagevec) { |
85 | mempool_free(p, nfs_wdata_mempool); | 85 | mempool_free(p, nfs_wdata_mempool); |
86 | p = NULL; | 86 | p = NULL; |
87 | } | 87 | } |
88 | } | 88 | } |
89 | } | 89 | } |
90 | return p; | 90 | return p; |
91 | } | 91 | } |
92 | 92 | ||
93 | void nfs_writedata_free(struct nfs_write_data *p) | 93 | void nfs_writedata_free(struct nfs_write_data *p) |
94 | { | 94 | { |
95 | if (p && (p->pagevec != &p->page_array[0])) | 95 | if (p && (p->pagevec != &p->page_array[0])) |
96 | kfree(p->pagevec); | 96 | kfree(p->pagevec); |
97 | mempool_free(p, nfs_wdata_mempool); | 97 | mempool_free(p, nfs_wdata_mempool); |
98 | } | 98 | } |
99 | 99 | ||
100 | static void nfs_writedata_release(struct nfs_write_data *wdata) | 100 | static void nfs_writedata_release(struct nfs_write_data *wdata) |
101 | { | 101 | { |
102 | put_lseg(wdata->lseg); | 102 | put_lseg(wdata->lseg); |
103 | put_nfs_open_context(wdata->args.context); | 103 | put_nfs_open_context(wdata->args.context); |
104 | nfs_writedata_free(wdata); | 104 | nfs_writedata_free(wdata); |
105 | } | 105 | } |
106 | 106 | ||
107 | static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) | 107 | static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) |
108 | { | 108 | { |
109 | ctx->error = error; | 109 | ctx->error = error; |
110 | smp_wmb(); | 110 | smp_wmb(); |
111 | set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); | 111 | set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); |
112 | } | 112 | } |
113 | 113 | ||
114 | static struct nfs_page *nfs_page_find_request_locked(struct page *page) | 114 | static struct nfs_page *nfs_page_find_request_locked(struct page *page) |
115 | { | 115 | { |
116 | struct nfs_page *req = NULL; | 116 | struct nfs_page *req = NULL; |
117 | 117 | ||
118 | if (PagePrivate(page)) { | 118 | if (PagePrivate(page)) { |
119 | req = (struct nfs_page *)page_private(page); | 119 | req = (struct nfs_page *)page_private(page); |
120 | if (req != NULL) | 120 | if (req != NULL) |
121 | kref_get(&req->wb_kref); | 121 | kref_get(&req->wb_kref); |
122 | } | 122 | } |
123 | return req; | 123 | return req; |
124 | } | 124 | } |
125 | 125 | ||
126 | static struct nfs_page *nfs_page_find_request(struct page *page) | 126 | static struct nfs_page *nfs_page_find_request(struct page *page) |
127 | { | 127 | { |
128 | struct inode *inode = page->mapping->host; | 128 | struct inode *inode = page->mapping->host; |
129 | struct nfs_page *req = NULL; | 129 | struct nfs_page *req = NULL; |
130 | 130 | ||
131 | spin_lock(&inode->i_lock); | 131 | spin_lock(&inode->i_lock); |
132 | req = nfs_page_find_request_locked(page); | 132 | req = nfs_page_find_request_locked(page); |
133 | spin_unlock(&inode->i_lock); | 133 | spin_unlock(&inode->i_lock); |
134 | return req; | 134 | return req; |
135 | } | 135 | } |
136 | 136 | ||
137 | /* Adjust the file length if we're writing beyond the end */ | 137 | /* Adjust the file length if we're writing beyond the end */ |
138 | static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) | 138 | static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) |
139 | { | 139 | { |
140 | struct inode *inode = page->mapping->host; | 140 | struct inode *inode = page->mapping->host; |
141 | loff_t end, i_size; | 141 | loff_t end, i_size; |
142 | pgoff_t end_index; | 142 | pgoff_t end_index; |
143 | 143 | ||
144 | spin_lock(&inode->i_lock); | 144 | spin_lock(&inode->i_lock); |
145 | i_size = i_size_read(inode); | 145 | i_size = i_size_read(inode); |
146 | end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; | 146 | end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; |
147 | if (i_size > 0 && page->index < end_index) | 147 | if (i_size > 0 && page->index < end_index) |
148 | goto out; | 148 | goto out; |
149 | end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); | 149 | end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); |
150 | if (i_size >= end) | 150 | if (i_size >= end) |
151 | goto out; | 151 | goto out; |
152 | i_size_write(inode, end); | 152 | i_size_write(inode, end); |
153 | nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); | 153 | nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); |
154 | out: | 154 | out: |
155 | spin_unlock(&inode->i_lock); | 155 | spin_unlock(&inode->i_lock); |
156 | } | 156 | } |
157 | 157 | ||
158 | /* A writeback failed: mark the page as bad, and invalidate the page cache */ | 158 | /* A writeback failed: mark the page as bad, and invalidate the page cache */ |
159 | static void nfs_set_pageerror(struct page *page) | 159 | static void nfs_set_pageerror(struct page *page) |
160 | { | 160 | { |
161 | SetPageError(page); | 161 | SetPageError(page); |
162 | nfs_zap_mapping(page->mapping->host, page->mapping); | 162 | nfs_zap_mapping(page->mapping->host, page->mapping); |
163 | } | 163 | } |
164 | 164 | ||
165 | /* We can set the PG_uptodate flag if we see that a write request | 165 | /* We can set the PG_uptodate flag if we see that a write request |
166 | * covers the full page. | 166 | * covers the full page. |
167 | */ | 167 | */ |
168 | static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) | 168 | static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) |
169 | { | 169 | { |
170 | if (PageUptodate(page)) | 170 | if (PageUptodate(page)) |
171 | return; | 171 | return; |
172 | if (base != 0) | 172 | if (base != 0) |
173 | return; | 173 | return; |
174 | if (count != nfs_page_length(page)) | 174 | if (count != nfs_page_length(page)) |
175 | return; | 175 | return; |
176 | SetPageUptodate(page); | 176 | SetPageUptodate(page); |
177 | } | 177 | } |
178 | 178 | ||
179 | static int wb_priority(struct writeback_control *wbc) | 179 | static int wb_priority(struct writeback_control *wbc) |
180 | { | 180 | { |
181 | if (wbc->for_reclaim) | 181 | if (wbc->for_reclaim) |
182 | return FLUSH_HIGHPRI | FLUSH_STABLE; | 182 | return FLUSH_HIGHPRI | FLUSH_STABLE; |
183 | if (wbc->for_kupdate || wbc->for_background) | 183 | if (wbc->for_kupdate || wbc->for_background) |
184 | return FLUSH_LOWPRI | FLUSH_COND_STABLE; | 184 | return FLUSH_LOWPRI | FLUSH_COND_STABLE; |
185 | return FLUSH_COND_STABLE; | 185 | return FLUSH_COND_STABLE; |
186 | } | 186 | } |
187 | 187 | ||
188 | /* | 188 | /* |
189 | * NFS congestion control | 189 | * NFS congestion control |
190 | */ | 190 | */ |
191 | 191 | ||
192 | int nfs_congestion_kb; | 192 | int nfs_congestion_kb; |
193 | 193 | ||
194 | #define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) | 194 | #define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) |
195 | #define NFS_CONGESTION_OFF_THRESH \ | 195 | #define NFS_CONGESTION_OFF_THRESH \ |
196 | (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) | 196 | (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) |
197 | 197 | ||
198 | static int nfs_set_page_writeback(struct page *page) | 198 | static int nfs_set_page_writeback(struct page *page) |
199 | { | 199 | { |
200 | int ret = test_set_page_writeback(page); | 200 | int ret = test_set_page_writeback(page); |
201 | 201 | ||
202 | if (!ret) { | 202 | if (!ret) { |
203 | struct inode *inode = page->mapping->host; | 203 | struct inode *inode = page->mapping->host; |
204 | struct nfs_server *nfss = NFS_SERVER(inode); | 204 | struct nfs_server *nfss = NFS_SERVER(inode); |
205 | 205 | ||
206 | page_cache_get(page); | 206 | page_cache_get(page); |
207 | if (atomic_long_inc_return(&nfss->writeback) > | 207 | if (atomic_long_inc_return(&nfss->writeback) > |
208 | NFS_CONGESTION_ON_THRESH) { | 208 | NFS_CONGESTION_ON_THRESH) { |
209 | set_bdi_congested(&nfss->backing_dev_info, | 209 | set_bdi_congested(&nfss->backing_dev_info, |
210 | BLK_RW_ASYNC); | 210 | BLK_RW_ASYNC); |
211 | } | 211 | } |
212 | } | 212 | } |
213 | return ret; | 213 | return ret; |
214 | } | 214 | } |
215 | 215 | ||
216 | static void nfs_end_page_writeback(struct page *page) | 216 | static void nfs_end_page_writeback(struct page *page) |
217 | { | 217 | { |
218 | struct inode *inode = page->mapping->host; | 218 | struct inode *inode = page->mapping->host; |
219 | struct nfs_server *nfss = NFS_SERVER(inode); | 219 | struct nfs_server *nfss = NFS_SERVER(inode); |
220 | 220 | ||
221 | end_page_writeback(page); | 221 | end_page_writeback(page); |
222 | page_cache_release(page); | 222 | page_cache_release(page); |
223 | if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) | 223 | if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) |
224 | clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); | 224 | clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); |
225 | } | 225 | } |
226 | 226 | ||
227 | static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) | 227 | static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) |
228 | { | 228 | { |
229 | struct inode *inode = page->mapping->host; | 229 | struct inode *inode = page->mapping->host; |
230 | struct nfs_page *req; | 230 | struct nfs_page *req; |
231 | int ret; | 231 | int ret; |
232 | 232 | ||
233 | spin_lock(&inode->i_lock); | 233 | spin_lock(&inode->i_lock); |
234 | for (;;) { | 234 | for (;;) { |
235 | req = nfs_page_find_request_locked(page); | 235 | req = nfs_page_find_request_locked(page); |
236 | if (req == NULL) | 236 | if (req == NULL) |
237 | break; | 237 | break; |
238 | if (nfs_set_page_tag_locked(req)) | 238 | if (nfs_set_page_tag_locked(req)) |
239 | break; | 239 | break; |
240 | /* Note: If we hold the page lock, as is the case in nfs_writepage, | 240 | /* Note: If we hold the page lock, as is the case in nfs_writepage, |
241 | * then the call to nfs_set_page_tag_locked() will always | 241 | * then the call to nfs_set_page_tag_locked() will always |
242 | * succeed provided that someone hasn't already marked the | 242 | * succeed provided that someone hasn't already marked the |
243 | * request as dirty (in which case we don't care). | 243 | * request as dirty (in which case we don't care). |
244 | */ | 244 | */ |
245 | spin_unlock(&inode->i_lock); | 245 | spin_unlock(&inode->i_lock); |
246 | if (!nonblock) | 246 | if (!nonblock) |
247 | ret = nfs_wait_on_request(req); | 247 | ret = nfs_wait_on_request(req); |
248 | else | 248 | else |
249 | ret = -EAGAIN; | 249 | ret = -EAGAIN; |
250 | nfs_release_request(req); | 250 | nfs_release_request(req); |
251 | if (ret != 0) | 251 | if (ret != 0) |
252 | return ERR_PTR(ret); | 252 | return ERR_PTR(ret); |
253 | spin_lock(&inode->i_lock); | 253 | spin_lock(&inode->i_lock); |
254 | } | 254 | } |
255 | spin_unlock(&inode->i_lock); | 255 | spin_unlock(&inode->i_lock); |
256 | return req; | 256 | return req; |
257 | } | 257 | } |
258 | 258 | ||
259 | /* | 259 | /* |
260 | * Find an associated nfs write request, and prepare to flush it out | 260 | * Find an associated nfs write request, and prepare to flush it out |
261 | * May return an error if the user signalled nfs_wait_on_request(). | 261 | * May return an error if the user signalled nfs_wait_on_request(). |
262 | */ | 262 | */ |
263 | static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, | 263 | static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, |
264 | struct page *page, bool nonblock) | 264 | struct page *page, bool nonblock) |
265 | { | 265 | { |
266 | struct nfs_page *req; | 266 | struct nfs_page *req; |
267 | int ret = 0; | 267 | int ret = 0; |
268 | 268 | ||
269 | req = nfs_find_and_lock_request(page, nonblock); | 269 | req = nfs_find_and_lock_request(page, nonblock); |
270 | if (!req) | 270 | if (!req) |
271 | goto out; | 271 | goto out; |
272 | ret = PTR_ERR(req); | 272 | ret = PTR_ERR(req); |
273 | if (IS_ERR(req)) | 273 | if (IS_ERR(req)) |
274 | goto out; | 274 | goto out; |
275 | 275 | ||
276 | ret = nfs_set_page_writeback(page); | 276 | ret = nfs_set_page_writeback(page); |
277 | BUG_ON(ret != 0); | 277 | BUG_ON(ret != 0); |
278 | BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); | 278 | BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); |
279 | 279 | ||
280 | if (!nfs_pageio_add_request(pgio, req)) { | 280 | if (!nfs_pageio_add_request(pgio, req)) { |
281 | nfs_redirty_request(req); | 281 | nfs_redirty_request(req); |
282 | ret = pgio->pg_error; | 282 | ret = pgio->pg_error; |
283 | } | 283 | } |
284 | out: | 284 | out: |
285 | return ret; | 285 | return ret; |
286 | } | 286 | } |
287 | 287 | ||
288 | static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) | 288 | static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) |
289 | { | 289 | { |
290 | struct inode *inode = page->mapping->host; | 290 | struct inode *inode = page->mapping->host; |
291 | int ret; | 291 | int ret; |
292 | 292 | ||
293 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); | 293 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); |
294 | nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); | 294 | nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); |
295 | 295 | ||
296 | nfs_pageio_cond_complete(pgio, page->index); | 296 | nfs_pageio_cond_complete(pgio, page->index); |
297 | ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); | 297 | ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); |
298 | if (ret == -EAGAIN) { | 298 | if (ret == -EAGAIN) { |
299 | redirty_page_for_writepage(wbc, page); | 299 | redirty_page_for_writepage(wbc, page); |
300 | ret = 0; | 300 | ret = 0; |
301 | } | 301 | } |
302 | return ret; | 302 | return ret; |
303 | } | 303 | } |
304 | 304 | ||
305 | /* | 305 | /* |
306 | * Write an mmapped page to the server. | 306 | * Write an mmapped page to the server. |
307 | */ | 307 | */ |
308 | static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) | 308 | static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) |
309 | { | 309 | { |
310 | struct nfs_pageio_descriptor pgio; | 310 | struct nfs_pageio_descriptor pgio; |
311 | int err; | 311 | int err; |
312 | 312 | ||
313 | nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); | 313 | nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); |
314 | err = nfs_do_writepage(page, wbc, &pgio); | 314 | err = nfs_do_writepage(page, wbc, &pgio); |
315 | nfs_pageio_complete(&pgio); | 315 | nfs_pageio_complete(&pgio); |
316 | if (err < 0) | 316 | if (err < 0) |
317 | return err; | 317 | return err; |
318 | if (pgio.pg_error < 0) | 318 | if (pgio.pg_error < 0) |
319 | return pgio.pg_error; | 319 | return pgio.pg_error; |
320 | return 0; | 320 | return 0; |
321 | } | 321 | } |
322 | 322 | ||
323 | int nfs_writepage(struct page *page, struct writeback_control *wbc) | 323 | int nfs_writepage(struct page *page, struct writeback_control *wbc) |
324 | { | 324 | { |
325 | int ret; | 325 | int ret; |
326 | 326 | ||
327 | ret = nfs_writepage_locked(page, wbc); | 327 | ret = nfs_writepage_locked(page, wbc); |
328 | unlock_page(page); | 328 | unlock_page(page); |
329 | return ret; | 329 | return ret; |
330 | } | 330 | } |
331 | 331 | ||
332 | static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) | 332 | static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) |
333 | { | 333 | { |
334 | int ret; | 334 | int ret; |
335 | 335 | ||
336 | ret = nfs_do_writepage(page, wbc, data); | 336 | ret = nfs_do_writepage(page, wbc, data); |
337 | unlock_page(page); | 337 | unlock_page(page); |
338 | return ret; | 338 | return ret; |
339 | } | 339 | } |
340 | 340 | ||
341 | int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) | 341 | int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) |
342 | { | 342 | { |
343 | struct inode *inode = mapping->host; | 343 | struct inode *inode = mapping->host; |
344 | unsigned long *bitlock = &NFS_I(inode)->flags; | 344 | unsigned long *bitlock = &NFS_I(inode)->flags; |
345 | struct nfs_pageio_descriptor pgio; | 345 | struct nfs_pageio_descriptor pgio; |
346 | int err; | 346 | int err; |
347 | 347 | ||
348 | /* Stop dirtying of new pages while we sync */ | 348 | /* Stop dirtying of new pages while we sync */ |
349 | err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, | 349 | err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, |
350 | nfs_wait_bit_killable, TASK_KILLABLE); | 350 | nfs_wait_bit_killable, TASK_KILLABLE); |
351 | if (err) | 351 | if (err) |
352 | goto out_err; | 352 | goto out_err; |
353 | 353 | ||
354 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); | 354 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); |
355 | 355 | ||
356 | nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); | 356 | nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); |
357 | err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); | 357 | err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); |
358 | nfs_pageio_complete(&pgio); | 358 | nfs_pageio_complete(&pgio); |
359 | 359 | ||
360 | clear_bit_unlock(NFS_INO_FLUSHING, bitlock); | 360 | clear_bit_unlock(NFS_INO_FLUSHING, bitlock); |
361 | smp_mb__after_clear_bit(); | 361 | smp_mb__after_clear_bit(); |
362 | wake_up_bit(bitlock, NFS_INO_FLUSHING); | 362 | wake_up_bit(bitlock, NFS_INO_FLUSHING); |
363 | 363 | ||
364 | if (err < 0) | 364 | if (err < 0) |
365 | goto out_err; | 365 | goto out_err; |
366 | err = pgio.pg_error; | 366 | err = pgio.pg_error; |
367 | if (err < 0) | 367 | if (err < 0) |
368 | goto out_err; | 368 | goto out_err; |
369 | return 0; | 369 | return 0; |
370 | out_err: | 370 | out_err: |
371 | return err; | 371 | return err; |
372 | } | 372 | } |
373 | 373 | ||
374 | /* | 374 | /* |
375 | * Insert a write request into an inode | 375 | * Insert a write request into an inode |
376 | */ | 376 | */ |
377 | static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) | 377 | static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) |
378 | { | 378 | { |
379 | struct nfs_inode *nfsi = NFS_I(inode); | 379 | struct nfs_inode *nfsi = NFS_I(inode); |
380 | int error; | 380 | int error; |
381 | 381 | ||
382 | error = radix_tree_preload(GFP_NOFS); | 382 | error = radix_tree_preload(GFP_NOFS); |
383 | if (error != 0) | 383 | if (error != 0) |
384 | goto out; | 384 | goto out; |
385 | 385 | ||
386 | /* Lock the request! */ | 386 | /* Lock the request! */ |
387 | nfs_lock_request_dontget(req); | 387 | nfs_lock_request_dontget(req); |
388 | 388 | ||
389 | spin_lock(&inode->i_lock); | 389 | spin_lock(&inode->i_lock); |
390 | error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); | 390 | error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); |
391 | BUG_ON(error); | 391 | BUG_ON(error); |
392 | if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) | 392 | if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) |
393 | nfsi->change_attr++; | 393 | nfsi->change_attr++; |
394 | set_bit(PG_MAPPED, &req->wb_flags); | 394 | set_bit(PG_MAPPED, &req->wb_flags); |
395 | SetPagePrivate(req->wb_page); | 395 | SetPagePrivate(req->wb_page); |
396 | set_page_private(req->wb_page, (unsigned long)req); | 396 | set_page_private(req->wb_page, (unsigned long)req); |
397 | nfsi->npages++; | 397 | nfsi->npages++; |
398 | kref_get(&req->wb_kref); | 398 | kref_get(&req->wb_kref); |
399 | radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, | 399 | radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, |
400 | NFS_PAGE_TAG_LOCKED); | 400 | NFS_PAGE_TAG_LOCKED); |
401 | spin_unlock(&inode->i_lock); | 401 | spin_unlock(&inode->i_lock); |
402 | radix_tree_preload_end(); | 402 | radix_tree_preload_end(); |
403 | out: | 403 | out: |
404 | return error; | 404 | return error; |
405 | } | 405 | } |
406 | 406 | ||
407 | /* | 407 | /* |
408 | * Remove a write request from an inode | 408 | * Remove a write request from an inode |
409 | */ | 409 | */ |
410 | static void nfs_inode_remove_request(struct nfs_page *req) | 410 | static void nfs_inode_remove_request(struct nfs_page *req) |
411 | { | 411 | { |
412 | struct inode *inode = req->wb_context->dentry->d_inode; | 412 | struct inode *inode = req->wb_context->dentry->d_inode; |
413 | struct nfs_inode *nfsi = NFS_I(inode); | 413 | struct nfs_inode *nfsi = NFS_I(inode); |
414 | 414 | ||
415 | BUG_ON (!NFS_WBACK_BUSY(req)); | 415 | BUG_ON (!NFS_WBACK_BUSY(req)); |
416 | 416 | ||
417 | spin_lock(&inode->i_lock); | 417 | spin_lock(&inode->i_lock); |
418 | set_page_private(req->wb_page, 0); | 418 | set_page_private(req->wb_page, 0); |
419 | ClearPagePrivate(req->wb_page); | 419 | ClearPagePrivate(req->wb_page); |
420 | clear_bit(PG_MAPPED, &req->wb_flags); | 420 | clear_bit(PG_MAPPED, &req->wb_flags); |
421 | radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); | 421 | radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); |
422 | nfsi->npages--; | 422 | nfsi->npages--; |
423 | spin_unlock(&inode->i_lock); | 423 | spin_unlock(&inode->i_lock); |
424 | nfs_release_request(req); | 424 | nfs_release_request(req); |
425 | } | 425 | } |
426 | 426 | ||
427 | static void | 427 | static void |
428 | nfs_mark_request_dirty(struct nfs_page *req) | 428 | nfs_mark_request_dirty(struct nfs_page *req) |
429 | { | 429 | { |
430 | __set_page_dirty_nobuffers(req->wb_page); | 430 | __set_page_dirty_nobuffers(req->wb_page); |
431 | __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); | 431 | __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); |
432 | } | 432 | } |
433 | 433 | ||
434 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) | 434 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) |
435 | /* | 435 | /* |
436 | * Add a request to the inode's commit list. | 436 | * Add a request to the inode's commit list. |
437 | */ | 437 | */ |
438 | static void | 438 | static void |
439 | nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) | 439 | nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) |
440 | { | 440 | { |
441 | struct inode *inode = req->wb_context->dentry->d_inode; | 441 | struct inode *inode = req->wb_context->dentry->d_inode; |
442 | struct nfs_inode *nfsi = NFS_I(inode); | 442 | struct nfs_inode *nfsi = NFS_I(inode); |
443 | 443 | ||
444 | spin_lock(&inode->i_lock); | 444 | spin_lock(&inode->i_lock); |
445 | set_bit(PG_CLEAN, &(req)->wb_flags); | 445 | set_bit(PG_CLEAN, &(req)->wb_flags); |
446 | radix_tree_tag_set(&nfsi->nfs_page_tree, | 446 | radix_tree_tag_set(&nfsi->nfs_page_tree, |
447 | req->wb_index, | 447 | req->wb_index, |
448 | NFS_PAGE_TAG_COMMIT); | 448 | NFS_PAGE_TAG_COMMIT); |
449 | nfsi->ncommit++; | 449 | nfsi->ncommit++; |
450 | spin_unlock(&inode->i_lock); | 450 | spin_unlock(&inode->i_lock); |
451 | pnfs_mark_request_commit(req, lseg); | 451 | pnfs_mark_request_commit(req, lseg); |
452 | inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); | 452 | inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); |
453 | inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); | 453 | inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); |
454 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); | 454 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); |
455 | } | 455 | } |
456 | 456 | ||
457 | static int | 457 | static int |
458 | nfs_clear_request_commit(struct nfs_page *req) | 458 | nfs_clear_request_commit(struct nfs_page *req) |
459 | { | 459 | { |
460 | struct page *page = req->wb_page; | 460 | struct page *page = req->wb_page; |
461 | 461 | ||
462 | if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { | 462 | if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { |
463 | dec_zone_page_state(page, NR_UNSTABLE_NFS); | 463 | dec_zone_page_state(page, NR_UNSTABLE_NFS); |
464 | dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); | 464 | dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); |
465 | return 1; | 465 | return 1; |
466 | } | 466 | } |
467 | return 0; | 467 | return 0; |
468 | } | 468 | } |
469 | 469 | ||
470 | static inline | 470 | static inline |
471 | int nfs_write_need_commit(struct nfs_write_data *data) | 471 | int nfs_write_need_commit(struct nfs_write_data *data) |
472 | { | 472 | { |
473 | if (data->verf.committed == NFS_DATA_SYNC) | 473 | if (data->verf.committed == NFS_DATA_SYNC) |
474 | return data->lseg == NULL; | 474 | return data->lseg == NULL; |
475 | else | 475 | else |
476 | return data->verf.committed != NFS_FILE_SYNC; | 476 | return data->verf.committed != NFS_FILE_SYNC; |
477 | } | 477 | } |
478 | 478 | ||
479 | static inline | 479 | static inline |
480 | int nfs_reschedule_unstable_write(struct nfs_page *req, | 480 | int nfs_reschedule_unstable_write(struct nfs_page *req, |
481 | struct nfs_write_data *data) | 481 | struct nfs_write_data *data) |
482 | { | 482 | { |
483 | if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { | 483 | if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { |
484 | nfs_mark_request_commit(req, data->lseg); | 484 | nfs_mark_request_commit(req, data->lseg); |
485 | return 1; | 485 | return 1; |
486 | } | 486 | } |
487 | if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { | 487 | if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { |
488 | nfs_mark_request_dirty(req); | 488 | nfs_mark_request_dirty(req); |
489 | return 1; | 489 | return 1; |
490 | } | 490 | } |
491 | return 0; | 491 | return 0; |
492 | } | 492 | } |
493 | #else | 493 | #else |
494 | static inline void | 494 | static inline void |
495 | nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) | 495 | nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) |
496 | { | 496 | { |
497 | } | 497 | } |
498 | 498 | ||
499 | static inline int | 499 | static inline int |
500 | nfs_clear_request_commit(struct nfs_page *req) | 500 | nfs_clear_request_commit(struct nfs_page *req) |
501 | { | 501 | { |
502 | return 0; | 502 | return 0; |
503 | } | 503 | } |
504 | 504 | ||
505 | static inline | 505 | static inline |
506 | int nfs_write_need_commit(struct nfs_write_data *data) | 506 | int nfs_write_need_commit(struct nfs_write_data *data) |
507 | { | 507 | { |
508 | return 0; | 508 | return 0; |
509 | } | 509 | } |
510 | 510 | ||
511 | static inline | 511 | static inline |
512 | int nfs_reschedule_unstable_write(struct nfs_page *req, | 512 | int nfs_reschedule_unstable_write(struct nfs_page *req, |
513 | struct nfs_write_data *data) | 513 | struct nfs_write_data *data) |
514 | { | 514 | { |
515 | return 0; | 515 | return 0; |
516 | } | 516 | } |
517 | #endif | 517 | #endif |
518 | 518 | ||
519 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) | 519 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) |
520 | static int | 520 | static int |
521 | nfs_need_commit(struct nfs_inode *nfsi) | 521 | nfs_need_commit(struct nfs_inode *nfsi) |
522 | { | 522 | { |
523 | return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); | 523 | return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); |
524 | } | 524 | } |
525 | 525 | ||
526 | /* | 526 | /* |
527 | * nfs_scan_commit - Scan an inode for commit requests | 527 | * nfs_scan_commit - Scan an inode for commit requests |
528 | * @inode: NFS inode to scan | 528 | * @inode: NFS inode to scan |
529 | * @dst: destination list | 529 | * @dst: destination list |
530 | * @idx_start: lower bound of page->index to scan. | 530 | * @idx_start: lower bound of page->index to scan. |
531 | * @npages: idx_start + npages sets the upper bound to scan. | 531 | * @npages: idx_start + npages sets the upper bound to scan. |
532 | * | 532 | * |
533 | * Moves requests from the inode's 'commit' request list. | 533 | * Moves requests from the inode's 'commit' request list. |
534 | * The requests are *not* checked to ensure that they form a contiguous set. | 534 | * The requests are *not* checked to ensure that they form a contiguous set. |
535 | */ | 535 | */ |
536 | static int | 536 | static int |
537 | nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) | 537 | nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) |
538 | { | 538 | { |
539 | struct nfs_inode *nfsi = NFS_I(inode); | 539 | struct nfs_inode *nfsi = NFS_I(inode); |
540 | int ret; | 540 | int ret; |
541 | 541 | ||
542 | if (!nfs_need_commit(nfsi)) | 542 | if (!nfs_need_commit(nfsi)) |
543 | return 0; | 543 | return 0; |
544 | 544 | ||
545 | spin_lock(&inode->i_lock); | 545 | spin_lock(&inode->i_lock); |
546 | ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); | 546 | ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); |
547 | if (ret > 0) | 547 | if (ret > 0) |
548 | nfsi->ncommit -= ret; | 548 | nfsi->ncommit -= ret; |
549 | spin_unlock(&inode->i_lock); | 549 | spin_unlock(&inode->i_lock); |
550 | 550 | ||
551 | if (nfs_need_commit(NFS_I(inode))) | 551 | if (nfs_need_commit(NFS_I(inode))) |
552 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); | 552 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); |
553 | 553 | ||
554 | return ret; | 554 | return ret; |
555 | } | 555 | } |
556 | #else | 556 | #else |
557 | static inline int nfs_need_commit(struct nfs_inode *nfsi) | 557 | static inline int nfs_need_commit(struct nfs_inode *nfsi) |
558 | { | 558 | { |
559 | return 0; | 559 | return 0; |
560 | } | 560 | } |
561 | 561 | ||
562 | static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) | 562 | static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) |
563 | { | 563 | { |
564 | return 0; | 564 | return 0; |
565 | } | 565 | } |
566 | #endif | 566 | #endif |
567 | 567 | ||
568 | /* | 568 | /* |
569 | * Search for an existing write request, and attempt to update | 569 | * Search for an existing write request, and attempt to update |
570 | * it to reflect a new dirty region on a given page. | 570 | * it to reflect a new dirty region on a given page. |
571 | * | 571 | * |
572 | * If the attempt fails, then the existing request is flushed out | 572 | * If the attempt fails, then the existing request is flushed out |
573 | * to disk. | 573 | * to disk. |
574 | */ | 574 | */ |
575 | static struct nfs_page *nfs_try_to_update_request(struct inode *inode, | 575 | static struct nfs_page *nfs_try_to_update_request(struct inode *inode, |
576 | struct page *page, | 576 | struct page *page, |
577 | unsigned int offset, | 577 | unsigned int offset, |
578 | unsigned int bytes) | 578 | unsigned int bytes) |
579 | { | 579 | { |
580 | struct nfs_page *req; | 580 | struct nfs_page *req; |
581 | unsigned int rqend; | 581 | unsigned int rqend; |
582 | unsigned int end; | 582 | unsigned int end; |
583 | int error; | 583 | int error; |
584 | 584 | ||
585 | if (!PagePrivate(page)) | 585 | if (!PagePrivate(page)) |
586 | return NULL; | 586 | return NULL; |
587 | 587 | ||
588 | end = offset + bytes; | 588 | end = offset + bytes; |
589 | spin_lock(&inode->i_lock); | 589 | spin_lock(&inode->i_lock); |
590 | 590 | ||
591 | for (;;) { | 591 | for (;;) { |
592 | req = nfs_page_find_request_locked(page); | 592 | req = nfs_page_find_request_locked(page); |
593 | if (req == NULL) | 593 | if (req == NULL) |
594 | goto out_unlock; | 594 | goto out_unlock; |
595 | 595 | ||
596 | rqend = req->wb_offset + req->wb_bytes; | 596 | rqend = req->wb_offset + req->wb_bytes; |
597 | /* | 597 | /* |
598 | * Tell the caller to flush out the request if | 598 | * Tell the caller to flush out the request if |
599 | * the offsets are non-contiguous. | 599 | * the offsets are non-contiguous. |
600 | * Note: nfs_flush_incompatible() will already | 600 | * Note: nfs_flush_incompatible() will already |
601 | * have flushed out requests having wrong owners. | 601 | * have flushed out requests having wrong owners. |
602 | */ | 602 | */ |
603 | if (offset > rqend | 603 | if (offset > rqend |
604 | || end < req->wb_offset) | 604 | || end < req->wb_offset) |
605 | goto out_flushme; | 605 | goto out_flushme; |
606 | 606 | ||
607 | if (nfs_set_page_tag_locked(req)) | 607 | if (nfs_set_page_tag_locked(req)) |
608 | break; | 608 | break; |
609 | 609 | ||
610 | /* The request is locked, so wait and then retry */ | 610 | /* The request is locked, so wait and then retry */ |
611 | spin_unlock(&inode->i_lock); | 611 | spin_unlock(&inode->i_lock); |
612 | error = nfs_wait_on_request(req); | 612 | error = nfs_wait_on_request(req); |
613 | nfs_release_request(req); | 613 | nfs_release_request(req); |
614 | if (error != 0) | 614 | if (error != 0) |
615 | goto out_err; | 615 | goto out_err; |
616 | spin_lock(&inode->i_lock); | 616 | spin_lock(&inode->i_lock); |
617 | } | 617 | } |
618 | 618 | ||
619 | if (nfs_clear_request_commit(req) && | 619 | if (nfs_clear_request_commit(req) && |
620 | radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, | 620 | radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, |
621 | req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { | 621 | req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { |
622 | NFS_I(inode)->ncommit--; | 622 | NFS_I(inode)->ncommit--; |
623 | pnfs_clear_request_commit(req); | 623 | pnfs_clear_request_commit(req); |
624 | } | 624 | } |
625 | 625 | ||
626 | /* Okay, the request matches. Update the region */ | 626 | /* Okay, the request matches. Update the region */ |
627 | if (offset < req->wb_offset) { | 627 | if (offset < req->wb_offset) { |
628 | req->wb_offset = offset; | 628 | req->wb_offset = offset; |
629 | req->wb_pgbase = offset; | 629 | req->wb_pgbase = offset; |
630 | } | 630 | } |
631 | if (end > rqend) | 631 | if (end > rqend) |
632 | req->wb_bytes = end - req->wb_offset; | 632 | req->wb_bytes = end - req->wb_offset; |
633 | else | 633 | else |
634 | req->wb_bytes = rqend - req->wb_offset; | 634 | req->wb_bytes = rqend - req->wb_offset; |
635 | out_unlock: | 635 | out_unlock: |
636 | spin_unlock(&inode->i_lock); | 636 | spin_unlock(&inode->i_lock); |
637 | return req; | 637 | return req; |
638 | out_flushme: | 638 | out_flushme: |
639 | spin_unlock(&inode->i_lock); | 639 | spin_unlock(&inode->i_lock); |
640 | nfs_release_request(req); | 640 | nfs_release_request(req); |
641 | error = nfs_wb_page(inode, page); | 641 | error = nfs_wb_page(inode, page); |
642 | out_err: | 642 | out_err: |
643 | return ERR_PTR(error); | 643 | return ERR_PTR(error); |
644 | } | 644 | } |
645 | 645 | ||
646 | /* | 646 | /* |
647 | * Try to update an existing write request, or create one if there is none. | 647 | * Try to update an existing write request, or create one if there is none. |
648 | * | 648 | * |
649 | * Note: Should always be called with the Page Lock held to prevent races | 649 | * Note: Should always be called with the Page Lock held to prevent races |
650 | * if we have to add a new request. Also assumes that the caller has | 650 | * if we have to add a new request. Also assumes that the caller has |
651 | * already called nfs_flush_incompatible() if necessary. | 651 | * already called nfs_flush_incompatible() if necessary. |
652 | */ | 652 | */ |
653 | static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, | 653 | static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, |
654 | struct page *page, unsigned int offset, unsigned int bytes) | 654 | struct page *page, unsigned int offset, unsigned int bytes) |
655 | { | 655 | { |
656 | struct inode *inode = page->mapping->host; | 656 | struct inode *inode = page->mapping->host; |
657 | struct nfs_page *req; | 657 | struct nfs_page *req; |
658 | int error; | 658 | int error; |
659 | 659 | ||
660 | req = nfs_try_to_update_request(inode, page, offset, bytes); | 660 | req = nfs_try_to_update_request(inode, page, offset, bytes); |
661 | if (req != NULL) | 661 | if (req != NULL) |
662 | goto out; | 662 | goto out; |
663 | req = nfs_create_request(ctx, inode, page, offset, bytes); | 663 | req = nfs_create_request(ctx, inode, page, offset, bytes); |
664 | if (IS_ERR(req)) | 664 | if (IS_ERR(req)) |
665 | goto out; | 665 | goto out; |
666 | error = nfs_inode_add_request(inode, req); | 666 | error = nfs_inode_add_request(inode, req); |
667 | if (error != 0) { | 667 | if (error != 0) { |
668 | nfs_release_request(req); | 668 | nfs_release_request(req); |
669 | req = ERR_PTR(error); | 669 | req = ERR_PTR(error); |
670 | } | 670 | } |
671 | out: | 671 | out: |
672 | return req; | 672 | return req; |
673 | } | 673 | } |
674 | 674 | ||
675 | static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, | 675 | static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, |
676 | unsigned int offset, unsigned int count) | 676 | unsigned int offset, unsigned int count) |
677 | { | 677 | { |
678 | struct nfs_page *req; | 678 | struct nfs_page *req; |
679 | 679 | ||
680 | req = nfs_setup_write_request(ctx, page, offset, count); | 680 | req = nfs_setup_write_request(ctx, page, offset, count); |
681 | if (IS_ERR(req)) | 681 | if (IS_ERR(req)) |
682 | return PTR_ERR(req); | 682 | return PTR_ERR(req); |
683 | /* Update file length */ | 683 | /* Update file length */ |
684 | nfs_grow_file(page, offset, count); | 684 | nfs_grow_file(page, offset, count); |
685 | nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); | 685 | nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); |
686 | nfs_mark_request_dirty(req); | 686 | nfs_mark_request_dirty(req); |
687 | nfs_clear_page_tag_locked(req); | 687 | nfs_clear_page_tag_locked(req); |
688 | return 0; | 688 | return 0; |
689 | } | 689 | } |
690 | 690 | ||
691 | int nfs_flush_incompatible(struct file *file, struct page *page) | 691 | int nfs_flush_incompatible(struct file *file, struct page *page) |
692 | { | 692 | { |
693 | struct nfs_open_context *ctx = nfs_file_open_context(file); | 693 | struct nfs_open_context *ctx = nfs_file_open_context(file); |
694 | struct nfs_page *req; | 694 | struct nfs_page *req; |
695 | int do_flush, status; | 695 | int do_flush, status; |
696 | /* | 696 | /* |
697 | * Look for a request corresponding to this page. If there | 697 | * Look for a request corresponding to this page. If there |
698 | * is one, and it belongs to another file, we flush it out | 698 | * is one, and it belongs to another file, we flush it out |
699 | * before we try to copy anything into the page. Do this | 699 | * before we try to copy anything into the page. Do this |
700 | * due to the lack of an ACCESS-type call in NFSv2. | 700 | * due to the lack of an ACCESS-type call in NFSv2. |
701 | * Also do the same if we find a request from an existing | 701 | * Also do the same if we find a request from an existing |
702 | * dropped page. | 702 | * dropped page. |
703 | */ | 703 | */ |
704 | do { | 704 | do { |
705 | req = nfs_page_find_request(page); | 705 | req = nfs_page_find_request(page); |
706 | if (req == NULL) | 706 | if (req == NULL) |
707 | return 0; | 707 | return 0; |
708 | do_flush = req->wb_page != page || req->wb_context != ctx || | 708 | do_flush = req->wb_page != page || req->wb_context != ctx || |
709 | req->wb_lock_context->lockowner != current->files || | 709 | req->wb_lock_context->lockowner != current->files || |
710 | req->wb_lock_context->pid != current->tgid; | 710 | req->wb_lock_context->pid != current->tgid; |
711 | nfs_release_request(req); | 711 | nfs_release_request(req); |
712 | if (!do_flush) | 712 | if (!do_flush) |
713 | return 0; | 713 | return 0; |
714 | status = nfs_wb_page(page->mapping->host, page); | 714 | status = nfs_wb_page(page->mapping->host, page); |
715 | } while (status == 0); | 715 | } while (status == 0); |
716 | return status; | 716 | return status; |
717 | } | 717 | } |
718 | 718 | ||
719 | /* | 719 | /* |
720 | * If the page cache is marked as unsafe or invalid, then we can't rely on | 720 | * If the page cache is marked as unsafe or invalid, then we can't rely on |
721 | * the PageUptodate() flag. In this case, we will need to turn off | 721 | * the PageUptodate() flag. In this case, we will need to turn off |
722 | * write optimisations that depend on the page contents being correct. | 722 | * write optimisations that depend on the page contents being correct. |
723 | */ | 723 | */ |
724 | static int nfs_write_pageuptodate(struct page *page, struct inode *inode) | 724 | static int nfs_write_pageuptodate(struct page *page, struct inode *inode) |
725 | { | 725 | { |
726 | return PageUptodate(page) && | 726 | return PageUptodate(page) && |
727 | !(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA)); | 727 | !(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA)); |
728 | } | 728 | } |
729 | 729 | ||
730 | /* | 730 | /* |
731 | * Update and possibly write a cached page of an NFS file. | 731 | * Update and possibly write a cached page of an NFS file. |
732 | * | 732 | * |
733 | * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad | 733 | * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad |
734 | * things with a page scheduled for an RPC call (e.g. invalidate it). | 734 | * things with a page scheduled for an RPC call (e.g. invalidate it). |
735 | */ | 735 | */ |
736 | int nfs_updatepage(struct file *file, struct page *page, | 736 | int nfs_updatepage(struct file *file, struct page *page, |
737 | unsigned int offset, unsigned int count) | 737 | unsigned int offset, unsigned int count) |
738 | { | 738 | { |
739 | struct nfs_open_context *ctx = nfs_file_open_context(file); | 739 | struct nfs_open_context *ctx = nfs_file_open_context(file); |
740 | struct inode *inode = page->mapping->host; | 740 | struct inode *inode = page->mapping->host; |
741 | int status = 0; | 741 | int status = 0; |
742 | 742 | ||
743 | nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); | 743 | nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); |
744 | 744 | ||
745 | dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", | 745 | dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", |
746 | file->f_path.dentry->d_parent->d_name.name, | 746 | file->f_path.dentry->d_parent->d_name.name, |
747 | file->f_path.dentry->d_name.name, count, | 747 | file->f_path.dentry->d_name.name, count, |
748 | (long long)(page_offset(page) + offset)); | 748 | (long long)(page_offset(page) + offset)); |
749 | 749 | ||
750 | /* If we're not using byte range locks, and we know the page | 750 | /* If we're not using byte range locks, and we know the page |
751 | * is up to date, it may be more efficient to extend the write | 751 | * is up to date, it may be more efficient to extend the write |
752 | * to cover the entire page in order to avoid fragmentation | 752 | * to cover the entire page in order to avoid fragmentation |
753 | * inefficiencies. | 753 | * inefficiencies. |
754 | */ | 754 | */ |
755 | if (nfs_write_pageuptodate(page, inode) && | 755 | if (nfs_write_pageuptodate(page, inode) && |
756 | inode->i_flock == NULL && | 756 | inode->i_flock == NULL && |
757 | !(file->f_flags & O_DSYNC)) { | 757 | !(file->f_flags & O_DSYNC)) { |
758 | count = max(count + offset, nfs_page_length(page)); | 758 | count = max(count + offset, nfs_page_length(page)); |
759 | offset = 0; | 759 | offset = 0; |
760 | } | 760 | } |
761 | 761 | ||
762 | status = nfs_writepage_setup(ctx, page, offset, count); | 762 | status = nfs_writepage_setup(ctx, page, offset, count); |
763 | if (status < 0) | 763 | if (status < 0) |
764 | nfs_set_pageerror(page); | 764 | nfs_set_pageerror(page); |
765 | 765 | ||
766 | dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", | 766 | dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", |
767 | status, (long long)i_size_read(inode)); | 767 | status, (long long)i_size_read(inode)); |
768 | return status; | 768 | return status; |
769 | } | 769 | } |
770 | 770 | ||
771 | static void nfs_writepage_release(struct nfs_page *req, | 771 | static void nfs_writepage_release(struct nfs_page *req, |
772 | struct nfs_write_data *data) | 772 | struct nfs_write_data *data) |
773 | { | 773 | { |
774 | struct page *page = req->wb_page; | 774 | struct page *page = req->wb_page; |
775 | 775 | ||
776 | if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) | 776 | if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) |
777 | nfs_inode_remove_request(req); | 777 | nfs_inode_remove_request(req); |
778 | nfs_clear_page_tag_locked(req); | 778 | nfs_clear_page_tag_locked(req); |
779 | nfs_end_page_writeback(page); | 779 | nfs_end_page_writeback(page); |
780 | } | 780 | } |
781 | 781 | ||
782 | static int flush_task_priority(int how) | 782 | static int flush_task_priority(int how) |
783 | { | 783 | { |
784 | switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { | 784 | switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { |
785 | case FLUSH_HIGHPRI: | 785 | case FLUSH_HIGHPRI: |
786 | return RPC_PRIORITY_HIGH; | 786 | return RPC_PRIORITY_HIGH; |
787 | case FLUSH_LOWPRI: | 787 | case FLUSH_LOWPRI: |
788 | return RPC_PRIORITY_LOW; | 788 | return RPC_PRIORITY_LOW; |
789 | } | 789 | } |
790 | return RPC_PRIORITY_NORMAL; | 790 | return RPC_PRIORITY_NORMAL; |
791 | } | 791 | } |
792 | 792 | ||
793 | int nfs_initiate_write(struct nfs_write_data *data, | 793 | int nfs_initiate_write(struct nfs_write_data *data, |
794 | struct rpc_clnt *clnt, | 794 | struct rpc_clnt *clnt, |
795 | const struct rpc_call_ops *call_ops, | 795 | const struct rpc_call_ops *call_ops, |
796 | int how) | 796 | int how) |
797 | { | 797 | { |
798 | struct inode *inode = data->inode; | 798 | struct inode *inode = data->inode; |
799 | int priority = flush_task_priority(how); | 799 | int priority = flush_task_priority(how); |
800 | struct rpc_task *task; | 800 | struct rpc_task *task; |
801 | struct rpc_message msg = { | 801 | struct rpc_message msg = { |
802 | .rpc_argp = &data->args, | 802 | .rpc_argp = &data->args, |
803 | .rpc_resp = &data->res, | 803 | .rpc_resp = &data->res, |
804 | .rpc_cred = data->cred, | 804 | .rpc_cred = data->cred, |
805 | }; | 805 | }; |
806 | struct rpc_task_setup task_setup_data = { | 806 | struct rpc_task_setup task_setup_data = { |
807 | .rpc_client = clnt, | 807 | .rpc_client = clnt, |
808 | .task = &data->task, | 808 | .task = &data->task, |
809 | .rpc_message = &msg, | 809 | .rpc_message = &msg, |
810 | .callback_ops = call_ops, | 810 | .callback_ops = call_ops, |
811 | .callback_data = data, | 811 | .callback_data = data, |
812 | .workqueue = nfsiod_workqueue, | 812 | .workqueue = nfsiod_workqueue, |
813 | .flags = RPC_TASK_ASYNC, | 813 | .flags = RPC_TASK_ASYNC, |
814 | .priority = priority, | 814 | .priority = priority, |
815 | }; | 815 | }; |
816 | int ret = 0; | 816 | int ret = 0; |
817 | 817 | ||
818 | /* Set up the initial task struct. */ | 818 | /* Set up the initial task struct. */ |
819 | NFS_PROTO(inode)->write_setup(data, &msg); | 819 | NFS_PROTO(inode)->write_setup(data, &msg); |
820 | 820 | ||
821 | dprintk("NFS: %5u initiated write call " | 821 | dprintk("NFS: %5u initiated write call " |
822 | "(req %s/%lld, %u bytes @ offset %llu)\n", | 822 | "(req %s/%lld, %u bytes @ offset %llu)\n", |
823 | data->task.tk_pid, | 823 | data->task.tk_pid, |
824 | inode->i_sb->s_id, | 824 | inode->i_sb->s_id, |
825 | (long long)NFS_FILEID(inode), | 825 | (long long)NFS_FILEID(inode), |
826 | data->args.count, | 826 | data->args.count, |
827 | (unsigned long long)data->args.offset); | 827 | (unsigned long long)data->args.offset); |
828 | 828 | ||
829 | task = rpc_run_task(&task_setup_data); | 829 | task = rpc_run_task(&task_setup_data); |
830 | if (IS_ERR(task)) { | 830 | if (IS_ERR(task)) { |
831 | ret = PTR_ERR(task); | 831 | ret = PTR_ERR(task); |
832 | goto out; | 832 | goto out; |
833 | } | 833 | } |
834 | if (how & FLUSH_SYNC) { | 834 | if (how & FLUSH_SYNC) { |
835 | ret = rpc_wait_for_completion_task(task); | 835 | ret = rpc_wait_for_completion_task(task); |
836 | if (ret == 0) | 836 | if (ret == 0) |
837 | ret = task->tk_status; | 837 | ret = task->tk_status; |
838 | } | 838 | } |
839 | rpc_put_task(task); | 839 | rpc_put_task(task); |
840 | out: | 840 | out: |
841 | return ret; | 841 | return ret; |
842 | } | 842 | } |
843 | EXPORT_SYMBOL_GPL(nfs_initiate_write); | 843 | EXPORT_SYMBOL_GPL(nfs_initiate_write); |
844 | 844 | ||
845 | /* | 845 | /* |
846 | * Set up the argument/result storage required for the RPC call. | 846 | * Set up the argument/result storage required for the RPC call. |
847 | */ | 847 | */ |
848 | static int nfs_write_rpcsetup(struct nfs_page *req, | 848 | static int nfs_write_rpcsetup(struct nfs_page *req, |
849 | struct nfs_write_data *data, | 849 | struct nfs_write_data *data, |
850 | const struct rpc_call_ops *call_ops, | 850 | const struct rpc_call_ops *call_ops, |
851 | unsigned int count, unsigned int offset, | 851 | unsigned int count, unsigned int offset, |
852 | struct pnfs_layout_segment *lseg, | 852 | struct pnfs_layout_segment *lseg, |
853 | int how) | 853 | int how) |
854 | { | 854 | { |
855 | struct inode *inode = req->wb_context->dentry->d_inode; | 855 | struct inode *inode = req->wb_context->dentry->d_inode; |
856 | 856 | ||
857 | /* Set up the RPC argument and reply structs | 857 | /* Set up the RPC argument and reply structs |
858 | * NB: take care not to mess about with data->commit et al. */ | 858 | * NB: take care not to mess about with data->commit et al. */ |
859 | 859 | ||
860 | data->req = req; | 860 | data->req = req; |
861 | data->inode = inode = req->wb_context->dentry->d_inode; | 861 | data->inode = inode = req->wb_context->dentry->d_inode; |
862 | data->cred = req->wb_context->cred; | 862 | data->cred = req->wb_context->cred; |
863 | data->lseg = get_lseg(lseg); | 863 | data->lseg = get_lseg(lseg); |
864 | 864 | ||
865 | data->args.fh = NFS_FH(inode); | 865 | data->args.fh = NFS_FH(inode); |
866 | data->args.offset = req_offset(req) + offset; | 866 | data->args.offset = req_offset(req) + offset; |
867 | /* pnfs_set_layoutcommit needs this */ | 867 | /* pnfs_set_layoutcommit needs this */ |
868 | data->mds_offset = data->args.offset; | 868 | data->mds_offset = data->args.offset; |
869 | data->args.pgbase = req->wb_pgbase + offset; | 869 | data->args.pgbase = req->wb_pgbase + offset; |
870 | data->args.pages = data->pagevec; | 870 | data->args.pages = data->pagevec; |
871 | data->args.count = count; | 871 | data->args.count = count; |
872 | data->args.context = get_nfs_open_context(req->wb_context); | 872 | data->args.context = get_nfs_open_context(req->wb_context); |
873 | data->args.lock_context = req->wb_lock_context; | 873 | data->args.lock_context = req->wb_lock_context; |
874 | data->args.stable = NFS_UNSTABLE; | 874 | data->args.stable = NFS_UNSTABLE; |
875 | if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { | 875 | if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { |
876 | data->args.stable = NFS_DATA_SYNC; | 876 | data->args.stable = NFS_DATA_SYNC; |
877 | if (!nfs_need_commit(NFS_I(inode))) | 877 | if (!nfs_need_commit(NFS_I(inode))) |
878 | data->args.stable = NFS_FILE_SYNC; | 878 | data->args.stable = NFS_FILE_SYNC; |
879 | } | 879 | } |
880 | 880 | ||
881 | data->res.fattr = &data->fattr; | 881 | data->res.fattr = &data->fattr; |
882 | data->res.count = count; | 882 | data->res.count = count; |
883 | data->res.verf = &data->verf; | 883 | data->res.verf = &data->verf; |
884 | nfs_fattr_init(&data->fattr); | 884 | nfs_fattr_init(&data->fattr); |
885 | 885 | ||
886 | if (data->lseg && | 886 | if (data->lseg && |
887 | (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) | 887 | (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) |
888 | return 0; | 888 | return 0; |
889 | 889 | ||
890 | return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); | 890 | return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); |
891 | } | 891 | } |
892 | 892 | ||
893 | /* If a nfs_flush_* function fails, it should remove reqs from @head and | 893 | /* If a nfs_flush_* function fails, it should remove reqs from @head and |
894 | * call this on each, which will prepare them to be retried on next | 894 | * call this on each, which will prepare them to be retried on next |
895 | * writeback using standard nfs. | 895 | * writeback using standard nfs. |
896 | */ | 896 | */ |
897 | static void nfs_redirty_request(struct nfs_page *req) | 897 | static void nfs_redirty_request(struct nfs_page *req) |
898 | { | 898 | { |
899 | struct page *page = req->wb_page; | 899 | struct page *page = req->wb_page; |
900 | 900 | ||
901 | nfs_mark_request_dirty(req); | 901 | nfs_mark_request_dirty(req); |
902 | nfs_clear_page_tag_locked(req); | 902 | nfs_clear_page_tag_locked(req); |
903 | nfs_end_page_writeback(page); | 903 | nfs_end_page_writeback(page); |
904 | } | 904 | } |
905 | 905 | ||
906 | /* | 906 | /* |
907 | * Generate multiple small requests to write out a single | 907 | * Generate multiple small requests to write out a single |
908 | * contiguous dirty area on one page. | 908 | * contiguous dirty area on one page. |
909 | */ | 909 | */ |
910 | static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) | 910 | static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) |
911 | { | 911 | { |
912 | struct nfs_page *req = nfs_list_entry(desc->pg_list.next); | 912 | struct nfs_page *req = nfs_list_entry(desc->pg_list.next); |
913 | struct page *page = req->wb_page; | 913 | struct page *page = req->wb_page; |
914 | struct nfs_write_data *data; | 914 | struct nfs_write_data *data; |
915 | size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; | 915 | size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; |
916 | unsigned int offset; | 916 | unsigned int offset; |
917 | int requests = 0; | 917 | int requests = 0; |
918 | int ret = 0; | 918 | int ret = 0; |
919 | struct pnfs_layout_segment *lseg; | 919 | struct pnfs_layout_segment *lseg; |
920 | LIST_HEAD(list); | 920 | LIST_HEAD(list); |
921 | 921 | ||
922 | nfs_list_remove_request(req); | 922 | nfs_list_remove_request(req); |
923 | 923 | ||
924 | if ((desc->pg_ioflags & FLUSH_COND_STABLE) && | 924 | if ((desc->pg_ioflags & FLUSH_COND_STABLE) && |
925 | (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || | 925 | (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || |
926 | desc->pg_count > wsize)) | 926 | desc->pg_count > wsize)) |
927 | desc->pg_ioflags &= ~FLUSH_COND_STABLE; | 927 | desc->pg_ioflags &= ~FLUSH_COND_STABLE; |
928 | 928 | ||
929 | 929 | ||
930 | nbytes = desc->pg_count; | 930 | nbytes = desc->pg_count; |
931 | do { | 931 | do { |
932 | size_t len = min(nbytes, wsize); | 932 | size_t len = min(nbytes, wsize); |
933 | 933 | ||
934 | data = nfs_writedata_alloc(1); | 934 | data = nfs_writedata_alloc(1); |
935 | if (!data) | 935 | if (!data) |
936 | goto out_bad; | 936 | goto out_bad; |
937 | list_add(&data->pages, &list); | 937 | list_add(&data->pages, &list); |
938 | requests++; | 938 | requests++; |
939 | nbytes -= len; | 939 | nbytes -= len; |
940 | } while (nbytes != 0); | 940 | } while (nbytes != 0); |
941 | atomic_set(&req->wb_complete, requests); | 941 | atomic_set(&req->wb_complete, requests); |
942 | 942 | ||
943 | BUG_ON(desc->pg_lseg); | 943 | BUG_ON(desc->pg_lseg); |
944 | lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, | 944 | lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, |
945 | req_offset(req), desc->pg_count, | 945 | req_offset(req), desc->pg_count, |
946 | IOMODE_RW, GFP_NOFS); | 946 | IOMODE_RW, GFP_NOFS); |
947 | ClearPageError(page); | 947 | ClearPageError(page); |
948 | offset = 0; | 948 | offset = 0; |
949 | nbytes = desc->pg_count; | 949 | nbytes = desc->pg_count; |
950 | do { | 950 | do { |
951 | int ret2; | 951 | int ret2; |
952 | 952 | ||
953 | data = list_entry(list.next, struct nfs_write_data, pages); | 953 | data = list_entry(list.next, struct nfs_write_data, pages); |
954 | list_del_init(&data->pages); | 954 | list_del_init(&data->pages); |
955 | 955 | ||
956 | data->pagevec[0] = page; | 956 | data->pagevec[0] = page; |
957 | 957 | ||
958 | if (nbytes < wsize) | 958 | if (nbytes < wsize) |
959 | wsize = nbytes; | 959 | wsize = nbytes; |
960 | ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, | 960 | ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, |
961 | wsize, offset, lseg, desc->pg_ioflags); | 961 | wsize, offset, lseg, desc->pg_ioflags); |
962 | if (ret == 0) | 962 | if (ret == 0) |
963 | ret = ret2; | 963 | ret = ret2; |
964 | offset += wsize; | 964 | offset += wsize; |
965 | nbytes -= wsize; | 965 | nbytes -= wsize; |
966 | } while (nbytes != 0); | 966 | } while (nbytes != 0); |
967 | 967 | ||
968 | put_lseg(lseg); | 968 | put_lseg(lseg); |
969 | desc->pg_lseg = NULL; | 969 | desc->pg_lseg = NULL; |
970 | return ret; | 970 | return ret; |
971 | 971 | ||
972 | out_bad: | 972 | out_bad: |
973 | while (!list_empty(&list)) { | 973 | while (!list_empty(&list)) { |
974 | data = list_entry(list.next, struct nfs_write_data, pages); | 974 | data = list_entry(list.next, struct nfs_write_data, pages); |
975 | list_del(&data->pages); | 975 | list_del(&data->pages); |
976 | nfs_writedata_free(data); | 976 | nfs_writedata_free(data); |
977 | } | 977 | } |
978 | nfs_redirty_request(req); | 978 | nfs_redirty_request(req); |
979 | return -ENOMEM; | 979 | return -ENOMEM; |
980 | } | 980 | } |
981 | 981 | ||
982 | /* | 982 | /* |
983 | * Create an RPC task for the given write request and kick it. | 983 | * Create an RPC task for the given write request and kick it. |
984 | * The page must have been locked by the caller. | 984 | * The page must have been locked by the caller. |
985 | * | 985 | * |
986 | * It may happen that the page we're passed is not marked dirty. | 986 | * It may happen that the page we're passed is not marked dirty. |
987 | * This is the case if nfs_updatepage detects a conflicting request | 987 | * This is the case if nfs_updatepage detects a conflicting request |
988 | * that has been written but not committed. | 988 | * that has been written but not committed. |
989 | */ | 989 | */ |
990 | static int nfs_flush_one(struct nfs_pageio_descriptor *desc) | 990 | static int nfs_flush_one(struct nfs_pageio_descriptor *desc) |
991 | { | 991 | { |
992 | struct nfs_page *req; | 992 | struct nfs_page *req; |
993 | struct page **pages; | 993 | struct page **pages; |
994 | struct nfs_write_data *data; | 994 | struct nfs_write_data *data; |
995 | struct list_head *head = &desc->pg_list; | 995 | struct list_head *head = &desc->pg_list; |
996 | struct pnfs_layout_segment *lseg = desc->pg_lseg; | 996 | struct pnfs_layout_segment *lseg = desc->pg_lseg; |
997 | int ret; | 997 | int ret; |
998 | 998 | ||
999 | data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, | 999 | data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, |
1000 | desc->pg_count)); | 1000 | desc->pg_count)); |
1001 | if (!data) { | 1001 | if (!data) { |
1002 | while (!list_empty(head)) { | 1002 | while (!list_empty(head)) { |
1003 | req = nfs_list_entry(head->next); | 1003 | req = nfs_list_entry(head->next); |
1004 | nfs_list_remove_request(req); | 1004 | nfs_list_remove_request(req); |
1005 | nfs_redirty_request(req); | 1005 | nfs_redirty_request(req); |
1006 | } | 1006 | } |
1007 | ret = -ENOMEM; | 1007 | ret = -ENOMEM; |
1008 | goto out; | 1008 | goto out; |
1009 | } | 1009 | } |
1010 | pages = data->pagevec; | 1010 | pages = data->pagevec; |
1011 | while (!list_empty(head)) { | 1011 | while (!list_empty(head)) { |
1012 | req = nfs_list_entry(head->next); | 1012 | req = nfs_list_entry(head->next); |
1013 | nfs_list_remove_request(req); | 1013 | nfs_list_remove_request(req); |
1014 | nfs_list_add_request(req, &data->pages); | 1014 | nfs_list_add_request(req, &data->pages); |
1015 | ClearPageError(req->wb_page); | 1015 | ClearPageError(req->wb_page); |
1016 | *pages++ = req->wb_page; | 1016 | *pages++ = req->wb_page; |
1017 | } | 1017 | } |
1018 | req = nfs_list_entry(data->pages.next); | 1018 | req = nfs_list_entry(data->pages.next); |
1019 | if ((!lseg) && list_is_singular(&data->pages)) | 1019 | if ((!lseg) && list_is_singular(&data->pages)) |
1020 | lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, | 1020 | lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, |
1021 | req_offset(req), desc->pg_count, | 1021 | req_offset(req), desc->pg_count, |
1022 | IOMODE_RW, GFP_NOFS); | 1022 | IOMODE_RW, GFP_NOFS); |
1023 | 1023 | ||
1024 | if ((desc->pg_ioflags & FLUSH_COND_STABLE) && | 1024 | if ((desc->pg_ioflags & FLUSH_COND_STABLE) && |
1025 | (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) | 1025 | (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) |
1026 | desc->pg_ioflags &= ~FLUSH_COND_STABLE; | 1026 | desc->pg_ioflags &= ~FLUSH_COND_STABLE; |
1027 | 1027 | ||
1028 | /* Set up the argument struct */ | 1028 | /* Set up the argument struct */ |
1029 | ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); | 1029 | ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); |
1030 | out: | 1030 | out: |
1031 | put_lseg(lseg); /* Cleans any gotten in ->pg_test */ | 1031 | put_lseg(lseg); /* Cleans any gotten in ->pg_test */ |
1032 | desc->pg_lseg = NULL; | 1032 | desc->pg_lseg = NULL; |
1033 | return ret; | 1033 | return ret; |
1034 | } | 1034 | } |
1035 | 1035 | ||
1036 | static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, | 1036 | static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, |
1037 | struct inode *inode, int ioflags) | 1037 | struct inode *inode, int ioflags) |
1038 | { | 1038 | { |
1039 | size_t wsize = NFS_SERVER(inode)->wsize; | 1039 | size_t wsize = NFS_SERVER(inode)->wsize; |
1040 | 1040 | ||
1041 | if (wsize < PAGE_CACHE_SIZE) | 1041 | if (wsize < PAGE_CACHE_SIZE) |
1042 | nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); | 1042 | nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); |
1043 | else | 1043 | else |
1044 | nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); | 1044 | nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | /* | 1047 | /* |
1048 | * Handle a write reply that flushed part of a page. | 1048 | * Handle a write reply that flushed part of a page. |
1049 | */ | 1049 | */ |
1050 | static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) | 1050 | static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) |
1051 | { | 1051 | { |
1052 | struct nfs_write_data *data = calldata; | 1052 | struct nfs_write_data *data = calldata; |
1053 | 1053 | ||
1054 | dprintk("NFS: %5u write(%s/%lld %d@%lld)", | 1054 | dprintk("NFS: %5u write(%s/%lld %d@%lld)", |
1055 | task->tk_pid, | 1055 | task->tk_pid, |
1056 | data->req->wb_context->dentry->d_inode->i_sb->s_id, | 1056 | data->req->wb_context->dentry->d_inode->i_sb->s_id, |
1057 | (long long) | 1057 | (long long) |
1058 | NFS_FILEID(data->req->wb_context->dentry->d_inode), | 1058 | NFS_FILEID(data->req->wb_context->dentry->d_inode), |
1059 | data->req->wb_bytes, (long long)req_offset(data->req)); | 1059 | data->req->wb_bytes, (long long)req_offset(data->req)); |
1060 | 1060 | ||
1061 | nfs_writeback_done(task, data); | 1061 | nfs_writeback_done(task, data); |
1062 | } | 1062 | } |
1063 | 1063 | ||
1064 | static void nfs_writeback_release_partial(void *calldata) | 1064 | static void nfs_writeback_release_partial(void *calldata) |
1065 | { | 1065 | { |
1066 | struct nfs_write_data *data = calldata; | 1066 | struct nfs_write_data *data = calldata; |
1067 | struct nfs_page *req = data->req; | 1067 | struct nfs_page *req = data->req; |
1068 | struct page *page = req->wb_page; | 1068 | struct page *page = req->wb_page; |
1069 | int status = data->task.tk_status; | 1069 | int status = data->task.tk_status; |
1070 | 1070 | ||
1071 | if (status < 0) { | 1071 | if (status < 0) { |
1072 | nfs_set_pageerror(page); | 1072 | nfs_set_pageerror(page); |
1073 | nfs_context_set_write_error(req->wb_context, status); | 1073 | nfs_context_set_write_error(req->wb_context, status); |
1074 | dprintk(", error = %d\n", status); | 1074 | dprintk(", error = %d\n", status); |
1075 | goto out; | 1075 | goto out; |
1076 | } | 1076 | } |
1077 | 1077 | ||
1078 | if (nfs_write_need_commit(data)) { | 1078 | if (nfs_write_need_commit(data)) { |
1079 | struct inode *inode = page->mapping->host; | 1079 | struct inode *inode = page->mapping->host; |
1080 | 1080 | ||
1081 | spin_lock(&inode->i_lock); | 1081 | spin_lock(&inode->i_lock); |
1082 | if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { | 1082 | if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { |
1083 | /* Do nothing we need to resend the writes */ | 1083 | /* Do nothing we need to resend the writes */ |
1084 | } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { | 1084 | } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { |
1085 | memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); | 1085 | memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); |
1086 | dprintk(" defer commit\n"); | 1086 | dprintk(" defer commit\n"); |
1087 | } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { | 1087 | } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { |
1088 | set_bit(PG_NEED_RESCHED, &req->wb_flags); | 1088 | set_bit(PG_NEED_RESCHED, &req->wb_flags); |
1089 | clear_bit(PG_NEED_COMMIT, &req->wb_flags); | 1089 | clear_bit(PG_NEED_COMMIT, &req->wb_flags); |
1090 | dprintk(" server reboot detected\n"); | 1090 | dprintk(" server reboot detected\n"); |
1091 | } | 1091 | } |
1092 | spin_unlock(&inode->i_lock); | 1092 | spin_unlock(&inode->i_lock); |
1093 | } else | 1093 | } else |
1094 | dprintk(" OK\n"); | 1094 | dprintk(" OK\n"); |
1095 | 1095 | ||
1096 | out: | 1096 | out: |
1097 | if (atomic_dec_and_test(&req->wb_complete)) | 1097 | if (atomic_dec_and_test(&req->wb_complete)) |
1098 | nfs_writepage_release(req, data); | 1098 | nfs_writepage_release(req, data); |
1099 | nfs_writedata_release(calldata); | 1099 | nfs_writedata_release(calldata); |
1100 | } | 1100 | } |
1101 | 1101 | ||
1102 | #if defined(CONFIG_NFS_V4_1) | 1102 | #if defined(CONFIG_NFS_V4_1) |
1103 | void nfs_write_prepare(struct rpc_task *task, void *calldata) | 1103 | void nfs_write_prepare(struct rpc_task *task, void *calldata) |
1104 | { | 1104 | { |
1105 | struct nfs_write_data *data = calldata; | 1105 | struct nfs_write_data *data = calldata; |
1106 | 1106 | ||
1107 | if (nfs4_setup_sequence(NFS_SERVER(data->inode), | 1107 | if (nfs4_setup_sequence(NFS_SERVER(data->inode), |
1108 | &data->args.seq_args, | 1108 | &data->args.seq_args, |
1109 | &data->res.seq_res, 1, task)) | 1109 | &data->res.seq_res, 1, task)) |
1110 | return; | 1110 | return; |
1111 | rpc_call_start(task); | 1111 | rpc_call_start(task); |
1112 | } | 1112 | } |
1113 | #endif /* CONFIG_NFS_V4_1 */ | 1113 | #endif /* CONFIG_NFS_V4_1 */ |
1114 | 1114 | ||
1115 | static const struct rpc_call_ops nfs_write_partial_ops = { | 1115 | static const struct rpc_call_ops nfs_write_partial_ops = { |
1116 | #if defined(CONFIG_NFS_V4_1) | 1116 | #if defined(CONFIG_NFS_V4_1) |
1117 | .rpc_call_prepare = nfs_write_prepare, | 1117 | .rpc_call_prepare = nfs_write_prepare, |
1118 | #endif /* CONFIG_NFS_V4_1 */ | 1118 | #endif /* CONFIG_NFS_V4_1 */ |
1119 | .rpc_call_done = nfs_writeback_done_partial, | 1119 | .rpc_call_done = nfs_writeback_done_partial, |
1120 | .rpc_release = nfs_writeback_release_partial, | 1120 | .rpc_release = nfs_writeback_release_partial, |
1121 | }; | 1121 | }; |
1122 | 1122 | ||
1123 | /* | 1123 | /* |
1124 | * Handle a write reply that flushes a whole page. | 1124 | * Handle a write reply that flushes a whole page. |
1125 | * | 1125 | * |
1126 | * FIXME: There is an inherent race with invalidate_inode_pages and | 1126 | * FIXME: There is an inherent race with invalidate_inode_pages and |
1127 | * writebacks since the page->count is kept > 1 for as long | 1127 | * writebacks since the page->count is kept > 1 for as long |
1128 | * as the page has a write request pending. | 1128 | * as the page has a write request pending. |
1129 | */ | 1129 | */ |
1130 | static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) | 1130 | static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) |
1131 | { | 1131 | { |
1132 | struct nfs_write_data *data = calldata; | 1132 | struct nfs_write_data *data = calldata; |
1133 | 1133 | ||
1134 | nfs_writeback_done(task, data); | 1134 | nfs_writeback_done(task, data); |
1135 | } | 1135 | } |
1136 | 1136 | ||
1137 | static void nfs_writeback_release_full(void *calldata) | 1137 | static void nfs_writeback_release_full(void *calldata) |
1138 | { | 1138 | { |
1139 | struct nfs_write_data *data = calldata; | 1139 | struct nfs_write_data *data = calldata; |
1140 | int status = data->task.tk_status; | 1140 | int status = data->task.tk_status; |
1141 | 1141 | ||
1142 | /* Update attributes as result of writeback. */ | 1142 | /* Update attributes as result of writeback. */ |
1143 | while (!list_empty(&data->pages)) { | 1143 | while (!list_empty(&data->pages)) { |
1144 | struct nfs_page *req = nfs_list_entry(data->pages.next); | 1144 | struct nfs_page *req = nfs_list_entry(data->pages.next); |
1145 | struct page *page = req->wb_page; | 1145 | struct page *page = req->wb_page; |
1146 | 1146 | ||
1147 | nfs_list_remove_request(req); | 1147 | nfs_list_remove_request(req); |
1148 | 1148 | ||
1149 | dprintk("NFS: %5u write (%s/%lld %d@%lld)", | 1149 | dprintk("NFS: %5u write (%s/%lld %d@%lld)", |
1150 | data->task.tk_pid, | 1150 | data->task.tk_pid, |
1151 | req->wb_context->dentry->d_inode->i_sb->s_id, | 1151 | req->wb_context->dentry->d_inode->i_sb->s_id, |
1152 | (long long)NFS_FILEID(req->wb_context->dentry->d_inode), | 1152 | (long long)NFS_FILEID(req->wb_context->dentry->d_inode), |
1153 | req->wb_bytes, | 1153 | req->wb_bytes, |
1154 | (long long)req_offset(req)); | 1154 | (long long)req_offset(req)); |
1155 | 1155 | ||
1156 | if (status < 0) { | 1156 | if (status < 0) { |
1157 | nfs_set_pageerror(page); | 1157 | nfs_set_pageerror(page); |
1158 | nfs_context_set_write_error(req->wb_context, status); | 1158 | nfs_context_set_write_error(req->wb_context, status); |
1159 | dprintk(", error = %d\n", status); | 1159 | dprintk(", error = %d\n", status); |
1160 | goto remove_request; | 1160 | goto remove_request; |
1161 | } | 1161 | } |
1162 | 1162 | ||
1163 | if (nfs_write_need_commit(data)) { | 1163 | if (nfs_write_need_commit(data)) { |
1164 | memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); | 1164 | memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); |
1165 | nfs_mark_request_commit(req, data->lseg); | 1165 | nfs_mark_request_commit(req, data->lseg); |
1166 | dprintk(" marked for commit\n"); | 1166 | dprintk(" marked for commit\n"); |
1167 | goto next; | 1167 | goto next; |
1168 | } | 1168 | } |
1169 | dprintk(" OK\n"); | 1169 | dprintk(" OK\n"); |
1170 | remove_request: | 1170 | remove_request: |
1171 | nfs_inode_remove_request(req); | 1171 | nfs_inode_remove_request(req); |
1172 | next: | 1172 | next: |
1173 | nfs_clear_page_tag_locked(req); | 1173 | nfs_clear_page_tag_locked(req); |
1174 | nfs_end_page_writeback(page); | 1174 | nfs_end_page_writeback(page); |
1175 | } | 1175 | } |
1176 | nfs_writedata_release(calldata); | 1176 | nfs_writedata_release(calldata); |
1177 | } | 1177 | } |
1178 | 1178 | ||
1179 | static const struct rpc_call_ops nfs_write_full_ops = { | 1179 | static const struct rpc_call_ops nfs_write_full_ops = { |
1180 | #if defined(CONFIG_NFS_V4_1) | 1180 | #if defined(CONFIG_NFS_V4_1) |
1181 | .rpc_call_prepare = nfs_write_prepare, | 1181 | .rpc_call_prepare = nfs_write_prepare, |
1182 | #endif /* CONFIG_NFS_V4_1 */ | 1182 | #endif /* CONFIG_NFS_V4_1 */ |
1183 | .rpc_call_done = nfs_writeback_done_full, | 1183 | .rpc_call_done = nfs_writeback_done_full, |
1184 | .rpc_release = nfs_writeback_release_full, | 1184 | .rpc_release = nfs_writeback_release_full, |
1185 | }; | 1185 | }; |
1186 | 1186 | ||
1187 | 1187 | ||
1188 | /* | 1188 | /* |
1189 | * This function is called when the WRITE call is complete. | 1189 | * This function is called when the WRITE call is complete. |
1190 | */ | 1190 | */ |
1191 | void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) | 1191 | void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) |
1192 | { | 1192 | { |
1193 | struct nfs_writeargs *argp = &data->args; | 1193 | struct nfs_writeargs *argp = &data->args; |
1194 | struct nfs_writeres *resp = &data->res; | 1194 | struct nfs_writeres *resp = &data->res; |
1195 | struct nfs_server *server = NFS_SERVER(data->inode); | 1195 | struct nfs_server *server = NFS_SERVER(data->inode); |
1196 | int status; | 1196 | int status; |
1197 | 1197 | ||
1198 | dprintk("NFS: %5u nfs_writeback_done (status %d)\n", | 1198 | dprintk("NFS: %5u nfs_writeback_done (status %d)\n", |
1199 | task->tk_pid, task->tk_status); | 1199 | task->tk_pid, task->tk_status); |
1200 | 1200 | ||
1201 | /* | 1201 | /* |
1202 | * ->write_done will attempt to use post-op attributes to detect | 1202 | * ->write_done will attempt to use post-op attributes to detect |
1203 | * conflicting writes by other clients. A strict interpretation | 1203 | * conflicting writes by other clients. A strict interpretation |
1204 | * of close-to-open would allow us to continue caching even if | 1204 | * of close-to-open would allow us to continue caching even if |
1205 | * another writer had changed the file, but some applications | 1205 | * another writer had changed the file, but some applications |
1206 | * depend on tighter cache coherency when writing. | 1206 | * depend on tighter cache coherency when writing. |
1207 | */ | 1207 | */ |
1208 | status = NFS_PROTO(data->inode)->write_done(task, data); | 1208 | status = NFS_PROTO(data->inode)->write_done(task, data); |
1209 | if (status != 0) | 1209 | if (status != 0) |
1210 | return; | 1210 | return; |
1211 | nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); | 1211 | nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); |
1212 | 1212 | ||
1213 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) | 1213 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) |
1214 | if (resp->verf->committed < argp->stable && task->tk_status >= 0) { | 1214 | if (resp->verf->committed < argp->stable && task->tk_status >= 0) { |
1215 | /* We tried a write call, but the server did not | 1215 | /* We tried a write call, but the server did not |
1216 | * commit data to stable storage even though we | 1216 | * commit data to stable storage even though we |
1217 | * requested it. | 1217 | * requested it. |
1218 | * Note: There is a known bug in Tru64 < 5.0 in which | 1218 | * Note: There is a known bug in Tru64 < 5.0 in which |
1219 | * the server reports NFS_DATA_SYNC, but performs | 1219 | * the server reports NFS_DATA_SYNC, but performs |
1220 | * NFS_FILE_SYNC. We therefore implement this checking | 1220 | * NFS_FILE_SYNC. We therefore implement this checking |
1221 | * as a dprintk() in order to avoid filling syslog. | 1221 | * as a dprintk() in order to avoid filling syslog. |
1222 | */ | 1222 | */ |
1223 | static unsigned long complain; | 1223 | static unsigned long complain; |
1224 | 1224 | ||
1225 | /* Note this will print the MDS for a DS write */ | 1225 | /* Note this will print the MDS for a DS write */ |
1226 | if (time_before(complain, jiffies)) { | 1226 | if (time_before(complain, jiffies)) { |
1227 | dprintk("NFS: faulty NFS server %s:" | 1227 | dprintk("NFS: faulty NFS server %s:" |
1228 | " (committed = %d) != (stable = %d)\n", | 1228 | " (committed = %d) != (stable = %d)\n", |
1229 | server->nfs_client->cl_hostname, | 1229 | server->nfs_client->cl_hostname, |
1230 | resp->verf->committed, argp->stable); | 1230 | resp->verf->committed, argp->stable); |
1231 | complain = jiffies + 300 * HZ; | 1231 | complain = jiffies + 300 * HZ; |
1232 | } | 1232 | } |
1233 | } | 1233 | } |
1234 | #endif | 1234 | #endif |
1235 | /* Is this a short write? */ | 1235 | /* Is this a short write? */ |
1236 | if (task->tk_status >= 0 && resp->count < argp->count) { | 1236 | if (task->tk_status >= 0 && resp->count < argp->count) { |
1237 | static unsigned long complain; | 1237 | static unsigned long complain; |
1238 | 1238 | ||
1239 | nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); | 1239 | nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); |
1240 | 1240 | ||
1241 | /* Has the server at least made some progress? */ | 1241 | /* Has the server at least made some progress? */ |
1242 | if (resp->count != 0) { | 1242 | if (resp->count != 0) { |
1243 | /* Was this an NFSv2 write or an NFSv3 stable write? */ | 1243 | /* Was this an NFSv2 write or an NFSv3 stable write? */ |
1244 | if (resp->verf->committed != NFS_UNSTABLE) { | 1244 | if (resp->verf->committed != NFS_UNSTABLE) { |
1245 | /* Resend from where the server left off */ | 1245 | /* Resend from where the server left off */ |
1246 | data->mds_offset += resp->count; | 1246 | data->mds_offset += resp->count; |
1247 | argp->offset += resp->count; | 1247 | argp->offset += resp->count; |
1248 | argp->pgbase += resp->count; | 1248 | argp->pgbase += resp->count; |
1249 | argp->count -= resp->count; | 1249 | argp->count -= resp->count; |
1250 | } else { | 1250 | } else { |
1251 | /* Resend as a stable write in order to avoid | 1251 | /* Resend as a stable write in order to avoid |
1252 | * headaches in the case of a server crash. | 1252 | * headaches in the case of a server crash. |
1253 | */ | 1253 | */ |
1254 | argp->stable = NFS_FILE_SYNC; | 1254 | argp->stable = NFS_FILE_SYNC; |
1255 | } | 1255 | } |
1256 | nfs_restart_rpc(task, server->nfs_client); | 1256 | nfs_restart_rpc(task, server->nfs_client); |
1257 | return; | 1257 | return; |
1258 | } | 1258 | } |
1259 | if (time_before(complain, jiffies)) { | 1259 | if (time_before(complain, jiffies)) { |
1260 | printk(KERN_WARNING | 1260 | printk(KERN_WARNING |
1261 | "NFS: Server wrote zero bytes, expected %u.\n", | 1261 | "NFS: Server wrote zero bytes, expected %u.\n", |
1262 | argp->count); | 1262 | argp->count); |
1263 | complain = jiffies + 300 * HZ; | 1263 | complain = jiffies + 300 * HZ; |
1264 | } | 1264 | } |
1265 | /* Can't do anything about it except throw an error. */ | 1265 | /* Can't do anything about it except throw an error. */ |
1266 | task->tk_status = -EIO; | 1266 | task->tk_status = -EIO; |
1267 | } | 1267 | } |
1268 | return; | 1268 | return; |
1269 | } | 1269 | } |
1270 | 1270 | ||
1271 | 1271 | ||
1272 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) | 1272 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) |
1273 | static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) | 1273 | static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) |
1274 | { | 1274 | { |
1275 | int ret; | 1275 | int ret; |
1276 | 1276 | ||
1277 | if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) | 1277 | if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) |
1278 | return 1; | 1278 | return 1; |
1279 | if (!may_wait) | 1279 | if (!may_wait) |
1280 | return 0; | 1280 | return 0; |
1281 | ret = out_of_line_wait_on_bit_lock(&nfsi->flags, | 1281 | ret = out_of_line_wait_on_bit_lock(&nfsi->flags, |
1282 | NFS_INO_COMMIT, | 1282 | NFS_INO_COMMIT, |
1283 | nfs_wait_bit_killable, | 1283 | nfs_wait_bit_killable, |
1284 | TASK_KILLABLE); | 1284 | TASK_KILLABLE); |
1285 | return (ret < 0) ? ret : 1; | 1285 | return (ret < 0) ? ret : 1; |
1286 | } | 1286 | } |
1287 | 1287 | ||
1288 | void nfs_commit_clear_lock(struct nfs_inode *nfsi) | 1288 | void nfs_commit_clear_lock(struct nfs_inode *nfsi) |
1289 | { | 1289 | { |
1290 | clear_bit(NFS_INO_COMMIT, &nfsi->flags); | 1290 | clear_bit(NFS_INO_COMMIT, &nfsi->flags); |
1291 | smp_mb__after_clear_bit(); | 1291 | smp_mb__after_clear_bit(); |
1292 | wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); | 1292 | wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); |
1293 | } | 1293 | } |
1294 | EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); | 1294 | EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); |
1295 | 1295 | ||
1296 | void nfs_commitdata_release(void *data) | 1296 | void nfs_commitdata_release(void *data) |
1297 | { | 1297 | { |
1298 | struct nfs_write_data *wdata = data; | 1298 | struct nfs_write_data *wdata = data; |
1299 | 1299 | ||
1300 | put_lseg(wdata->lseg); | 1300 | put_lseg(wdata->lseg); |
1301 | put_nfs_open_context(wdata->args.context); | 1301 | put_nfs_open_context(wdata->args.context); |
1302 | nfs_commit_free(wdata); | 1302 | nfs_commit_free(wdata); |
1303 | } | 1303 | } |
1304 | EXPORT_SYMBOL_GPL(nfs_commitdata_release); | 1304 | EXPORT_SYMBOL_GPL(nfs_commitdata_release); |
1305 | 1305 | ||
1306 | int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, | 1306 | int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, |
1307 | const struct rpc_call_ops *call_ops, | 1307 | const struct rpc_call_ops *call_ops, |
1308 | int how) | 1308 | int how) |
1309 | { | 1309 | { |
1310 | struct rpc_task *task; | 1310 | struct rpc_task *task; |
1311 | int priority = flush_task_priority(how); | 1311 | int priority = flush_task_priority(how); |
1312 | struct rpc_message msg = { | 1312 | struct rpc_message msg = { |
1313 | .rpc_argp = &data->args, | 1313 | .rpc_argp = &data->args, |
1314 | .rpc_resp = &data->res, | 1314 | .rpc_resp = &data->res, |
1315 | .rpc_cred = data->cred, | 1315 | .rpc_cred = data->cred, |
1316 | }; | 1316 | }; |
1317 | struct rpc_task_setup task_setup_data = { | 1317 | struct rpc_task_setup task_setup_data = { |
1318 | .task = &data->task, | 1318 | .task = &data->task, |
1319 | .rpc_client = clnt, | 1319 | .rpc_client = clnt, |
1320 | .rpc_message = &msg, | 1320 | .rpc_message = &msg, |
1321 | .callback_ops = call_ops, | 1321 | .callback_ops = call_ops, |
1322 | .callback_data = data, | 1322 | .callback_data = data, |
1323 | .workqueue = nfsiod_workqueue, | 1323 | .workqueue = nfsiod_workqueue, |
1324 | .flags = RPC_TASK_ASYNC, | 1324 | .flags = RPC_TASK_ASYNC, |
1325 | .priority = priority, | 1325 | .priority = priority, |
1326 | }; | 1326 | }; |
1327 | /* Set up the initial task struct. */ | 1327 | /* Set up the initial task struct. */ |
1328 | NFS_PROTO(data->inode)->commit_setup(data, &msg); | 1328 | NFS_PROTO(data->inode)->commit_setup(data, &msg); |
1329 | 1329 | ||
1330 | dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); | 1330 | dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); |
1331 | 1331 | ||
1332 | task = rpc_run_task(&task_setup_data); | 1332 | task = rpc_run_task(&task_setup_data); |
1333 | if (IS_ERR(task)) | 1333 | if (IS_ERR(task)) |
1334 | return PTR_ERR(task); | 1334 | return PTR_ERR(task); |
1335 | if (how & FLUSH_SYNC) | 1335 | if (how & FLUSH_SYNC) |
1336 | rpc_wait_for_completion_task(task); | 1336 | rpc_wait_for_completion_task(task); |
1337 | rpc_put_task(task); | 1337 | rpc_put_task(task); |
1338 | return 0; | 1338 | return 0; |
1339 | } | 1339 | } |
1340 | EXPORT_SYMBOL_GPL(nfs_initiate_commit); | 1340 | EXPORT_SYMBOL_GPL(nfs_initiate_commit); |
1341 | 1341 | ||
1342 | /* | 1342 | /* |
1343 | * Set up the argument/result storage required for the RPC call. | 1343 | * Set up the argument/result storage required for the RPC call. |
1344 | */ | 1344 | */ |
1345 | void nfs_init_commit(struct nfs_write_data *data, | 1345 | void nfs_init_commit(struct nfs_write_data *data, |
1346 | struct list_head *head, | 1346 | struct list_head *head, |
1347 | struct pnfs_layout_segment *lseg) | 1347 | struct pnfs_layout_segment *lseg) |
1348 | { | 1348 | { |
1349 | struct nfs_page *first = nfs_list_entry(head->next); | 1349 | struct nfs_page *first = nfs_list_entry(head->next); |
1350 | struct inode *inode = first->wb_context->dentry->d_inode; | 1350 | struct inode *inode = first->wb_context->dentry->d_inode; |
1351 | 1351 | ||
1352 | /* Set up the RPC argument and reply structs | 1352 | /* Set up the RPC argument and reply structs |
1353 | * NB: take care not to mess about with data->commit et al. */ | 1353 | * NB: take care not to mess about with data->commit et al. */ |
1354 | 1354 | ||
1355 | list_splice_init(head, &data->pages); | 1355 | list_splice_init(head, &data->pages); |
1356 | 1356 | ||
1357 | data->inode = inode; | 1357 | data->inode = inode; |
1358 | data->cred = first->wb_context->cred; | 1358 | data->cred = first->wb_context->cred; |
1359 | data->lseg = lseg; /* reference transferred */ | 1359 | data->lseg = lseg; /* reference transferred */ |
1360 | data->mds_ops = &nfs_commit_ops; | 1360 | data->mds_ops = &nfs_commit_ops; |
1361 | 1361 | ||
1362 | data->args.fh = NFS_FH(data->inode); | 1362 | data->args.fh = NFS_FH(data->inode); |
1363 | /* Note: we always request a commit of the entire inode */ | 1363 | /* Note: we always request a commit of the entire inode */ |
1364 | data->args.offset = 0; | 1364 | data->args.offset = 0; |
1365 | data->args.count = 0; | 1365 | data->args.count = 0; |
1366 | data->args.context = get_nfs_open_context(first->wb_context); | 1366 | data->args.context = get_nfs_open_context(first->wb_context); |
1367 | data->res.count = 0; | 1367 | data->res.count = 0; |
1368 | data->res.fattr = &data->fattr; | 1368 | data->res.fattr = &data->fattr; |
1369 | data->res.verf = &data->verf; | 1369 | data->res.verf = &data->verf; |
1370 | nfs_fattr_init(&data->fattr); | 1370 | nfs_fattr_init(&data->fattr); |
1371 | } | 1371 | } |
1372 | EXPORT_SYMBOL_GPL(nfs_init_commit); | 1372 | EXPORT_SYMBOL_GPL(nfs_init_commit); |
1373 | 1373 | ||
1374 | void nfs_retry_commit(struct list_head *page_list, | 1374 | void nfs_retry_commit(struct list_head *page_list, |
1375 | struct pnfs_layout_segment *lseg) | 1375 | struct pnfs_layout_segment *lseg) |
1376 | { | 1376 | { |
1377 | struct nfs_page *req; | 1377 | struct nfs_page *req; |
1378 | 1378 | ||
1379 | while (!list_empty(page_list)) { | 1379 | while (!list_empty(page_list)) { |
1380 | req = nfs_list_entry(page_list->next); | 1380 | req = nfs_list_entry(page_list->next); |
1381 | nfs_list_remove_request(req); | 1381 | nfs_list_remove_request(req); |
1382 | nfs_mark_request_commit(req, lseg); | 1382 | nfs_mark_request_commit(req, lseg); |
1383 | dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); | 1383 | dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); |
1384 | dec_bdi_stat(req->wb_page->mapping->backing_dev_info, | 1384 | dec_bdi_stat(req->wb_page->mapping->backing_dev_info, |
1385 | BDI_RECLAIMABLE); | 1385 | BDI_RECLAIMABLE); |
1386 | nfs_clear_page_tag_locked(req); | 1386 | nfs_clear_page_tag_locked(req); |
1387 | } | 1387 | } |
1388 | } | 1388 | } |
1389 | EXPORT_SYMBOL_GPL(nfs_retry_commit); | 1389 | EXPORT_SYMBOL_GPL(nfs_retry_commit); |
1390 | 1390 | ||
1391 | /* | 1391 | /* |
1392 | * Commit dirty pages | 1392 | * Commit dirty pages |
1393 | */ | 1393 | */ |
1394 | static int | 1394 | static int |
1395 | nfs_commit_list(struct inode *inode, struct list_head *head, int how) | 1395 | nfs_commit_list(struct inode *inode, struct list_head *head, int how) |
1396 | { | 1396 | { |
1397 | struct nfs_write_data *data; | 1397 | struct nfs_write_data *data; |
1398 | 1398 | ||
1399 | data = nfs_commitdata_alloc(); | 1399 | data = nfs_commitdata_alloc(); |
1400 | 1400 | ||
1401 | if (!data) | 1401 | if (!data) |
1402 | goto out_bad; | 1402 | goto out_bad; |
1403 | 1403 | ||
1404 | /* Set up the argument struct */ | 1404 | /* Set up the argument struct */ |
1405 | nfs_init_commit(data, head, NULL); | 1405 | nfs_init_commit(data, head, NULL); |
1406 | return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); | 1406 | return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); |
1407 | out_bad: | 1407 | out_bad: |
1408 | nfs_retry_commit(head, NULL); | 1408 | nfs_retry_commit(head, NULL); |
1409 | nfs_commit_clear_lock(NFS_I(inode)); | 1409 | nfs_commit_clear_lock(NFS_I(inode)); |
1410 | return -ENOMEM; | 1410 | return -ENOMEM; |
1411 | } | 1411 | } |
1412 | 1412 | ||
1413 | /* | 1413 | /* |
1414 | * COMMIT call returned | 1414 | * COMMIT call returned |
1415 | */ | 1415 | */ |
1416 | static void nfs_commit_done(struct rpc_task *task, void *calldata) | 1416 | static void nfs_commit_done(struct rpc_task *task, void *calldata) |
1417 | { | 1417 | { |
1418 | struct nfs_write_data *data = calldata; | 1418 | struct nfs_write_data *data = calldata; |
1419 | 1419 | ||
1420 | dprintk("NFS: %5u nfs_commit_done (status %d)\n", | 1420 | dprintk("NFS: %5u nfs_commit_done (status %d)\n", |
1421 | task->tk_pid, task->tk_status); | 1421 | task->tk_pid, task->tk_status); |
1422 | 1422 | ||
1423 | /* Call the NFS version-specific code */ | 1423 | /* Call the NFS version-specific code */ |
1424 | NFS_PROTO(data->inode)->commit_done(task, data); | 1424 | NFS_PROTO(data->inode)->commit_done(task, data); |
1425 | } | 1425 | } |
1426 | 1426 | ||
1427 | void nfs_commit_release_pages(struct nfs_write_data *data) | 1427 | void nfs_commit_release_pages(struct nfs_write_data *data) |
1428 | { | 1428 | { |
1429 | struct nfs_page *req; | 1429 | struct nfs_page *req; |
1430 | int status = data->task.tk_status; | 1430 | int status = data->task.tk_status; |
1431 | 1431 | ||
1432 | while (!list_empty(&data->pages)) { | 1432 | while (!list_empty(&data->pages)) { |
1433 | req = nfs_list_entry(data->pages.next); | 1433 | req = nfs_list_entry(data->pages.next); |
1434 | nfs_list_remove_request(req); | 1434 | nfs_list_remove_request(req); |
1435 | nfs_clear_request_commit(req); | 1435 | nfs_clear_request_commit(req); |
1436 | 1436 | ||
1437 | dprintk("NFS: commit (%s/%lld %d@%lld)", | 1437 | dprintk("NFS: commit (%s/%lld %d@%lld)", |
1438 | req->wb_context->dentry->d_sb->s_id, | 1438 | req->wb_context->dentry->d_sb->s_id, |
1439 | (long long)NFS_FILEID(req->wb_context->dentry->d_inode), | 1439 | (long long)NFS_FILEID(req->wb_context->dentry->d_inode), |
1440 | req->wb_bytes, | 1440 | req->wb_bytes, |
1441 | (long long)req_offset(req)); | 1441 | (long long)req_offset(req)); |
1442 | if (status < 0) { | 1442 | if (status < 0) { |
1443 | nfs_context_set_write_error(req->wb_context, status); | 1443 | nfs_context_set_write_error(req->wb_context, status); |
1444 | nfs_inode_remove_request(req); | 1444 | nfs_inode_remove_request(req); |
1445 | dprintk(", error = %d\n", status); | 1445 | dprintk(", error = %d\n", status); |
1446 | goto next; | 1446 | goto next; |
1447 | } | 1447 | } |
1448 | 1448 | ||
1449 | /* Okay, COMMIT succeeded, apparently. Check the verifier | 1449 | /* Okay, COMMIT succeeded, apparently. Check the verifier |
1450 | * returned by the server against all stored verfs. */ | 1450 | * returned by the server against all stored verfs. */ |
1451 | if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { | 1451 | if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { |
1452 | /* We have a match */ | 1452 | /* We have a match */ |
1453 | nfs_inode_remove_request(req); | 1453 | nfs_inode_remove_request(req); |
1454 | dprintk(" OK\n"); | 1454 | dprintk(" OK\n"); |
1455 | goto next; | 1455 | goto next; |
1456 | } | 1456 | } |
1457 | /* We have a mismatch. Write the page again */ | 1457 | /* We have a mismatch. Write the page again */ |
1458 | dprintk(" mismatch\n"); | 1458 | dprintk(" mismatch\n"); |
1459 | nfs_mark_request_dirty(req); | 1459 | nfs_mark_request_dirty(req); |
1460 | next: | 1460 | next: |
1461 | nfs_clear_page_tag_locked(req); | 1461 | nfs_clear_page_tag_locked(req); |
1462 | } | 1462 | } |
1463 | } | 1463 | } |
1464 | EXPORT_SYMBOL_GPL(nfs_commit_release_pages); | 1464 | EXPORT_SYMBOL_GPL(nfs_commit_release_pages); |
1465 | 1465 | ||
1466 | static void nfs_commit_release(void *calldata) | 1466 | static void nfs_commit_release(void *calldata) |
1467 | { | 1467 | { |
1468 | struct nfs_write_data *data = calldata; | 1468 | struct nfs_write_data *data = calldata; |
1469 | 1469 | ||
1470 | nfs_commit_release_pages(data); | 1470 | nfs_commit_release_pages(data); |
1471 | nfs_commit_clear_lock(NFS_I(data->inode)); | 1471 | nfs_commit_clear_lock(NFS_I(data->inode)); |
1472 | nfs_commitdata_release(calldata); | 1472 | nfs_commitdata_release(calldata); |
1473 | } | 1473 | } |
1474 | 1474 | ||
1475 | static const struct rpc_call_ops nfs_commit_ops = { | 1475 | static const struct rpc_call_ops nfs_commit_ops = { |
1476 | #if defined(CONFIG_NFS_V4_1) | 1476 | #if defined(CONFIG_NFS_V4_1) |
1477 | .rpc_call_prepare = nfs_write_prepare, | 1477 | .rpc_call_prepare = nfs_write_prepare, |
1478 | #endif /* CONFIG_NFS_V4_1 */ | 1478 | #endif /* CONFIG_NFS_V4_1 */ |
1479 | .rpc_call_done = nfs_commit_done, | 1479 | .rpc_call_done = nfs_commit_done, |
1480 | .rpc_release = nfs_commit_release, | 1480 | .rpc_release = nfs_commit_release, |
1481 | }; | 1481 | }; |
1482 | 1482 | ||
1483 | int nfs_commit_inode(struct inode *inode, int how) | 1483 | int nfs_commit_inode(struct inode *inode, int how) |
1484 | { | 1484 | { |
1485 | LIST_HEAD(head); | 1485 | LIST_HEAD(head); |
1486 | int may_wait = how & FLUSH_SYNC; | 1486 | int may_wait = how & FLUSH_SYNC; |
1487 | int res; | 1487 | int res; |
1488 | 1488 | ||
1489 | res = nfs_commit_set_lock(NFS_I(inode), may_wait); | 1489 | res = nfs_commit_set_lock(NFS_I(inode), may_wait); |
1490 | if (res <= 0) | 1490 | if (res <= 0) |
1491 | goto out_mark_dirty; | 1491 | goto out_mark_dirty; |
1492 | res = nfs_scan_commit(inode, &head, 0, 0); | 1492 | res = nfs_scan_commit(inode, &head, 0, 0); |
1493 | if (res) { | 1493 | if (res) { |
1494 | int error; | 1494 | int error; |
1495 | 1495 | ||
1496 | error = pnfs_commit_list(inode, &head, how); | 1496 | error = pnfs_commit_list(inode, &head, how); |
1497 | if (error == PNFS_NOT_ATTEMPTED) | 1497 | if (error == PNFS_NOT_ATTEMPTED) |
1498 | error = nfs_commit_list(inode, &head, how); | 1498 | error = nfs_commit_list(inode, &head, how); |
1499 | if (error < 0) | 1499 | if (error < 0) |
1500 | return error; | 1500 | return error; |
1501 | if (!may_wait) | 1501 | if (!may_wait) |
1502 | goto out_mark_dirty; | 1502 | goto out_mark_dirty; |
1503 | error = wait_on_bit(&NFS_I(inode)->flags, | 1503 | error = wait_on_bit(&NFS_I(inode)->flags, |
1504 | NFS_INO_COMMIT, | 1504 | NFS_INO_COMMIT, |
1505 | nfs_wait_bit_killable, | 1505 | nfs_wait_bit_killable, |
1506 | TASK_KILLABLE); | 1506 | TASK_KILLABLE); |
1507 | if (error < 0) | 1507 | if (error < 0) |
1508 | return error; | 1508 | return error; |
1509 | } else | 1509 | } else |
1510 | nfs_commit_clear_lock(NFS_I(inode)); | 1510 | nfs_commit_clear_lock(NFS_I(inode)); |
1511 | return res; | 1511 | return res; |
1512 | /* Note: If we exit without ensuring that the commit is complete, | 1512 | /* Note: If we exit without ensuring that the commit is complete, |
1513 | * we must mark the inode as dirty. Otherwise, future calls to | 1513 | * we must mark the inode as dirty. Otherwise, future calls to |
1514 | * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure | 1514 | * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure |
1515 | * that the data is on the disk. | 1515 | * that the data is on the disk. |
1516 | */ | 1516 | */ |
1517 | out_mark_dirty: | 1517 | out_mark_dirty: |
1518 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); | 1518 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); |
1519 | return res; | 1519 | return res; |
1520 | } | 1520 | } |
1521 | 1521 | ||
1522 | static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) | 1522 | static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) |
1523 | { | 1523 | { |
1524 | struct nfs_inode *nfsi = NFS_I(inode); | 1524 | struct nfs_inode *nfsi = NFS_I(inode); |
1525 | int flags = FLUSH_SYNC; | 1525 | int flags = FLUSH_SYNC; |
1526 | int ret = 0; | 1526 | int ret = 0; |
1527 | 1527 | ||
1528 | if (wbc->sync_mode == WB_SYNC_NONE) { | 1528 | if (wbc->sync_mode == WB_SYNC_NONE) { |
1529 | /* Don't commit yet if this is a non-blocking flush and there | 1529 | /* Don't commit yet if this is a non-blocking flush and there |
1530 | * are a lot of outstanding writes for this mapping. | 1530 | * are a lot of outstanding writes for this mapping. |
1531 | */ | 1531 | */ |
1532 | if (nfsi->ncommit <= (nfsi->npages >> 1)) | 1532 | if (nfsi->ncommit <= (nfsi->npages >> 1)) |
1533 | goto out_mark_dirty; | 1533 | goto out_mark_dirty; |
1534 | 1534 | ||
1535 | /* don't wait for the COMMIT response */ | 1535 | /* don't wait for the COMMIT response */ |
1536 | flags = 0; | 1536 | flags = 0; |
1537 | } | 1537 | } |
1538 | 1538 | ||
1539 | ret = nfs_commit_inode(inode, flags); | 1539 | ret = nfs_commit_inode(inode, flags); |
1540 | if (ret >= 0) { | 1540 | if (ret >= 0) { |
1541 | if (wbc->sync_mode == WB_SYNC_NONE) { | 1541 | if (wbc->sync_mode == WB_SYNC_NONE) { |
1542 | if (ret < wbc->nr_to_write) | 1542 | if (ret < wbc->nr_to_write) |
1543 | wbc->nr_to_write -= ret; | 1543 | wbc->nr_to_write -= ret; |
1544 | else | 1544 | else |
1545 | wbc->nr_to_write = 0; | 1545 | wbc->nr_to_write = 0; |
1546 | } | 1546 | } |
1547 | return 0; | 1547 | return 0; |
1548 | } | 1548 | } |
1549 | out_mark_dirty: | 1549 | out_mark_dirty: |
1550 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); | 1550 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); |
1551 | return ret; | 1551 | return ret; |
1552 | } | 1552 | } |
1553 | #else | 1553 | #else |
1554 | static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) | 1554 | static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) |
1555 | { | 1555 | { |
1556 | return 0; | 1556 | return 0; |
1557 | } | 1557 | } |
1558 | #endif | 1558 | #endif |
1559 | 1559 | ||
1560 | int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) | 1560 | int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) |
1561 | { | 1561 | { |
1562 | int ret; | 1562 | int ret; |
1563 | 1563 | ||
1564 | ret = nfs_commit_unstable_pages(inode, wbc); | 1564 | ret = nfs_commit_unstable_pages(inode, wbc); |
1565 | if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { | 1565 | if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { |
1566 | int status; | 1566 | int status; |
1567 | bool sync = true; | 1567 | bool sync = true; |
1568 | 1568 | ||
1569 | if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || | 1569 | if (wbc->sync_mode == WB_SYNC_NONE) |
1570 | wbc->for_background) | ||
1571 | sync = false; | 1570 | sync = false; |
1572 | 1571 | ||
1573 | status = pnfs_layoutcommit_inode(inode, sync); | 1572 | status = pnfs_layoutcommit_inode(inode, sync); |
1574 | if (status < 0) | 1573 | if (status < 0) |
1575 | return status; | 1574 | return status; |
1576 | } | 1575 | } |
1577 | return ret; | 1576 | return ret; |
1578 | } | 1577 | } |
1579 | 1578 | ||
1580 | /* | 1579 | /* |
1581 | * flush the inode to disk. | 1580 | * flush the inode to disk. |
1582 | */ | 1581 | */ |
1583 | int nfs_wb_all(struct inode *inode) | 1582 | int nfs_wb_all(struct inode *inode) |
1584 | { | 1583 | { |
1585 | struct writeback_control wbc = { | 1584 | struct writeback_control wbc = { |
1586 | .sync_mode = WB_SYNC_ALL, | 1585 | .sync_mode = WB_SYNC_ALL, |
1587 | .nr_to_write = LONG_MAX, | 1586 | .nr_to_write = LONG_MAX, |
1588 | .range_start = 0, | 1587 | .range_start = 0, |
1589 | .range_end = LLONG_MAX, | 1588 | .range_end = LLONG_MAX, |
1590 | }; | 1589 | }; |
1591 | 1590 | ||
1592 | return sync_inode(inode, &wbc); | 1591 | return sync_inode(inode, &wbc); |
1593 | } | 1592 | } |
1594 | 1593 | ||
1595 | int nfs_wb_page_cancel(struct inode *inode, struct page *page) | 1594 | int nfs_wb_page_cancel(struct inode *inode, struct page *page) |
1596 | { | 1595 | { |
1597 | struct nfs_page *req; | 1596 | struct nfs_page *req; |
1598 | int ret = 0; | 1597 | int ret = 0; |
1599 | 1598 | ||
1600 | BUG_ON(!PageLocked(page)); | 1599 | BUG_ON(!PageLocked(page)); |
1601 | for (;;) { | 1600 | for (;;) { |
1602 | wait_on_page_writeback(page); | 1601 | wait_on_page_writeback(page); |
1603 | req = nfs_page_find_request(page); | 1602 | req = nfs_page_find_request(page); |
1604 | if (req == NULL) | 1603 | if (req == NULL) |
1605 | break; | 1604 | break; |
1606 | if (nfs_lock_request_dontget(req)) { | 1605 | if (nfs_lock_request_dontget(req)) { |
1607 | nfs_inode_remove_request(req); | 1606 | nfs_inode_remove_request(req); |
1608 | /* | 1607 | /* |
1609 | * In case nfs_inode_remove_request has marked the | 1608 | * In case nfs_inode_remove_request has marked the |
1610 | * page as being dirty | 1609 | * page as being dirty |
1611 | */ | 1610 | */ |
1612 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 1611 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
1613 | nfs_unlock_request(req); | 1612 | nfs_unlock_request(req); |
1614 | break; | 1613 | break; |
1615 | } | 1614 | } |
1616 | ret = nfs_wait_on_request(req); | 1615 | ret = nfs_wait_on_request(req); |
1617 | nfs_release_request(req); | 1616 | nfs_release_request(req); |
1618 | if (ret < 0) | 1617 | if (ret < 0) |
1619 | break; | 1618 | break; |
1620 | } | 1619 | } |
1621 | return ret; | 1620 | return ret; |
1622 | } | 1621 | } |
1623 | 1622 | ||
1624 | /* | 1623 | /* |
1625 | * Write back all requests on one page - we do this before reading it. | 1624 | * Write back all requests on one page - we do this before reading it. |
1626 | */ | 1625 | */ |
1627 | int nfs_wb_page(struct inode *inode, struct page *page) | 1626 | int nfs_wb_page(struct inode *inode, struct page *page) |
1628 | { | 1627 | { |
1629 | loff_t range_start = page_offset(page); | 1628 | loff_t range_start = page_offset(page); |
1630 | loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); | 1629 | loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); |
1631 | struct writeback_control wbc = { | 1630 | struct writeback_control wbc = { |
1632 | .sync_mode = WB_SYNC_ALL, | 1631 | .sync_mode = WB_SYNC_ALL, |
1633 | .nr_to_write = 0, | 1632 | .nr_to_write = 0, |
1634 | .range_start = range_start, | 1633 | .range_start = range_start, |
1635 | .range_end = range_end, | 1634 | .range_end = range_end, |
1636 | }; | 1635 | }; |
1637 | int ret; | 1636 | int ret; |
1638 | 1637 | ||
1639 | for (;;) { | 1638 | for (;;) { |
1640 | wait_on_page_writeback(page); | 1639 | wait_on_page_writeback(page); |
1641 | if (clear_page_dirty_for_io(page)) { | 1640 | if (clear_page_dirty_for_io(page)) { |
1642 | ret = nfs_writepage_locked(page, &wbc); | 1641 | ret = nfs_writepage_locked(page, &wbc); |
1643 | if (ret < 0) | 1642 | if (ret < 0) |
1644 | goto out_error; | 1643 | goto out_error; |
1645 | continue; | 1644 | continue; |
1646 | } | 1645 | } |
1647 | if (!PagePrivate(page)) | 1646 | if (!PagePrivate(page)) |
1648 | break; | 1647 | break; |
1649 | ret = nfs_commit_inode(inode, FLUSH_SYNC); | 1648 | ret = nfs_commit_inode(inode, FLUSH_SYNC); |
1650 | if (ret < 0) | 1649 | if (ret < 0) |
1651 | goto out_error; | 1650 | goto out_error; |
1652 | } | 1651 | } |
1653 | return 0; | 1652 | return 0; |
1654 | out_error: | 1653 | out_error: |
1655 | return ret; | 1654 | return ret; |
1656 | } | 1655 | } |
1657 | 1656 | ||
1658 | #ifdef CONFIG_MIGRATION | 1657 | #ifdef CONFIG_MIGRATION |
1659 | int nfs_migrate_page(struct address_space *mapping, struct page *newpage, | 1658 | int nfs_migrate_page(struct address_space *mapping, struct page *newpage, |
1660 | struct page *page) | 1659 | struct page *page) |
1661 | { | 1660 | { |
1662 | struct nfs_page *req; | 1661 | struct nfs_page *req; |
1663 | int ret; | 1662 | int ret; |
1664 | 1663 | ||
1665 | nfs_fscache_release_page(page, GFP_KERNEL); | 1664 | nfs_fscache_release_page(page, GFP_KERNEL); |
1666 | 1665 | ||
1667 | req = nfs_find_and_lock_request(page, false); | 1666 | req = nfs_find_and_lock_request(page, false); |
1668 | ret = PTR_ERR(req); | 1667 | ret = PTR_ERR(req); |
1669 | if (IS_ERR(req)) | 1668 | if (IS_ERR(req)) |
1670 | goto out; | 1669 | goto out; |
1671 | 1670 | ||
1672 | ret = migrate_page(mapping, newpage, page); | 1671 | ret = migrate_page(mapping, newpage, page); |
1673 | if (!req) | 1672 | if (!req) |
1674 | goto out; | 1673 | goto out; |
1675 | if (ret) | 1674 | if (ret) |
1676 | goto out_unlock; | 1675 | goto out_unlock; |
1677 | page_cache_get(newpage); | 1676 | page_cache_get(newpage); |
1678 | spin_lock(&mapping->host->i_lock); | 1677 | spin_lock(&mapping->host->i_lock); |
1679 | req->wb_page = newpage; | 1678 | req->wb_page = newpage; |
1680 | SetPagePrivate(newpage); | 1679 | SetPagePrivate(newpage); |
1681 | set_page_private(newpage, (unsigned long)req); | 1680 | set_page_private(newpage, (unsigned long)req); |
1682 | ClearPagePrivate(page); | 1681 | ClearPagePrivate(page); |
1683 | set_page_private(page, 0); | 1682 | set_page_private(page, 0); |
1684 | spin_unlock(&mapping->host->i_lock); | 1683 | spin_unlock(&mapping->host->i_lock); |
1685 | page_cache_release(page); | 1684 | page_cache_release(page); |
1686 | out_unlock: | 1685 | out_unlock: |
1687 | nfs_clear_page_tag_locked(req); | 1686 | nfs_clear_page_tag_locked(req); |
1688 | out: | 1687 | out: |
1689 | return ret; | 1688 | return ret; |
1690 | } | 1689 | } |
1691 | #endif | 1690 | #endif |
1692 | 1691 | ||
1693 | int __init nfs_init_writepagecache(void) | 1692 | int __init nfs_init_writepagecache(void) |
1694 | { | 1693 | { |
1695 | nfs_wdata_cachep = kmem_cache_create("nfs_write_data", | 1694 | nfs_wdata_cachep = kmem_cache_create("nfs_write_data", |
1696 | sizeof(struct nfs_write_data), | 1695 | sizeof(struct nfs_write_data), |
1697 | 0, SLAB_HWCACHE_ALIGN, | 1696 | 0, SLAB_HWCACHE_ALIGN, |
1698 | NULL); | 1697 | NULL); |
1699 | if (nfs_wdata_cachep == NULL) | 1698 | if (nfs_wdata_cachep == NULL) |
1700 | return -ENOMEM; | 1699 | return -ENOMEM; |
1701 | 1700 | ||
1702 | nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, | 1701 | nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, |
1703 | nfs_wdata_cachep); | 1702 | nfs_wdata_cachep); |
1704 | if (nfs_wdata_mempool == NULL) | 1703 | if (nfs_wdata_mempool == NULL) |
1705 | return -ENOMEM; | 1704 | return -ENOMEM; |
1706 | 1705 | ||
1707 | nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, | 1706 | nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, |
1708 | nfs_wdata_cachep); | 1707 | nfs_wdata_cachep); |
1709 | if (nfs_commit_mempool == NULL) | 1708 | if (nfs_commit_mempool == NULL) |
1710 | return -ENOMEM; | 1709 | return -ENOMEM; |
1711 | 1710 | ||
1712 | /* | 1711 | /* |
1713 | * NFS congestion size, scale with available memory. | 1712 | * NFS congestion size, scale with available memory. |
1714 | * | 1713 | * |
1715 | * 64MB: 8192k | 1714 | * 64MB: 8192k |
1716 | * 128MB: 11585k | 1715 | * 128MB: 11585k |
1717 | * 256MB: 16384k | 1716 | * 256MB: 16384k |
1718 | * 512MB: 23170k | 1717 | * 512MB: 23170k |
1719 | * 1GB: 32768k | 1718 | * 1GB: 32768k |
1720 | * 2GB: 46340k | 1719 | * 2GB: 46340k |
1721 | * 4GB: 65536k | 1720 | * 4GB: 65536k |
1722 | * 8GB: 92681k | 1721 | * 8GB: 92681k |
1723 | * 16GB: 131072k | 1722 | * 16GB: 131072k |
1724 | * | 1723 | * |
1725 | * This allows larger machines to have larger/more transfers. | 1724 | * This allows larger machines to have larger/more transfers. |
1726 | * Limit the default to 256M | 1725 | * Limit the default to 256M |
1727 | */ | 1726 | */ |
1728 | nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); | 1727 | nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); |
1729 | if (nfs_congestion_kb > 256*1024) | 1728 | if (nfs_congestion_kb > 256*1024) |
1730 | nfs_congestion_kb = 256*1024; | 1729 | nfs_congestion_kb = 256*1024; |
1731 | 1730 | ||
1732 | return 0; | 1731 | return 0; |
1733 | } | 1732 | } |
1734 | 1733 | ||
1735 | void nfs_destroy_writepagecache(void) | 1734 | void nfs_destroy_writepagecache(void) |
1736 | { | 1735 | { |
1737 | mempool_destroy(nfs_commit_mempool); | 1736 | mempool_destroy(nfs_commit_mempool); |
1738 | mempool_destroy(nfs_wdata_mempool); | 1737 | mempool_destroy(nfs_wdata_mempool); |
1739 | kmem_cache_destroy(nfs_wdata_cachep); | 1738 | kmem_cache_destroy(nfs_wdata_cachep); |
1740 | } | 1739 | } |
1741 | 1740 | ||
1742 | 1741 |
include/linux/backing-dev.h
1 | /* | 1 | /* |
2 | * include/linux/backing-dev.h | 2 | * include/linux/backing-dev.h |
3 | * | 3 | * |
4 | * low-level device information and state which is propagated up through | 4 | * low-level device information and state which is propagated up through |
5 | * to high-level code. | 5 | * to high-level code. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #ifndef _LINUX_BACKING_DEV_H | 8 | #ifndef _LINUX_BACKING_DEV_H |
9 | #define _LINUX_BACKING_DEV_H | 9 | #define _LINUX_BACKING_DEV_H |
10 | 10 | ||
11 | #include <linux/percpu_counter.h> | 11 | #include <linux/percpu_counter.h> |
12 | #include <linux/log2.h> | 12 | #include <linux/log2.h> |
13 | #include <linux/proportions.h> | 13 | #include <linux/proportions.h> |
14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/timer.h> | 17 | #include <linux/timer.h> |
18 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
19 | #include <asm/atomic.h> | 19 | #include <asm/atomic.h> |
20 | 20 | ||
21 | struct page; | 21 | struct page; |
22 | struct device; | 22 | struct device; |
23 | struct dentry; | 23 | struct dentry; |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * Bits in backing_dev_info.state | 26 | * Bits in backing_dev_info.state |
27 | */ | 27 | */ |
28 | enum bdi_state { | 28 | enum bdi_state { |
29 | BDI_pending, /* On its way to being activated */ | 29 | BDI_pending, /* On its way to being activated */ |
30 | BDI_wb_alloc, /* Default embedded wb allocated */ | 30 | BDI_wb_alloc, /* Default embedded wb allocated */ |
31 | BDI_async_congested, /* The async (write) queue is getting full */ | 31 | BDI_async_congested, /* The async (write) queue is getting full */ |
32 | BDI_sync_congested, /* The sync queue is getting full */ | 32 | BDI_sync_congested, /* The sync queue is getting full */ |
33 | BDI_registered, /* bdi_register() was done */ | 33 | BDI_registered, /* bdi_register() was done */ |
34 | BDI_writeback_running, /* Writeback is in progress */ | 34 | BDI_writeback_running, /* Writeback is in progress */ |
35 | BDI_unused, /* Available bits start here */ | 35 | BDI_unused, /* Available bits start here */ |
36 | }; | 36 | }; |
37 | 37 | ||
38 | typedef int (congested_fn)(void *, int); | 38 | typedef int (congested_fn)(void *, int); |
39 | 39 | ||
40 | enum bdi_stat_item { | 40 | enum bdi_stat_item { |
41 | BDI_RECLAIMABLE, | 41 | BDI_RECLAIMABLE, |
42 | BDI_WRITEBACK, | 42 | BDI_WRITEBACK, |
43 | BDI_WRITTEN, | ||
43 | NR_BDI_STAT_ITEMS | 44 | NR_BDI_STAT_ITEMS |
44 | }; | 45 | }; |
45 | 46 | ||
46 | #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) | 47 | #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) |
47 | 48 | ||
48 | struct bdi_writeback { | 49 | struct bdi_writeback { |
49 | struct backing_dev_info *bdi; /* our parent bdi */ | 50 | struct backing_dev_info *bdi; /* our parent bdi */ |
50 | unsigned int nr; | 51 | unsigned int nr; |
51 | 52 | ||
52 | unsigned long last_old_flush; /* last old data flush */ | 53 | unsigned long last_old_flush; /* last old data flush */ |
53 | unsigned long last_active; /* last time bdi thread was active */ | 54 | unsigned long last_active; /* last time bdi thread was active */ |
54 | 55 | ||
55 | struct task_struct *task; /* writeback thread */ | 56 | struct task_struct *task; /* writeback thread */ |
56 | struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ | 57 | struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ |
57 | struct list_head b_dirty; /* dirty inodes */ | 58 | struct list_head b_dirty; /* dirty inodes */ |
58 | struct list_head b_io; /* parked for writeback */ | 59 | struct list_head b_io; /* parked for writeback */ |
59 | struct list_head b_more_io; /* parked for more writeback */ | 60 | struct list_head b_more_io; /* parked for more writeback */ |
61 | spinlock_t list_lock; /* protects the b_* lists */ | ||
60 | }; | 62 | }; |
61 | 63 | ||
62 | struct backing_dev_info { | 64 | struct backing_dev_info { |
63 | struct list_head bdi_list; | 65 | struct list_head bdi_list; |
64 | unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ | 66 | unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ |
65 | unsigned long state; /* Always use atomic bitops on this */ | 67 | unsigned long state; /* Always use atomic bitops on this */ |
66 | unsigned int capabilities; /* Device capabilities */ | 68 | unsigned int capabilities; /* Device capabilities */ |
67 | congested_fn *congested_fn; /* Function pointer if device is md/dm */ | 69 | congested_fn *congested_fn; /* Function pointer if device is md/dm */ |
68 | void *congested_data; /* Pointer to aux data for congested func */ | 70 | void *congested_data; /* Pointer to aux data for congested func */ |
69 | 71 | ||
70 | char *name; | 72 | char *name; |
71 | 73 | ||
72 | struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; | 74 | struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; |
73 | 75 | ||
76 | unsigned long bw_time_stamp; /* last time write bw is updated */ | ||
77 | unsigned long written_stamp; /* pages written at bw_time_stamp */ | ||
78 | unsigned long write_bandwidth; /* the estimated write bandwidth */ | ||
79 | unsigned long avg_write_bandwidth; /* further smoothed write bw */ | ||
80 | |||
74 | struct prop_local_percpu completions; | 81 | struct prop_local_percpu completions; |
75 | int dirty_exceeded; | 82 | int dirty_exceeded; |
76 | 83 | ||
77 | unsigned int min_ratio; | 84 | unsigned int min_ratio; |
78 | unsigned int max_ratio, max_prop_frac; | 85 | unsigned int max_ratio, max_prop_frac; |
79 | 86 | ||
80 | struct bdi_writeback wb; /* default writeback info for this bdi */ | 87 | struct bdi_writeback wb; /* default writeback info for this bdi */ |
81 | spinlock_t wb_lock; /* protects work_list */ | 88 | spinlock_t wb_lock; /* protects work_list */ |
82 | 89 | ||
83 | struct list_head work_list; | 90 | struct list_head work_list; |
84 | 91 | ||
85 | struct device *dev; | 92 | struct device *dev; |
86 | 93 | ||
87 | struct timer_list laptop_mode_wb_timer; | 94 | struct timer_list laptop_mode_wb_timer; |
88 | 95 | ||
89 | #ifdef CONFIG_DEBUG_FS | 96 | #ifdef CONFIG_DEBUG_FS |
90 | struct dentry *debug_dir; | 97 | struct dentry *debug_dir; |
91 | struct dentry *debug_stats; | 98 | struct dentry *debug_stats; |
92 | #endif | 99 | #endif |
93 | }; | 100 | }; |
94 | 101 | ||
95 | int bdi_init(struct backing_dev_info *bdi); | 102 | int bdi_init(struct backing_dev_info *bdi); |
96 | void bdi_destroy(struct backing_dev_info *bdi); | 103 | void bdi_destroy(struct backing_dev_info *bdi); |
97 | 104 | ||
98 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 105 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
99 | const char *fmt, ...); | 106 | const char *fmt, ...); |
100 | int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); | 107 | int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); |
101 | void bdi_unregister(struct backing_dev_info *bdi); | 108 | void bdi_unregister(struct backing_dev_info *bdi); |
102 | int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); | 109 | int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); |
103 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); | 110 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); |
104 | void bdi_start_background_writeback(struct backing_dev_info *bdi); | 111 | void bdi_start_background_writeback(struct backing_dev_info *bdi); |
105 | int bdi_writeback_thread(void *data); | 112 | int bdi_writeback_thread(void *data); |
106 | int bdi_has_dirty_io(struct backing_dev_info *bdi); | 113 | int bdi_has_dirty_io(struct backing_dev_info *bdi); |
107 | void bdi_arm_supers_timer(void); | 114 | void bdi_arm_supers_timer(void); |
108 | void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); | 115 | void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); |
116 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); | ||
109 | 117 | ||
110 | extern spinlock_t bdi_lock; | 118 | extern spinlock_t bdi_lock; |
111 | extern struct list_head bdi_list; | 119 | extern struct list_head bdi_list; |
112 | extern struct list_head bdi_pending_list; | 120 | extern struct list_head bdi_pending_list; |
113 | 121 | ||
114 | static inline int wb_has_dirty_io(struct bdi_writeback *wb) | 122 | static inline int wb_has_dirty_io(struct bdi_writeback *wb) |
115 | { | 123 | { |
116 | return !list_empty(&wb->b_dirty) || | 124 | return !list_empty(&wb->b_dirty) || |
117 | !list_empty(&wb->b_io) || | 125 | !list_empty(&wb->b_io) || |
118 | !list_empty(&wb->b_more_io); | 126 | !list_empty(&wb->b_more_io); |
119 | } | 127 | } |
120 | 128 | ||
121 | static inline void __add_bdi_stat(struct backing_dev_info *bdi, | 129 | static inline void __add_bdi_stat(struct backing_dev_info *bdi, |
122 | enum bdi_stat_item item, s64 amount) | 130 | enum bdi_stat_item item, s64 amount) |
123 | { | 131 | { |
124 | __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); | 132 | __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); |
125 | } | 133 | } |
126 | 134 | ||
127 | static inline void __inc_bdi_stat(struct backing_dev_info *bdi, | 135 | static inline void __inc_bdi_stat(struct backing_dev_info *bdi, |
128 | enum bdi_stat_item item) | 136 | enum bdi_stat_item item) |
129 | { | 137 | { |
130 | __add_bdi_stat(bdi, item, 1); | 138 | __add_bdi_stat(bdi, item, 1); |
131 | } | 139 | } |
132 | 140 | ||
133 | static inline void inc_bdi_stat(struct backing_dev_info *bdi, | 141 | static inline void inc_bdi_stat(struct backing_dev_info *bdi, |
134 | enum bdi_stat_item item) | 142 | enum bdi_stat_item item) |
135 | { | 143 | { |
136 | unsigned long flags; | 144 | unsigned long flags; |
137 | 145 | ||
138 | local_irq_save(flags); | 146 | local_irq_save(flags); |
139 | __inc_bdi_stat(bdi, item); | 147 | __inc_bdi_stat(bdi, item); |
140 | local_irq_restore(flags); | 148 | local_irq_restore(flags); |
141 | } | 149 | } |
142 | 150 | ||
143 | static inline void __dec_bdi_stat(struct backing_dev_info *bdi, | 151 | static inline void __dec_bdi_stat(struct backing_dev_info *bdi, |
144 | enum bdi_stat_item item) | 152 | enum bdi_stat_item item) |
145 | { | 153 | { |
146 | __add_bdi_stat(bdi, item, -1); | 154 | __add_bdi_stat(bdi, item, -1); |
147 | } | 155 | } |
148 | 156 | ||
149 | static inline void dec_bdi_stat(struct backing_dev_info *bdi, | 157 | static inline void dec_bdi_stat(struct backing_dev_info *bdi, |
150 | enum bdi_stat_item item) | 158 | enum bdi_stat_item item) |
151 | { | 159 | { |
152 | unsigned long flags; | 160 | unsigned long flags; |
153 | 161 | ||
154 | local_irq_save(flags); | 162 | local_irq_save(flags); |
155 | __dec_bdi_stat(bdi, item); | 163 | __dec_bdi_stat(bdi, item); |
156 | local_irq_restore(flags); | 164 | local_irq_restore(flags); |
157 | } | 165 | } |
158 | 166 | ||
159 | static inline s64 bdi_stat(struct backing_dev_info *bdi, | 167 | static inline s64 bdi_stat(struct backing_dev_info *bdi, |
160 | enum bdi_stat_item item) | 168 | enum bdi_stat_item item) |
161 | { | 169 | { |
162 | return percpu_counter_read_positive(&bdi->bdi_stat[item]); | 170 | return percpu_counter_read_positive(&bdi->bdi_stat[item]); |
163 | } | 171 | } |
164 | 172 | ||
165 | static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, | 173 | static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, |
166 | enum bdi_stat_item item) | 174 | enum bdi_stat_item item) |
167 | { | 175 | { |
168 | return percpu_counter_sum_positive(&bdi->bdi_stat[item]); | 176 | return percpu_counter_sum_positive(&bdi->bdi_stat[item]); |
169 | } | 177 | } |
170 | 178 | ||
171 | static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, | 179 | static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, |
172 | enum bdi_stat_item item) | 180 | enum bdi_stat_item item) |
173 | { | 181 | { |
174 | s64 sum; | 182 | s64 sum; |
175 | unsigned long flags; | 183 | unsigned long flags; |
176 | 184 | ||
177 | local_irq_save(flags); | 185 | local_irq_save(flags); |
178 | sum = __bdi_stat_sum(bdi, item); | 186 | sum = __bdi_stat_sum(bdi, item); |
179 | local_irq_restore(flags); | 187 | local_irq_restore(flags); |
180 | 188 | ||
181 | return sum; | 189 | return sum; |
182 | } | 190 | } |
183 | 191 | ||
184 | extern void bdi_writeout_inc(struct backing_dev_info *bdi); | 192 | extern void bdi_writeout_inc(struct backing_dev_info *bdi); |
185 | 193 | ||
186 | /* | 194 | /* |
187 | * maximal error of a stat counter. | 195 | * maximal error of a stat counter. |
188 | */ | 196 | */ |
189 | static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) | 197 | static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) |
190 | { | 198 | { |
191 | #ifdef CONFIG_SMP | 199 | #ifdef CONFIG_SMP |
192 | return nr_cpu_ids * BDI_STAT_BATCH; | 200 | return nr_cpu_ids * BDI_STAT_BATCH; |
193 | #else | 201 | #else |
194 | return 1; | 202 | return 1; |
195 | #endif | 203 | #endif |
196 | } | 204 | } |
197 | 205 | ||
198 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); | 206 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); |
199 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); | 207 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); |
200 | 208 | ||
201 | /* | 209 | /* |
202 | * Flags in backing_dev_info::capability | 210 | * Flags in backing_dev_info::capability |
203 | * | 211 | * |
204 | * The first three flags control whether dirty pages will contribute to the | 212 | * The first three flags control whether dirty pages will contribute to the |
205 | * VM's accounting and whether writepages() should be called for dirty pages | 213 | * VM's accounting and whether writepages() should be called for dirty pages |
206 | * (something that would not, for example, be appropriate for ramfs) | 214 | * (something that would not, for example, be appropriate for ramfs) |
207 | * | 215 | * |
208 | * WARNING: these flags are closely related and should not normally be | 216 | * WARNING: these flags are closely related and should not normally be |
209 | * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these | 217 | * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these |
210 | * three flags into a single convenience macro. | 218 | * three flags into a single convenience macro. |
211 | * | 219 | * |
212 | * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting | 220 | * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting |
213 | * BDI_CAP_NO_WRITEBACK: Don't write pages back | 221 | * BDI_CAP_NO_WRITEBACK: Don't write pages back |
214 | * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages | 222 | * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages |
215 | * | 223 | * |
216 | * These flags let !MMU mmap() govern direct device mapping vs immediate | 224 | * These flags let !MMU mmap() govern direct device mapping vs immediate |
217 | * copying more easily for MAP_PRIVATE, especially for ROM filesystems. | 225 | * copying more easily for MAP_PRIVATE, especially for ROM filesystems. |
218 | * | 226 | * |
219 | * BDI_CAP_MAP_COPY: Copy can be mapped (MAP_PRIVATE) | 227 | * BDI_CAP_MAP_COPY: Copy can be mapped (MAP_PRIVATE) |
220 | * BDI_CAP_MAP_DIRECT: Can be mapped directly (MAP_SHARED) | 228 | * BDI_CAP_MAP_DIRECT: Can be mapped directly (MAP_SHARED) |
221 | * BDI_CAP_READ_MAP: Can be mapped for reading | 229 | * BDI_CAP_READ_MAP: Can be mapped for reading |
222 | * BDI_CAP_WRITE_MAP: Can be mapped for writing | 230 | * BDI_CAP_WRITE_MAP: Can be mapped for writing |
223 | * BDI_CAP_EXEC_MAP: Can be mapped for execution | 231 | * BDI_CAP_EXEC_MAP: Can be mapped for execution |
224 | * | 232 | * |
225 | * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. | 233 | * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. |
226 | */ | 234 | */ |
227 | #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 | 235 | #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 |
228 | #define BDI_CAP_NO_WRITEBACK 0x00000002 | 236 | #define BDI_CAP_NO_WRITEBACK 0x00000002 |
229 | #define BDI_CAP_MAP_COPY 0x00000004 | 237 | #define BDI_CAP_MAP_COPY 0x00000004 |
230 | #define BDI_CAP_MAP_DIRECT 0x00000008 | 238 | #define BDI_CAP_MAP_DIRECT 0x00000008 |
231 | #define BDI_CAP_READ_MAP 0x00000010 | 239 | #define BDI_CAP_READ_MAP 0x00000010 |
232 | #define BDI_CAP_WRITE_MAP 0x00000020 | 240 | #define BDI_CAP_WRITE_MAP 0x00000020 |
233 | #define BDI_CAP_EXEC_MAP 0x00000040 | 241 | #define BDI_CAP_EXEC_MAP 0x00000040 |
234 | #define BDI_CAP_NO_ACCT_WB 0x00000080 | 242 | #define BDI_CAP_NO_ACCT_WB 0x00000080 |
235 | #define BDI_CAP_SWAP_BACKED 0x00000100 | 243 | #define BDI_CAP_SWAP_BACKED 0x00000100 |
236 | 244 | ||
237 | #define BDI_CAP_VMFLAGS \ | 245 | #define BDI_CAP_VMFLAGS \ |
238 | (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) | 246 | (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) |
239 | 247 | ||
240 | #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ | 248 | #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ |
241 | (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) | 249 | (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) |
242 | 250 | ||
243 | #if defined(VM_MAYREAD) && \ | 251 | #if defined(VM_MAYREAD) && \ |
244 | (BDI_CAP_READ_MAP != VM_MAYREAD || \ | 252 | (BDI_CAP_READ_MAP != VM_MAYREAD || \ |
245 | BDI_CAP_WRITE_MAP != VM_MAYWRITE || \ | 253 | BDI_CAP_WRITE_MAP != VM_MAYWRITE || \ |
246 | BDI_CAP_EXEC_MAP != VM_MAYEXEC) | 254 | BDI_CAP_EXEC_MAP != VM_MAYEXEC) |
247 | #error please change backing_dev_info::capabilities flags | 255 | #error please change backing_dev_info::capabilities flags |
248 | #endif | 256 | #endif |
249 | 257 | ||
250 | extern struct backing_dev_info default_backing_dev_info; | 258 | extern struct backing_dev_info default_backing_dev_info; |
251 | extern struct backing_dev_info noop_backing_dev_info; | 259 | extern struct backing_dev_info noop_backing_dev_info; |
252 | 260 | ||
253 | int writeback_in_progress(struct backing_dev_info *bdi); | 261 | int writeback_in_progress(struct backing_dev_info *bdi); |
254 | 262 | ||
255 | static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) | 263 | static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) |
256 | { | 264 | { |
257 | if (bdi->congested_fn) | 265 | if (bdi->congested_fn) |
258 | return bdi->congested_fn(bdi->congested_data, bdi_bits); | 266 | return bdi->congested_fn(bdi->congested_data, bdi_bits); |
259 | return (bdi->state & bdi_bits); | 267 | return (bdi->state & bdi_bits); |
260 | } | 268 | } |
261 | 269 | ||
262 | static inline int bdi_read_congested(struct backing_dev_info *bdi) | 270 | static inline int bdi_read_congested(struct backing_dev_info *bdi) |
263 | { | 271 | { |
264 | return bdi_congested(bdi, 1 << BDI_sync_congested); | 272 | return bdi_congested(bdi, 1 << BDI_sync_congested); |
265 | } | 273 | } |
266 | 274 | ||
267 | static inline int bdi_write_congested(struct backing_dev_info *bdi) | 275 | static inline int bdi_write_congested(struct backing_dev_info *bdi) |
268 | { | 276 | { |
269 | return bdi_congested(bdi, 1 << BDI_async_congested); | 277 | return bdi_congested(bdi, 1 << BDI_async_congested); |
270 | } | 278 | } |
271 | 279 | ||
272 | static inline int bdi_rw_congested(struct backing_dev_info *bdi) | 280 | static inline int bdi_rw_congested(struct backing_dev_info *bdi) |
273 | { | 281 | { |
274 | return bdi_congested(bdi, (1 << BDI_sync_congested) | | 282 | return bdi_congested(bdi, (1 << BDI_sync_congested) | |
275 | (1 << BDI_async_congested)); | 283 | (1 << BDI_async_congested)); |
276 | } | 284 | } |
277 | 285 | ||
278 | enum { | 286 | enum { |
279 | BLK_RW_ASYNC = 0, | 287 | BLK_RW_ASYNC = 0, |
280 | BLK_RW_SYNC = 1, | 288 | BLK_RW_SYNC = 1, |
281 | }; | 289 | }; |
282 | 290 | ||
283 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync); | 291 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync); |
284 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); | 292 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); |
285 | long congestion_wait(int sync, long timeout); | 293 | long congestion_wait(int sync, long timeout); |
286 | long wait_iff_congested(struct zone *zone, int sync, long timeout); | 294 | long wait_iff_congested(struct zone *zone, int sync, long timeout); |
287 | 295 | ||
288 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) | 296 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) |
289 | { | 297 | { |
290 | return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK); | 298 | return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK); |
291 | } | 299 | } |
292 | 300 | ||
293 | static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi) | 301 | static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi) |
294 | { | 302 | { |
295 | return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY); | 303 | return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY); |
296 | } | 304 | } |
297 | 305 | ||
298 | static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) | 306 | static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) |
299 | { | 307 | { |
300 | /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */ | 308 | /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */ |
301 | return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB | | 309 | return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB | |
302 | BDI_CAP_NO_WRITEBACK)); | 310 | BDI_CAP_NO_WRITEBACK)); |
303 | } | 311 | } |
304 | 312 | ||
305 | static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) | 313 | static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) |
306 | { | 314 | { |
307 | return bdi->capabilities & BDI_CAP_SWAP_BACKED; | 315 | return bdi->capabilities & BDI_CAP_SWAP_BACKED; |
308 | } | 316 | } |
309 | 317 | ||
310 | static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) | 318 | static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) |
311 | { | 319 | { |
312 | return bdi == &default_backing_dev_info; | 320 | return bdi == &default_backing_dev_info; |
313 | } | 321 | } |
314 | 322 | ||
315 | static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) | 323 | static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) |
316 | { | 324 | { |
317 | return bdi_cap_writeback_dirty(mapping->backing_dev_info); | 325 | return bdi_cap_writeback_dirty(mapping->backing_dev_info); |
318 | } | 326 | } |
319 | 327 | ||
320 | static inline bool mapping_cap_account_dirty(struct address_space *mapping) | 328 | static inline bool mapping_cap_account_dirty(struct address_space *mapping) |
321 | { | 329 | { |
322 | return bdi_cap_account_dirty(mapping->backing_dev_info); | 330 | return bdi_cap_account_dirty(mapping->backing_dev_info); |
323 | } | 331 | } |
324 | 332 | ||
325 | static inline bool mapping_cap_swap_backed(struct address_space *mapping) | 333 | static inline bool mapping_cap_swap_backed(struct address_space *mapping) |
326 | { | 334 | { |
327 | return bdi_cap_swap_backed(mapping->backing_dev_info); | 335 | return bdi_cap_swap_backed(mapping->backing_dev_info); |
328 | } | 336 | } |
329 | 337 | ||
330 | static inline int bdi_sched_wait(void *word) | 338 | static inline int bdi_sched_wait(void *word) |
331 | { | 339 | { |
332 | schedule(); | 340 | schedule(); |
333 | return 0; | 341 | return 0; |
334 | } | 342 | } |
335 | 343 | ||
336 | #endif /* _LINUX_BACKING_DEV_H */ | 344 | #endif /* _LINUX_BACKING_DEV_H */ |
337 | 345 |
include/linux/writeback.h
1 | /* | 1 | /* |
2 | * include/linux/writeback.h | 2 | * include/linux/writeback.h |
3 | */ | 3 | */ |
4 | #ifndef WRITEBACK_H | 4 | #ifndef WRITEBACK_H |
5 | #define WRITEBACK_H | 5 | #define WRITEBACK_H |
6 | 6 | ||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
9 | 9 | ||
10 | struct backing_dev_info; | 10 | /* |
11 | * The 1/4 region under the global dirty thresh is for smooth dirty throttling: | ||
12 | * | ||
13 | * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) | ||
14 | * | ||
15 | * The 1/16 region above the global dirty limit will be put to maximum pauses: | ||
16 | * | ||
17 | * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) | ||
18 | * | ||
19 | * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put | ||
20 | * to loops: | ||
21 | * | ||
22 | * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA) | ||
23 | * | ||
24 | * Further beyond, all dirtier tasks will enter a loop waiting (possibly long | ||
25 | * time) for the dirty pages to drop, unless written enough pages. | ||
26 | * | ||
27 | * The global dirty threshold is normally equal to the global dirty limit, | ||
28 | * except when the system suddenly allocates a lot of anonymous memory and | ||
29 | * knocks down the global dirty threshold quickly, in which case the global | ||
30 | * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. | ||
31 | */ | ||
32 | #define DIRTY_SCOPE 8 | ||
33 | #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) | ||
34 | #define DIRTY_MAXPAUSE_AREA 16 | ||
35 | #define DIRTY_PASSGOOD_AREA 8 | ||
11 | 36 | ||
12 | extern spinlock_t inode_wb_list_lock; | 37 | /* |
38 | * 4MB minimal write chunk size | ||
39 | */ | ||
40 | #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) | ||
13 | 41 | ||
42 | struct backing_dev_info; | ||
43 | |||
14 | /* | 44 | /* |
15 | * fs/fs-writeback.c | 45 | * fs/fs-writeback.c |
16 | */ | 46 | */ |
17 | enum writeback_sync_modes { | 47 | enum writeback_sync_modes { |
18 | WB_SYNC_NONE, /* Don't wait on anything */ | 48 | WB_SYNC_NONE, /* Don't wait on anything */ |
19 | WB_SYNC_ALL, /* Wait on every mapping */ | 49 | WB_SYNC_ALL, /* Wait on every mapping */ |
20 | }; | 50 | }; |
21 | 51 | ||
22 | /* | 52 | /* |
23 | * A control structure which tells the writeback code what to do. These are | 53 | * A control structure which tells the writeback code what to do. These are |
24 | * always on the stack, and hence need no locking. They are always initialised | 54 | * always on the stack, and hence need no locking. They are always initialised |
25 | * in a manner such that unspecified fields are set to zero. | 55 | * in a manner such that unspecified fields are set to zero. |
26 | */ | 56 | */ |
27 | struct writeback_control { | 57 | struct writeback_control { |
28 | enum writeback_sync_modes sync_mode; | 58 | enum writeback_sync_modes sync_mode; |
29 | unsigned long *older_than_this; /* If !NULL, only write back inodes | ||
30 | older than this */ | ||
31 | unsigned long wb_start; /* Time writeback_inodes_wb was | ||
32 | called. This is needed to avoid | ||
33 | extra jobs and livelock */ | ||
34 | long nr_to_write; /* Write this many pages, and decrement | 59 | long nr_to_write; /* Write this many pages, and decrement |
35 | this for each page written */ | 60 | this for each page written */ |
36 | long pages_skipped; /* Pages which were not written */ | 61 | long pages_skipped; /* Pages which were not written */ |
37 | 62 | ||
38 | /* | 63 | /* |
39 | * For a_ops->writepages(): is start or end are non-zero then this is | 64 | * For a_ops->writepages(): is start or end are non-zero then this is |
40 | * a hint that the filesystem need only write out the pages inside that | 65 | * a hint that the filesystem need only write out the pages inside that |
41 | * byterange. The byte at `end' is included in the writeout request. | 66 | * byterange. The byte at `end' is included in the writeout request. |
42 | */ | 67 | */ |
43 | loff_t range_start; | 68 | loff_t range_start; |
44 | loff_t range_end; | 69 | loff_t range_end; |
45 | 70 | ||
46 | unsigned nonblocking:1; /* Don't get stuck on request queues */ | ||
47 | unsigned encountered_congestion:1; /* An output: a queue is full */ | ||
48 | unsigned for_kupdate:1; /* A kupdate writeback */ | 71 | unsigned for_kupdate:1; /* A kupdate writeback */ |
49 | unsigned for_background:1; /* A background writeback */ | 72 | unsigned for_background:1; /* A background writeback */ |
73 | unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ | ||
50 | unsigned for_reclaim:1; /* Invoked from the page allocator */ | 74 | unsigned for_reclaim:1; /* Invoked from the page allocator */ |
51 | unsigned range_cyclic:1; /* range_start is cyclic */ | 75 | unsigned range_cyclic:1; /* range_start is cyclic */ |
52 | unsigned more_io:1; /* more io to be dispatched */ | ||
53 | }; | 76 | }; |
54 | 77 | ||
55 | /* | 78 | /* |
56 | * fs/fs-writeback.c | 79 | * fs/fs-writeback.c |
57 | */ | 80 | */ |
58 | struct bdi_writeback; | 81 | struct bdi_writeback; |
59 | int inode_wait(void *); | 82 | int inode_wait(void *); |
60 | void writeback_inodes_sb(struct super_block *); | 83 | void writeback_inodes_sb(struct super_block *); |
61 | void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); | 84 | void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); |
62 | int writeback_inodes_sb_if_idle(struct super_block *); | 85 | int writeback_inodes_sb_if_idle(struct super_block *); |
63 | int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); | 86 | int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); |
64 | void sync_inodes_sb(struct super_block *); | 87 | void sync_inodes_sb(struct super_block *); |
65 | void writeback_inodes_wb(struct bdi_writeback *wb, | 88 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); |
66 | struct writeback_control *wbc); | ||
67 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait); | 89 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait); |
68 | void wakeup_flusher_threads(long nr_pages); | 90 | void wakeup_flusher_threads(long nr_pages); |
69 | 91 | ||
70 | /* writeback.h requires fs.h; it, too, is not included from here. */ | 92 | /* writeback.h requires fs.h; it, too, is not included from here. */ |
71 | static inline void wait_on_inode(struct inode *inode) | 93 | static inline void wait_on_inode(struct inode *inode) |
72 | { | 94 | { |
73 | might_sleep(); | 95 | might_sleep(); |
74 | wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); | 96 | wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); |
75 | } | 97 | } |
76 | static inline void inode_sync_wait(struct inode *inode) | 98 | static inline void inode_sync_wait(struct inode *inode) |
77 | { | 99 | { |
78 | might_sleep(); | 100 | might_sleep(); |
79 | wait_on_bit(&inode->i_state, __I_SYNC, inode_wait, | 101 | wait_on_bit(&inode->i_state, __I_SYNC, inode_wait, |
80 | TASK_UNINTERRUPTIBLE); | 102 | TASK_UNINTERRUPTIBLE); |
81 | } | 103 | } |
82 | 104 | ||
83 | 105 | ||
84 | /* | 106 | /* |
85 | * mm/page-writeback.c | 107 | * mm/page-writeback.c |
86 | */ | 108 | */ |
87 | #ifdef CONFIG_BLOCK | 109 | #ifdef CONFIG_BLOCK |
88 | void laptop_io_completion(struct backing_dev_info *info); | 110 | void laptop_io_completion(struct backing_dev_info *info); |
89 | void laptop_sync_completion(void); | 111 | void laptop_sync_completion(void); |
90 | void laptop_mode_sync(struct work_struct *work); | 112 | void laptop_mode_sync(struct work_struct *work); |
91 | void laptop_mode_timer_fn(unsigned long data); | 113 | void laptop_mode_timer_fn(unsigned long data); |
92 | #else | 114 | #else |
93 | static inline void laptop_sync_completion(void) { } | 115 | static inline void laptop_sync_completion(void) { } |
94 | #endif | 116 | #endif |
95 | void throttle_vm_writeout(gfp_t gfp_mask); | 117 | void throttle_vm_writeout(gfp_t gfp_mask); |
96 | 118 | ||
119 | extern unsigned long global_dirty_limit; | ||
120 | |||
97 | /* These are exported to sysctl. */ | 121 | /* These are exported to sysctl. */ |
98 | extern int dirty_background_ratio; | 122 | extern int dirty_background_ratio; |
99 | extern unsigned long dirty_background_bytes; | 123 | extern unsigned long dirty_background_bytes; |
100 | extern int vm_dirty_ratio; | 124 | extern int vm_dirty_ratio; |
101 | extern unsigned long vm_dirty_bytes; | 125 | extern unsigned long vm_dirty_bytes; |
102 | extern unsigned int dirty_writeback_interval; | 126 | extern unsigned int dirty_writeback_interval; |
103 | extern unsigned int dirty_expire_interval; | 127 | extern unsigned int dirty_expire_interval; |
104 | extern int vm_highmem_is_dirtyable; | 128 | extern int vm_highmem_is_dirtyable; |
105 | extern int block_dump; | 129 | extern int block_dump; |
106 | extern int laptop_mode; | 130 | extern int laptop_mode; |
107 | 131 | ||
108 | extern unsigned long determine_dirtyable_memory(void); | 132 | extern unsigned long determine_dirtyable_memory(void); |
109 | 133 | ||
110 | extern int dirty_background_ratio_handler(struct ctl_table *table, int write, | 134 | extern int dirty_background_ratio_handler(struct ctl_table *table, int write, |
111 | void __user *buffer, size_t *lenp, | 135 | void __user *buffer, size_t *lenp, |
112 | loff_t *ppos); | 136 | loff_t *ppos); |
113 | extern int dirty_background_bytes_handler(struct ctl_table *table, int write, | 137 | extern int dirty_background_bytes_handler(struct ctl_table *table, int write, |
114 | void __user *buffer, size_t *lenp, | 138 | void __user *buffer, size_t *lenp, |
115 | loff_t *ppos); | 139 | loff_t *ppos); |
116 | extern int dirty_ratio_handler(struct ctl_table *table, int write, | 140 | extern int dirty_ratio_handler(struct ctl_table *table, int write, |
117 | void __user *buffer, size_t *lenp, | 141 | void __user *buffer, size_t *lenp, |
118 | loff_t *ppos); | 142 | loff_t *ppos); |
119 | extern int dirty_bytes_handler(struct ctl_table *table, int write, | 143 | extern int dirty_bytes_handler(struct ctl_table *table, int write, |
120 | void __user *buffer, size_t *lenp, | 144 | void __user *buffer, size_t *lenp, |
121 | loff_t *ppos); | 145 | loff_t *ppos); |
122 | 146 | ||
123 | struct ctl_table; | 147 | struct ctl_table; |
124 | int dirty_writeback_centisecs_handler(struct ctl_table *, int, | 148 | int dirty_writeback_centisecs_handler(struct ctl_table *, int, |
125 | void __user *, size_t *, loff_t *); | 149 | void __user *, size_t *, loff_t *); |
126 | 150 | ||
127 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); | 151 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); |
128 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, | 152 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, |
129 | unsigned long dirty); | 153 | unsigned long dirty); |
154 | |||
155 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
156 | unsigned long thresh, | ||
157 | unsigned long dirty, | ||
158 | unsigned long bdi_thresh, | ||
159 | unsigned long bdi_dirty, | ||
160 | unsigned long start_time); | ||
130 | 161 | ||
131 | void page_writeback_init(void); | 162 | void page_writeback_init(void); |
132 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 163 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
133 | unsigned long nr_pages_dirtied); | 164 | unsigned long nr_pages_dirtied); |
134 | 165 | ||
135 | static inline void | 166 | static inline void |
136 | balance_dirty_pages_ratelimited(struct address_space *mapping) | 167 | balance_dirty_pages_ratelimited(struct address_space *mapping) |
137 | { | 168 | { |
138 | balance_dirty_pages_ratelimited_nr(mapping, 1); | 169 | balance_dirty_pages_ratelimited_nr(mapping, 1); |
139 | } | 170 | } |
140 | 171 | ||
141 | typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, | 172 | typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, |
142 | void *data); | 173 | void *data); |
143 | 174 | ||
144 | int generic_writepages(struct address_space *mapping, | 175 | int generic_writepages(struct address_space *mapping, |
145 | struct writeback_control *wbc); | 176 | struct writeback_control *wbc); |
146 | void tag_pages_for_writeback(struct address_space *mapping, | 177 | void tag_pages_for_writeback(struct address_space *mapping, |
147 | pgoff_t start, pgoff_t end); | 178 | pgoff_t start, pgoff_t end); |
148 | int write_cache_pages(struct address_space *mapping, | 179 | int write_cache_pages(struct address_space *mapping, |
149 | struct writeback_control *wbc, writepage_t writepage, | 180 | struct writeback_control *wbc, writepage_t writepage, |
150 | void *data); | 181 | void *data); |
151 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc); | 182 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc); |
152 | void set_page_dirty_balance(struct page *page, int page_mkwrite); | 183 | void set_page_dirty_balance(struct page *page, int page_mkwrite); |
153 | void writeback_set_ratelimit(void); | 184 | void writeback_set_ratelimit(void); |
154 | void tag_pages_for_writeback(struct address_space *mapping, | 185 | void tag_pages_for_writeback(struct address_space *mapping, |
include/trace/events/btrfs.h
1 | #undef TRACE_SYSTEM | 1 | #undef TRACE_SYSTEM |
2 | #define TRACE_SYSTEM btrfs | 2 | #define TRACE_SYSTEM btrfs |
3 | 3 | ||
4 | #if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ) | 4 | #if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ) |
5 | #define _TRACE_BTRFS_H | 5 | #define _TRACE_BTRFS_H |
6 | 6 | ||
7 | #include <linux/writeback.h> | 7 | #include <linux/writeback.h> |
8 | #include <linux/tracepoint.h> | 8 | #include <linux/tracepoint.h> |
9 | 9 | ||
10 | struct btrfs_root; | 10 | struct btrfs_root; |
11 | struct btrfs_fs_info; | 11 | struct btrfs_fs_info; |
12 | struct btrfs_inode; | 12 | struct btrfs_inode; |
13 | struct extent_map; | 13 | struct extent_map; |
14 | struct btrfs_ordered_extent; | 14 | struct btrfs_ordered_extent; |
15 | struct btrfs_delayed_ref_node; | 15 | struct btrfs_delayed_ref_node; |
16 | struct btrfs_delayed_tree_ref; | 16 | struct btrfs_delayed_tree_ref; |
17 | struct btrfs_delayed_data_ref; | 17 | struct btrfs_delayed_data_ref; |
18 | struct btrfs_delayed_ref_head; | 18 | struct btrfs_delayed_ref_head; |
19 | struct map_lookup; | 19 | struct map_lookup; |
20 | struct extent_buffer; | 20 | struct extent_buffer; |
21 | 21 | ||
22 | #define show_ref_type(type) \ | 22 | #define show_ref_type(type) \ |
23 | __print_symbolic(type, \ | 23 | __print_symbolic(type, \ |
24 | { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ | 24 | { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ |
25 | { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \ | 25 | { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \ |
26 | { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \ | 26 | { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \ |
27 | { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \ | 27 | { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \ |
28 | { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" }) | 28 | { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" }) |
29 | 29 | ||
30 | #define __show_root_type(obj) \ | 30 | #define __show_root_type(obj) \ |
31 | __print_symbolic_u64(obj, \ | 31 | __print_symbolic_u64(obj, \ |
32 | { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, \ | 32 | { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, \ |
33 | { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, \ | 33 | { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, \ |
34 | { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, \ | 34 | { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, \ |
35 | { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, \ | 35 | { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, \ |
36 | { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \ | 36 | { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \ |
37 | { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \ | 37 | { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \ |
38 | { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \ | 38 | { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \ |
39 | { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \ | 39 | { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \ |
40 | { BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \ | 40 | { BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \ |
41 | { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) | 41 | { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) |
42 | 42 | ||
43 | #define show_root_type(obj) \ | 43 | #define show_root_type(obj) \ |
44 | obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ | 44 | obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ |
45 | (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" | 45 | (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" |
46 | 46 | ||
47 | TRACE_EVENT(btrfs_transaction_commit, | 47 | TRACE_EVENT(btrfs_transaction_commit, |
48 | 48 | ||
49 | TP_PROTO(struct btrfs_root *root), | 49 | TP_PROTO(struct btrfs_root *root), |
50 | 50 | ||
51 | TP_ARGS(root), | 51 | TP_ARGS(root), |
52 | 52 | ||
53 | TP_STRUCT__entry( | 53 | TP_STRUCT__entry( |
54 | __field( u64, generation ) | 54 | __field( u64, generation ) |
55 | __field( u64, root_objectid ) | 55 | __field( u64, root_objectid ) |
56 | ), | 56 | ), |
57 | 57 | ||
58 | TP_fast_assign( | 58 | TP_fast_assign( |
59 | __entry->generation = root->fs_info->generation; | 59 | __entry->generation = root->fs_info->generation; |
60 | __entry->root_objectid = root->root_key.objectid; | 60 | __entry->root_objectid = root->root_key.objectid; |
61 | ), | 61 | ), |
62 | 62 | ||
63 | TP_printk("root = %llu(%s), gen = %llu", | 63 | TP_printk("root = %llu(%s), gen = %llu", |
64 | show_root_type(__entry->root_objectid), | 64 | show_root_type(__entry->root_objectid), |
65 | (unsigned long long)__entry->generation) | 65 | (unsigned long long)__entry->generation) |
66 | ); | 66 | ); |
67 | 67 | ||
68 | DECLARE_EVENT_CLASS(btrfs__inode, | 68 | DECLARE_EVENT_CLASS(btrfs__inode, |
69 | 69 | ||
70 | TP_PROTO(struct inode *inode), | 70 | TP_PROTO(struct inode *inode), |
71 | 71 | ||
72 | TP_ARGS(inode), | 72 | TP_ARGS(inode), |
73 | 73 | ||
74 | TP_STRUCT__entry( | 74 | TP_STRUCT__entry( |
75 | __field( ino_t, ino ) | 75 | __field( ino_t, ino ) |
76 | __field( blkcnt_t, blocks ) | 76 | __field( blkcnt_t, blocks ) |
77 | __field( u64, disk_i_size ) | 77 | __field( u64, disk_i_size ) |
78 | __field( u64, generation ) | 78 | __field( u64, generation ) |
79 | __field( u64, last_trans ) | 79 | __field( u64, last_trans ) |
80 | __field( u64, logged_trans ) | 80 | __field( u64, logged_trans ) |
81 | __field( u64, root_objectid ) | 81 | __field( u64, root_objectid ) |
82 | ), | 82 | ), |
83 | 83 | ||
84 | TP_fast_assign( | 84 | TP_fast_assign( |
85 | __entry->ino = inode->i_ino; | 85 | __entry->ino = inode->i_ino; |
86 | __entry->blocks = inode->i_blocks; | 86 | __entry->blocks = inode->i_blocks; |
87 | __entry->disk_i_size = BTRFS_I(inode)->disk_i_size; | 87 | __entry->disk_i_size = BTRFS_I(inode)->disk_i_size; |
88 | __entry->generation = BTRFS_I(inode)->generation; | 88 | __entry->generation = BTRFS_I(inode)->generation; |
89 | __entry->last_trans = BTRFS_I(inode)->last_trans; | 89 | __entry->last_trans = BTRFS_I(inode)->last_trans; |
90 | __entry->logged_trans = BTRFS_I(inode)->logged_trans; | 90 | __entry->logged_trans = BTRFS_I(inode)->logged_trans; |
91 | __entry->root_objectid = | 91 | __entry->root_objectid = |
92 | BTRFS_I(inode)->root->root_key.objectid; | 92 | BTRFS_I(inode)->root->root_key.objectid; |
93 | ), | 93 | ), |
94 | 94 | ||
95 | TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, " | 95 | TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, " |
96 | "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu", | 96 | "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu", |
97 | show_root_type(__entry->root_objectid), | 97 | show_root_type(__entry->root_objectid), |
98 | (unsigned long long)__entry->generation, | 98 | (unsigned long long)__entry->generation, |
99 | (unsigned long)__entry->ino, | 99 | (unsigned long)__entry->ino, |
100 | (unsigned long long)__entry->blocks, | 100 | (unsigned long long)__entry->blocks, |
101 | (unsigned long long)__entry->disk_i_size, | 101 | (unsigned long long)__entry->disk_i_size, |
102 | (unsigned long long)__entry->last_trans, | 102 | (unsigned long long)__entry->last_trans, |
103 | (unsigned long long)__entry->logged_trans) | 103 | (unsigned long long)__entry->logged_trans) |
104 | ); | 104 | ); |
105 | 105 | ||
106 | DEFINE_EVENT(btrfs__inode, btrfs_inode_new, | 106 | DEFINE_EVENT(btrfs__inode, btrfs_inode_new, |
107 | 107 | ||
108 | TP_PROTO(struct inode *inode), | 108 | TP_PROTO(struct inode *inode), |
109 | 109 | ||
110 | TP_ARGS(inode) | 110 | TP_ARGS(inode) |
111 | ); | 111 | ); |
112 | 112 | ||
113 | DEFINE_EVENT(btrfs__inode, btrfs_inode_request, | 113 | DEFINE_EVENT(btrfs__inode, btrfs_inode_request, |
114 | 114 | ||
115 | TP_PROTO(struct inode *inode), | 115 | TP_PROTO(struct inode *inode), |
116 | 116 | ||
117 | TP_ARGS(inode) | 117 | TP_ARGS(inode) |
118 | ); | 118 | ); |
119 | 119 | ||
120 | DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, | 120 | DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, |
121 | 121 | ||
122 | TP_PROTO(struct inode *inode), | 122 | TP_PROTO(struct inode *inode), |
123 | 123 | ||
124 | TP_ARGS(inode) | 124 | TP_ARGS(inode) |
125 | ); | 125 | ); |
126 | 126 | ||
127 | #define __show_map_type(type) \ | 127 | #define __show_map_type(type) \ |
128 | __print_symbolic_u64(type, \ | 128 | __print_symbolic_u64(type, \ |
129 | { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ | 129 | { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ |
130 | { EXTENT_MAP_HOLE, "HOLE" }, \ | 130 | { EXTENT_MAP_HOLE, "HOLE" }, \ |
131 | { EXTENT_MAP_INLINE, "INLINE" }, \ | 131 | { EXTENT_MAP_INLINE, "INLINE" }, \ |
132 | { EXTENT_MAP_DELALLOC, "DELALLOC" }) | 132 | { EXTENT_MAP_DELALLOC, "DELALLOC" }) |
133 | 133 | ||
134 | #define show_map_type(type) \ | 134 | #define show_map_type(type) \ |
135 | type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) | 135 | type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) |
136 | 136 | ||
137 | #define show_map_flags(flag) \ | 137 | #define show_map_flags(flag) \ |
138 | __print_flags(flag, "|", \ | 138 | __print_flags(flag, "|", \ |
139 | { EXTENT_FLAG_PINNED, "PINNED" }, \ | 139 | { EXTENT_FLAG_PINNED, "PINNED" }, \ |
140 | { EXTENT_FLAG_COMPRESSED, "COMPRESSED" }, \ | 140 | { EXTENT_FLAG_COMPRESSED, "COMPRESSED" }, \ |
141 | { EXTENT_FLAG_VACANCY, "VACANCY" }, \ | 141 | { EXTENT_FLAG_VACANCY, "VACANCY" }, \ |
142 | { EXTENT_FLAG_PREALLOC, "PREALLOC" }) | 142 | { EXTENT_FLAG_PREALLOC, "PREALLOC" }) |
143 | 143 | ||
144 | TRACE_EVENT(btrfs_get_extent, | 144 | TRACE_EVENT(btrfs_get_extent, |
145 | 145 | ||
146 | TP_PROTO(struct btrfs_root *root, struct extent_map *map), | 146 | TP_PROTO(struct btrfs_root *root, struct extent_map *map), |
147 | 147 | ||
148 | TP_ARGS(root, map), | 148 | TP_ARGS(root, map), |
149 | 149 | ||
150 | TP_STRUCT__entry( | 150 | TP_STRUCT__entry( |
151 | __field( u64, root_objectid ) | 151 | __field( u64, root_objectid ) |
152 | __field( u64, start ) | 152 | __field( u64, start ) |
153 | __field( u64, len ) | 153 | __field( u64, len ) |
154 | __field( u64, orig_start ) | 154 | __field( u64, orig_start ) |
155 | __field( u64, block_start ) | 155 | __field( u64, block_start ) |
156 | __field( u64, block_len ) | 156 | __field( u64, block_len ) |
157 | __field( unsigned long, flags ) | 157 | __field( unsigned long, flags ) |
158 | __field( int, refs ) | 158 | __field( int, refs ) |
159 | __field( unsigned int, compress_type ) | 159 | __field( unsigned int, compress_type ) |
160 | ), | 160 | ), |
161 | 161 | ||
162 | TP_fast_assign( | 162 | TP_fast_assign( |
163 | __entry->root_objectid = root->root_key.objectid; | 163 | __entry->root_objectid = root->root_key.objectid; |
164 | __entry->start = map->start; | 164 | __entry->start = map->start; |
165 | __entry->len = map->len; | 165 | __entry->len = map->len; |
166 | __entry->orig_start = map->orig_start; | 166 | __entry->orig_start = map->orig_start; |
167 | __entry->block_start = map->block_start; | 167 | __entry->block_start = map->block_start; |
168 | __entry->block_len = map->block_len; | 168 | __entry->block_len = map->block_len; |
169 | __entry->flags = map->flags; | 169 | __entry->flags = map->flags; |
170 | __entry->refs = atomic_read(&map->refs); | 170 | __entry->refs = atomic_read(&map->refs); |
171 | __entry->compress_type = map->compress_type; | 171 | __entry->compress_type = map->compress_type; |
172 | ), | 172 | ), |
173 | 173 | ||
174 | TP_printk("root = %llu(%s), start = %llu, len = %llu, " | 174 | TP_printk("root = %llu(%s), start = %llu, len = %llu, " |
175 | "orig_start = %llu, block_start = %llu(%s), " | 175 | "orig_start = %llu, block_start = %llu(%s), " |
176 | "block_len = %llu, flags = %s, refs = %u, " | 176 | "block_len = %llu, flags = %s, refs = %u, " |
177 | "compress_type = %u", | 177 | "compress_type = %u", |
178 | show_root_type(__entry->root_objectid), | 178 | show_root_type(__entry->root_objectid), |
179 | (unsigned long long)__entry->start, | 179 | (unsigned long long)__entry->start, |
180 | (unsigned long long)__entry->len, | 180 | (unsigned long long)__entry->len, |
181 | (unsigned long long)__entry->orig_start, | 181 | (unsigned long long)__entry->orig_start, |
182 | show_map_type(__entry->block_start), | 182 | show_map_type(__entry->block_start), |
183 | (unsigned long long)__entry->block_len, | 183 | (unsigned long long)__entry->block_len, |
184 | show_map_flags(__entry->flags), | 184 | show_map_flags(__entry->flags), |
185 | __entry->refs, __entry->compress_type) | 185 | __entry->refs, __entry->compress_type) |
186 | ); | 186 | ); |
187 | 187 | ||
188 | #define show_ordered_flags(flags) \ | 188 | #define show_ordered_flags(flags) \ |
189 | __print_symbolic(flags, \ | 189 | __print_symbolic(flags, \ |
190 | { BTRFS_ORDERED_IO_DONE, "IO_DONE" }, \ | 190 | { BTRFS_ORDERED_IO_DONE, "IO_DONE" }, \ |
191 | { BTRFS_ORDERED_COMPLETE, "COMPLETE" }, \ | 191 | { BTRFS_ORDERED_COMPLETE, "COMPLETE" }, \ |
192 | { BTRFS_ORDERED_NOCOW, "NOCOW" }, \ | 192 | { BTRFS_ORDERED_NOCOW, "NOCOW" }, \ |
193 | { BTRFS_ORDERED_COMPRESSED, "COMPRESSED" }, \ | 193 | { BTRFS_ORDERED_COMPRESSED, "COMPRESSED" }, \ |
194 | { BTRFS_ORDERED_PREALLOC, "PREALLOC" }, \ | 194 | { BTRFS_ORDERED_PREALLOC, "PREALLOC" }, \ |
195 | { BTRFS_ORDERED_DIRECT, "DIRECT" }) | 195 | { BTRFS_ORDERED_DIRECT, "DIRECT" }) |
196 | 196 | ||
197 | DECLARE_EVENT_CLASS(btrfs__ordered_extent, | 197 | DECLARE_EVENT_CLASS(btrfs__ordered_extent, |
198 | 198 | ||
199 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), | 199 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), |
200 | 200 | ||
201 | TP_ARGS(inode, ordered), | 201 | TP_ARGS(inode, ordered), |
202 | 202 | ||
203 | TP_STRUCT__entry( | 203 | TP_STRUCT__entry( |
204 | __field( ino_t, ino ) | 204 | __field( ino_t, ino ) |
205 | __field( u64, file_offset ) | 205 | __field( u64, file_offset ) |
206 | __field( u64, start ) | 206 | __field( u64, start ) |
207 | __field( u64, len ) | 207 | __field( u64, len ) |
208 | __field( u64, disk_len ) | 208 | __field( u64, disk_len ) |
209 | __field( u64, bytes_left ) | 209 | __field( u64, bytes_left ) |
210 | __field( unsigned long, flags ) | 210 | __field( unsigned long, flags ) |
211 | __field( int, compress_type ) | 211 | __field( int, compress_type ) |
212 | __field( int, refs ) | 212 | __field( int, refs ) |
213 | __field( u64, root_objectid ) | 213 | __field( u64, root_objectid ) |
214 | ), | 214 | ), |
215 | 215 | ||
216 | TP_fast_assign( | 216 | TP_fast_assign( |
217 | __entry->ino = inode->i_ino; | 217 | __entry->ino = inode->i_ino; |
218 | __entry->file_offset = ordered->file_offset; | 218 | __entry->file_offset = ordered->file_offset; |
219 | __entry->start = ordered->start; | 219 | __entry->start = ordered->start; |
220 | __entry->len = ordered->len; | 220 | __entry->len = ordered->len; |
221 | __entry->disk_len = ordered->disk_len; | 221 | __entry->disk_len = ordered->disk_len; |
222 | __entry->bytes_left = ordered->bytes_left; | 222 | __entry->bytes_left = ordered->bytes_left; |
223 | __entry->flags = ordered->flags; | 223 | __entry->flags = ordered->flags; |
224 | __entry->compress_type = ordered->compress_type; | 224 | __entry->compress_type = ordered->compress_type; |
225 | __entry->refs = atomic_read(&ordered->refs); | 225 | __entry->refs = atomic_read(&ordered->refs); |
226 | __entry->root_objectid = | 226 | __entry->root_objectid = |
227 | BTRFS_I(inode)->root->root_key.objectid; | 227 | BTRFS_I(inode)->root->root_key.objectid; |
228 | ), | 228 | ), |
229 | 229 | ||
230 | TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, " | 230 | TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, " |
231 | "start = %llu, len = %llu, disk_len = %llu, " | 231 | "start = %llu, len = %llu, disk_len = %llu, " |
232 | "bytes_left = %llu, flags = %s, compress_type = %d, " | 232 | "bytes_left = %llu, flags = %s, compress_type = %d, " |
233 | "refs = %d", | 233 | "refs = %d", |
234 | show_root_type(__entry->root_objectid), | 234 | show_root_type(__entry->root_objectid), |
235 | (unsigned long long)__entry->ino, | 235 | (unsigned long long)__entry->ino, |
236 | (unsigned long long)__entry->file_offset, | 236 | (unsigned long long)__entry->file_offset, |
237 | (unsigned long long)__entry->start, | 237 | (unsigned long long)__entry->start, |
238 | (unsigned long long)__entry->len, | 238 | (unsigned long long)__entry->len, |
239 | (unsigned long long)__entry->disk_len, | 239 | (unsigned long long)__entry->disk_len, |
240 | (unsigned long long)__entry->bytes_left, | 240 | (unsigned long long)__entry->bytes_left, |
241 | show_ordered_flags(__entry->flags), | 241 | show_ordered_flags(__entry->flags), |
242 | __entry->compress_type, __entry->refs) | 242 | __entry->compress_type, __entry->refs) |
243 | ); | 243 | ); |
244 | 244 | ||
245 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, | 245 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, |
246 | 246 | ||
247 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), | 247 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), |
248 | 248 | ||
249 | TP_ARGS(inode, ordered) | 249 | TP_ARGS(inode, ordered) |
250 | ); | 250 | ); |
251 | 251 | ||
252 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, | 252 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, |
253 | 253 | ||
254 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), | 254 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), |
255 | 255 | ||
256 | TP_ARGS(inode, ordered) | 256 | TP_ARGS(inode, ordered) |
257 | ); | 257 | ); |
258 | 258 | ||
259 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, | 259 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, |
260 | 260 | ||
261 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), | 261 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), |
262 | 262 | ||
263 | TP_ARGS(inode, ordered) | 263 | TP_ARGS(inode, ordered) |
264 | ); | 264 | ); |
265 | 265 | ||
266 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, | 266 | DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, |
267 | 267 | ||
268 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), | 268 | TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), |
269 | 269 | ||
270 | TP_ARGS(inode, ordered) | 270 | TP_ARGS(inode, ordered) |
271 | ); | 271 | ); |
272 | 272 | ||
273 | DECLARE_EVENT_CLASS(btrfs__writepage, | 273 | DECLARE_EVENT_CLASS(btrfs__writepage, |
274 | 274 | ||
275 | TP_PROTO(struct page *page, struct inode *inode, | 275 | TP_PROTO(struct page *page, struct inode *inode, |
276 | struct writeback_control *wbc), | 276 | struct writeback_control *wbc), |
277 | 277 | ||
278 | TP_ARGS(page, inode, wbc), | 278 | TP_ARGS(page, inode, wbc), |
279 | 279 | ||
280 | TP_STRUCT__entry( | 280 | TP_STRUCT__entry( |
281 | __field( ino_t, ino ) | 281 | __field( ino_t, ino ) |
282 | __field( pgoff_t, index ) | 282 | __field( pgoff_t, index ) |
283 | __field( long, nr_to_write ) | 283 | __field( long, nr_to_write ) |
284 | __field( long, pages_skipped ) | 284 | __field( long, pages_skipped ) |
285 | __field( loff_t, range_start ) | 285 | __field( loff_t, range_start ) |
286 | __field( loff_t, range_end ) | 286 | __field( loff_t, range_end ) |
287 | __field( char, nonblocking ) | ||
288 | __field( char, for_kupdate ) | 287 | __field( char, for_kupdate ) |
289 | __field( char, for_reclaim ) | 288 | __field( char, for_reclaim ) |
290 | __field( char, range_cyclic ) | 289 | __field( char, range_cyclic ) |
291 | __field( pgoff_t, writeback_index ) | 290 | __field( pgoff_t, writeback_index ) |
292 | __field( u64, root_objectid ) | 291 | __field( u64, root_objectid ) |
293 | ), | 292 | ), |
294 | 293 | ||
295 | TP_fast_assign( | 294 | TP_fast_assign( |
296 | __entry->ino = inode->i_ino; | 295 | __entry->ino = inode->i_ino; |
297 | __entry->index = page->index; | 296 | __entry->index = page->index; |
298 | __entry->nr_to_write = wbc->nr_to_write; | 297 | __entry->nr_to_write = wbc->nr_to_write; |
299 | __entry->pages_skipped = wbc->pages_skipped; | 298 | __entry->pages_skipped = wbc->pages_skipped; |
300 | __entry->range_start = wbc->range_start; | 299 | __entry->range_start = wbc->range_start; |
301 | __entry->range_end = wbc->range_end; | 300 | __entry->range_end = wbc->range_end; |
302 | __entry->nonblocking = wbc->nonblocking; | ||
303 | __entry->for_kupdate = wbc->for_kupdate; | 301 | __entry->for_kupdate = wbc->for_kupdate; |
304 | __entry->for_reclaim = wbc->for_reclaim; | 302 | __entry->for_reclaim = wbc->for_reclaim; |
305 | __entry->range_cyclic = wbc->range_cyclic; | 303 | __entry->range_cyclic = wbc->range_cyclic; |
306 | __entry->writeback_index = inode->i_mapping->writeback_index; | 304 | __entry->writeback_index = inode->i_mapping->writeback_index; |
307 | __entry->root_objectid = | 305 | __entry->root_objectid = |
308 | BTRFS_I(inode)->root->root_key.objectid; | 306 | BTRFS_I(inode)->root->root_key.objectid; |
309 | ), | 307 | ), |
310 | 308 | ||
311 | TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " | 309 | TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " |
312 | "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " | 310 | "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " |
313 | "range_end = %llu, nonblocking = %d, for_kupdate = %d, " | 311 | "range_end = %llu, for_kupdate = %d, " |
314 | "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", | 312 | "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", |
315 | show_root_type(__entry->root_objectid), | 313 | show_root_type(__entry->root_objectid), |
316 | (unsigned long)__entry->ino, __entry->index, | 314 | (unsigned long)__entry->ino, __entry->index, |
317 | __entry->nr_to_write, __entry->pages_skipped, | 315 | __entry->nr_to_write, __entry->pages_skipped, |
318 | __entry->range_start, __entry->range_end, | 316 | __entry->range_start, __entry->range_end, |
319 | __entry->nonblocking, __entry->for_kupdate, | 317 | __entry->for_kupdate, |
320 | __entry->for_reclaim, __entry->range_cyclic, | 318 | __entry->for_reclaim, __entry->range_cyclic, |
321 | (unsigned long)__entry->writeback_index) | 319 | (unsigned long)__entry->writeback_index) |
322 | ); | 320 | ); |
323 | 321 | ||
324 | DEFINE_EVENT(btrfs__writepage, __extent_writepage, | 322 | DEFINE_EVENT(btrfs__writepage, __extent_writepage, |
325 | 323 | ||
326 | TP_PROTO(struct page *page, struct inode *inode, | 324 | TP_PROTO(struct page *page, struct inode *inode, |
327 | struct writeback_control *wbc), | 325 | struct writeback_control *wbc), |
328 | 326 | ||
329 | TP_ARGS(page, inode, wbc) | 327 | TP_ARGS(page, inode, wbc) |
330 | ); | 328 | ); |
331 | 329 | ||
332 | TRACE_EVENT(btrfs_writepage_end_io_hook, | 330 | TRACE_EVENT(btrfs_writepage_end_io_hook, |
333 | 331 | ||
334 | TP_PROTO(struct page *page, u64 start, u64 end, int uptodate), | 332 | TP_PROTO(struct page *page, u64 start, u64 end, int uptodate), |
335 | 333 | ||
336 | TP_ARGS(page, start, end, uptodate), | 334 | TP_ARGS(page, start, end, uptodate), |
337 | 335 | ||
338 | TP_STRUCT__entry( | 336 | TP_STRUCT__entry( |
339 | __field( ino_t, ino ) | 337 | __field( ino_t, ino ) |
340 | __field( pgoff_t, index ) | 338 | __field( pgoff_t, index ) |
341 | __field( u64, start ) | 339 | __field( u64, start ) |
342 | __field( u64, end ) | 340 | __field( u64, end ) |
343 | __field( int, uptodate ) | 341 | __field( int, uptodate ) |
344 | __field( u64, root_objectid ) | 342 | __field( u64, root_objectid ) |
345 | ), | 343 | ), |
346 | 344 | ||
347 | TP_fast_assign( | 345 | TP_fast_assign( |
348 | __entry->ino = page->mapping->host->i_ino; | 346 | __entry->ino = page->mapping->host->i_ino; |
349 | __entry->index = page->index; | 347 | __entry->index = page->index; |
350 | __entry->start = start; | 348 | __entry->start = start; |
351 | __entry->end = end; | 349 | __entry->end = end; |
352 | __entry->uptodate = uptodate; | 350 | __entry->uptodate = uptodate; |
353 | __entry->root_objectid = | 351 | __entry->root_objectid = |
354 | BTRFS_I(page->mapping->host)->root->root_key.objectid; | 352 | BTRFS_I(page->mapping->host)->root->root_key.objectid; |
355 | ), | 353 | ), |
356 | 354 | ||
357 | TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, " | 355 | TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, " |
358 | "end = %llu, uptodate = %d", | 356 | "end = %llu, uptodate = %d", |
359 | show_root_type(__entry->root_objectid), | 357 | show_root_type(__entry->root_objectid), |
360 | (unsigned long)__entry->ino, (unsigned long)__entry->index, | 358 | (unsigned long)__entry->ino, (unsigned long)__entry->index, |
361 | (unsigned long long)__entry->start, | 359 | (unsigned long long)__entry->start, |
362 | (unsigned long long)__entry->end, __entry->uptodate) | 360 | (unsigned long long)__entry->end, __entry->uptodate) |
363 | ); | 361 | ); |
364 | 362 | ||
365 | TRACE_EVENT(btrfs_sync_file, | 363 | TRACE_EVENT(btrfs_sync_file, |
366 | 364 | ||
367 | TP_PROTO(struct file *file, int datasync), | 365 | TP_PROTO(struct file *file, int datasync), |
368 | 366 | ||
369 | TP_ARGS(file, datasync), | 367 | TP_ARGS(file, datasync), |
370 | 368 | ||
371 | TP_STRUCT__entry( | 369 | TP_STRUCT__entry( |
372 | __field( ino_t, ino ) | 370 | __field( ino_t, ino ) |
373 | __field( ino_t, parent ) | 371 | __field( ino_t, parent ) |
374 | __field( int, datasync ) | 372 | __field( int, datasync ) |
375 | __field( u64, root_objectid ) | 373 | __field( u64, root_objectid ) |
376 | ), | 374 | ), |
377 | 375 | ||
378 | TP_fast_assign( | 376 | TP_fast_assign( |
379 | struct dentry *dentry = file->f_path.dentry; | 377 | struct dentry *dentry = file->f_path.dentry; |
380 | struct inode *inode = dentry->d_inode; | 378 | struct inode *inode = dentry->d_inode; |
381 | 379 | ||
382 | __entry->ino = inode->i_ino; | 380 | __entry->ino = inode->i_ino; |
383 | __entry->parent = dentry->d_parent->d_inode->i_ino; | 381 | __entry->parent = dentry->d_parent->d_inode->i_ino; |
384 | __entry->datasync = datasync; | 382 | __entry->datasync = datasync; |
385 | __entry->root_objectid = | 383 | __entry->root_objectid = |
386 | BTRFS_I(inode)->root->root_key.objectid; | 384 | BTRFS_I(inode)->root->root_key.objectid; |
387 | ), | 385 | ), |
388 | 386 | ||
389 | TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d", | 387 | TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d", |
390 | show_root_type(__entry->root_objectid), | 388 | show_root_type(__entry->root_objectid), |
391 | (unsigned long)__entry->ino, (unsigned long)__entry->parent, | 389 | (unsigned long)__entry->ino, (unsigned long)__entry->parent, |
392 | __entry->datasync) | 390 | __entry->datasync) |
393 | ); | 391 | ); |
394 | 392 | ||
395 | TRACE_EVENT(btrfs_sync_fs, | 393 | TRACE_EVENT(btrfs_sync_fs, |
396 | 394 | ||
397 | TP_PROTO(int wait), | 395 | TP_PROTO(int wait), |
398 | 396 | ||
399 | TP_ARGS(wait), | 397 | TP_ARGS(wait), |
400 | 398 | ||
401 | TP_STRUCT__entry( | 399 | TP_STRUCT__entry( |
402 | __field( int, wait ) | 400 | __field( int, wait ) |
403 | ), | 401 | ), |
404 | 402 | ||
405 | TP_fast_assign( | 403 | TP_fast_assign( |
406 | __entry->wait = wait; | 404 | __entry->wait = wait; |
407 | ), | 405 | ), |
408 | 406 | ||
409 | TP_printk("wait = %d", __entry->wait) | 407 | TP_printk("wait = %d", __entry->wait) |
410 | ); | 408 | ); |
411 | 409 | ||
412 | #define show_ref_action(action) \ | 410 | #define show_ref_action(action) \ |
413 | __print_symbolic(action, \ | 411 | __print_symbolic(action, \ |
414 | { BTRFS_ADD_DELAYED_REF, "ADD_DELAYED_REF" }, \ | 412 | { BTRFS_ADD_DELAYED_REF, "ADD_DELAYED_REF" }, \ |
415 | { BTRFS_DROP_DELAYED_REF, "DROP_DELAYED_REF" }, \ | 413 | { BTRFS_DROP_DELAYED_REF, "DROP_DELAYED_REF" }, \ |
416 | { BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" }, \ | 414 | { BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" }, \ |
417 | { BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" }) | 415 | { BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" }) |
418 | 416 | ||
419 | 417 | ||
420 | TRACE_EVENT(btrfs_delayed_tree_ref, | 418 | TRACE_EVENT(btrfs_delayed_tree_ref, |
421 | 419 | ||
422 | TP_PROTO(struct btrfs_delayed_ref_node *ref, | 420 | TP_PROTO(struct btrfs_delayed_ref_node *ref, |
423 | struct btrfs_delayed_tree_ref *full_ref, | 421 | struct btrfs_delayed_tree_ref *full_ref, |
424 | int action), | 422 | int action), |
425 | 423 | ||
426 | TP_ARGS(ref, full_ref, action), | 424 | TP_ARGS(ref, full_ref, action), |
427 | 425 | ||
428 | TP_STRUCT__entry( | 426 | TP_STRUCT__entry( |
429 | __field( u64, bytenr ) | 427 | __field( u64, bytenr ) |
430 | __field( u64, num_bytes ) | 428 | __field( u64, num_bytes ) |
431 | __field( int, action ) | 429 | __field( int, action ) |
432 | __field( u64, parent ) | 430 | __field( u64, parent ) |
433 | __field( u64, ref_root ) | 431 | __field( u64, ref_root ) |
434 | __field( int, level ) | 432 | __field( int, level ) |
435 | __field( int, type ) | 433 | __field( int, type ) |
436 | ), | 434 | ), |
437 | 435 | ||
438 | TP_fast_assign( | 436 | TP_fast_assign( |
439 | __entry->bytenr = ref->bytenr; | 437 | __entry->bytenr = ref->bytenr; |
440 | __entry->num_bytes = ref->num_bytes; | 438 | __entry->num_bytes = ref->num_bytes; |
441 | __entry->action = action; | 439 | __entry->action = action; |
442 | __entry->parent = full_ref->parent; | 440 | __entry->parent = full_ref->parent; |
443 | __entry->ref_root = full_ref->root; | 441 | __entry->ref_root = full_ref->root; |
444 | __entry->level = full_ref->level; | 442 | __entry->level = full_ref->level; |
445 | __entry->type = ref->type; | 443 | __entry->type = ref->type; |
446 | ), | 444 | ), |
447 | 445 | ||
448 | TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " | 446 | TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " |
449 | "parent = %llu(%s), ref_root = %llu(%s), level = %d, " | 447 | "parent = %llu(%s), ref_root = %llu(%s), level = %d, " |
450 | "type = %s", | 448 | "type = %s", |
451 | (unsigned long long)__entry->bytenr, | 449 | (unsigned long long)__entry->bytenr, |
452 | (unsigned long long)__entry->num_bytes, | 450 | (unsigned long long)__entry->num_bytes, |
453 | show_ref_action(__entry->action), | 451 | show_ref_action(__entry->action), |
454 | show_root_type(__entry->parent), | 452 | show_root_type(__entry->parent), |
455 | show_root_type(__entry->ref_root), | 453 | show_root_type(__entry->ref_root), |
456 | __entry->level, show_ref_type(__entry->type)) | 454 | __entry->level, show_ref_type(__entry->type)) |
457 | ); | 455 | ); |
458 | 456 | ||
459 | TRACE_EVENT(btrfs_delayed_data_ref, | 457 | TRACE_EVENT(btrfs_delayed_data_ref, |
460 | 458 | ||
461 | TP_PROTO(struct btrfs_delayed_ref_node *ref, | 459 | TP_PROTO(struct btrfs_delayed_ref_node *ref, |
462 | struct btrfs_delayed_data_ref *full_ref, | 460 | struct btrfs_delayed_data_ref *full_ref, |
463 | int action), | 461 | int action), |
464 | 462 | ||
465 | TP_ARGS(ref, full_ref, action), | 463 | TP_ARGS(ref, full_ref, action), |
466 | 464 | ||
467 | TP_STRUCT__entry( | 465 | TP_STRUCT__entry( |
468 | __field( u64, bytenr ) | 466 | __field( u64, bytenr ) |
469 | __field( u64, num_bytes ) | 467 | __field( u64, num_bytes ) |
470 | __field( int, action ) | 468 | __field( int, action ) |
471 | __field( u64, parent ) | 469 | __field( u64, parent ) |
472 | __field( u64, ref_root ) | 470 | __field( u64, ref_root ) |
473 | __field( u64, owner ) | 471 | __field( u64, owner ) |
474 | __field( u64, offset ) | 472 | __field( u64, offset ) |
475 | __field( int, type ) | 473 | __field( int, type ) |
476 | ), | 474 | ), |
477 | 475 | ||
478 | TP_fast_assign( | 476 | TP_fast_assign( |
479 | __entry->bytenr = ref->bytenr; | 477 | __entry->bytenr = ref->bytenr; |
480 | __entry->num_bytes = ref->num_bytes; | 478 | __entry->num_bytes = ref->num_bytes; |
481 | __entry->action = action; | 479 | __entry->action = action; |
482 | __entry->parent = full_ref->parent; | 480 | __entry->parent = full_ref->parent; |
483 | __entry->ref_root = full_ref->root; | 481 | __entry->ref_root = full_ref->root; |
484 | __entry->owner = full_ref->objectid; | 482 | __entry->owner = full_ref->objectid; |
485 | __entry->offset = full_ref->offset; | 483 | __entry->offset = full_ref->offset; |
486 | __entry->type = ref->type; | 484 | __entry->type = ref->type; |
487 | ), | 485 | ), |
488 | 486 | ||
489 | TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " | 487 | TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " |
490 | "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, " | 488 | "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, " |
491 | "offset = %llu, type = %s", | 489 | "offset = %llu, type = %s", |
492 | (unsigned long long)__entry->bytenr, | 490 | (unsigned long long)__entry->bytenr, |
493 | (unsigned long long)__entry->num_bytes, | 491 | (unsigned long long)__entry->num_bytes, |
494 | show_ref_action(__entry->action), | 492 | show_ref_action(__entry->action), |
495 | show_root_type(__entry->parent), | 493 | show_root_type(__entry->parent), |
496 | show_root_type(__entry->ref_root), | 494 | show_root_type(__entry->ref_root), |
497 | (unsigned long long)__entry->owner, | 495 | (unsigned long long)__entry->owner, |
498 | (unsigned long long)__entry->offset, | 496 | (unsigned long long)__entry->offset, |
499 | show_ref_type(__entry->type)) | 497 | show_ref_type(__entry->type)) |
500 | ); | 498 | ); |
501 | 499 | ||
502 | TRACE_EVENT(btrfs_delayed_ref_head, | 500 | TRACE_EVENT(btrfs_delayed_ref_head, |
503 | 501 | ||
504 | TP_PROTO(struct btrfs_delayed_ref_node *ref, | 502 | TP_PROTO(struct btrfs_delayed_ref_node *ref, |
505 | struct btrfs_delayed_ref_head *head_ref, | 503 | struct btrfs_delayed_ref_head *head_ref, |
506 | int action), | 504 | int action), |
507 | 505 | ||
508 | TP_ARGS(ref, head_ref, action), | 506 | TP_ARGS(ref, head_ref, action), |
509 | 507 | ||
510 | TP_STRUCT__entry( | 508 | TP_STRUCT__entry( |
511 | __field( u64, bytenr ) | 509 | __field( u64, bytenr ) |
512 | __field( u64, num_bytes ) | 510 | __field( u64, num_bytes ) |
513 | __field( int, action ) | 511 | __field( int, action ) |
514 | __field( int, is_data ) | 512 | __field( int, is_data ) |
515 | ), | 513 | ), |
516 | 514 | ||
517 | TP_fast_assign( | 515 | TP_fast_assign( |
518 | __entry->bytenr = ref->bytenr; | 516 | __entry->bytenr = ref->bytenr; |
519 | __entry->num_bytes = ref->num_bytes; | 517 | __entry->num_bytes = ref->num_bytes; |
520 | __entry->action = action; | 518 | __entry->action = action; |
521 | __entry->is_data = head_ref->is_data; | 519 | __entry->is_data = head_ref->is_data; |
522 | ), | 520 | ), |
523 | 521 | ||
524 | TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d", | 522 | TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d", |
525 | (unsigned long long)__entry->bytenr, | 523 | (unsigned long long)__entry->bytenr, |
526 | (unsigned long long)__entry->num_bytes, | 524 | (unsigned long long)__entry->num_bytes, |
527 | show_ref_action(__entry->action), | 525 | show_ref_action(__entry->action), |
528 | __entry->is_data) | 526 | __entry->is_data) |
529 | ); | 527 | ); |
530 | 528 | ||
531 | #define show_chunk_type(type) \ | 529 | #define show_chunk_type(type) \ |
532 | __print_flags(type, "|", \ | 530 | __print_flags(type, "|", \ |
533 | { BTRFS_BLOCK_GROUP_DATA, "DATA" }, \ | 531 | { BTRFS_BLOCK_GROUP_DATA, "DATA" }, \ |
534 | { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ | 532 | { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ |
535 | { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ | 533 | { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ |
536 | { BTRFS_BLOCK_GROUP_RAID0, "RAID0" }, \ | 534 | { BTRFS_BLOCK_GROUP_RAID0, "RAID0" }, \ |
537 | { BTRFS_BLOCK_GROUP_RAID1, "RAID1" }, \ | 535 | { BTRFS_BLOCK_GROUP_RAID1, "RAID1" }, \ |
538 | { BTRFS_BLOCK_GROUP_DUP, "DUP" }, \ | 536 | { BTRFS_BLOCK_GROUP_DUP, "DUP" }, \ |
539 | { BTRFS_BLOCK_GROUP_RAID10, "RAID10"}) | 537 | { BTRFS_BLOCK_GROUP_RAID10, "RAID10"}) |
540 | 538 | ||
541 | DECLARE_EVENT_CLASS(btrfs__chunk, | 539 | DECLARE_EVENT_CLASS(btrfs__chunk, |
542 | 540 | ||
543 | TP_PROTO(struct btrfs_root *root, struct map_lookup *map, | 541 | TP_PROTO(struct btrfs_root *root, struct map_lookup *map, |
544 | u64 offset, u64 size), | 542 | u64 offset, u64 size), |
545 | 543 | ||
546 | TP_ARGS(root, map, offset, size), | 544 | TP_ARGS(root, map, offset, size), |
547 | 545 | ||
548 | TP_STRUCT__entry( | 546 | TP_STRUCT__entry( |
549 | __field( int, num_stripes ) | 547 | __field( int, num_stripes ) |
550 | __field( u64, type ) | 548 | __field( u64, type ) |
551 | __field( int, sub_stripes ) | 549 | __field( int, sub_stripes ) |
552 | __field( u64, offset ) | 550 | __field( u64, offset ) |
553 | __field( u64, size ) | 551 | __field( u64, size ) |
554 | __field( u64, root_objectid ) | 552 | __field( u64, root_objectid ) |
555 | ), | 553 | ), |
556 | 554 | ||
557 | TP_fast_assign( | 555 | TP_fast_assign( |
558 | __entry->num_stripes = map->num_stripes; | 556 | __entry->num_stripes = map->num_stripes; |
559 | __entry->type = map->type; | 557 | __entry->type = map->type; |
560 | __entry->sub_stripes = map->sub_stripes; | 558 | __entry->sub_stripes = map->sub_stripes; |
561 | __entry->offset = offset; | 559 | __entry->offset = offset; |
562 | __entry->size = size; | 560 | __entry->size = size; |
563 | __entry->root_objectid = root->root_key.objectid; | 561 | __entry->root_objectid = root->root_key.objectid; |
564 | ), | 562 | ), |
565 | 563 | ||
566 | TP_printk("root = %llu(%s), offset = %llu, size = %llu, " | 564 | TP_printk("root = %llu(%s), offset = %llu, size = %llu, " |
567 | "num_stripes = %d, sub_stripes = %d, type = %s", | 565 | "num_stripes = %d, sub_stripes = %d, type = %s", |
568 | show_root_type(__entry->root_objectid), | 566 | show_root_type(__entry->root_objectid), |
569 | (unsigned long long)__entry->offset, | 567 | (unsigned long long)__entry->offset, |
570 | (unsigned long long)__entry->size, | 568 | (unsigned long long)__entry->size, |
571 | __entry->num_stripes, __entry->sub_stripes, | 569 | __entry->num_stripes, __entry->sub_stripes, |
572 | show_chunk_type(__entry->type)) | 570 | show_chunk_type(__entry->type)) |
573 | ); | 571 | ); |
574 | 572 | ||
575 | DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc, | 573 | DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc, |
576 | 574 | ||
577 | TP_PROTO(struct btrfs_root *root, struct map_lookup *map, | 575 | TP_PROTO(struct btrfs_root *root, struct map_lookup *map, |
578 | u64 offset, u64 size), | 576 | u64 offset, u64 size), |
579 | 577 | ||
580 | TP_ARGS(root, map, offset, size) | 578 | TP_ARGS(root, map, offset, size) |
581 | ); | 579 | ); |
582 | 580 | ||
583 | DEFINE_EVENT(btrfs__chunk, btrfs_chunk_free, | 581 | DEFINE_EVENT(btrfs__chunk, btrfs_chunk_free, |
584 | 582 | ||
585 | TP_PROTO(struct btrfs_root *root, struct map_lookup *map, | 583 | TP_PROTO(struct btrfs_root *root, struct map_lookup *map, |
586 | u64 offset, u64 size), | 584 | u64 offset, u64 size), |
587 | 585 | ||
588 | TP_ARGS(root, map, offset, size) | 586 | TP_ARGS(root, map, offset, size) |
589 | ); | 587 | ); |
590 | 588 | ||
591 | TRACE_EVENT(btrfs_cow_block, | 589 | TRACE_EVENT(btrfs_cow_block, |
592 | 590 | ||
593 | TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf, | 591 | TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf, |
594 | struct extent_buffer *cow), | 592 | struct extent_buffer *cow), |
595 | 593 | ||
596 | TP_ARGS(root, buf, cow), | 594 | TP_ARGS(root, buf, cow), |
597 | 595 | ||
598 | TP_STRUCT__entry( | 596 | TP_STRUCT__entry( |
599 | __field( u64, root_objectid ) | 597 | __field( u64, root_objectid ) |
600 | __field( u64, buf_start ) | 598 | __field( u64, buf_start ) |
601 | __field( int, refs ) | 599 | __field( int, refs ) |
602 | __field( u64, cow_start ) | 600 | __field( u64, cow_start ) |
603 | __field( int, buf_level ) | 601 | __field( int, buf_level ) |
604 | __field( int, cow_level ) | 602 | __field( int, cow_level ) |
605 | ), | 603 | ), |
606 | 604 | ||
607 | TP_fast_assign( | 605 | TP_fast_assign( |
608 | __entry->root_objectid = root->root_key.objectid; | 606 | __entry->root_objectid = root->root_key.objectid; |
609 | __entry->buf_start = buf->start; | 607 | __entry->buf_start = buf->start; |
610 | __entry->refs = atomic_read(&buf->refs); | 608 | __entry->refs = atomic_read(&buf->refs); |
611 | __entry->cow_start = cow->start; | 609 | __entry->cow_start = cow->start; |
612 | __entry->buf_level = btrfs_header_level(buf); | 610 | __entry->buf_level = btrfs_header_level(buf); |
613 | __entry->cow_level = btrfs_header_level(cow); | 611 | __entry->cow_level = btrfs_header_level(cow); |
614 | ), | 612 | ), |
615 | 613 | ||
616 | TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu " | 614 | TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu " |
617 | "(orig_level = %d), cow_buf = %llu (cow_level = %d)", | 615 | "(orig_level = %d), cow_buf = %llu (cow_level = %d)", |
618 | show_root_type(__entry->root_objectid), | 616 | show_root_type(__entry->root_objectid), |
619 | __entry->refs, | 617 | __entry->refs, |
620 | (unsigned long long)__entry->buf_start, | 618 | (unsigned long long)__entry->buf_start, |
621 | __entry->buf_level, | 619 | __entry->buf_level, |
622 | (unsigned long long)__entry->cow_start, | 620 | (unsigned long long)__entry->cow_start, |
623 | __entry->cow_level) | 621 | __entry->cow_level) |
624 | ); | 622 | ); |
625 | 623 | ||
626 | DECLARE_EVENT_CLASS(btrfs__reserved_extent, | 624 | DECLARE_EVENT_CLASS(btrfs__reserved_extent, |
627 | 625 | ||
628 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), | 626 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), |
629 | 627 | ||
630 | TP_ARGS(root, start, len), | 628 | TP_ARGS(root, start, len), |
631 | 629 | ||
632 | TP_STRUCT__entry( | 630 | TP_STRUCT__entry( |
633 | __field( u64, root_objectid ) | 631 | __field( u64, root_objectid ) |
634 | __field( u64, start ) | 632 | __field( u64, start ) |
635 | __field( u64, len ) | 633 | __field( u64, len ) |
636 | ), | 634 | ), |
637 | 635 | ||
638 | TP_fast_assign( | 636 | TP_fast_assign( |
639 | __entry->root_objectid = root->root_key.objectid; | 637 | __entry->root_objectid = root->root_key.objectid; |
640 | __entry->start = start; | 638 | __entry->start = start; |
641 | __entry->len = len; | 639 | __entry->len = len; |
642 | ), | 640 | ), |
643 | 641 | ||
644 | TP_printk("root = %llu(%s), start = %llu, len = %llu", | 642 | TP_printk("root = %llu(%s), start = %llu, len = %llu", |
645 | show_root_type(__entry->root_objectid), | 643 | show_root_type(__entry->root_objectid), |
646 | (unsigned long long)__entry->start, | 644 | (unsigned long long)__entry->start, |
647 | (unsigned long long)__entry->len) | 645 | (unsigned long long)__entry->len) |
648 | ); | 646 | ); |
649 | 647 | ||
650 | DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_alloc, | 648 | DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_alloc, |
651 | 649 | ||
652 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), | 650 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), |
653 | 651 | ||
654 | TP_ARGS(root, start, len) | 652 | TP_ARGS(root, start, len) |
655 | ); | 653 | ); |
656 | 654 | ||
657 | DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, | 655 | DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, |
658 | 656 | ||
659 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), | 657 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), |
660 | 658 | ||
661 | TP_ARGS(root, start, len) | 659 | TP_ARGS(root, start, len) |
662 | ); | 660 | ); |
663 | 661 | ||
664 | #endif /* _TRACE_BTRFS_H */ | 662 | #endif /* _TRACE_BTRFS_H */ |
665 | 663 | ||
666 | /* This part must be outside protection */ | 664 | /* This part must be outside protection */ |
667 | #include <trace/define_trace.h> | 665 | #include <trace/define_trace.h> |
668 | 666 |
include/trace/events/ext4.h
1 | #undef TRACE_SYSTEM | 1 | #undef TRACE_SYSTEM |
2 | #define TRACE_SYSTEM ext4 | 2 | #define TRACE_SYSTEM ext4 |
3 | 3 | ||
4 | #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) | 4 | #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) |
5 | #define _TRACE_EXT4_H | 5 | #define _TRACE_EXT4_H |
6 | 6 | ||
7 | #include <linux/writeback.h> | 7 | #include <linux/writeback.h> |
8 | #include <linux/tracepoint.h> | 8 | #include <linux/tracepoint.h> |
9 | 9 | ||
10 | struct ext4_allocation_context; | 10 | struct ext4_allocation_context; |
11 | struct ext4_allocation_request; | 11 | struct ext4_allocation_request; |
12 | struct ext4_prealloc_space; | 12 | struct ext4_prealloc_space; |
13 | struct ext4_inode_info; | 13 | struct ext4_inode_info; |
14 | struct mpage_da_data; | 14 | struct mpage_da_data; |
15 | 15 | ||
16 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) | 16 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) |
17 | 17 | ||
18 | TRACE_EVENT(ext4_free_inode, | 18 | TRACE_EVENT(ext4_free_inode, |
19 | TP_PROTO(struct inode *inode), | 19 | TP_PROTO(struct inode *inode), |
20 | 20 | ||
21 | TP_ARGS(inode), | 21 | TP_ARGS(inode), |
22 | 22 | ||
23 | TP_STRUCT__entry( | 23 | TP_STRUCT__entry( |
24 | __field( dev_t, dev ) | 24 | __field( dev_t, dev ) |
25 | __field( ino_t, ino ) | 25 | __field( ino_t, ino ) |
26 | __field( umode_t, mode ) | 26 | __field( umode_t, mode ) |
27 | __field( uid_t, uid ) | 27 | __field( uid_t, uid ) |
28 | __field( gid_t, gid ) | 28 | __field( gid_t, gid ) |
29 | __field( __u64, blocks ) | 29 | __field( __u64, blocks ) |
30 | ), | 30 | ), |
31 | 31 | ||
32 | TP_fast_assign( | 32 | TP_fast_assign( |
33 | __entry->dev = inode->i_sb->s_dev; | 33 | __entry->dev = inode->i_sb->s_dev; |
34 | __entry->ino = inode->i_ino; | 34 | __entry->ino = inode->i_ino; |
35 | __entry->mode = inode->i_mode; | 35 | __entry->mode = inode->i_mode; |
36 | __entry->uid = inode->i_uid; | 36 | __entry->uid = inode->i_uid; |
37 | __entry->gid = inode->i_gid; | 37 | __entry->gid = inode->i_gid; |
38 | __entry->blocks = inode->i_blocks; | 38 | __entry->blocks = inode->i_blocks; |
39 | ), | 39 | ), |
40 | 40 | ||
41 | TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", | 41 | TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", |
42 | MAJOR(__entry->dev), MINOR(__entry->dev), | 42 | MAJOR(__entry->dev), MINOR(__entry->dev), |
43 | (unsigned long) __entry->ino, __entry->mode, | 43 | (unsigned long) __entry->ino, __entry->mode, |
44 | __entry->uid, __entry->gid, __entry->blocks) | 44 | __entry->uid, __entry->gid, __entry->blocks) |
45 | ); | 45 | ); |
46 | 46 | ||
47 | TRACE_EVENT(ext4_request_inode, | 47 | TRACE_EVENT(ext4_request_inode, |
48 | TP_PROTO(struct inode *dir, int mode), | 48 | TP_PROTO(struct inode *dir, int mode), |
49 | 49 | ||
50 | TP_ARGS(dir, mode), | 50 | TP_ARGS(dir, mode), |
51 | 51 | ||
52 | TP_STRUCT__entry( | 52 | TP_STRUCT__entry( |
53 | __field( dev_t, dev ) | 53 | __field( dev_t, dev ) |
54 | __field( ino_t, dir ) | 54 | __field( ino_t, dir ) |
55 | __field( umode_t, mode ) | 55 | __field( umode_t, mode ) |
56 | ), | 56 | ), |
57 | 57 | ||
58 | TP_fast_assign( | 58 | TP_fast_assign( |
59 | __entry->dev = dir->i_sb->s_dev; | 59 | __entry->dev = dir->i_sb->s_dev; |
60 | __entry->dir = dir->i_ino; | 60 | __entry->dir = dir->i_ino; |
61 | __entry->mode = mode; | 61 | __entry->mode = mode; |
62 | ), | 62 | ), |
63 | 63 | ||
64 | TP_printk("dev %d,%d dir %lu mode 0%o", | 64 | TP_printk("dev %d,%d dir %lu mode 0%o", |
65 | MAJOR(__entry->dev), MINOR(__entry->dev), | 65 | MAJOR(__entry->dev), MINOR(__entry->dev), |
66 | (unsigned long) __entry->dir, __entry->mode) | 66 | (unsigned long) __entry->dir, __entry->mode) |
67 | ); | 67 | ); |
68 | 68 | ||
69 | TRACE_EVENT(ext4_allocate_inode, | 69 | TRACE_EVENT(ext4_allocate_inode, |
70 | TP_PROTO(struct inode *inode, struct inode *dir, int mode), | 70 | TP_PROTO(struct inode *inode, struct inode *dir, int mode), |
71 | 71 | ||
72 | TP_ARGS(inode, dir, mode), | 72 | TP_ARGS(inode, dir, mode), |
73 | 73 | ||
74 | TP_STRUCT__entry( | 74 | TP_STRUCT__entry( |
75 | __field( dev_t, dev ) | 75 | __field( dev_t, dev ) |
76 | __field( ino_t, ino ) | 76 | __field( ino_t, ino ) |
77 | __field( ino_t, dir ) | 77 | __field( ino_t, dir ) |
78 | __field( umode_t, mode ) | 78 | __field( umode_t, mode ) |
79 | ), | 79 | ), |
80 | 80 | ||
81 | TP_fast_assign( | 81 | TP_fast_assign( |
82 | __entry->dev = inode->i_sb->s_dev; | 82 | __entry->dev = inode->i_sb->s_dev; |
83 | __entry->ino = inode->i_ino; | 83 | __entry->ino = inode->i_ino; |
84 | __entry->dir = dir->i_ino; | 84 | __entry->dir = dir->i_ino; |
85 | __entry->mode = mode; | 85 | __entry->mode = mode; |
86 | ), | 86 | ), |
87 | 87 | ||
88 | TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", | 88 | TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", |
89 | MAJOR(__entry->dev), MINOR(__entry->dev), | 89 | MAJOR(__entry->dev), MINOR(__entry->dev), |
90 | (unsigned long) __entry->ino, | 90 | (unsigned long) __entry->ino, |
91 | (unsigned long) __entry->dir, __entry->mode) | 91 | (unsigned long) __entry->dir, __entry->mode) |
92 | ); | 92 | ); |
93 | 93 | ||
94 | TRACE_EVENT(ext4_evict_inode, | 94 | TRACE_EVENT(ext4_evict_inode, |
95 | TP_PROTO(struct inode *inode), | 95 | TP_PROTO(struct inode *inode), |
96 | 96 | ||
97 | TP_ARGS(inode), | 97 | TP_ARGS(inode), |
98 | 98 | ||
99 | TP_STRUCT__entry( | 99 | TP_STRUCT__entry( |
100 | __field( dev_t, dev ) | 100 | __field( dev_t, dev ) |
101 | __field( ino_t, ino ) | 101 | __field( ino_t, ino ) |
102 | __field( int, nlink ) | 102 | __field( int, nlink ) |
103 | ), | 103 | ), |
104 | 104 | ||
105 | TP_fast_assign( | 105 | TP_fast_assign( |
106 | __entry->dev = inode->i_sb->s_dev; | 106 | __entry->dev = inode->i_sb->s_dev; |
107 | __entry->ino = inode->i_ino; | 107 | __entry->ino = inode->i_ino; |
108 | __entry->nlink = inode->i_nlink; | 108 | __entry->nlink = inode->i_nlink; |
109 | ), | 109 | ), |
110 | 110 | ||
111 | TP_printk("dev %d,%d ino %lu nlink %d", | 111 | TP_printk("dev %d,%d ino %lu nlink %d", |
112 | MAJOR(__entry->dev), MINOR(__entry->dev), | 112 | MAJOR(__entry->dev), MINOR(__entry->dev), |
113 | (unsigned long) __entry->ino, __entry->nlink) | 113 | (unsigned long) __entry->ino, __entry->nlink) |
114 | ); | 114 | ); |
115 | 115 | ||
116 | TRACE_EVENT(ext4_drop_inode, | 116 | TRACE_EVENT(ext4_drop_inode, |
117 | TP_PROTO(struct inode *inode, int drop), | 117 | TP_PROTO(struct inode *inode, int drop), |
118 | 118 | ||
119 | TP_ARGS(inode, drop), | 119 | TP_ARGS(inode, drop), |
120 | 120 | ||
121 | TP_STRUCT__entry( | 121 | TP_STRUCT__entry( |
122 | __field( dev_t, dev ) | 122 | __field( dev_t, dev ) |
123 | __field( ino_t, ino ) | 123 | __field( ino_t, ino ) |
124 | __field( int, drop ) | 124 | __field( int, drop ) |
125 | ), | 125 | ), |
126 | 126 | ||
127 | TP_fast_assign( | 127 | TP_fast_assign( |
128 | __entry->dev = inode->i_sb->s_dev; | 128 | __entry->dev = inode->i_sb->s_dev; |
129 | __entry->ino = inode->i_ino; | 129 | __entry->ino = inode->i_ino; |
130 | __entry->drop = drop; | 130 | __entry->drop = drop; |
131 | ), | 131 | ), |
132 | 132 | ||
133 | TP_printk("dev %d,%d ino %lu drop %d", | 133 | TP_printk("dev %d,%d ino %lu drop %d", |
134 | MAJOR(__entry->dev), MINOR(__entry->dev), | 134 | MAJOR(__entry->dev), MINOR(__entry->dev), |
135 | (unsigned long) __entry->ino, __entry->drop) | 135 | (unsigned long) __entry->ino, __entry->drop) |
136 | ); | 136 | ); |
137 | 137 | ||
138 | TRACE_EVENT(ext4_mark_inode_dirty, | 138 | TRACE_EVENT(ext4_mark_inode_dirty, |
139 | TP_PROTO(struct inode *inode, unsigned long IP), | 139 | TP_PROTO(struct inode *inode, unsigned long IP), |
140 | 140 | ||
141 | TP_ARGS(inode, IP), | 141 | TP_ARGS(inode, IP), |
142 | 142 | ||
143 | TP_STRUCT__entry( | 143 | TP_STRUCT__entry( |
144 | __field( dev_t, dev ) | 144 | __field( dev_t, dev ) |
145 | __field( ino_t, ino ) | 145 | __field( ino_t, ino ) |
146 | __field(unsigned long, ip ) | 146 | __field(unsigned long, ip ) |
147 | ), | 147 | ), |
148 | 148 | ||
149 | TP_fast_assign( | 149 | TP_fast_assign( |
150 | __entry->dev = inode->i_sb->s_dev; | 150 | __entry->dev = inode->i_sb->s_dev; |
151 | __entry->ino = inode->i_ino; | 151 | __entry->ino = inode->i_ino; |
152 | __entry->ip = IP; | 152 | __entry->ip = IP; |
153 | ), | 153 | ), |
154 | 154 | ||
155 | TP_printk("dev %d,%d ino %lu caller %pF", | 155 | TP_printk("dev %d,%d ino %lu caller %pF", |
156 | MAJOR(__entry->dev), MINOR(__entry->dev), | 156 | MAJOR(__entry->dev), MINOR(__entry->dev), |
157 | (unsigned long) __entry->ino, (void *)__entry->ip) | 157 | (unsigned long) __entry->ino, (void *)__entry->ip) |
158 | ); | 158 | ); |
159 | 159 | ||
160 | TRACE_EVENT(ext4_begin_ordered_truncate, | 160 | TRACE_EVENT(ext4_begin_ordered_truncate, |
161 | TP_PROTO(struct inode *inode, loff_t new_size), | 161 | TP_PROTO(struct inode *inode, loff_t new_size), |
162 | 162 | ||
163 | TP_ARGS(inode, new_size), | 163 | TP_ARGS(inode, new_size), |
164 | 164 | ||
165 | TP_STRUCT__entry( | 165 | TP_STRUCT__entry( |
166 | __field( dev_t, dev ) | 166 | __field( dev_t, dev ) |
167 | __field( ino_t, ino ) | 167 | __field( ino_t, ino ) |
168 | __field( loff_t, new_size ) | 168 | __field( loff_t, new_size ) |
169 | ), | 169 | ), |
170 | 170 | ||
171 | TP_fast_assign( | 171 | TP_fast_assign( |
172 | __entry->dev = inode->i_sb->s_dev; | 172 | __entry->dev = inode->i_sb->s_dev; |
173 | __entry->ino = inode->i_ino; | 173 | __entry->ino = inode->i_ino; |
174 | __entry->new_size = new_size; | 174 | __entry->new_size = new_size; |
175 | ), | 175 | ), |
176 | 176 | ||
177 | TP_printk("dev %d,%d ino %lu new_size %lld", | 177 | TP_printk("dev %d,%d ino %lu new_size %lld", |
178 | MAJOR(__entry->dev), MINOR(__entry->dev), | 178 | MAJOR(__entry->dev), MINOR(__entry->dev), |
179 | (unsigned long) __entry->ino, | 179 | (unsigned long) __entry->ino, |
180 | __entry->new_size) | 180 | __entry->new_size) |
181 | ); | 181 | ); |
182 | 182 | ||
183 | DECLARE_EVENT_CLASS(ext4__write_begin, | 183 | DECLARE_EVENT_CLASS(ext4__write_begin, |
184 | 184 | ||
185 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 185 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
186 | unsigned int flags), | 186 | unsigned int flags), |
187 | 187 | ||
188 | TP_ARGS(inode, pos, len, flags), | 188 | TP_ARGS(inode, pos, len, flags), |
189 | 189 | ||
190 | TP_STRUCT__entry( | 190 | TP_STRUCT__entry( |
191 | __field( dev_t, dev ) | 191 | __field( dev_t, dev ) |
192 | __field( ino_t, ino ) | 192 | __field( ino_t, ino ) |
193 | __field( loff_t, pos ) | 193 | __field( loff_t, pos ) |
194 | __field( unsigned int, len ) | 194 | __field( unsigned int, len ) |
195 | __field( unsigned int, flags ) | 195 | __field( unsigned int, flags ) |
196 | ), | 196 | ), |
197 | 197 | ||
198 | TP_fast_assign( | 198 | TP_fast_assign( |
199 | __entry->dev = inode->i_sb->s_dev; | 199 | __entry->dev = inode->i_sb->s_dev; |
200 | __entry->ino = inode->i_ino; | 200 | __entry->ino = inode->i_ino; |
201 | __entry->pos = pos; | 201 | __entry->pos = pos; |
202 | __entry->len = len; | 202 | __entry->len = len; |
203 | __entry->flags = flags; | 203 | __entry->flags = flags; |
204 | ), | 204 | ), |
205 | 205 | ||
206 | TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u", | 206 | TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u", |
207 | MAJOR(__entry->dev), MINOR(__entry->dev), | 207 | MAJOR(__entry->dev), MINOR(__entry->dev), |
208 | (unsigned long) __entry->ino, | 208 | (unsigned long) __entry->ino, |
209 | __entry->pos, __entry->len, __entry->flags) | 209 | __entry->pos, __entry->len, __entry->flags) |
210 | ); | 210 | ); |
211 | 211 | ||
212 | DEFINE_EVENT(ext4__write_begin, ext4_write_begin, | 212 | DEFINE_EVENT(ext4__write_begin, ext4_write_begin, |
213 | 213 | ||
214 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 214 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
215 | unsigned int flags), | 215 | unsigned int flags), |
216 | 216 | ||
217 | TP_ARGS(inode, pos, len, flags) | 217 | TP_ARGS(inode, pos, len, flags) |
218 | ); | 218 | ); |
219 | 219 | ||
220 | DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, | 220 | DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, |
221 | 221 | ||
222 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 222 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
223 | unsigned int flags), | 223 | unsigned int flags), |
224 | 224 | ||
225 | TP_ARGS(inode, pos, len, flags) | 225 | TP_ARGS(inode, pos, len, flags) |
226 | ); | 226 | ); |
227 | 227 | ||
228 | DECLARE_EVENT_CLASS(ext4__write_end, | 228 | DECLARE_EVENT_CLASS(ext4__write_end, |
229 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 229 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
230 | unsigned int copied), | 230 | unsigned int copied), |
231 | 231 | ||
232 | TP_ARGS(inode, pos, len, copied), | 232 | TP_ARGS(inode, pos, len, copied), |
233 | 233 | ||
234 | TP_STRUCT__entry( | 234 | TP_STRUCT__entry( |
235 | __field( dev_t, dev ) | 235 | __field( dev_t, dev ) |
236 | __field( ino_t, ino ) | 236 | __field( ino_t, ino ) |
237 | __field( loff_t, pos ) | 237 | __field( loff_t, pos ) |
238 | __field( unsigned int, len ) | 238 | __field( unsigned int, len ) |
239 | __field( unsigned int, copied ) | 239 | __field( unsigned int, copied ) |
240 | ), | 240 | ), |
241 | 241 | ||
242 | TP_fast_assign( | 242 | TP_fast_assign( |
243 | __entry->dev = inode->i_sb->s_dev; | 243 | __entry->dev = inode->i_sb->s_dev; |
244 | __entry->ino = inode->i_ino; | 244 | __entry->ino = inode->i_ino; |
245 | __entry->pos = pos; | 245 | __entry->pos = pos; |
246 | __entry->len = len; | 246 | __entry->len = len; |
247 | __entry->copied = copied; | 247 | __entry->copied = copied; |
248 | ), | 248 | ), |
249 | 249 | ||
250 | TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", | 250 | TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", |
251 | MAJOR(__entry->dev), MINOR(__entry->dev), | 251 | MAJOR(__entry->dev), MINOR(__entry->dev), |
252 | (unsigned long) __entry->ino, | 252 | (unsigned long) __entry->ino, |
253 | __entry->pos, __entry->len, __entry->copied) | 253 | __entry->pos, __entry->len, __entry->copied) |
254 | ); | 254 | ); |
255 | 255 | ||
256 | DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, | 256 | DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, |
257 | 257 | ||
258 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 258 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
259 | unsigned int copied), | 259 | unsigned int copied), |
260 | 260 | ||
261 | TP_ARGS(inode, pos, len, copied) | 261 | TP_ARGS(inode, pos, len, copied) |
262 | ); | 262 | ); |
263 | 263 | ||
264 | DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end, | 264 | DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end, |
265 | 265 | ||
266 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 266 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
267 | unsigned int copied), | 267 | unsigned int copied), |
268 | 268 | ||
269 | TP_ARGS(inode, pos, len, copied) | 269 | TP_ARGS(inode, pos, len, copied) |
270 | ); | 270 | ); |
271 | 271 | ||
272 | DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end, | 272 | DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end, |
273 | 273 | ||
274 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 274 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
275 | unsigned int copied), | 275 | unsigned int copied), |
276 | 276 | ||
277 | TP_ARGS(inode, pos, len, copied) | 277 | TP_ARGS(inode, pos, len, copied) |
278 | ); | 278 | ); |
279 | 279 | ||
280 | DEFINE_EVENT(ext4__write_end, ext4_da_write_end, | 280 | DEFINE_EVENT(ext4__write_end, ext4_da_write_end, |
281 | 281 | ||
282 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, | 282 | TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, |
283 | unsigned int copied), | 283 | unsigned int copied), |
284 | 284 | ||
285 | TP_ARGS(inode, pos, len, copied) | 285 | TP_ARGS(inode, pos, len, copied) |
286 | ); | 286 | ); |
287 | 287 | ||
288 | TRACE_EVENT(ext4_da_writepages, | 288 | TRACE_EVENT(ext4_da_writepages, |
289 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), | 289 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), |
290 | 290 | ||
291 | TP_ARGS(inode, wbc), | 291 | TP_ARGS(inode, wbc), |
292 | 292 | ||
293 | TP_STRUCT__entry( | 293 | TP_STRUCT__entry( |
294 | __field( dev_t, dev ) | 294 | __field( dev_t, dev ) |
295 | __field( ino_t, ino ) | 295 | __field( ino_t, ino ) |
296 | __field( long, nr_to_write ) | 296 | __field( long, nr_to_write ) |
297 | __field( long, pages_skipped ) | 297 | __field( long, pages_skipped ) |
298 | __field( loff_t, range_start ) | 298 | __field( loff_t, range_start ) |
299 | __field( loff_t, range_end ) | 299 | __field( loff_t, range_end ) |
300 | __field( int, sync_mode ) | 300 | __field( int, sync_mode ) |
301 | __field( char, for_kupdate ) | 301 | __field( char, for_kupdate ) |
302 | __field( char, range_cyclic ) | 302 | __field( char, range_cyclic ) |
303 | __field( pgoff_t, writeback_index ) | 303 | __field( pgoff_t, writeback_index ) |
304 | ), | 304 | ), |
305 | 305 | ||
306 | TP_fast_assign( | 306 | TP_fast_assign( |
307 | __entry->dev = inode->i_sb->s_dev; | 307 | __entry->dev = inode->i_sb->s_dev; |
308 | __entry->ino = inode->i_ino; | 308 | __entry->ino = inode->i_ino; |
309 | __entry->nr_to_write = wbc->nr_to_write; | 309 | __entry->nr_to_write = wbc->nr_to_write; |
310 | __entry->pages_skipped = wbc->pages_skipped; | 310 | __entry->pages_skipped = wbc->pages_skipped; |
311 | __entry->range_start = wbc->range_start; | 311 | __entry->range_start = wbc->range_start; |
312 | __entry->range_end = wbc->range_end; | 312 | __entry->range_end = wbc->range_end; |
313 | __entry->sync_mode = wbc->sync_mode; | 313 | __entry->sync_mode = wbc->sync_mode; |
314 | __entry->for_kupdate = wbc->for_kupdate; | 314 | __entry->for_kupdate = wbc->for_kupdate; |
315 | __entry->range_cyclic = wbc->range_cyclic; | 315 | __entry->range_cyclic = wbc->range_cyclic; |
316 | __entry->writeback_index = inode->i_mapping->writeback_index; | 316 | __entry->writeback_index = inode->i_mapping->writeback_index; |
317 | ), | 317 | ), |
318 | 318 | ||
319 | TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " | 319 | TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " |
320 | "range_start %lld range_end %lld sync_mode %d" | 320 | "range_start %lld range_end %lld sync_mode %d" |
321 | "for_kupdate %d range_cyclic %d writeback_index %lu", | 321 | "for_kupdate %d range_cyclic %d writeback_index %lu", |
322 | MAJOR(__entry->dev), MINOR(__entry->dev), | 322 | MAJOR(__entry->dev), MINOR(__entry->dev), |
323 | (unsigned long) __entry->ino, __entry->nr_to_write, | 323 | (unsigned long) __entry->ino, __entry->nr_to_write, |
324 | __entry->pages_skipped, __entry->range_start, | 324 | __entry->pages_skipped, __entry->range_start, |
325 | __entry->range_end, __entry->sync_mode, | 325 | __entry->range_end, __entry->sync_mode, |
326 | __entry->for_kupdate, __entry->range_cyclic, | 326 | __entry->for_kupdate, __entry->range_cyclic, |
327 | (unsigned long) __entry->writeback_index) | 327 | (unsigned long) __entry->writeback_index) |
328 | ); | 328 | ); |
329 | 329 | ||
330 | TRACE_EVENT(ext4_da_write_pages, | 330 | TRACE_EVENT(ext4_da_write_pages, |
331 | TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), | 331 | TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), |
332 | 332 | ||
333 | TP_ARGS(inode, mpd), | 333 | TP_ARGS(inode, mpd), |
334 | 334 | ||
335 | TP_STRUCT__entry( | 335 | TP_STRUCT__entry( |
336 | __field( dev_t, dev ) | 336 | __field( dev_t, dev ) |
337 | __field( ino_t, ino ) | 337 | __field( ino_t, ino ) |
338 | __field( __u64, b_blocknr ) | 338 | __field( __u64, b_blocknr ) |
339 | __field( __u32, b_size ) | 339 | __field( __u32, b_size ) |
340 | __field( __u32, b_state ) | 340 | __field( __u32, b_state ) |
341 | __field( unsigned long, first_page ) | 341 | __field( unsigned long, first_page ) |
342 | __field( int, io_done ) | 342 | __field( int, io_done ) |
343 | __field( int, pages_written ) | 343 | __field( int, pages_written ) |
344 | __field( int, sync_mode ) | 344 | __field( int, sync_mode ) |
345 | ), | 345 | ), |
346 | 346 | ||
347 | TP_fast_assign( | 347 | TP_fast_assign( |
348 | __entry->dev = inode->i_sb->s_dev; | 348 | __entry->dev = inode->i_sb->s_dev; |
349 | __entry->ino = inode->i_ino; | 349 | __entry->ino = inode->i_ino; |
350 | __entry->b_blocknr = mpd->b_blocknr; | 350 | __entry->b_blocknr = mpd->b_blocknr; |
351 | __entry->b_size = mpd->b_size; | 351 | __entry->b_size = mpd->b_size; |
352 | __entry->b_state = mpd->b_state; | 352 | __entry->b_state = mpd->b_state; |
353 | __entry->first_page = mpd->first_page; | 353 | __entry->first_page = mpd->first_page; |
354 | __entry->io_done = mpd->io_done; | 354 | __entry->io_done = mpd->io_done; |
355 | __entry->pages_written = mpd->pages_written; | 355 | __entry->pages_written = mpd->pages_written; |
356 | __entry->sync_mode = mpd->wbc->sync_mode; | 356 | __entry->sync_mode = mpd->wbc->sync_mode; |
357 | ), | 357 | ), |
358 | 358 | ||
359 | TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x " | 359 | TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x " |
360 | "first_page %lu io_done %d pages_written %d sync_mode %d", | 360 | "first_page %lu io_done %d pages_written %d sync_mode %d", |
361 | MAJOR(__entry->dev), MINOR(__entry->dev), | 361 | MAJOR(__entry->dev), MINOR(__entry->dev), |
362 | (unsigned long) __entry->ino, | 362 | (unsigned long) __entry->ino, |
363 | __entry->b_blocknr, __entry->b_size, | 363 | __entry->b_blocknr, __entry->b_size, |
364 | __entry->b_state, __entry->first_page, | 364 | __entry->b_state, __entry->first_page, |
365 | __entry->io_done, __entry->pages_written, | 365 | __entry->io_done, __entry->pages_written, |
366 | __entry->sync_mode | 366 | __entry->sync_mode |
367 | ) | 367 | ) |
368 | ); | 368 | ); |
369 | 369 | ||
370 | TRACE_EVENT(ext4_da_writepages_result, | 370 | TRACE_EVENT(ext4_da_writepages_result, |
371 | TP_PROTO(struct inode *inode, struct writeback_control *wbc, | 371 | TP_PROTO(struct inode *inode, struct writeback_control *wbc, |
372 | int ret, int pages_written), | 372 | int ret, int pages_written), |
373 | 373 | ||
374 | TP_ARGS(inode, wbc, ret, pages_written), | 374 | TP_ARGS(inode, wbc, ret, pages_written), |
375 | 375 | ||
376 | TP_STRUCT__entry( | 376 | TP_STRUCT__entry( |
377 | __field( dev_t, dev ) | 377 | __field( dev_t, dev ) |
378 | __field( ino_t, ino ) | 378 | __field( ino_t, ino ) |
379 | __field( int, ret ) | 379 | __field( int, ret ) |
380 | __field( int, pages_written ) | 380 | __field( int, pages_written ) |
381 | __field( long, pages_skipped ) | 381 | __field( long, pages_skipped ) |
382 | __field( int, sync_mode ) | 382 | __field( int, sync_mode ) |
383 | __field( char, more_io ) | ||
384 | __field( pgoff_t, writeback_index ) | 383 | __field( pgoff_t, writeback_index ) |
385 | ), | 384 | ), |
386 | 385 | ||
387 | TP_fast_assign( | 386 | TP_fast_assign( |
388 | __entry->dev = inode->i_sb->s_dev; | 387 | __entry->dev = inode->i_sb->s_dev; |
389 | __entry->ino = inode->i_ino; | 388 | __entry->ino = inode->i_ino; |
390 | __entry->ret = ret; | 389 | __entry->ret = ret; |
391 | __entry->pages_written = pages_written; | 390 | __entry->pages_written = pages_written; |
392 | __entry->pages_skipped = wbc->pages_skipped; | 391 | __entry->pages_skipped = wbc->pages_skipped; |
393 | __entry->sync_mode = wbc->sync_mode; | 392 | __entry->sync_mode = wbc->sync_mode; |
394 | __entry->more_io = wbc->more_io; | ||
395 | __entry->writeback_index = inode->i_mapping->writeback_index; | 393 | __entry->writeback_index = inode->i_mapping->writeback_index; |
396 | ), | 394 | ), |
397 | 395 | ||
398 | TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " | 396 | TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " |
399 | " more_io %d sync_mode %d writeback_index %lu", | 397 | "sync_mode %d writeback_index %lu", |
400 | MAJOR(__entry->dev), MINOR(__entry->dev), | 398 | MAJOR(__entry->dev), MINOR(__entry->dev), |
401 | (unsigned long) __entry->ino, __entry->ret, | 399 | (unsigned long) __entry->ino, __entry->ret, |
402 | __entry->pages_written, __entry->pages_skipped, | 400 | __entry->pages_written, __entry->pages_skipped, |
403 | __entry->more_io, __entry->sync_mode, | 401 | __entry->sync_mode, |
404 | (unsigned long) __entry->writeback_index) | 402 | (unsigned long) __entry->writeback_index) |
405 | ); | 403 | ); |
406 | 404 | ||
407 | DECLARE_EVENT_CLASS(ext4__page_op, | 405 | DECLARE_EVENT_CLASS(ext4__page_op, |
408 | TP_PROTO(struct page *page), | 406 | TP_PROTO(struct page *page), |
409 | 407 | ||
410 | TP_ARGS(page), | 408 | TP_ARGS(page), |
411 | 409 | ||
412 | TP_STRUCT__entry( | 410 | TP_STRUCT__entry( |
413 | __field( pgoff_t, index ) | 411 | __field( pgoff_t, index ) |
414 | __field( ino_t, ino ) | 412 | __field( ino_t, ino ) |
415 | __field( dev_t, dev ) | 413 | __field( dev_t, dev ) |
416 | 414 | ||
417 | ), | 415 | ), |
418 | 416 | ||
419 | TP_fast_assign( | 417 | TP_fast_assign( |
420 | __entry->index = page->index; | 418 | __entry->index = page->index; |
421 | __entry->ino = page->mapping->host->i_ino; | 419 | __entry->ino = page->mapping->host->i_ino; |
422 | __entry->dev = page->mapping->host->i_sb->s_dev; | 420 | __entry->dev = page->mapping->host->i_sb->s_dev; |
423 | ), | 421 | ), |
424 | 422 | ||
425 | TP_printk("dev %d,%d ino %lu page_index %lu", | 423 | TP_printk("dev %d,%d ino %lu page_index %lu", |
426 | MAJOR(__entry->dev), MINOR(__entry->dev), | 424 | MAJOR(__entry->dev), MINOR(__entry->dev), |
427 | (unsigned long) __entry->ino, | 425 | (unsigned long) __entry->ino, |
428 | (unsigned long) __entry->index) | 426 | (unsigned long) __entry->index) |
429 | ); | 427 | ); |
430 | 428 | ||
431 | DEFINE_EVENT(ext4__page_op, ext4_writepage, | 429 | DEFINE_EVENT(ext4__page_op, ext4_writepage, |
432 | 430 | ||
433 | TP_PROTO(struct page *page), | 431 | TP_PROTO(struct page *page), |
434 | 432 | ||
435 | TP_ARGS(page) | 433 | TP_ARGS(page) |
436 | ); | 434 | ); |
437 | 435 | ||
438 | DEFINE_EVENT(ext4__page_op, ext4_readpage, | 436 | DEFINE_EVENT(ext4__page_op, ext4_readpage, |
439 | 437 | ||
440 | TP_PROTO(struct page *page), | 438 | TP_PROTO(struct page *page), |
441 | 439 | ||
442 | TP_ARGS(page) | 440 | TP_ARGS(page) |
443 | ); | 441 | ); |
444 | 442 | ||
445 | DEFINE_EVENT(ext4__page_op, ext4_releasepage, | 443 | DEFINE_EVENT(ext4__page_op, ext4_releasepage, |
446 | 444 | ||
447 | TP_PROTO(struct page *page), | 445 | TP_PROTO(struct page *page), |
448 | 446 | ||
449 | TP_ARGS(page) | 447 | TP_ARGS(page) |
450 | ); | 448 | ); |
451 | 449 | ||
452 | TRACE_EVENT(ext4_invalidatepage, | 450 | TRACE_EVENT(ext4_invalidatepage, |
453 | TP_PROTO(struct page *page, unsigned long offset), | 451 | TP_PROTO(struct page *page, unsigned long offset), |
454 | 452 | ||
455 | TP_ARGS(page, offset), | 453 | TP_ARGS(page, offset), |
456 | 454 | ||
457 | TP_STRUCT__entry( | 455 | TP_STRUCT__entry( |
458 | __field( pgoff_t, index ) | 456 | __field( pgoff_t, index ) |
459 | __field( unsigned long, offset ) | 457 | __field( unsigned long, offset ) |
460 | __field( ino_t, ino ) | 458 | __field( ino_t, ino ) |
461 | __field( dev_t, dev ) | 459 | __field( dev_t, dev ) |
462 | 460 | ||
463 | ), | 461 | ), |
464 | 462 | ||
465 | TP_fast_assign( | 463 | TP_fast_assign( |
466 | __entry->index = page->index; | 464 | __entry->index = page->index; |
467 | __entry->offset = offset; | 465 | __entry->offset = offset; |
468 | __entry->ino = page->mapping->host->i_ino; | 466 | __entry->ino = page->mapping->host->i_ino; |
469 | __entry->dev = page->mapping->host->i_sb->s_dev; | 467 | __entry->dev = page->mapping->host->i_sb->s_dev; |
470 | ), | 468 | ), |
471 | 469 | ||
472 | TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", | 470 | TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", |
473 | MAJOR(__entry->dev), MINOR(__entry->dev), | 471 | MAJOR(__entry->dev), MINOR(__entry->dev), |
474 | (unsigned long) __entry->ino, | 472 | (unsigned long) __entry->ino, |
475 | (unsigned long) __entry->index, __entry->offset) | 473 | (unsigned long) __entry->index, __entry->offset) |
476 | ); | 474 | ); |
477 | 475 | ||
478 | TRACE_EVENT(ext4_discard_blocks, | 476 | TRACE_EVENT(ext4_discard_blocks, |
479 | TP_PROTO(struct super_block *sb, unsigned long long blk, | 477 | TP_PROTO(struct super_block *sb, unsigned long long blk, |
480 | unsigned long long count), | 478 | unsigned long long count), |
481 | 479 | ||
482 | TP_ARGS(sb, blk, count), | 480 | TP_ARGS(sb, blk, count), |
483 | 481 | ||
484 | TP_STRUCT__entry( | 482 | TP_STRUCT__entry( |
485 | __field( dev_t, dev ) | 483 | __field( dev_t, dev ) |
486 | __field( __u64, blk ) | 484 | __field( __u64, blk ) |
487 | __field( __u64, count ) | 485 | __field( __u64, count ) |
488 | 486 | ||
489 | ), | 487 | ), |
490 | 488 | ||
491 | TP_fast_assign( | 489 | TP_fast_assign( |
492 | __entry->dev = sb->s_dev; | 490 | __entry->dev = sb->s_dev; |
493 | __entry->blk = blk; | 491 | __entry->blk = blk; |
494 | __entry->count = count; | 492 | __entry->count = count; |
495 | ), | 493 | ), |
496 | 494 | ||
497 | TP_printk("dev %d,%d blk %llu count %llu", | 495 | TP_printk("dev %d,%d blk %llu count %llu", |
498 | MAJOR(__entry->dev), MINOR(__entry->dev), | 496 | MAJOR(__entry->dev), MINOR(__entry->dev), |
499 | __entry->blk, __entry->count) | 497 | __entry->blk, __entry->count) |
500 | ); | 498 | ); |
501 | 499 | ||
502 | DECLARE_EVENT_CLASS(ext4__mb_new_pa, | 500 | DECLARE_EVENT_CLASS(ext4__mb_new_pa, |
503 | TP_PROTO(struct ext4_allocation_context *ac, | 501 | TP_PROTO(struct ext4_allocation_context *ac, |
504 | struct ext4_prealloc_space *pa), | 502 | struct ext4_prealloc_space *pa), |
505 | 503 | ||
506 | TP_ARGS(ac, pa), | 504 | TP_ARGS(ac, pa), |
507 | 505 | ||
508 | TP_STRUCT__entry( | 506 | TP_STRUCT__entry( |
509 | __field( dev_t, dev ) | 507 | __field( dev_t, dev ) |
510 | __field( ino_t, ino ) | 508 | __field( ino_t, ino ) |
511 | __field( __u64, pa_pstart ) | 509 | __field( __u64, pa_pstart ) |
512 | __field( __u32, pa_len ) | 510 | __field( __u32, pa_len ) |
513 | __field( __u64, pa_lstart ) | 511 | __field( __u64, pa_lstart ) |
514 | 512 | ||
515 | ), | 513 | ), |
516 | 514 | ||
517 | TP_fast_assign( | 515 | TP_fast_assign( |
518 | __entry->dev = ac->ac_sb->s_dev; | 516 | __entry->dev = ac->ac_sb->s_dev; |
519 | __entry->ino = ac->ac_inode->i_ino; | 517 | __entry->ino = ac->ac_inode->i_ino; |
520 | __entry->pa_pstart = pa->pa_pstart; | 518 | __entry->pa_pstart = pa->pa_pstart; |
521 | __entry->pa_len = pa->pa_len; | 519 | __entry->pa_len = pa->pa_len; |
522 | __entry->pa_lstart = pa->pa_lstart; | 520 | __entry->pa_lstart = pa->pa_lstart; |
523 | ), | 521 | ), |
524 | 522 | ||
525 | TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", | 523 | TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", |
526 | MAJOR(__entry->dev), MINOR(__entry->dev), | 524 | MAJOR(__entry->dev), MINOR(__entry->dev), |
527 | (unsigned long) __entry->ino, | 525 | (unsigned long) __entry->ino, |
528 | __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) | 526 | __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) |
529 | ); | 527 | ); |
530 | 528 | ||
531 | DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, | 529 | DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, |
532 | 530 | ||
533 | TP_PROTO(struct ext4_allocation_context *ac, | 531 | TP_PROTO(struct ext4_allocation_context *ac, |
534 | struct ext4_prealloc_space *pa), | 532 | struct ext4_prealloc_space *pa), |
535 | 533 | ||
536 | TP_ARGS(ac, pa) | 534 | TP_ARGS(ac, pa) |
537 | ); | 535 | ); |
538 | 536 | ||
539 | DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, | 537 | DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, |
540 | 538 | ||
541 | TP_PROTO(struct ext4_allocation_context *ac, | 539 | TP_PROTO(struct ext4_allocation_context *ac, |
542 | struct ext4_prealloc_space *pa), | 540 | struct ext4_prealloc_space *pa), |
543 | 541 | ||
544 | TP_ARGS(ac, pa) | 542 | TP_ARGS(ac, pa) |
545 | ); | 543 | ); |
546 | 544 | ||
547 | TRACE_EVENT(ext4_mb_release_inode_pa, | 545 | TRACE_EVENT(ext4_mb_release_inode_pa, |
548 | TP_PROTO(struct ext4_prealloc_space *pa, | 546 | TP_PROTO(struct ext4_prealloc_space *pa, |
549 | unsigned long long block, unsigned int count), | 547 | unsigned long long block, unsigned int count), |
550 | 548 | ||
551 | TP_ARGS(pa, block, count), | 549 | TP_ARGS(pa, block, count), |
552 | 550 | ||
553 | TP_STRUCT__entry( | 551 | TP_STRUCT__entry( |
554 | __field( dev_t, dev ) | 552 | __field( dev_t, dev ) |
555 | __field( ino_t, ino ) | 553 | __field( ino_t, ino ) |
556 | __field( __u64, block ) | 554 | __field( __u64, block ) |
557 | __field( __u32, count ) | 555 | __field( __u32, count ) |
558 | 556 | ||
559 | ), | 557 | ), |
560 | 558 | ||
561 | TP_fast_assign( | 559 | TP_fast_assign( |
562 | __entry->dev = pa->pa_inode->i_sb->s_dev; | 560 | __entry->dev = pa->pa_inode->i_sb->s_dev; |
563 | __entry->ino = pa->pa_inode->i_ino; | 561 | __entry->ino = pa->pa_inode->i_ino; |
564 | __entry->block = block; | 562 | __entry->block = block; |
565 | __entry->count = count; | 563 | __entry->count = count; |
566 | ), | 564 | ), |
567 | 565 | ||
568 | TP_printk("dev %d,%d ino %lu block %llu count %u", | 566 | TP_printk("dev %d,%d ino %lu block %llu count %u", |
569 | MAJOR(__entry->dev), MINOR(__entry->dev), | 567 | MAJOR(__entry->dev), MINOR(__entry->dev), |
570 | (unsigned long) __entry->ino, | 568 | (unsigned long) __entry->ino, |
571 | __entry->block, __entry->count) | 569 | __entry->block, __entry->count) |
572 | ); | 570 | ); |
573 | 571 | ||
574 | TRACE_EVENT(ext4_mb_release_group_pa, | 572 | TRACE_EVENT(ext4_mb_release_group_pa, |
575 | TP_PROTO(struct ext4_prealloc_space *pa), | 573 | TP_PROTO(struct ext4_prealloc_space *pa), |
576 | 574 | ||
577 | TP_ARGS(pa), | 575 | TP_ARGS(pa), |
578 | 576 | ||
579 | TP_STRUCT__entry( | 577 | TP_STRUCT__entry( |
580 | __field( dev_t, dev ) | 578 | __field( dev_t, dev ) |
581 | __field( __u64, pa_pstart ) | 579 | __field( __u64, pa_pstart ) |
582 | __field( __u32, pa_len ) | 580 | __field( __u32, pa_len ) |
583 | 581 | ||
584 | ), | 582 | ), |
585 | 583 | ||
586 | TP_fast_assign( | 584 | TP_fast_assign( |
587 | __entry->dev = pa->pa_inode->i_sb->s_dev; | 585 | __entry->dev = pa->pa_inode->i_sb->s_dev; |
588 | __entry->pa_pstart = pa->pa_pstart; | 586 | __entry->pa_pstart = pa->pa_pstart; |
589 | __entry->pa_len = pa->pa_len; | 587 | __entry->pa_len = pa->pa_len; |
590 | ), | 588 | ), |
591 | 589 | ||
592 | TP_printk("dev %d,%d pstart %llu len %u", | 590 | TP_printk("dev %d,%d pstart %llu len %u", |
593 | MAJOR(__entry->dev), MINOR(__entry->dev), | 591 | MAJOR(__entry->dev), MINOR(__entry->dev), |
594 | __entry->pa_pstart, __entry->pa_len) | 592 | __entry->pa_pstart, __entry->pa_len) |
595 | ); | 593 | ); |
596 | 594 | ||
597 | TRACE_EVENT(ext4_discard_preallocations, | 595 | TRACE_EVENT(ext4_discard_preallocations, |
598 | TP_PROTO(struct inode *inode), | 596 | TP_PROTO(struct inode *inode), |
599 | 597 | ||
600 | TP_ARGS(inode), | 598 | TP_ARGS(inode), |
601 | 599 | ||
602 | TP_STRUCT__entry( | 600 | TP_STRUCT__entry( |
603 | __field( dev_t, dev ) | 601 | __field( dev_t, dev ) |
604 | __field( ino_t, ino ) | 602 | __field( ino_t, ino ) |
605 | 603 | ||
606 | ), | 604 | ), |
607 | 605 | ||
608 | TP_fast_assign( | 606 | TP_fast_assign( |
609 | __entry->dev = inode->i_sb->s_dev; | 607 | __entry->dev = inode->i_sb->s_dev; |
610 | __entry->ino = inode->i_ino; | 608 | __entry->ino = inode->i_ino; |
611 | ), | 609 | ), |
612 | 610 | ||
613 | TP_printk("dev %d,%d ino %lu", | 611 | TP_printk("dev %d,%d ino %lu", |
614 | MAJOR(__entry->dev), MINOR(__entry->dev), | 612 | MAJOR(__entry->dev), MINOR(__entry->dev), |
615 | (unsigned long) __entry->ino) | 613 | (unsigned long) __entry->ino) |
616 | ); | 614 | ); |
617 | 615 | ||
618 | TRACE_EVENT(ext4_mb_discard_preallocations, | 616 | TRACE_EVENT(ext4_mb_discard_preallocations, |
619 | TP_PROTO(struct super_block *sb, int needed), | 617 | TP_PROTO(struct super_block *sb, int needed), |
620 | 618 | ||
621 | TP_ARGS(sb, needed), | 619 | TP_ARGS(sb, needed), |
622 | 620 | ||
623 | TP_STRUCT__entry( | 621 | TP_STRUCT__entry( |
624 | __field( dev_t, dev ) | 622 | __field( dev_t, dev ) |
625 | __field( int, needed ) | 623 | __field( int, needed ) |
626 | 624 | ||
627 | ), | 625 | ), |
628 | 626 | ||
629 | TP_fast_assign( | 627 | TP_fast_assign( |
630 | __entry->dev = sb->s_dev; | 628 | __entry->dev = sb->s_dev; |
631 | __entry->needed = needed; | 629 | __entry->needed = needed; |
632 | ), | 630 | ), |
633 | 631 | ||
634 | TP_printk("dev %d,%d needed %d", | 632 | TP_printk("dev %d,%d needed %d", |
635 | MAJOR(__entry->dev), MINOR(__entry->dev), | 633 | MAJOR(__entry->dev), MINOR(__entry->dev), |
636 | __entry->needed) | 634 | __entry->needed) |
637 | ); | 635 | ); |
638 | 636 | ||
639 | TRACE_EVENT(ext4_request_blocks, | 637 | TRACE_EVENT(ext4_request_blocks, |
640 | TP_PROTO(struct ext4_allocation_request *ar), | 638 | TP_PROTO(struct ext4_allocation_request *ar), |
641 | 639 | ||
642 | TP_ARGS(ar), | 640 | TP_ARGS(ar), |
643 | 641 | ||
644 | TP_STRUCT__entry( | 642 | TP_STRUCT__entry( |
645 | __field( dev_t, dev ) | 643 | __field( dev_t, dev ) |
646 | __field( ino_t, ino ) | 644 | __field( ino_t, ino ) |
647 | __field( unsigned int, flags ) | 645 | __field( unsigned int, flags ) |
648 | __field( unsigned int, len ) | 646 | __field( unsigned int, len ) |
649 | __field( __u32, logical ) | 647 | __field( __u32, logical ) |
650 | __field( __u32, lleft ) | 648 | __field( __u32, lleft ) |
651 | __field( __u32, lright ) | 649 | __field( __u32, lright ) |
652 | __field( __u64, goal ) | 650 | __field( __u64, goal ) |
653 | __field( __u64, pleft ) | 651 | __field( __u64, pleft ) |
654 | __field( __u64, pright ) | 652 | __field( __u64, pright ) |
655 | ), | 653 | ), |
656 | 654 | ||
657 | TP_fast_assign( | 655 | TP_fast_assign( |
658 | __entry->dev = ar->inode->i_sb->s_dev; | 656 | __entry->dev = ar->inode->i_sb->s_dev; |
659 | __entry->ino = ar->inode->i_ino; | 657 | __entry->ino = ar->inode->i_ino; |
660 | __entry->flags = ar->flags; | 658 | __entry->flags = ar->flags; |
661 | __entry->len = ar->len; | 659 | __entry->len = ar->len; |
662 | __entry->logical = ar->logical; | 660 | __entry->logical = ar->logical; |
663 | __entry->goal = ar->goal; | 661 | __entry->goal = ar->goal; |
664 | __entry->lleft = ar->lleft; | 662 | __entry->lleft = ar->lleft; |
665 | __entry->lright = ar->lright; | 663 | __entry->lright = ar->lright; |
666 | __entry->pleft = ar->pleft; | 664 | __entry->pleft = ar->pleft; |
667 | __entry->pright = ar->pright; | 665 | __entry->pright = ar->pright; |
668 | ), | 666 | ), |
669 | 667 | ||
670 | TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " | 668 | TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " |
671 | "lleft %u lright %u pleft %llu pright %llu ", | 669 | "lleft %u lright %u pleft %llu pright %llu ", |
672 | MAJOR(__entry->dev), MINOR(__entry->dev), | 670 | MAJOR(__entry->dev), MINOR(__entry->dev), |
673 | (unsigned long) __entry->ino, __entry->flags, | 671 | (unsigned long) __entry->ino, __entry->flags, |
674 | __entry->len, __entry->logical, __entry->goal, | 672 | __entry->len, __entry->logical, __entry->goal, |
675 | __entry->lleft, __entry->lright, __entry->pleft, | 673 | __entry->lleft, __entry->lright, __entry->pleft, |
676 | __entry->pright) | 674 | __entry->pright) |
677 | ); | 675 | ); |
678 | 676 | ||
679 | TRACE_EVENT(ext4_allocate_blocks, | 677 | TRACE_EVENT(ext4_allocate_blocks, |
680 | TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), | 678 | TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), |
681 | 679 | ||
682 | TP_ARGS(ar, block), | 680 | TP_ARGS(ar, block), |
683 | 681 | ||
684 | TP_STRUCT__entry( | 682 | TP_STRUCT__entry( |
685 | __field( dev_t, dev ) | 683 | __field( dev_t, dev ) |
686 | __field( ino_t, ino ) | 684 | __field( ino_t, ino ) |
687 | __field( __u64, block ) | 685 | __field( __u64, block ) |
688 | __field( unsigned int, flags ) | 686 | __field( unsigned int, flags ) |
689 | __field( unsigned int, len ) | 687 | __field( unsigned int, len ) |
690 | __field( __u32, logical ) | 688 | __field( __u32, logical ) |
691 | __field( __u32, lleft ) | 689 | __field( __u32, lleft ) |
692 | __field( __u32, lright ) | 690 | __field( __u32, lright ) |
693 | __field( __u64, goal ) | 691 | __field( __u64, goal ) |
694 | __field( __u64, pleft ) | 692 | __field( __u64, pleft ) |
695 | __field( __u64, pright ) | 693 | __field( __u64, pright ) |
696 | ), | 694 | ), |
697 | 695 | ||
698 | TP_fast_assign( | 696 | TP_fast_assign( |
699 | __entry->dev = ar->inode->i_sb->s_dev; | 697 | __entry->dev = ar->inode->i_sb->s_dev; |
700 | __entry->ino = ar->inode->i_ino; | 698 | __entry->ino = ar->inode->i_ino; |
701 | __entry->block = block; | 699 | __entry->block = block; |
702 | __entry->flags = ar->flags; | 700 | __entry->flags = ar->flags; |
703 | __entry->len = ar->len; | 701 | __entry->len = ar->len; |
704 | __entry->logical = ar->logical; | 702 | __entry->logical = ar->logical; |
705 | __entry->goal = ar->goal; | 703 | __entry->goal = ar->goal; |
706 | __entry->lleft = ar->lleft; | 704 | __entry->lleft = ar->lleft; |
707 | __entry->lright = ar->lright; | 705 | __entry->lright = ar->lright; |
708 | __entry->pleft = ar->pleft; | 706 | __entry->pleft = ar->pleft; |
709 | __entry->pright = ar->pright; | 707 | __entry->pright = ar->pright; |
710 | ), | 708 | ), |
711 | 709 | ||
712 | TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " | 710 | TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " |
713 | "goal %llu lleft %u lright %u pleft %llu pright %llu", | 711 | "goal %llu lleft %u lright %u pleft %llu pright %llu", |
714 | MAJOR(__entry->dev), MINOR(__entry->dev), | 712 | MAJOR(__entry->dev), MINOR(__entry->dev), |
715 | (unsigned long) __entry->ino, __entry->flags, | 713 | (unsigned long) __entry->ino, __entry->flags, |
716 | __entry->len, __entry->block, __entry->logical, | 714 | __entry->len, __entry->block, __entry->logical, |
717 | __entry->goal, __entry->lleft, __entry->lright, | 715 | __entry->goal, __entry->lleft, __entry->lright, |
718 | __entry->pleft, __entry->pright) | 716 | __entry->pleft, __entry->pright) |
719 | ); | 717 | ); |
720 | 718 | ||
721 | TRACE_EVENT(ext4_free_blocks, | 719 | TRACE_EVENT(ext4_free_blocks, |
722 | TP_PROTO(struct inode *inode, __u64 block, unsigned long count, | 720 | TP_PROTO(struct inode *inode, __u64 block, unsigned long count, |
723 | int flags), | 721 | int flags), |
724 | 722 | ||
725 | TP_ARGS(inode, block, count, flags), | 723 | TP_ARGS(inode, block, count, flags), |
726 | 724 | ||
727 | TP_STRUCT__entry( | 725 | TP_STRUCT__entry( |
728 | __field( dev_t, dev ) | 726 | __field( dev_t, dev ) |
729 | __field( ino_t, ino ) | 727 | __field( ino_t, ino ) |
730 | __field( umode_t, mode ) | 728 | __field( umode_t, mode ) |
731 | __field( __u64, block ) | 729 | __field( __u64, block ) |
732 | __field( unsigned long, count ) | 730 | __field( unsigned long, count ) |
733 | __field( int, flags ) | 731 | __field( int, flags ) |
734 | ), | 732 | ), |
735 | 733 | ||
736 | TP_fast_assign( | 734 | TP_fast_assign( |
737 | __entry->dev = inode->i_sb->s_dev; | 735 | __entry->dev = inode->i_sb->s_dev; |
738 | __entry->ino = inode->i_ino; | 736 | __entry->ino = inode->i_ino; |
739 | __entry->mode = inode->i_mode; | 737 | __entry->mode = inode->i_mode; |
740 | __entry->block = block; | 738 | __entry->block = block; |
741 | __entry->count = count; | 739 | __entry->count = count; |
742 | __entry->flags = flags; | 740 | __entry->flags = flags; |
743 | ), | 741 | ), |
744 | 742 | ||
745 | TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", | 743 | TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", |
746 | MAJOR(__entry->dev), MINOR(__entry->dev), | 744 | MAJOR(__entry->dev), MINOR(__entry->dev), |
747 | (unsigned long) __entry->ino, | 745 | (unsigned long) __entry->ino, |
748 | __entry->mode, __entry->block, __entry->count, | 746 | __entry->mode, __entry->block, __entry->count, |
749 | __entry->flags) | 747 | __entry->flags) |
750 | ); | 748 | ); |
751 | 749 | ||
752 | TRACE_EVENT(ext4_sync_file_enter, | 750 | TRACE_EVENT(ext4_sync_file_enter, |
753 | TP_PROTO(struct file *file, int datasync), | 751 | TP_PROTO(struct file *file, int datasync), |
754 | 752 | ||
755 | TP_ARGS(file, datasync), | 753 | TP_ARGS(file, datasync), |
756 | 754 | ||
757 | TP_STRUCT__entry( | 755 | TP_STRUCT__entry( |
758 | __field( dev_t, dev ) | 756 | __field( dev_t, dev ) |
759 | __field( ino_t, ino ) | 757 | __field( ino_t, ino ) |
760 | __field( ino_t, parent ) | 758 | __field( ino_t, parent ) |
761 | __field( int, datasync ) | 759 | __field( int, datasync ) |
762 | ), | 760 | ), |
763 | 761 | ||
764 | TP_fast_assign( | 762 | TP_fast_assign( |
765 | struct dentry *dentry = file->f_path.dentry; | 763 | struct dentry *dentry = file->f_path.dentry; |
766 | 764 | ||
767 | __entry->dev = dentry->d_inode->i_sb->s_dev; | 765 | __entry->dev = dentry->d_inode->i_sb->s_dev; |
768 | __entry->ino = dentry->d_inode->i_ino; | 766 | __entry->ino = dentry->d_inode->i_ino; |
769 | __entry->datasync = datasync; | 767 | __entry->datasync = datasync; |
770 | __entry->parent = dentry->d_parent->d_inode->i_ino; | 768 | __entry->parent = dentry->d_parent->d_inode->i_ino; |
771 | ), | 769 | ), |
772 | 770 | ||
773 | TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", | 771 | TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", |
774 | MAJOR(__entry->dev), MINOR(__entry->dev), | 772 | MAJOR(__entry->dev), MINOR(__entry->dev), |
775 | (unsigned long) __entry->ino, | 773 | (unsigned long) __entry->ino, |
776 | (unsigned long) __entry->parent, __entry->datasync) | 774 | (unsigned long) __entry->parent, __entry->datasync) |
777 | ); | 775 | ); |
778 | 776 | ||
779 | TRACE_EVENT(ext4_sync_file_exit, | 777 | TRACE_EVENT(ext4_sync_file_exit, |
780 | TP_PROTO(struct inode *inode, int ret), | 778 | TP_PROTO(struct inode *inode, int ret), |
781 | 779 | ||
782 | TP_ARGS(inode, ret), | 780 | TP_ARGS(inode, ret), |
783 | 781 | ||
784 | TP_STRUCT__entry( | 782 | TP_STRUCT__entry( |
785 | __field( int, ret ) | 783 | __field( int, ret ) |
786 | __field( ino_t, ino ) | 784 | __field( ino_t, ino ) |
787 | __field( dev_t, dev ) | 785 | __field( dev_t, dev ) |
788 | ), | 786 | ), |
789 | 787 | ||
790 | TP_fast_assign( | 788 | TP_fast_assign( |
791 | __entry->ret = ret; | 789 | __entry->ret = ret; |
792 | __entry->ino = inode->i_ino; | 790 | __entry->ino = inode->i_ino; |
793 | __entry->dev = inode->i_sb->s_dev; | 791 | __entry->dev = inode->i_sb->s_dev; |
794 | ), | 792 | ), |
795 | 793 | ||
796 | TP_printk("dev %d,%d ino %lu ret %d", | 794 | TP_printk("dev %d,%d ino %lu ret %d", |
797 | MAJOR(__entry->dev), MINOR(__entry->dev), | 795 | MAJOR(__entry->dev), MINOR(__entry->dev), |
798 | (unsigned long) __entry->ino, | 796 | (unsigned long) __entry->ino, |
799 | __entry->ret) | 797 | __entry->ret) |
800 | ); | 798 | ); |
801 | 799 | ||
802 | TRACE_EVENT(ext4_sync_fs, | 800 | TRACE_EVENT(ext4_sync_fs, |
803 | TP_PROTO(struct super_block *sb, int wait), | 801 | TP_PROTO(struct super_block *sb, int wait), |
804 | 802 | ||
805 | TP_ARGS(sb, wait), | 803 | TP_ARGS(sb, wait), |
806 | 804 | ||
807 | TP_STRUCT__entry( | 805 | TP_STRUCT__entry( |
808 | __field( dev_t, dev ) | 806 | __field( dev_t, dev ) |
809 | __field( int, wait ) | 807 | __field( int, wait ) |
810 | 808 | ||
811 | ), | 809 | ), |
812 | 810 | ||
813 | TP_fast_assign( | 811 | TP_fast_assign( |
814 | __entry->dev = sb->s_dev; | 812 | __entry->dev = sb->s_dev; |
815 | __entry->wait = wait; | 813 | __entry->wait = wait; |
816 | ), | 814 | ), |
817 | 815 | ||
818 | TP_printk("dev %d,%d wait %d", | 816 | TP_printk("dev %d,%d wait %d", |
819 | MAJOR(__entry->dev), MINOR(__entry->dev), | 817 | MAJOR(__entry->dev), MINOR(__entry->dev), |
820 | __entry->wait) | 818 | __entry->wait) |
821 | ); | 819 | ); |
822 | 820 | ||
823 | TRACE_EVENT(ext4_alloc_da_blocks, | 821 | TRACE_EVENT(ext4_alloc_da_blocks, |
824 | TP_PROTO(struct inode *inode), | 822 | TP_PROTO(struct inode *inode), |
825 | 823 | ||
826 | TP_ARGS(inode), | 824 | TP_ARGS(inode), |
827 | 825 | ||
828 | TP_STRUCT__entry( | 826 | TP_STRUCT__entry( |
829 | __field( dev_t, dev ) | 827 | __field( dev_t, dev ) |
830 | __field( ino_t, ino ) | 828 | __field( ino_t, ino ) |
831 | __field( unsigned int, data_blocks ) | 829 | __field( unsigned int, data_blocks ) |
832 | __field( unsigned int, meta_blocks ) | 830 | __field( unsigned int, meta_blocks ) |
833 | ), | 831 | ), |
834 | 832 | ||
835 | TP_fast_assign( | 833 | TP_fast_assign( |
836 | __entry->dev = inode->i_sb->s_dev; | 834 | __entry->dev = inode->i_sb->s_dev; |
837 | __entry->ino = inode->i_ino; | 835 | __entry->ino = inode->i_ino; |
838 | __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 836 | __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; |
839 | __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; | 837 | __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; |
840 | ), | 838 | ), |
841 | 839 | ||
842 | TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u", | 840 | TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u", |
843 | MAJOR(__entry->dev), MINOR(__entry->dev), | 841 | MAJOR(__entry->dev), MINOR(__entry->dev), |
844 | (unsigned long) __entry->ino, | 842 | (unsigned long) __entry->ino, |
845 | __entry->data_blocks, __entry->meta_blocks) | 843 | __entry->data_blocks, __entry->meta_blocks) |
846 | ); | 844 | ); |
847 | 845 | ||
848 | TRACE_EVENT(ext4_mballoc_alloc, | 846 | TRACE_EVENT(ext4_mballoc_alloc, |
849 | TP_PROTO(struct ext4_allocation_context *ac), | 847 | TP_PROTO(struct ext4_allocation_context *ac), |
850 | 848 | ||
851 | TP_ARGS(ac), | 849 | TP_ARGS(ac), |
852 | 850 | ||
853 | TP_STRUCT__entry( | 851 | TP_STRUCT__entry( |
854 | __field( dev_t, dev ) | 852 | __field( dev_t, dev ) |
855 | __field( ino_t, ino ) | 853 | __field( ino_t, ino ) |
856 | __field( __u16, found ) | 854 | __field( __u16, found ) |
857 | __field( __u16, groups ) | 855 | __field( __u16, groups ) |
858 | __field( __u16, buddy ) | 856 | __field( __u16, buddy ) |
859 | __field( __u16, flags ) | 857 | __field( __u16, flags ) |
860 | __field( __u16, tail ) | 858 | __field( __u16, tail ) |
861 | __field( __u8, cr ) | 859 | __field( __u8, cr ) |
862 | __field( __u32, orig_logical ) | 860 | __field( __u32, orig_logical ) |
863 | __field( int, orig_start ) | 861 | __field( int, orig_start ) |
864 | __field( __u32, orig_group ) | 862 | __field( __u32, orig_group ) |
865 | __field( int, orig_len ) | 863 | __field( int, orig_len ) |
866 | __field( __u32, goal_logical ) | 864 | __field( __u32, goal_logical ) |
867 | __field( int, goal_start ) | 865 | __field( int, goal_start ) |
868 | __field( __u32, goal_group ) | 866 | __field( __u32, goal_group ) |
869 | __field( int, goal_len ) | 867 | __field( int, goal_len ) |
870 | __field( __u32, result_logical ) | 868 | __field( __u32, result_logical ) |
871 | __field( int, result_start ) | 869 | __field( int, result_start ) |
872 | __field( __u32, result_group ) | 870 | __field( __u32, result_group ) |
873 | __field( int, result_len ) | 871 | __field( int, result_len ) |
874 | ), | 872 | ), |
875 | 873 | ||
876 | TP_fast_assign( | 874 | TP_fast_assign( |
877 | __entry->dev = ac->ac_inode->i_sb->s_dev; | 875 | __entry->dev = ac->ac_inode->i_sb->s_dev; |
878 | __entry->ino = ac->ac_inode->i_ino; | 876 | __entry->ino = ac->ac_inode->i_ino; |
879 | __entry->found = ac->ac_found; | 877 | __entry->found = ac->ac_found; |
880 | __entry->flags = ac->ac_flags; | 878 | __entry->flags = ac->ac_flags; |
881 | __entry->groups = ac->ac_groups_scanned; | 879 | __entry->groups = ac->ac_groups_scanned; |
882 | __entry->buddy = ac->ac_buddy; | 880 | __entry->buddy = ac->ac_buddy; |
883 | __entry->tail = ac->ac_tail; | 881 | __entry->tail = ac->ac_tail; |
884 | __entry->cr = ac->ac_criteria; | 882 | __entry->cr = ac->ac_criteria; |
885 | __entry->orig_logical = ac->ac_o_ex.fe_logical; | 883 | __entry->orig_logical = ac->ac_o_ex.fe_logical; |
886 | __entry->orig_start = ac->ac_o_ex.fe_start; | 884 | __entry->orig_start = ac->ac_o_ex.fe_start; |
887 | __entry->orig_group = ac->ac_o_ex.fe_group; | 885 | __entry->orig_group = ac->ac_o_ex.fe_group; |
888 | __entry->orig_len = ac->ac_o_ex.fe_len; | 886 | __entry->orig_len = ac->ac_o_ex.fe_len; |
889 | __entry->goal_logical = ac->ac_g_ex.fe_logical; | 887 | __entry->goal_logical = ac->ac_g_ex.fe_logical; |
890 | __entry->goal_start = ac->ac_g_ex.fe_start; | 888 | __entry->goal_start = ac->ac_g_ex.fe_start; |
891 | __entry->goal_group = ac->ac_g_ex.fe_group; | 889 | __entry->goal_group = ac->ac_g_ex.fe_group; |
892 | __entry->goal_len = ac->ac_g_ex.fe_len; | 890 | __entry->goal_len = ac->ac_g_ex.fe_len; |
893 | __entry->result_logical = ac->ac_f_ex.fe_logical; | 891 | __entry->result_logical = ac->ac_f_ex.fe_logical; |
894 | __entry->result_start = ac->ac_f_ex.fe_start; | 892 | __entry->result_start = ac->ac_f_ex.fe_start; |
895 | __entry->result_group = ac->ac_f_ex.fe_group; | 893 | __entry->result_group = ac->ac_f_ex.fe_group; |
896 | __entry->result_len = ac->ac_f_ex.fe_len; | 894 | __entry->result_len = ac->ac_f_ex.fe_len; |
897 | ), | 895 | ), |
898 | 896 | ||
899 | TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " | 897 | TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " |
900 | "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " | 898 | "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " |
901 | "tail %u broken %u", | 899 | "tail %u broken %u", |
902 | MAJOR(__entry->dev), MINOR(__entry->dev), | 900 | MAJOR(__entry->dev), MINOR(__entry->dev), |
903 | (unsigned long) __entry->ino, | 901 | (unsigned long) __entry->ino, |
904 | __entry->orig_group, __entry->orig_start, | 902 | __entry->orig_group, __entry->orig_start, |
905 | __entry->orig_len, __entry->orig_logical, | 903 | __entry->orig_len, __entry->orig_logical, |
906 | __entry->goal_group, __entry->goal_start, | 904 | __entry->goal_group, __entry->goal_start, |
907 | __entry->goal_len, __entry->goal_logical, | 905 | __entry->goal_len, __entry->goal_logical, |
908 | __entry->result_group, __entry->result_start, | 906 | __entry->result_group, __entry->result_start, |
909 | __entry->result_len, __entry->result_logical, | 907 | __entry->result_len, __entry->result_logical, |
910 | __entry->found, __entry->groups, __entry->cr, | 908 | __entry->found, __entry->groups, __entry->cr, |
911 | __entry->flags, __entry->tail, | 909 | __entry->flags, __entry->tail, |
912 | __entry->buddy ? 1 << __entry->buddy : 0) | 910 | __entry->buddy ? 1 << __entry->buddy : 0) |
913 | ); | 911 | ); |
914 | 912 | ||
915 | TRACE_EVENT(ext4_mballoc_prealloc, | 913 | TRACE_EVENT(ext4_mballoc_prealloc, |
916 | TP_PROTO(struct ext4_allocation_context *ac), | 914 | TP_PROTO(struct ext4_allocation_context *ac), |
917 | 915 | ||
918 | TP_ARGS(ac), | 916 | TP_ARGS(ac), |
919 | 917 | ||
920 | TP_STRUCT__entry( | 918 | TP_STRUCT__entry( |
921 | __field( dev_t, dev ) | 919 | __field( dev_t, dev ) |
922 | __field( ino_t, ino ) | 920 | __field( ino_t, ino ) |
923 | __field( __u32, orig_logical ) | 921 | __field( __u32, orig_logical ) |
924 | __field( int, orig_start ) | 922 | __field( int, orig_start ) |
925 | __field( __u32, orig_group ) | 923 | __field( __u32, orig_group ) |
926 | __field( int, orig_len ) | 924 | __field( int, orig_len ) |
927 | __field( __u32, result_logical ) | 925 | __field( __u32, result_logical ) |
928 | __field( int, result_start ) | 926 | __field( int, result_start ) |
929 | __field( __u32, result_group ) | 927 | __field( __u32, result_group ) |
930 | __field( int, result_len ) | 928 | __field( int, result_len ) |
931 | ), | 929 | ), |
932 | 930 | ||
933 | TP_fast_assign( | 931 | TP_fast_assign( |
934 | __entry->dev = ac->ac_inode->i_sb->s_dev; | 932 | __entry->dev = ac->ac_inode->i_sb->s_dev; |
935 | __entry->ino = ac->ac_inode->i_ino; | 933 | __entry->ino = ac->ac_inode->i_ino; |
936 | __entry->orig_logical = ac->ac_o_ex.fe_logical; | 934 | __entry->orig_logical = ac->ac_o_ex.fe_logical; |
937 | __entry->orig_start = ac->ac_o_ex.fe_start; | 935 | __entry->orig_start = ac->ac_o_ex.fe_start; |
938 | __entry->orig_group = ac->ac_o_ex.fe_group; | 936 | __entry->orig_group = ac->ac_o_ex.fe_group; |
939 | __entry->orig_len = ac->ac_o_ex.fe_len; | 937 | __entry->orig_len = ac->ac_o_ex.fe_len; |
940 | __entry->result_logical = ac->ac_b_ex.fe_logical; | 938 | __entry->result_logical = ac->ac_b_ex.fe_logical; |
941 | __entry->result_start = ac->ac_b_ex.fe_start; | 939 | __entry->result_start = ac->ac_b_ex.fe_start; |
942 | __entry->result_group = ac->ac_b_ex.fe_group; | 940 | __entry->result_group = ac->ac_b_ex.fe_group; |
943 | __entry->result_len = ac->ac_b_ex.fe_len; | 941 | __entry->result_len = ac->ac_b_ex.fe_len; |
944 | ), | 942 | ), |
945 | 943 | ||
946 | TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", | 944 | TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", |
947 | MAJOR(__entry->dev), MINOR(__entry->dev), | 945 | MAJOR(__entry->dev), MINOR(__entry->dev), |
948 | (unsigned long) __entry->ino, | 946 | (unsigned long) __entry->ino, |
949 | __entry->orig_group, __entry->orig_start, | 947 | __entry->orig_group, __entry->orig_start, |
950 | __entry->orig_len, __entry->orig_logical, | 948 | __entry->orig_len, __entry->orig_logical, |
951 | __entry->result_group, __entry->result_start, | 949 | __entry->result_group, __entry->result_start, |
952 | __entry->result_len, __entry->result_logical) | 950 | __entry->result_len, __entry->result_logical) |
953 | ); | 951 | ); |
954 | 952 | ||
955 | DECLARE_EVENT_CLASS(ext4__mballoc, | 953 | DECLARE_EVENT_CLASS(ext4__mballoc, |
956 | TP_PROTO(struct super_block *sb, | 954 | TP_PROTO(struct super_block *sb, |
957 | struct inode *inode, | 955 | struct inode *inode, |
958 | ext4_group_t group, | 956 | ext4_group_t group, |
959 | ext4_grpblk_t start, | 957 | ext4_grpblk_t start, |
960 | ext4_grpblk_t len), | 958 | ext4_grpblk_t len), |
961 | 959 | ||
962 | TP_ARGS(sb, inode, group, start, len), | 960 | TP_ARGS(sb, inode, group, start, len), |
963 | 961 | ||
964 | TP_STRUCT__entry( | 962 | TP_STRUCT__entry( |
965 | __field( dev_t, dev ) | 963 | __field( dev_t, dev ) |
966 | __field( ino_t, ino ) | 964 | __field( ino_t, ino ) |
967 | __field( int, result_start ) | 965 | __field( int, result_start ) |
968 | __field( __u32, result_group ) | 966 | __field( __u32, result_group ) |
969 | __field( int, result_len ) | 967 | __field( int, result_len ) |
970 | ), | 968 | ), |
971 | 969 | ||
972 | TP_fast_assign( | 970 | TP_fast_assign( |
973 | __entry->dev = sb->s_dev; | 971 | __entry->dev = sb->s_dev; |
974 | __entry->ino = inode ? inode->i_ino : 0; | 972 | __entry->ino = inode ? inode->i_ino : 0; |
975 | __entry->result_start = start; | 973 | __entry->result_start = start; |
976 | __entry->result_group = group; | 974 | __entry->result_group = group; |
977 | __entry->result_len = len; | 975 | __entry->result_len = len; |
978 | ), | 976 | ), |
979 | 977 | ||
980 | TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", | 978 | TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", |
981 | MAJOR(__entry->dev), MINOR(__entry->dev), | 979 | MAJOR(__entry->dev), MINOR(__entry->dev), |
982 | (unsigned long) __entry->ino, | 980 | (unsigned long) __entry->ino, |
983 | __entry->result_group, __entry->result_start, | 981 | __entry->result_group, __entry->result_start, |
984 | __entry->result_len) | 982 | __entry->result_len) |
985 | ); | 983 | ); |
986 | 984 | ||
987 | DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, | 985 | DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, |
988 | 986 | ||
989 | TP_PROTO(struct super_block *sb, | 987 | TP_PROTO(struct super_block *sb, |
990 | struct inode *inode, | 988 | struct inode *inode, |
991 | ext4_group_t group, | 989 | ext4_group_t group, |
992 | ext4_grpblk_t start, | 990 | ext4_grpblk_t start, |
993 | ext4_grpblk_t len), | 991 | ext4_grpblk_t len), |
994 | 992 | ||
995 | TP_ARGS(sb, inode, group, start, len) | 993 | TP_ARGS(sb, inode, group, start, len) |
996 | ); | 994 | ); |
997 | 995 | ||
998 | DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, | 996 | DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, |
999 | 997 | ||
1000 | TP_PROTO(struct super_block *sb, | 998 | TP_PROTO(struct super_block *sb, |
1001 | struct inode *inode, | 999 | struct inode *inode, |
1002 | ext4_group_t group, | 1000 | ext4_group_t group, |
1003 | ext4_grpblk_t start, | 1001 | ext4_grpblk_t start, |
1004 | ext4_grpblk_t len), | 1002 | ext4_grpblk_t len), |
1005 | 1003 | ||
1006 | TP_ARGS(sb, inode, group, start, len) | 1004 | TP_ARGS(sb, inode, group, start, len) |
1007 | ); | 1005 | ); |
1008 | 1006 | ||
1009 | TRACE_EVENT(ext4_forget, | 1007 | TRACE_EVENT(ext4_forget, |
1010 | TP_PROTO(struct inode *inode, int is_metadata, __u64 block), | 1008 | TP_PROTO(struct inode *inode, int is_metadata, __u64 block), |
1011 | 1009 | ||
1012 | TP_ARGS(inode, is_metadata, block), | 1010 | TP_ARGS(inode, is_metadata, block), |
1013 | 1011 | ||
1014 | TP_STRUCT__entry( | 1012 | TP_STRUCT__entry( |
1015 | __field( dev_t, dev ) | 1013 | __field( dev_t, dev ) |
1016 | __field( ino_t, ino ) | 1014 | __field( ino_t, ino ) |
1017 | __field( umode_t, mode ) | 1015 | __field( umode_t, mode ) |
1018 | __field( int, is_metadata ) | 1016 | __field( int, is_metadata ) |
1019 | __field( __u64, block ) | 1017 | __field( __u64, block ) |
1020 | ), | 1018 | ), |
1021 | 1019 | ||
1022 | TP_fast_assign( | 1020 | TP_fast_assign( |
1023 | __entry->dev = inode->i_sb->s_dev; | 1021 | __entry->dev = inode->i_sb->s_dev; |
1024 | __entry->ino = inode->i_ino; | 1022 | __entry->ino = inode->i_ino; |
1025 | __entry->mode = inode->i_mode; | 1023 | __entry->mode = inode->i_mode; |
1026 | __entry->is_metadata = is_metadata; | 1024 | __entry->is_metadata = is_metadata; |
1027 | __entry->block = block; | 1025 | __entry->block = block; |
1028 | ), | 1026 | ), |
1029 | 1027 | ||
1030 | TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", | 1028 | TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", |
1031 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1029 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1032 | (unsigned long) __entry->ino, | 1030 | (unsigned long) __entry->ino, |
1033 | __entry->mode, __entry->is_metadata, __entry->block) | 1031 | __entry->mode, __entry->is_metadata, __entry->block) |
1034 | ); | 1032 | ); |
1035 | 1033 | ||
1036 | TRACE_EVENT(ext4_da_update_reserve_space, | 1034 | TRACE_EVENT(ext4_da_update_reserve_space, |
1037 | TP_PROTO(struct inode *inode, int used_blocks), | 1035 | TP_PROTO(struct inode *inode, int used_blocks), |
1038 | 1036 | ||
1039 | TP_ARGS(inode, used_blocks), | 1037 | TP_ARGS(inode, used_blocks), |
1040 | 1038 | ||
1041 | TP_STRUCT__entry( | 1039 | TP_STRUCT__entry( |
1042 | __field( dev_t, dev ) | 1040 | __field( dev_t, dev ) |
1043 | __field( ino_t, ino ) | 1041 | __field( ino_t, ino ) |
1044 | __field( umode_t, mode ) | 1042 | __field( umode_t, mode ) |
1045 | __field( __u64, i_blocks ) | 1043 | __field( __u64, i_blocks ) |
1046 | __field( int, used_blocks ) | 1044 | __field( int, used_blocks ) |
1047 | __field( int, reserved_data_blocks ) | 1045 | __field( int, reserved_data_blocks ) |
1048 | __field( int, reserved_meta_blocks ) | 1046 | __field( int, reserved_meta_blocks ) |
1049 | __field( int, allocated_meta_blocks ) | 1047 | __field( int, allocated_meta_blocks ) |
1050 | ), | 1048 | ), |
1051 | 1049 | ||
1052 | TP_fast_assign( | 1050 | TP_fast_assign( |
1053 | __entry->dev = inode->i_sb->s_dev; | 1051 | __entry->dev = inode->i_sb->s_dev; |
1054 | __entry->ino = inode->i_ino; | 1052 | __entry->ino = inode->i_ino; |
1055 | __entry->mode = inode->i_mode; | 1053 | __entry->mode = inode->i_mode; |
1056 | __entry->i_blocks = inode->i_blocks; | 1054 | __entry->i_blocks = inode->i_blocks; |
1057 | __entry->used_blocks = used_blocks; | 1055 | __entry->used_blocks = used_blocks; |
1058 | __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 1056 | __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; |
1059 | __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; | 1057 | __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; |
1060 | __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; | 1058 | __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; |
1061 | ), | 1059 | ), |
1062 | 1060 | ||
1063 | TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " | 1061 | TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " |
1064 | "reserved_data_blocks %d reserved_meta_blocks %d " | 1062 | "reserved_data_blocks %d reserved_meta_blocks %d " |
1065 | "allocated_meta_blocks %d", | 1063 | "allocated_meta_blocks %d", |
1066 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1064 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1067 | (unsigned long) __entry->ino, | 1065 | (unsigned long) __entry->ino, |
1068 | __entry->mode, __entry->i_blocks, | 1066 | __entry->mode, __entry->i_blocks, |
1069 | __entry->used_blocks, __entry->reserved_data_blocks, | 1067 | __entry->used_blocks, __entry->reserved_data_blocks, |
1070 | __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) | 1068 | __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) |
1071 | ); | 1069 | ); |
1072 | 1070 | ||
1073 | TRACE_EVENT(ext4_da_reserve_space, | 1071 | TRACE_EVENT(ext4_da_reserve_space, |
1074 | TP_PROTO(struct inode *inode, int md_needed), | 1072 | TP_PROTO(struct inode *inode, int md_needed), |
1075 | 1073 | ||
1076 | TP_ARGS(inode, md_needed), | 1074 | TP_ARGS(inode, md_needed), |
1077 | 1075 | ||
1078 | TP_STRUCT__entry( | 1076 | TP_STRUCT__entry( |
1079 | __field( dev_t, dev ) | 1077 | __field( dev_t, dev ) |
1080 | __field( ino_t, ino ) | 1078 | __field( ino_t, ino ) |
1081 | __field( umode_t, mode ) | 1079 | __field( umode_t, mode ) |
1082 | __field( __u64, i_blocks ) | 1080 | __field( __u64, i_blocks ) |
1083 | __field( int, md_needed ) | 1081 | __field( int, md_needed ) |
1084 | __field( int, reserved_data_blocks ) | 1082 | __field( int, reserved_data_blocks ) |
1085 | __field( int, reserved_meta_blocks ) | 1083 | __field( int, reserved_meta_blocks ) |
1086 | ), | 1084 | ), |
1087 | 1085 | ||
1088 | TP_fast_assign( | 1086 | TP_fast_assign( |
1089 | __entry->dev = inode->i_sb->s_dev; | 1087 | __entry->dev = inode->i_sb->s_dev; |
1090 | __entry->ino = inode->i_ino; | 1088 | __entry->ino = inode->i_ino; |
1091 | __entry->mode = inode->i_mode; | 1089 | __entry->mode = inode->i_mode; |
1092 | __entry->i_blocks = inode->i_blocks; | 1090 | __entry->i_blocks = inode->i_blocks; |
1093 | __entry->md_needed = md_needed; | 1091 | __entry->md_needed = md_needed; |
1094 | __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 1092 | __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; |
1095 | __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; | 1093 | __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; |
1096 | ), | 1094 | ), |
1097 | 1095 | ||
1098 | TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d " | 1096 | TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d " |
1099 | "reserved_data_blocks %d reserved_meta_blocks %d", | 1097 | "reserved_data_blocks %d reserved_meta_blocks %d", |
1100 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1098 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1101 | (unsigned long) __entry->ino, | 1099 | (unsigned long) __entry->ino, |
1102 | __entry->mode, __entry->i_blocks, | 1100 | __entry->mode, __entry->i_blocks, |
1103 | __entry->md_needed, __entry->reserved_data_blocks, | 1101 | __entry->md_needed, __entry->reserved_data_blocks, |
1104 | __entry->reserved_meta_blocks) | 1102 | __entry->reserved_meta_blocks) |
1105 | ); | 1103 | ); |
1106 | 1104 | ||
1107 | TRACE_EVENT(ext4_da_release_space, | 1105 | TRACE_EVENT(ext4_da_release_space, |
1108 | TP_PROTO(struct inode *inode, int freed_blocks), | 1106 | TP_PROTO(struct inode *inode, int freed_blocks), |
1109 | 1107 | ||
1110 | TP_ARGS(inode, freed_blocks), | 1108 | TP_ARGS(inode, freed_blocks), |
1111 | 1109 | ||
1112 | TP_STRUCT__entry( | 1110 | TP_STRUCT__entry( |
1113 | __field( dev_t, dev ) | 1111 | __field( dev_t, dev ) |
1114 | __field( ino_t, ino ) | 1112 | __field( ino_t, ino ) |
1115 | __field( umode_t, mode ) | 1113 | __field( umode_t, mode ) |
1116 | __field( __u64, i_blocks ) | 1114 | __field( __u64, i_blocks ) |
1117 | __field( int, freed_blocks ) | 1115 | __field( int, freed_blocks ) |
1118 | __field( int, reserved_data_blocks ) | 1116 | __field( int, reserved_data_blocks ) |
1119 | __field( int, reserved_meta_blocks ) | 1117 | __field( int, reserved_meta_blocks ) |
1120 | __field( int, allocated_meta_blocks ) | 1118 | __field( int, allocated_meta_blocks ) |
1121 | ), | 1119 | ), |
1122 | 1120 | ||
1123 | TP_fast_assign( | 1121 | TP_fast_assign( |
1124 | __entry->dev = inode->i_sb->s_dev; | 1122 | __entry->dev = inode->i_sb->s_dev; |
1125 | __entry->ino = inode->i_ino; | 1123 | __entry->ino = inode->i_ino; |
1126 | __entry->mode = inode->i_mode; | 1124 | __entry->mode = inode->i_mode; |
1127 | __entry->i_blocks = inode->i_blocks; | 1125 | __entry->i_blocks = inode->i_blocks; |
1128 | __entry->freed_blocks = freed_blocks; | 1126 | __entry->freed_blocks = freed_blocks; |
1129 | __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 1127 | __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; |
1130 | __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; | 1128 | __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; |
1131 | __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; | 1129 | __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; |
1132 | ), | 1130 | ), |
1133 | 1131 | ||
1134 | TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " | 1132 | TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " |
1135 | "reserved_data_blocks %d reserved_meta_blocks %d " | 1133 | "reserved_data_blocks %d reserved_meta_blocks %d " |
1136 | "allocated_meta_blocks %d", | 1134 | "allocated_meta_blocks %d", |
1137 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1135 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1138 | (unsigned long) __entry->ino, | 1136 | (unsigned long) __entry->ino, |
1139 | __entry->mode, __entry->i_blocks, | 1137 | __entry->mode, __entry->i_blocks, |
1140 | __entry->freed_blocks, __entry->reserved_data_blocks, | 1138 | __entry->freed_blocks, __entry->reserved_data_blocks, |
1141 | __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) | 1139 | __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) |
1142 | ); | 1140 | ); |
1143 | 1141 | ||
1144 | DECLARE_EVENT_CLASS(ext4__bitmap_load, | 1142 | DECLARE_EVENT_CLASS(ext4__bitmap_load, |
1145 | TP_PROTO(struct super_block *sb, unsigned long group), | 1143 | TP_PROTO(struct super_block *sb, unsigned long group), |
1146 | 1144 | ||
1147 | TP_ARGS(sb, group), | 1145 | TP_ARGS(sb, group), |
1148 | 1146 | ||
1149 | TP_STRUCT__entry( | 1147 | TP_STRUCT__entry( |
1150 | __field( dev_t, dev ) | 1148 | __field( dev_t, dev ) |
1151 | __field( __u32, group ) | 1149 | __field( __u32, group ) |
1152 | 1150 | ||
1153 | ), | 1151 | ), |
1154 | 1152 | ||
1155 | TP_fast_assign( | 1153 | TP_fast_assign( |
1156 | __entry->dev = sb->s_dev; | 1154 | __entry->dev = sb->s_dev; |
1157 | __entry->group = group; | 1155 | __entry->group = group; |
1158 | ), | 1156 | ), |
1159 | 1157 | ||
1160 | TP_printk("dev %d,%d group %u", | 1158 | TP_printk("dev %d,%d group %u", |
1161 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1159 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1162 | __entry->group) | 1160 | __entry->group) |
1163 | ); | 1161 | ); |
1164 | 1162 | ||
1165 | DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, | 1163 | DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, |
1166 | 1164 | ||
1167 | TP_PROTO(struct super_block *sb, unsigned long group), | 1165 | TP_PROTO(struct super_block *sb, unsigned long group), |
1168 | 1166 | ||
1169 | TP_ARGS(sb, group) | 1167 | TP_ARGS(sb, group) |
1170 | ); | 1168 | ); |
1171 | 1169 | ||
1172 | DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, | 1170 | DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, |
1173 | 1171 | ||
1174 | TP_PROTO(struct super_block *sb, unsigned long group), | 1172 | TP_PROTO(struct super_block *sb, unsigned long group), |
1175 | 1173 | ||
1176 | TP_ARGS(sb, group) | 1174 | TP_ARGS(sb, group) |
1177 | ); | 1175 | ); |
1178 | 1176 | ||
1179 | DEFINE_EVENT(ext4__bitmap_load, ext4_read_block_bitmap_load, | 1177 | DEFINE_EVENT(ext4__bitmap_load, ext4_read_block_bitmap_load, |
1180 | 1178 | ||
1181 | TP_PROTO(struct super_block *sb, unsigned long group), | 1179 | TP_PROTO(struct super_block *sb, unsigned long group), |
1182 | 1180 | ||
1183 | TP_ARGS(sb, group) | 1181 | TP_ARGS(sb, group) |
1184 | ); | 1182 | ); |
1185 | 1183 | ||
1186 | DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, | 1184 | DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, |
1187 | 1185 | ||
1188 | TP_PROTO(struct super_block *sb, unsigned long group), | 1186 | TP_PROTO(struct super_block *sb, unsigned long group), |
1189 | 1187 | ||
1190 | TP_ARGS(sb, group) | 1188 | TP_ARGS(sb, group) |
1191 | ); | 1189 | ); |
1192 | 1190 | ||
1193 | TRACE_EVENT(ext4_direct_IO_enter, | 1191 | TRACE_EVENT(ext4_direct_IO_enter, |
1194 | TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), | 1192 | TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), |
1195 | 1193 | ||
1196 | TP_ARGS(inode, offset, len, rw), | 1194 | TP_ARGS(inode, offset, len, rw), |
1197 | 1195 | ||
1198 | TP_STRUCT__entry( | 1196 | TP_STRUCT__entry( |
1199 | __field( ino_t, ino ) | 1197 | __field( ino_t, ino ) |
1200 | __field( dev_t, dev ) | 1198 | __field( dev_t, dev ) |
1201 | __field( loff_t, pos ) | 1199 | __field( loff_t, pos ) |
1202 | __field( unsigned long, len ) | 1200 | __field( unsigned long, len ) |
1203 | __field( int, rw ) | 1201 | __field( int, rw ) |
1204 | ), | 1202 | ), |
1205 | 1203 | ||
1206 | TP_fast_assign( | 1204 | TP_fast_assign( |
1207 | __entry->ino = inode->i_ino; | 1205 | __entry->ino = inode->i_ino; |
1208 | __entry->dev = inode->i_sb->s_dev; | 1206 | __entry->dev = inode->i_sb->s_dev; |
1209 | __entry->pos = offset; | 1207 | __entry->pos = offset; |
1210 | __entry->len = len; | 1208 | __entry->len = len; |
1211 | __entry->rw = rw; | 1209 | __entry->rw = rw; |
1212 | ), | 1210 | ), |
1213 | 1211 | ||
1214 | TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", | 1212 | TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", |
1215 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1213 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1216 | (unsigned long) __entry->ino, | 1214 | (unsigned long) __entry->ino, |
1217 | __entry->pos, __entry->len, __entry->rw) | 1215 | __entry->pos, __entry->len, __entry->rw) |
1218 | ); | 1216 | ); |
1219 | 1217 | ||
1220 | TRACE_EVENT(ext4_direct_IO_exit, | 1218 | TRACE_EVENT(ext4_direct_IO_exit, |
1221 | TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, | 1219 | TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, |
1222 | int rw, int ret), | 1220 | int rw, int ret), |
1223 | 1221 | ||
1224 | TP_ARGS(inode, offset, len, rw, ret), | 1222 | TP_ARGS(inode, offset, len, rw, ret), |
1225 | 1223 | ||
1226 | TP_STRUCT__entry( | 1224 | TP_STRUCT__entry( |
1227 | __field( ino_t, ino ) | 1225 | __field( ino_t, ino ) |
1228 | __field( dev_t, dev ) | 1226 | __field( dev_t, dev ) |
1229 | __field( loff_t, pos ) | 1227 | __field( loff_t, pos ) |
1230 | __field( unsigned long, len ) | 1228 | __field( unsigned long, len ) |
1231 | __field( int, rw ) | 1229 | __field( int, rw ) |
1232 | __field( int, ret ) | 1230 | __field( int, ret ) |
1233 | ), | 1231 | ), |
1234 | 1232 | ||
1235 | TP_fast_assign( | 1233 | TP_fast_assign( |
1236 | __entry->ino = inode->i_ino; | 1234 | __entry->ino = inode->i_ino; |
1237 | __entry->dev = inode->i_sb->s_dev; | 1235 | __entry->dev = inode->i_sb->s_dev; |
1238 | __entry->pos = offset; | 1236 | __entry->pos = offset; |
1239 | __entry->len = len; | 1237 | __entry->len = len; |
1240 | __entry->rw = rw; | 1238 | __entry->rw = rw; |
1241 | __entry->ret = ret; | 1239 | __entry->ret = ret; |
1242 | ), | 1240 | ), |
1243 | 1241 | ||
1244 | TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", | 1242 | TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", |
1245 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1243 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1246 | (unsigned long) __entry->ino, | 1244 | (unsigned long) __entry->ino, |
1247 | __entry->pos, __entry->len, | 1245 | __entry->pos, __entry->len, |
1248 | __entry->rw, __entry->ret) | 1246 | __entry->rw, __entry->ret) |
1249 | ); | 1247 | ); |
1250 | 1248 | ||
1251 | TRACE_EVENT(ext4_fallocate_enter, | 1249 | TRACE_EVENT(ext4_fallocate_enter, |
1252 | TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), | 1250 | TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), |
1253 | 1251 | ||
1254 | TP_ARGS(inode, offset, len, mode), | 1252 | TP_ARGS(inode, offset, len, mode), |
1255 | 1253 | ||
1256 | TP_STRUCT__entry( | 1254 | TP_STRUCT__entry( |
1257 | __field( ino_t, ino ) | 1255 | __field( ino_t, ino ) |
1258 | __field( dev_t, dev ) | 1256 | __field( dev_t, dev ) |
1259 | __field( loff_t, pos ) | 1257 | __field( loff_t, pos ) |
1260 | __field( loff_t, len ) | 1258 | __field( loff_t, len ) |
1261 | __field( int, mode ) | 1259 | __field( int, mode ) |
1262 | ), | 1260 | ), |
1263 | 1261 | ||
1264 | TP_fast_assign( | 1262 | TP_fast_assign( |
1265 | __entry->ino = inode->i_ino; | 1263 | __entry->ino = inode->i_ino; |
1266 | __entry->dev = inode->i_sb->s_dev; | 1264 | __entry->dev = inode->i_sb->s_dev; |
1267 | __entry->pos = offset; | 1265 | __entry->pos = offset; |
1268 | __entry->len = len; | 1266 | __entry->len = len; |
1269 | __entry->mode = mode; | 1267 | __entry->mode = mode; |
1270 | ), | 1268 | ), |
1271 | 1269 | ||
1272 | TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d", | 1270 | TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d", |
1273 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1271 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1274 | (unsigned long) __entry->ino, __entry->pos, | 1272 | (unsigned long) __entry->ino, __entry->pos, |
1275 | __entry->len, __entry->mode) | 1273 | __entry->len, __entry->mode) |
1276 | ); | 1274 | ); |
1277 | 1275 | ||
1278 | TRACE_EVENT(ext4_fallocate_exit, | 1276 | TRACE_EVENT(ext4_fallocate_exit, |
1279 | TP_PROTO(struct inode *inode, loff_t offset, | 1277 | TP_PROTO(struct inode *inode, loff_t offset, |
1280 | unsigned int max_blocks, int ret), | 1278 | unsigned int max_blocks, int ret), |
1281 | 1279 | ||
1282 | TP_ARGS(inode, offset, max_blocks, ret), | 1280 | TP_ARGS(inode, offset, max_blocks, ret), |
1283 | 1281 | ||
1284 | TP_STRUCT__entry( | 1282 | TP_STRUCT__entry( |
1285 | __field( ino_t, ino ) | 1283 | __field( ino_t, ino ) |
1286 | __field( dev_t, dev ) | 1284 | __field( dev_t, dev ) |
1287 | __field( loff_t, pos ) | 1285 | __field( loff_t, pos ) |
1288 | __field( unsigned int, blocks ) | 1286 | __field( unsigned int, blocks ) |
1289 | __field( int, ret ) | 1287 | __field( int, ret ) |
1290 | ), | 1288 | ), |
1291 | 1289 | ||
1292 | TP_fast_assign( | 1290 | TP_fast_assign( |
1293 | __entry->ino = inode->i_ino; | 1291 | __entry->ino = inode->i_ino; |
1294 | __entry->dev = inode->i_sb->s_dev; | 1292 | __entry->dev = inode->i_sb->s_dev; |
1295 | __entry->pos = offset; | 1293 | __entry->pos = offset; |
1296 | __entry->blocks = max_blocks; | 1294 | __entry->blocks = max_blocks; |
1297 | __entry->ret = ret; | 1295 | __entry->ret = ret; |
1298 | ), | 1296 | ), |
1299 | 1297 | ||
1300 | TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", | 1298 | TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", |
1301 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1299 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1302 | (unsigned long) __entry->ino, | 1300 | (unsigned long) __entry->ino, |
1303 | __entry->pos, __entry->blocks, | 1301 | __entry->pos, __entry->blocks, |
1304 | __entry->ret) | 1302 | __entry->ret) |
1305 | ); | 1303 | ); |
1306 | 1304 | ||
1307 | TRACE_EVENT(ext4_unlink_enter, | 1305 | TRACE_EVENT(ext4_unlink_enter, |
1308 | TP_PROTO(struct inode *parent, struct dentry *dentry), | 1306 | TP_PROTO(struct inode *parent, struct dentry *dentry), |
1309 | 1307 | ||
1310 | TP_ARGS(parent, dentry), | 1308 | TP_ARGS(parent, dentry), |
1311 | 1309 | ||
1312 | TP_STRUCT__entry( | 1310 | TP_STRUCT__entry( |
1313 | __field( ino_t, parent ) | 1311 | __field( ino_t, parent ) |
1314 | __field( ino_t, ino ) | 1312 | __field( ino_t, ino ) |
1315 | __field( loff_t, size ) | 1313 | __field( loff_t, size ) |
1316 | __field( dev_t, dev ) | 1314 | __field( dev_t, dev ) |
1317 | ), | 1315 | ), |
1318 | 1316 | ||
1319 | TP_fast_assign( | 1317 | TP_fast_assign( |
1320 | __entry->parent = parent->i_ino; | 1318 | __entry->parent = parent->i_ino; |
1321 | __entry->ino = dentry->d_inode->i_ino; | 1319 | __entry->ino = dentry->d_inode->i_ino; |
1322 | __entry->size = dentry->d_inode->i_size; | 1320 | __entry->size = dentry->d_inode->i_size; |
1323 | __entry->dev = dentry->d_inode->i_sb->s_dev; | 1321 | __entry->dev = dentry->d_inode->i_sb->s_dev; |
1324 | ), | 1322 | ), |
1325 | 1323 | ||
1326 | TP_printk("dev %d,%d ino %lu size %lld parent %lu", | 1324 | TP_printk("dev %d,%d ino %lu size %lld parent %lu", |
1327 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1325 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1328 | (unsigned long) __entry->ino, __entry->size, | 1326 | (unsigned long) __entry->ino, __entry->size, |
1329 | (unsigned long) __entry->parent) | 1327 | (unsigned long) __entry->parent) |
1330 | ); | 1328 | ); |
1331 | 1329 | ||
1332 | TRACE_EVENT(ext4_unlink_exit, | 1330 | TRACE_EVENT(ext4_unlink_exit, |
1333 | TP_PROTO(struct dentry *dentry, int ret), | 1331 | TP_PROTO(struct dentry *dentry, int ret), |
1334 | 1332 | ||
1335 | TP_ARGS(dentry, ret), | 1333 | TP_ARGS(dentry, ret), |
1336 | 1334 | ||
1337 | TP_STRUCT__entry( | 1335 | TP_STRUCT__entry( |
1338 | __field( ino_t, ino ) | 1336 | __field( ino_t, ino ) |
1339 | __field( dev_t, dev ) | 1337 | __field( dev_t, dev ) |
1340 | __field( int, ret ) | 1338 | __field( int, ret ) |
1341 | ), | 1339 | ), |
1342 | 1340 | ||
1343 | TP_fast_assign( | 1341 | TP_fast_assign( |
1344 | __entry->ino = dentry->d_inode->i_ino; | 1342 | __entry->ino = dentry->d_inode->i_ino; |
1345 | __entry->dev = dentry->d_inode->i_sb->s_dev; | 1343 | __entry->dev = dentry->d_inode->i_sb->s_dev; |
1346 | __entry->ret = ret; | 1344 | __entry->ret = ret; |
1347 | ), | 1345 | ), |
1348 | 1346 | ||
1349 | TP_printk("dev %d,%d ino %lu ret %d", | 1347 | TP_printk("dev %d,%d ino %lu ret %d", |
1350 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1348 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1351 | (unsigned long) __entry->ino, | 1349 | (unsigned long) __entry->ino, |
1352 | __entry->ret) | 1350 | __entry->ret) |
1353 | ); | 1351 | ); |
1354 | 1352 | ||
1355 | DECLARE_EVENT_CLASS(ext4__truncate, | 1353 | DECLARE_EVENT_CLASS(ext4__truncate, |
1356 | TP_PROTO(struct inode *inode), | 1354 | TP_PROTO(struct inode *inode), |
1357 | 1355 | ||
1358 | TP_ARGS(inode), | 1356 | TP_ARGS(inode), |
1359 | 1357 | ||
1360 | TP_STRUCT__entry( | 1358 | TP_STRUCT__entry( |
1361 | __field( ino_t, ino ) | 1359 | __field( ino_t, ino ) |
1362 | __field( dev_t, dev ) | 1360 | __field( dev_t, dev ) |
1363 | __field( __u64, blocks ) | 1361 | __field( __u64, blocks ) |
1364 | ), | 1362 | ), |
1365 | 1363 | ||
1366 | TP_fast_assign( | 1364 | TP_fast_assign( |
1367 | __entry->ino = inode->i_ino; | 1365 | __entry->ino = inode->i_ino; |
1368 | __entry->dev = inode->i_sb->s_dev; | 1366 | __entry->dev = inode->i_sb->s_dev; |
1369 | __entry->blocks = inode->i_blocks; | 1367 | __entry->blocks = inode->i_blocks; |
1370 | ), | 1368 | ), |
1371 | 1369 | ||
1372 | TP_printk("dev %d,%d ino %lu blocks %llu", | 1370 | TP_printk("dev %d,%d ino %lu blocks %llu", |
1373 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1371 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1374 | (unsigned long) __entry->ino, __entry->blocks) | 1372 | (unsigned long) __entry->ino, __entry->blocks) |
1375 | ); | 1373 | ); |
1376 | 1374 | ||
1377 | DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, | 1375 | DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, |
1378 | 1376 | ||
1379 | TP_PROTO(struct inode *inode), | 1377 | TP_PROTO(struct inode *inode), |
1380 | 1378 | ||
1381 | TP_ARGS(inode) | 1379 | TP_ARGS(inode) |
1382 | ); | 1380 | ); |
1383 | 1381 | ||
1384 | DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, | 1382 | DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, |
1385 | 1383 | ||
1386 | TP_PROTO(struct inode *inode), | 1384 | TP_PROTO(struct inode *inode), |
1387 | 1385 | ||
1388 | TP_ARGS(inode) | 1386 | TP_ARGS(inode) |
1389 | ); | 1387 | ); |
1390 | 1388 | ||
1391 | DECLARE_EVENT_CLASS(ext4__map_blocks_enter, | 1389 | DECLARE_EVENT_CLASS(ext4__map_blocks_enter, |
1392 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, | 1390 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, |
1393 | unsigned int len, unsigned int flags), | 1391 | unsigned int len, unsigned int flags), |
1394 | 1392 | ||
1395 | TP_ARGS(inode, lblk, len, flags), | 1393 | TP_ARGS(inode, lblk, len, flags), |
1396 | 1394 | ||
1397 | TP_STRUCT__entry( | 1395 | TP_STRUCT__entry( |
1398 | __field( ino_t, ino ) | 1396 | __field( ino_t, ino ) |
1399 | __field( dev_t, dev ) | 1397 | __field( dev_t, dev ) |
1400 | __field( ext4_lblk_t, lblk ) | 1398 | __field( ext4_lblk_t, lblk ) |
1401 | __field( unsigned int, len ) | 1399 | __field( unsigned int, len ) |
1402 | __field( unsigned int, flags ) | 1400 | __field( unsigned int, flags ) |
1403 | ), | 1401 | ), |
1404 | 1402 | ||
1405 | TP_fast_assign( | 1403 | TP_fast_assign( |
1406 | __entry->ino = inode->i_ino; | 1404 | __entry->ino = inode->i_ino; |
1407 | __entry->dev = inode->i_sb->s_dev; | 1405 | __entry->dev = inode->i_sb->s_dev; |
1408 | __entry->lblk = lblk; | 1406 | __entry->lblk = lblk; |
1409 | __entry->len = len; | 1407 | __entry->len = len; |
1410 | __entry->flags = flags; | 1408 | __entry->flags = flags; |
1411 | ), | 1409 | ), |
1412 | 1410 | ||
1413 | TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", | 1411 | TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", |
1414 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1412 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1415 | (unsigned long) __entry->ino, | 1413 | (unsigned long) __entry->ino, |
1416 | __entry->lblk, __entry->len, __entry->flags) | 1414 | __entry->lblk, __entry->len, __entry->flags) |
1417 | ); | 1415 | ); |
1418 | 1416 | ||
1419 | DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, | 1417 | DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, |
1420 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, | 1418 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, |
1421 | unsigned len, unsigned flags), | 1419 | unsigned len, unsigned flags), |
1422 | 1420 | ||
1423 | TP_ARGS(inode, lblk, len, flags) | 1421 | TP_ARGS(inode, lblk, len, flags) |
1424 | ); | 1422 | ); |
1425 | 1423 | ||
1426 | DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, | 1424 | DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, |
1427 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, | 1425 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, |
1428 | unsigned len, unsigned flags), | 1426 | unsigned len, unsigned flags), |
1429 | 1427 | ||
1430 | TP_ARGS(inode, lblk, len, flags) | 1428 | TP_ARGS(inode, lblk, len, flags) |
1431 | ); | 1429 | ); |
1432 | 1430 | ||
1433 | DECLARE_EVENT_CLASS(ext4__map_blocks_exit, | 1431 | DECLARE_EVENT_CLASS(ext4__map_blocks_exit, |
1434 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, | 1432 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, |
1435 | ext4_fsblk_t pblk, unsigned int len, int ret), | 1433 | ext4_fsblk_t pblk, unsigned int len, int ret), |
1436 | 1434 | ||
1437 | TP_ARGS(inode, lblk, pblk, len, ret), | 1435 | TP_ARGS(inode, lblk, pblk, len, ret), |
1438 | 1436 | ||
1439 | TP_STRUCT__entry( | 1437 | TP_STRUCT__entry( |
1440 | __field( ino_t, ino ) | 1438 | __field( ino_t, ino ) |
1441 | __field( dev_t, dev ) | 1439 | __field( dev_t, dev ) |
1442 | __field( ext4_lblk_t, lblk ) | 1440 | __field( ext4_lblk_t, lblk ) |
1443 | __field( ext4_fsblk_t, pblk ) | 1441 | __field( ext4_fsblk_t, pblk ) |
1444 | __field( unsigned int, len ) | 1442 | __field( unsigned int, len ) |
1445 | __field( int, ret ) | 1443 | __field( int, ret ) |
1446 | ), | 1444 | ), |
1447 | 1445 | ||
1448 | TP_fast_assign( | 1446 | TP_fast_assign( |
1449 | __entry->ino = inode->i_ino; | 1447 | __entry->ino = inode->i_ino; |
1450 | __entry->dev = inode->i_sb->s_dev; | 1448 | __entry->dev = inode->i_sb->s_dev; |
1451 | __entry->lblk = lblk; | 1449 | __entry->lblk = lblk; |
1452 | __entry->pblk = pblk; | 1450 | __entry->pblk = pblk; |
1453 | __entry->len = len; | 1451 | __entry->len = len; |
1454 | __entry->ret = ret; | 1452 | __entry->ret = ret; |
1455 | ), | 1453 | ), |
1456 | 1454 | ||
1457 | TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", | 1455 | TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", |
1458 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1456 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1459 | (unsigned long) __entry->ino, | 1457 | (unsigned long) __entry->ino, |
1460 | __entry->lblk, __entry->pblk, | 1458 | __entry->lblk, __entry->pblk, |
1461 | __entry->len, __entry->ret) | 1459 | __entry->len, __entry->ret) |
1462 | ); | 1460 | ); |
1463 | 1461 | ||
1464 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, | 1462 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, |
1465 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, | 1463 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, |
1466 | ext4_fsblk_t pblk, unsigned len, int ret), | 1464 | ext4_fsblk_t pblk, unsigned len, int ret), |
1467 | 1465 | ||
1468 | TP_ARGS(inode, lblk, pblk, len, ret) | 1466 | TP_ARGS(inode, lblk, pblk, len, ret) |
1469 | ); | 1467 | ); |
1470 | 1468 | ||
1471 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, | 1469 | DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, |
1472 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, | 1470 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, |
1473 | ext4_fsblk_t pblk, unsigned len, int ret), | 1471 | ext4_fsblk_t pblk, unsigned len, int ret), |
1474 | 1472 | ||
1475 | TP_ARGS(inode, lblk, pblk, len, ret) | 1473 | TP_ARGS(inode, lblk, pblk, len, ret) |
1476 | ); | 1474 | ); |
1477 | 1475 | ||
1478 | TRACE_EVENT(ext4_ext_load_extent, | 1476 | TRACE_EVENT(ext4_ext_load_extent, |
1479 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), | 1477 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), |
1480 | 1478 | ||
1481 | TP_ARGS(inode, lblk, pblk), | 1479 | TP_ARGS(inode, lblk, pblk), |
1482 | 1480 | ||
1483 | TP_STRUCT__entry( | 1481 | TP_STRUCT__entry( |
1484 | __field( ino_t, ino ) | 1482 | __field( ino_t, ino ) |
1485 | __field( dev_t, dev ) | 1483 | __field( dev_t, dev ) |
1486 | __field( ext4_lblk_t, lblk ) | 1484 | __field( ext4_lblk_t, lblk ) |
1487 | __field( ext4_fsblk_t, pblk ) | 1485 | __field( ext4_fsblk_t, pblk ) |
1488 | ), | 1486 | ), |
1489 | 1487 | ||
1490 | TP_fast_assign( | 1488 | TP_fast_assign( |
1491 | __entry->ino = inode->i_ino; | 1489 | __entry->ino = inode->i_ino; |
1492 | __entry->dev = inode->i_sb->s_dev; | 1490 | __entry->dev = inode->i_sb->s_dev; |
1493 | __entry->lblk = lblk; | 1491 | __entry->lblk = lblk; |
1494 | __entry->pblk = pblk; | 1492 | __entry->pblk = pblk; |
1495 | ), | 1493 | ), |
1496 | 1494 | ||
1497 | TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", | 1495 | TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", |
1498 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1496 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1499 | (unsigned long) __entry->ino, | 1497 | (unsigned long) __entry->ino, |
1500 | __entry->lblk, __entry->pblk) | 1498 | __entry->lblk, __entry->pblk) |
1501 | ); | 1499 | ); |
1502 | 1500 | ||
1503 | TRACE_EVENT(ext4_load_inode, | 1501 | TRACE_EVENT(ext4_load_inode, |
1504 | TP_PROTO(struct inode *inode), | 1502 | TP_PROTO(struct inode *inode), |
1505 | 1503 | ||
1506 | TP_ARGS(inode), | 1504 | TP_ARGS(inode), |
1507 | 1505 | ||
1508 | TP_STRUCT__entry( | 1506 | TP_STRUCT__entry( |
1509 | __field( ino_t, ino ) | 1507 | __field( ino_t, ino ) |
1510 | __field( dev_t, dev ) | 1508 | __field( dev_t, dev ) |
1511 | ), | 1509 | ), |
1512 | 1510 | ||
1513 | TP_fast_assign( | 1511 | TP_fast_assign( |
1514 | __entry->ino = inode->i_ino; | 1512 | __entry->ino = inode->i_ino; |
1515 | __entry->dev = inode->i_sb->s_dev; | 1513 | __entry->dev = inode->i_sb->s_dev; |
1516 | ), | 1514 | ), |
1517 | 1515 | ||
1518 | TP_printk("dev %d,%d ino %ld", | 1516 | TP_printk("dev %d,%d ino %ld", |
1519 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1517 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1520 | (unsigned long) __entry->ino) | 1518 | (unsigned long) __entry->ino) |
1521 | ); | 1519 | ); |
1522 | 1520 | ||
1523 | #endif /* _TRACE_EXT4_H */ | 1521 | #endif /* _TRACE_EXT4_H */ |
1524 | 1522 | ||
1525 | /* This part must be outside protection */ | 1523 | /* This part must be outside protection */ |
1526 | #include <trace/define_trace.h> | 1524 | #include <trace/define_trace.h> |
1527 | 1525 |
include/trace/events/writeback.h
1 | #undef TRACE_SYSTEM | 1 | #undef TRACE_SYSTEM |
2 | #define TRACE_SYSTEM writeback | 2 | #define TRACE_SYSTEM writeback |
3 | 3 | ||
4 | #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) | 4 | #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) |
5 | #define _TRACE_WRITEBACK_H | 5 | #define _TRACE_WRITEBACK_H |
6 | 6 | ||
7 | #include <linux/backing-dev.h> | 7 | #include <linux/backing-dev.h> |
8 | #include <linux/device.h> | 8 | #include <linux/device.h> |
9 | #include <linux/writeback.h> | 9 | #include <linux/writeback.h> |
10 | 10 | ||
11 | #define show_inode_state(state) \ | ||
12 | __print_flags(state, "|", \ | ||
13 | {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ | ||
14 | {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ | ||
15 | {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ | ||
16 | {I_NEW, "I_NEW"}, \ | ||
17 | {I_WILL_FREE, "I_WILL_FREE"}, \ | ||
18 | {I_FREEING, "I_FREEING"}, \ | ||
19 | {I_CLEAR, "I_CLEAR"}, \ | ||
20 | {I_SYNC, "I_SYNC"}, \ | ||
21 | {I_REFERENCED, "I_REFERENCED"} \ | ||
22 | ) | ||
23 | |||
11 | struct wb_writeback_work; | 24 | struct wb_writeback_work; |
12 | 25 | ||
13 | DECLARE_EVENT_CLASS(writeback_work_class, | 26 | DECLARE_EVENT_CLASS(writeback_work_class, |
14 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), | 27 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), |
15 | TP_ARGS(bdi, work), | 28 | TP_ARGS(bdi, work), |
16 | TP_STRUCT__entry( | 29 | TP_STRUCT__entry( |
17 | __array(char, name, 32) | 30 | __array(char, name, 32) |
18 | __field(long, nr_pages) | 31 | __field(long, nr_pages) |
19 | __field(dev_t, sb_dev) | 32 | __field(dev_t, sb_dev) |
20 | __field(int, sync_mode) | 33 | __field(int, sync_mode) |
21 | __field(int, for_kupdate) | 34 | __field(int, for_kupdate) |
22 | __field(int, range_cyclic) | 35 | __field(int, range_cyclic) |
23 | __field(int, for_background) | 36 | __field(int, for_background) |
24 | ), | 37 | ), |
25 | TP_fast_assign( | 38 | TP_fast_assign( |
26 | strncpy(__entry->name, dev_name(bdi->dev), 32); | 39 | strncpy(__entry->name, dev_name(bdi->dev), 32); |
27 | __entry->nr_pages = work->nr_pages; | 40 | __entry->nr_pages = work->nr_pages; |
28 | __entry->sb_dev = work->sb ? work->sb->s_dev : 0; | 41 | __entry->sb_dev = work->sb ? work->sb->s_dev : 0; |
29 | __entry->sync_mode = work->sync_mode; | 42 | __entry->sync_mode = work->sync_mode; |
30 | __entry->for_kupdate = work->for_kupdate; | 43 | __entry->for_kupdate = work->for_kupdate; |
31 | __entry->range_cyclic = work->range_cyclic; | 44 | __entry->range_cyclic = work->range_cyclic; |
32 | __entry->for_background = work->for_background; | 45 | __entry->for_background = work->for_background; |
33 | ), | 46 | ), |
34 | TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " | 47 | TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " |
35 | "kupdate=%d range_cyclic=%d background=%d", | 48 | "kupdate=%d range_cyclic=%d background=%d", |
36 | __entry->name, | 49 | __entry->name, |
37 | MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), | 50 | MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), |
38 | __entry->nr_pages, | 51 | __entry->nr_pages, |
39 | __entry->sync_mode, | 52 | __entry->sync_mode, |
40 | __entry->for_kupdate, | 53 | __entry->for_kupdate, |
41 | __entry->range_cyclic, | 54 | __entry->range_cyclic, |
42 | __entry->for_background | 55 | __entry->for_background |
43 | ) | 56 | ) |
44 | ); | 57 | ); |
45 | #define DEFINE_WRITEBACK_WORK_EVENT(name) \ | 58 | #define DEFINE_WRITEBACK_WORK_EVENT(name) \ |
46 | DEFINE_EVENT(writeback_work_class, name, \ | 59 | DEFINE_EVENT(writeback_work_class, name, \ |
47 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ | 60 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ |
48 | TP_ARGS(bdi, work)) | 61 | TP_ARGS(bdi, work)) |
49 | DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); | 62 | DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); |
50 | DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); | 63 | DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); |
51 | DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); | 64 | DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); |
65 | DEFINE_WRITEBACK_WORK_EVENT(writeback_start); | ||
66 | DEFINE_WRITEBACK_WORK_EVENT(writeback_written); | ||
67 | DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); | ||
52 | 68 | ||
53 | TRACE_EVENT(writeback_pages_written, | 69 | TRACE_EVENT(writeback_pages_written, |
54 | TP_PROTO(long pages_written), | 70 | TP_PROTO(long pages_written), |
55 | TP_ARGS(pages_written), | 71 | TP_ARGS(pages_written), |
56 | TP_STRUCT__entry( | 72 | TP_STRUCT__entry( |
57 | __field(long, pages) | 73 | __field(long, pages) |
58 | ), | 74 | ), |
59 | TP_fast_assign( | 75 | TP_fast_assign( |
60 | __entry->pages = pages_written; | 76 | __entry->pages = pages_written; |
61 | ), | 77 | ), |
62 | TP_printk("%ld", __entry->pages) | 78 | TP_printk("%ld", __entry->pages) |
63 | ); | 79 | ); |
64 | 80 | ||
65 | DECLARE_EVENT_CLASS(writeback_class, | 81 | DECLARE_EVENT_CLASS(writeback_class, |
66 | TP_PROTO(struct backing_dev_info *bdi), | 82 | TP_PROTO(struct backing_dev_info *bdi), |
67 | TP_ARGS(bdi), | 83 | TP_ARGS(bdi), |
68 | TP_STRUCT__entry( | 84 | TP_STRUCT__entry( |
69 | __array(char, name, 32) | 85 | __array(char, name, 32) |
70 | ), | 86 | ), |
71 | TP_fast_assign( | 87 | TP_fast_assign( |
72 | strncpy(__entry->name, dev_name(bdi->dev), 32); | 88 | strncpy(__entry->name, dev_name(bdi->dev), 32); |
73 | ), | 89 | ), |
74 | TP_printk("bdi %s", | 90 | TP_printk("bdi %s", |
75 | __entry->name | 91 | __entry->name |
76 | ) | 92 | ) |
77 | ); | 93 | ); |
78 | #define DEFINE_WRITEBACK_EVENT(name) \ | 94 | #define DEFINE_WRITEBACK_EVENT(name) \ |
79 | DEFINE_EVENT(writeback_class, name, \ | 95 | DEFINE_EVENT(writeback_class, name, \ |
80 | TP_PROTO(struct backing_dev_info *bdi), \ | 96 | TP_PROTO(struct backing_dev_info *bdi), \ |
81 | TP_ARGS(bdi)) | 97 | TP_ARGS(bdi)) |
82 | 98 | ||
83 | DEFINE_WRITEBACK_EVENT(writeback_nowork); | 99 | DEFINE_WRITEBACK_EVENT(writeback_nowork); |
84 | DEFINE_WRITEBACK_EVENT(writeback_wake_background); | 100 | DEFINE_WRITEBACK_EVENT(writeback_wake_background); |
85 | DEFINE_WRITEBACK_EVENT(writeback_wake_thread); | 101 | DEFINE_WRITEBACK_EVENT(writeback_wake_thread); |
86 | DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); | 102 | DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); |
87 | DEFINE_WRITEBACK_EVENT(writeback_bdi_register); | 103 | DEFINE_WRITEBACK_EVENT(writeback_bdi_register); |
88 | DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); | 104 | DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); |
89 | DEFINE_WRITEBACK_EVENT(writeback_thread_start); | 105 | DEFINE_WRITEBACK_EVENT(writeback_thread_start); |
90 | DEFINE_WRITEBACK_EVENT(writeback_thread_stop); | 106 | DEFINE_WRITEBACK_EVENT(writeback_thread_stop); |
107 | DEFINE_WRITEBACK_EVENT(balance_dirty_start); | ||
108 | DEFINE_WRITEBACK_EVENT(balance_dirty_wait); | ||
91 | 109 | ||
110 | TRACE_EVENT(balance_dirty_written, | ||
111 | |||
112 | TP_PROTO(struct backing_dev_info *bdi, int written), | ||
113 | |||
114 | TP_ARGS(bdi, written), | ||
115 | |||
116 | TP_STRUCT__entry( | ||
117 | __array(char, name, 32) | ||
118 | __field(int, written) | ||
119 | ), | ||
120 | |||
121 | TP_fast_assign( | ||
122 | strncpy(__entry->name, dev_name(bdi->dev), 32); | ||
123 | __entry->written = written; | ||
124 | ), | ||
125 | |||
126 | TP_printk("bdi %s written %d", | ||
127 | __entry->name, | ||
128 | __entry->written | ||
129 | ) | ||
130 | ); | ||
131 | |||
92 | DECLARE_EVENT_CLASS(wbc_class, | 132 | DECLARE_EVENT_CLASS(wbc_class, |
93 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), | 133 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), |
94 | TP_ARGS(wbc, bdi), | 134 | TP_ARGS(wbc, bdi), |
95 | TP_STRUCT__entry( | 135 | TP_STRUCT__entry( |
96 | __array(char, name, 32) | 136 | __array(char, name, 32) |
97 | __field(long, nr_to_write) | 137 | __field(long, nr_to_write) |
98 | __field(long, pages_skipped) | 138 | __field(long, pages_skipped) |
99 | __field(int, sync_mode) | 139 | __field(int, sync_mode) |
100 | __field(int, for_kupdate) | 140 | __field(int, for_kupdate) |
101 | __field(int, for_background) | 141 | __field(int, for_background) |
102 | __field(int, for_reclaim) | 142 | __field(int, for_reclaim) |
103 | __field(int, range_cyclic) | 143 | __field(int, range_cyclic) |
104 | __field(int, more_io) | ||
105 | __field(unsigned long, older_than_this) | ||
106 | __field(long, range_start) | 144 | __field(long, range_start) |
107 | __field(long, range_end) | 145 | __field(long, range_end) |
108 | ), | 146 | ), |
109 | 147 | ||
110 | TP_fast_assign( | 148 | TP_fast_assign( |
111 | strncpy(__entry->name, dev_name(bdi->dev), 32); | 149 | strncpy(__entry->name, dev_name(bdi->dev), 32); |
112 | __entry->nr_to_write = wbc->nr_to_write; | 150 | __entry->nr_to_write = wbc->nr_to_write; |
113 | __entry->pages_skipped = wbc->pages_skipped; | 151 | __entry->pages_skipped = wbc->pages_skipped; |
114 | __entry->sync_mode = wbc->sync_mode; | 152 | __entry->sync_mode = wbc->sync_mode; |
115 | __entry->for_kupdate = wbc->for_kupdate; | 153 | __entry->for_kupdate = wbc->for_kupdate; |
116 | __entry->for_background = wbc->for_background; | 154 | __entry->for_background = wbc->for_background; |
117 | __entry->for_reclaim = wbc->for_reclaim; | 155 | __entry->for_reclaim = wbc->for_reclaim; |
118 | __entry->range_cyclic = wbc->range_cyclic; | 156 | __entry->range_cyclic = wbc->range_cyclic; |
119 | __entry->more_io = wbc->more_io; | ||
120 | __entry->older_than_this = wbc->older_than_this ? | ||
121 | *wbc->older_than_this : 0; | ||
122 | __entry->range_start = (long)wbc->range_start; | 157 | __entry->range_start = (long)wbc->range_start; |
123 | __entry->range_end = (long)wbc->range_end; | 158 | __entry->range_end = (long)wbc->range_end; |
124 | ), | 159 | ), |
125 | 160 | ||
126 | TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " | 161 | TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " |
127 | "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " | 162 | "bgrd=%d reclm=%d cyclic=%d " |
128 | "start=0x%lx end=0x%lx", | 163 | "start=0x%lx end=0x%lx", |
129 | __entry->name, | 164 | __entry->name, |
130 | __entry->nr_to_write, | 165 | __entry->nr_to_write, |
131 | __entry->pages_skipped, | 166 | __entry->pages_skipped, |
132 | __entry->sync_mode, | 167 | __entry->sync_mode, |
133 | __entry->for_kupdate, | 168 | __entry->for_kupdate, |
134 | __entry->for_background, | 169 | __entry->for_background, |
135 | __entry->for_reclaim, | 170 | __entry->for_reclaim, |
136 | __entry->range_cyclic, | 171 | __entry->range_cyclic, |
137 | __entry->more_io, | ||
138 | __entry->older_than_this, | ||
139 | __entry->range_start, | 172 | __entry->range_start, |
140 | __entry->range_end) | 173 | __entry->range_end) |
141 | ) | 174 | ) |
142 | 175 | ||
143 | #define DEFINE_WBC_EVENT(name) \ | 176 | #define DEFINE_WBC_EVENT(name) \ |
144 | DEFINE_EVENT(wbc_class, name, \ | 177 | DEFINE_EVENT(wbc_class, name, \ |
145 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ | 178 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ |
146 | TP_ARGS(wbc, bdi)) | 179 | TP_ARGS(wbc, bdi)) |
147 | DEFINE_WBC_EVENT(wbc_writeback_start); | ||
148 | DEFINE_WBC_EVENT(wbc_writeback_written); | ||
149 | DEFINE_WBC_EVENT(wbc_writeback_wait); | ||
150 | DEFINE_WBC_EVENT(wbc_balance_dirty_start); | ||
151 | DEFINE_WBC_EVENT(wbc_balance_dirty_written); | ||
152 | DEFINE_WBC_EVENT(wbc_balance_dirty_wait); | ||
153 | DEFINE_WBC_EVENT(wbc_writepage); | 180 | DEFINE_WBC_EVENT(wbc_writepage); |
154 | 181 | ||
182 | TRACE_EVENT(writeback_queue_io, | ||
183 | TP_PROTO(struct bdi_writeback *wb, | ||
184 | unsigned long *older_than_this, | ||
185 | int moved), | ||
186 | TP_ARGS(wb, older_than_this, moved), | ||
187 | TP_STRUCT__entry( | ||
188 | __array(char, name, 32) | ||
189 | __field(unsigned long, older) | ||
190 | __field(long, age) | ||
191 | __field(int, moved) | ||
192 | ), | ||
193 | TP_fast_assign( | ||
194 | strncpy(__entry->name, dev_name(wb->bdi->dev), 32); | ||
195 | __entry->older = older_than_this ? *older_than_this : 0; | ||
196 | __entry->age = older_than_this ? | ||
197 | (jiffies - *older_than_this) * 1000 / HZ : -1; | ||
198 | __entry->moved = moved; | ||
199 | ), | ||
200 | TP_printk("bdi %s: older=%lu age=%ld enqueue=%d", | ||
201 | __entry->name, | ||
202 | __entry->older, /* older_than_this in jiffies */ | ||
203 | __entry->age, /* older_than_this in relative milliseconds */ | ||
204 | __entry->moved) | ||
205 | ); | ||
206 | |||
207 | TRACE_EVENT(global_dirty_state, | ||
208 | |||
209 | TP_PROTO(unsigned long background_thresh, | ||
210 | unsigned long dirty_thresh | ||
211 | ), | ||
212 | |||
213 | TP_ARGS(background_thresh, | ||
214 | dirty_thresh | ||
215 | ), | ||
216 | |||
217 | TP_STRUCT__entry( | ||
218 | __field(unsigned long, nr_dirty) | ||
219 | __field(unsigned long, nr_writeback) | ||
220 | __field(unsigned long, nr_unstable) | ||
221 | __field(unsigned long, background_thresh) | ||
222 | __field(unsigned long, dirty_thresh) | ||
223 | __field(unsigned long, dirty_limit) | ||
224 | __field(unsigned long, nr_dirtied) | ||
225 | __field(unsigned long, nr_written) | ||
226 | ), | ||
227 | |||
228 | TP_fast_assign( | ||
229 | __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); | ||
230 | __entry->nr_writeback = global_page_state(NR_WRITEBACK); | ||
231 | __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); | ||
232 | __entry->nr_dirtied = global_page_state(NR_DIRTIED); | ||
233 | __entry->nr_written = global_page_state(NR_WRITTEN); | ||
234 | __entry->background_thresh = background_thresh; | ||
235 | __entry->dirty_thresh = dirty_thresh; | ||
236 | __entry->dirty_limit = global_dirty_limit; | ||
237 | ), | ||
238 | |||
239 | TP_printk("dirty=%lu writeback=%lu unstable=%lu " | ||
240 | "bg_thresh=%lu thresh=%lu limit=%lu " | ||
241 | "dirtied=%lu written=%lu", | ||
242 | __entry->nr_dirty, | ||
243 | __entry->nr_writeback, | ||
244 | __entry->nr_unstable, | ||
245 | __entry->background_thresh, | ||
246 | __entry->dirty_thresh, | ||
247 | __entry->dirty_limit, | ||
248 | __entry->nr_dirtied, | ||
249 | __entry->nr_written | ||
250 | ) | ||
251 | ); | ||
252 | |||
155 | DECLARE_EVENT_CLASS(writeback_congest_waited_template, | 253 | DECLARE_EVENT_CLASS(writeback_congest_waited_template, |
156 | 254 | ||
157 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), | 255 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), |
158 | 256 | ||
159 | TP_ARGS(usec_timeout, usec_delayed), | 257 | TP_ARGS(usec_timeout, usec_delayed), |
160 | 258 | ||
161 | TP_STRUCT__entry( | 259 | TP_STRUCT__entry( |
162 | __field( unsigned int, usec_timeout ) | 260 | __field( unsigned int, usec_timeout ) |
163 | __field( unsigned int, usec_delayed ) | 261 | __field( unsigned int, usec_delayed ) |
164 | ), | 262 | ), |
165 | 263 | ||
166 | TP_fast_assign( | 264 | TP_fast_assign( |
167 | __entry->usec_timeout = usec_timeout; | 265 | __entry->usec_timeout = usec_timeout; |
168 | __entry->usec_delayed = usec_delayed; | 266 | __entry->usec_delayed = usec_delayed; |
169 | ), | 267 | ), |
170 | 268 | ||
171 | TP_printk("usec_timeout=%u usec_delayed=%u", | 269 | TP_printk("usec_timeout=%u usec_delayed=%u", |
172 | __entry->usec_timeout, | 270 | __entry->usec_timeout, |
173 | __entry->usec_delayed) | 271 | __entry->usec_delayed) |
174 | ); | 272 | ); |
175 | 273 | ||
176 | DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, | 274 | DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, |
177 | 275 | ||
178 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), | 276 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), |
179 | 277 | ||
180 | TP_ARGS(usec_timeout, usec_delayed) | 278 | TP_ARGS(usec_timeout, usec_delayed) |
181 | ); | 279 | ); |
182 | 280 | ||
183 | DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, | 281 | DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, |
184 | 282 | ||
185 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), | 283 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), |
186 | 284 | ||
187 | TP_ARGS(usec_timeout, usec_delayed) | 285 | TP_ARGS(usec_timeout, usec_delayed) |
286 | ); | ||
287 | |||
288 | DECLARE_EVENT_CLASS(writeback_single_inode_template, | ||
289 | |||
290 | TP_PROTO(struct inode *inode, | ||
291 | struct writeback_control *wbc, | ||
292 | unsigned long nr_to_write | ||
293 | ), | ||
294 | |||
295 | TP_ARGS(inode, wbc, nr_to_write), | ||
296 | |||
297 | TP_STRUCT__entry( | ||
298 | __array(char, name, 32) | ||
299 | __field(unsigned long, ino) | ||
300 | __field(unsigned long, state) | ||
301 | __field(unsigned long, age) | ||
302 | __field(unsigned long, writeback_index) | ||
303 | __field(long, nr_to_write) | ||
304 | __field(unsigned long, wrote) | ||
305 | ), | ||
306 | |||
307 | TP_fast_assign( | ||
308 | strncpy(__entry->name, | ||
309 | dev_name(inode->i_mapping->backing_dev_info->dev), 32); | ||
310 | __entry->ino = inode->i_ino; | ||
311 | __entry->state = inode->i_state; | ||
312 | __entry->age = (jiffies - inode->dirtied_when) * | ||
313 | 1000 / HZ; | ||
314 | __entry->writeback_index = inode->i_mapping->writeback_index; | ||
315 | __entry->nr_to_write = nr_to_write; | ||
316 | __entry->wrote = nr_to_write - wbc->nr_to_write; | ||
317 | ), | ||
318 | |||
319 | TP_printk("bdi %s: ino=%lu state=%s age=%lu " | ||
320 | "index=%lu to_write=%ld wrote=%lu", | ||
321 | __entry->name, | ||
322 | __entry->ino, | ||
323 | show_inode_state(__entry->state), | ||
324 | __entry->age, | ||
325 | __entry->writeback_index, | ||
326 | __entry->nr_to_write, | ||
327 | __entry->wrote | ||
328 | ) | ||
329 | ); | ||
330 | |||
331 | DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue, | ||
332 | TP_PROTO(struct inode *inode, | ||
333 | struct writeback_control *wbc, | ||
334 | unsigned long nr_to_write), | ||
335 | TP_ARGS(inode, wbc, nr_to_write) | ||
336 | ); |
mm/backing-dev.c
1 | 1 | ||
2 | #include <linux/wait.h> | 2 | #include <linux/wait.h> |
3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
4 | #include <linux/kthread.h> | 4 | #include <linux/kthread.h> |
5 | #include <linux/freezer.h> | 5 | #include <linux/freezer.h> |
6 | #include <linux/fs.h> | 6 | #include <linux/fs.h> |
7 | #include <linux/pagemap.h> | 7 | #include <linux/pagemap.h> |
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/writeback.h> | 11 | #include <linux/writeback.h> |
12 | #include <linux/device.h> | 12 | #include <linux/device.h> |
13 | #include <trace/events/writeback.h> | 13 | #include <trace/events/writeback.h> |
14 | 14 | ||
15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | 15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); |
16 | 16 | ||
17 | struct backing_dev_info default_backing_dev_info = { | 17 | struct backing_dev_info default_backing_dev_info = { |
18 | .name = "default", | 18 | .name = "default", |
19 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | 19 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
20 | .state = 0, | 20 | .state = 0, |
21 | .capabilities = BDI_CAP_MAP_COPY, | 21 | .capabilities = BDI_CAP_MAP_COPY, |
22 | }; | 22 | }; |
23 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | 23 | EXPORT_SYMBOL_GPL(default_backing_dev_info); |
24 | 24 | ||
25 | struct backing_dev_info noop_backing_dev_info = { | 25 | struct backing_dev_info noop_backing_dev_info = { |
26 | .name = "noop", | 26 | .name = "noop", |
27 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 27 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
28 | }; | 28 | }; |
29 | EXPORT_SYMBOL_GPL(noop_backing_dev_info); | 29 | EXPORT_SYMBOL_GPL(noop_backing_dev_info); |
30 | 30 | ||
31 | static struct class *bdi_class; | 31 | static struct class *bdi_class; |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as | 34 | * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as |
35 | * reader side protection for bdi_pending_list. bdi_list has RCU reader side | 35 | * reader side protection for bdi_pending_list. bdi_list has RCU reader side |
36 | * locking. | 36 | * locking. |
37 | */ | 37 | */ |
38 | DEFINE_SPINLOCK(bdi_lock); | 38 | DEFINE_SPINLOCK(bdi_lock); |
39 | LIST_HEAD(bdi_list); | 39 | LIST_HEAD(bdi_list); |
40 | LIST_HEAD(bdi_pending_list); | 40 | LIST_HEAD(bdi_pending_list); |
41 | 41 | ||
42 | static struct task_struct *sync_supers_tsk; | 42 | static struct task_struct *sync_supers_tsk; |
43 | static struct timer_list sync_supers_timer; | 43 | static struct timer_list sync_supers_timer; |
44 | 44 | ||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
51 | 62 | ||
52 | static struct dentry *bdi_debug_root; | 63 | static struct dentry *bdi_debug_root; |
53 | 64 | ||
54 | static void bdi_debug_init(void) | 65 | static void bdi_debug_init(void) |
55 | { | 66 | { |
56 | bdi_debug_root = debugfs_create_dir("bdi", NULL); | 67 | bdi_debug_root = debugfs_create_dir("bdi", NULL); |
57 | } | 68 | } |
58 | 69 | ||
59 | static int bdi_debug_stats_show(struct seq_file *m, void *v) | 70 | static int bdi_debug_stats_show(struct seq_file *m, void *v) |
60 | { | 71 | { |
61 | struct backing_dev_info *bdi = m->private; | 72 | struct backing_dev_info *bdi = m->private; |
62 | struct bdi_writeback *wb = &bdi->wb; | 73 | struct bdi_writeback *wb = &bdi->wb; |
63 | unsigned long background_thresh; | 74 | unsigned long background_thresh; |
64 | unsigned long dirty_thresh; | 75 | unsigned long dirty_thresh; |
65 | unsigned long bdi_thresh; | 76 | unsigned long bdi_thresh; |
66 | unsigned long nr_dirty, nr_io, nr_more_io; | 77 | unsigned long nr_dirty, nr_io, nr_more_io; |
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | ||
82 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | seq_printf(m, | 94 | seq_printf(m, |
84 | "BdiWriteback: %8lu kB\n" | 95 | "BdiWriteback: %10lu kB\n" |
85 | "BdiReclaimable: %8lu kB\n" | 96 | "BdiReclaimable: %10lu kB\n" |
86 | "BdiDirtyThresh: %8lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
87 | "DirtyThresh: %8lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
88 | "BackgroundThresh: %8lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
89 | "b_dirty: %8lu\n" | 100 | "BdiWritten: %10lu kB\n" |
90 | "b_io: %8lu\n" | 101 | "BdiWriteBandwidth: %10lu kBps\n" |
91 | "b_more_io: %8lu\n" | 102 | "b_dirty: %10lu\n" |
92 | "bdi_list: %8u\n" | 103 | "b_io: %10lu\n" |
93 | "state: %8lx\n", | 104 | "b_more_io: %10lu\n" |
105 | "bdi_list: %10u\n" | ||
106 | "state: %10lx\n", | ||
94 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 107 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 108 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | K(bdi_thresh), K(dirty_thresh), | 109 | K(bdi_thresh), |
97 | K(background_thresh), nr_dirty, nr_io, nr_more_io, | 110 | K(dirty_thresh), |
111 | K(background_thresh), | ||
112 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | ||
113 | (unsigned long) K(bdi->write_bandwidth), | ||
114 | nr_dirty, | ||
115 | nr_io, | ||
116 | nr_more_io, | ||
98 | !list_empty(&bdi->bdi_list), bdi->state); | 117 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | #undef K | 118 | #undef K |
100 | 119 | ||
101 | return 0; | 120 | return 0; |
102 | } | 121 | } |
103 | 122 | ||
104 | static int bdi_debug_stats_open(struct inode *inode, struct file *file) | 123 | static int bdi_debug_stats_open(struct inode *inode, struct file *file) |
105 | { | 124 | { |
106 | return single_open(file, bdi_debug_stats_show, inode->i_private); | 125 | return single_open(file, bdi_debug_stats_show, inode->i_private); |
107 | } | 126 | } |
108 | 127 | ||
109 | static const struct file_operations bdi_debug_stats_fops = { | 128 | static const struct file_operations bdi_debug_stats_fops = { |
110 | .open = bdi_debug_stats_open, | 129 | .open = bdi_debug_stats_open, |
111 | .read = seq_read, | 130 | .read = seq_read, |
112 | .llseek = seq_lseek, | 131 | .llseek = seq_lseek, |
113 | .release = single_release, | 132 | .release = single_release, |
114 | }; | 133 | }; |
115 | 134 | ||
116 | static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) | 135 | static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) |
117 | { | 136 | { |
118 | bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); | 137 | bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); |
119 | bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir, | 138 | bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir, |
120 | bdi, &bdi_debug_stats_fops); | 139 | bdi, &bdi_debug_stats_fops); |
121 | } | 140 | } |
122 | 141 | ||
123 | static void bdi_debug_unregister(struct backing_dev_info *bdi) | 142 | static void bdi_debug_unregister(struct backing_dev_info *bdi) |
124 | { | 143 | { |
125 | debugfs_remove(bdi->debug_stats); | 144 | debugfs_remove(bdi->debug_stats); |
126 | debugfs_remove(bdi->debug_dir); | 145 | debugfs_remove(bdi->debug_dir); |
127 | } | 146 | } |
128 | #else | 147 | #else |
129 | static inline void bdi_debug_init(void) | 148 | static inline void bdi_debug_init(void) |
130 | { | 149 | { |
131 | } | 150 | } |
132 | static inline void bdi_debug_register(struct backing_dev_info *bdi, | 151 | static inline void bdi_debug_register(struct backing_dev_info *bdi, |
133 | const char *name) | 152 | const char *name) |
134 | { | 153 | { |
135 | } | 154 | } |
136 | static inline void bdi_debug_unregister(struct backing_dev_info *bdi) | 155 | static inline void bdi_debug_unregister(struct backing_dev_info *bdi) |
137 | { | 156 | { |
138 | } | 157 | } |
139 | #endif | 158 | #endif |
140 | 159 | ||
141 | static ssize_t read_ahead_kb_store(struct device *dev, | 160 | static ssize_t read_ahead_kb_store(struct device *dev, |
142 | struct device_attribute *attr, | 161 | struct device_attribute *attr, |
143 | const char *buf, size_t count) | 162 | const char *buf, size_t count) |
144 | { | 163 | { |
145 | struct backing_dev_info *bdi = dev_get_drvdata(dev); | 164 | struct backing_dev_info *bdi = dev_get_drvdata(dev); |
146 | char *end; | 165 | char *end; |
147 | unsigned long read_ahead_kb; | 166 | unsigned long read_ahead_kb; |
148 | ssize_t ret = -EINVAL; | 167 | ssize_t ret = -EINVAL; |
149 | 168 | ||
150 | read_ahead_kb = simple_strtoul(buf, &end, 10); | 169 | read_ahead_kb = simple_strtoul(buf, &end, 10); |
151 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { | 170 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { |
152 | bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); | 171 | bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); |
153 | ret = count; | 172 | ret = count; |
154 | } | 173 | } |
155 | return ret; | 174 | return ret; |
156 | } | 175 | } |
157 | 176 | ||
158 | #define K(pages) ((pages) << (PAGE_SHIFT - 10)) | 177 | #define K(pages) ((pages) << (PAGE_SHIFT - 10)) |
159 | 178 | ||
160 | #define BDI_SHOW(name, expr) \ | 179 | #define BDI_SHOW(name, expr) \ |
161 | static ssize_t name##_show(struct device *dev, \ | 180 | static ssize_t name##_show(struct device *dev, \ |
162 | struct device_attribute *attr, char *page) \ | 181 | struct device_attribute *attr, char *page) \ |
163 | { \ | 182 | { \ |
164 | struct backing_dev_info *bdi = dev_get_drvdata(dev); \ | 183 | struct backing_dev_info *bdi = dev_get_drvdata(dev); \ |
165 | \ | 184 | \ |
166 | return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ | 185 | return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ |
167 | } | 186 | } |
168 | 187 | ||
169 | BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) | 188 | BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) |
170 | 189 | ||
171 | static ssize_t min_ratio_store(struct device *dev, | 190 | static ssize_t min_ratio_store(struct device *dev, |
172 | struct device_attribute *attr, const char *buf, size_t count) | 191 | struct device_attribute *attr, const char *buf, size_t count) |
173 | { | 192 | { |
174 | struct backing_dev_info *bdi = dev_get_drvdata(dev); | 193 | struct backing_dev_info *bdi = dev_get_drvdata(dev); |
175 | char *end; | 194 | char *end; |
176 | unsigned int ratio; | 195 | unsigned int ratio; |
177 | ssize_t ret = -EINVAL; | 196 | ssize_t ret = -EINVAL; |
178 | 197 | ||
179 | ratio = simple_strtoul(buf, &end, 10); | 198 | ratio = simple_strtoul(buf, &end, 10); |
180 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { | 199 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { |
181 | ret = bdi_set_min_ratio(bdi, ratio); | 200 | ret = bdi_set_min_ratio(bdi, ratio); |
182 | if (!ret) | 201 | if (!ret) |
183 | ret = count; | 202 | ret = count; |
184 | } | 203 | } |
185 | return ret; | 204 | return ret; |
186 | } | 205 | } |
187 | BDI_SHOW(min_ratio, bdi->min_ratio) | 206 | BDI_SHOW(min_ratio, bdi->min_ratio) |
188 | 207 | ||
189 | static ssize_t max_ratio_store(struct device *dev, | 208 | static ssize_t max_ratio_store(struct device *dev, |
190 | struct device_attribute *attr, const char *buf, size_t count) | 209 | struct device_attribute *attr, const char *buf, size_t count) |
191 | { | 210 | { |
192 | struct backing_dev_info *bdi = dev_get_drvdata(dev); | 211 | struct backing_dev_info *bdi = dev_get_drvdata(dev); |
193 | char *end; | 212 | char *end; |
194 | unsigned int ratio; | 213 | unsigned int ratio; |
195 | ssize_t ret = -EINVAL; | 214 | ssize_t ret = -EINVAL; |
196 | 215 | ||
197 | ratio = simple_strtoul(buf, &end, 10); | 216 | ratio = simple_strtoul(buf, &end, 10); |
198 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { | 217 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { |
199 | ret = bdi_set_max_ratio(bdi, ratio); | 218 | ret = bdi_set_max_ratio(bdi, ratio); |
200 | if (!ret) | 219 | if (!ret) |
201 | ret = count; | 220 | ret = count; |
202 | } | 221 | } |
203 | return ret; | 222 | return ret; |
204 | } | 223 | } |
205 | BDI_SHOW(max_ratio, bdi->max_ratio) | 224 | BDI_SHOW(max_ratio, bdi->max_ratio) |
206 | 225 | ||
207 | #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) | 226 | #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) |
208 | 227 | ||
209 | static struct device_attribute bdi_dev_attrs[] = { | 228 | static struct device_attribute bdi_dev_attrs[] = { |
210 | __ATTR_RW(read_ahead_kb), | 229 | __ATTR_RW(read_ahead_kb), |
211 | __ATTR_RW(min_ratio), | 230 | __ATTR_RW(min_ratio), |
212 | __ATTR_RW(max_ratio), | 231 | __ATTR_RW(max_ratio), |
213 | __ATTR_NULL, | 232 | __ATTR_NULL, |
214 | }; | 233 | }; |
215 | 234 | ||
216 | static __init int bdi_class_init(void) | 235 | static __init int bdi_class_init(void) |
217 | { | 236 | { |
218 | bdi_class = class_create(THIS_MODULE, "bdi"); | 237 | bdi_class = class_create(THIS_MODULE, "bdi"); |
219 | if (IS_ERR(bdi_class)) | 238 | if (IS_ERR(bdi_class)) |
220 | return PTR_ERR(bdi_class); | 239 | return PTR_ERR(bdi_class); |
221 | 240 | ||
222 | bdi_class->dev_attrs = bdi_dev_attrs; | 241 | bdi_class->dev_attrs = bdi_dev_attrs; |
223 | bdi_debug_init(); | 242 | bdi_debug_init(); |
224 | return 0; | 243 | return 0; |
225 | } | 244 | } |
226 | postcore_initcall(bdi_class_init); | 245 | postcore_initcall(bdi_class_init); |
227 | 246 | ||
228 | static int __init default_bdi_init(void) | 247 | static int __init default_bdi_init(void) |
229 | { | 248 | { |
230 | int err; | 249 | int err; |
231 | 250 | ||
232 | sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); | 251 | sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); |
233 | BUG_ON(IS_ERR(sync_supers_tsk)); | 252 | BUG_ON(IS_ERR(sync_supers_tsk)); |
234 | 253 | ||
235 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); | 254 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); |
236 | bdi_arm_supers_timer(); | 255 | bdi_arm_supers_timer(); |
237 | 256 | ||
238 | err = bdi_init(&default_backing_dev_info); | 257 | err = bdi_init(&default_backing_dev_info); |
239 | if (!err) | 258 | if (!err) |
240 | bdi_register(&default_backing_dev_info, NULL, "default"); | 259 | bdi_register(&default_backing_dev_info, NULL, "default"); |
241 | err = bdi_init(&noop_backing_dev_info); | 260 | err = bdi_init(&noop_backing_dev_info); |
242 | 261 | ||
243 | return err; | 262 | return err; |
244 | } | 263 | } |
245 | subsys_initcall(default_bdi_init); | 264 | subsys_initcall(default_bdi_init); |
246 | 265 | ||
247 | int bdi_has_dirty_io(struct backing_dev_info *bdi) | 266 | int bdi_has_dirty_io(struct backing_dev_info *bdi) |
248 | { | 267 | { |
249 | return wb_has_dirty_io(&bdi->wb); | 268 | return wb_has_dirty_io(&bdi->wb); |
250 | } | 269 | } |
251 | 270 | ||
252 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
253 | { | ||
254 | struct writeback_control wbc = { | ||
255 | .sync_mode = WB_SYNC_NONE, | ||
256 | .older_than_this = NULL, | ||
257 | .range_cyclic = 1, | ||
258 | .nr_to_write = 1024, | ||
259 | }; | ||
260 | |||
261 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
262 | } | ||
263 | |||
264 | /* | 271 | /* |
265 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | 272 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | * or we risk deadlocking on ->s_umount. The longer term solution would be | 273 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
267 | * to implement sync_supers_bdi() or similar and simply do it from the | 274 | * to implement sync_supers_bdi() or similar and simply do it from the |
268 | * bdi writeback thread individually. | 275 | * bdi writeback thread individually. |
269 | */ | 276 | */ |
270 | static int bdi_sync_supers(void *unused) | 277 | static int bdi_sync_supers(void *unused) |
271 | { | 278 | { |
272 | set_user_nice(current, 0); | 279 | set_user_nice(current, 0); |
273 | 280 | ||
274 | while (!kthread_should_stop()) { | 281 | while (!kthread_should_stop()) { |
275 | set_current_state(TASK_INTERRUPTIBLE); | 282 | set_current_state(TASK_INTERRUPTIBLE); |
276 | schedule(); | 283 | schedule(); |
277 | 284 | ||
278 | /* | 285 | /* |
279 | * Do this periodically, like kupdated() did before. | 286 | * Do this periodically, like kupdated() did before. |
280 | */ | 287 | */ |
281 | sync_supers(); | 288 | sync_supers(); |
282 | } | 289 | } |
283 | 290 | ||
284 | return 0; | 291 | return 0; |
285 | } | 292 | } |
286 | 293 | ||
287 | void bdi_arm_supers_timer(void) | 294 | void bdi_arm_supers_timer(void) |
288 | { | 295 | { |
289 | unsigned long next; | 296 | unsigned long next; |
290 | 297 | ||
291 | if (!dirty_writeback_interval) | 298 | if (!dirty_writeback_interval) |
292 | return; | 299 | return; |
293 | 300 | ||
294 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; | 301 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; |
295 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); | 302 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); |
296 | } | 303 | } |
297 | 304 | ||
298 | static void sync_supers_timer_fn(unsigned long unused) | 305 | static void sync_supers_timer_fn(unsigned long unused) |
299 | { | 306 | { |
300 | wake_up_process(sync_supers_tsk); | 307 | wake_up_process(sync_supers_tsk); |
301 | bdi_arm_supers_timer(); | 308 | bdi_arm_supers_timer(); |
302 | } | 309 | } |
303 | 310 | ||
304 | static void wakeup_timer_fn(unsigned long data) | 311 | static void wakeup_timer_fn(unsigned long data) |
305 | { | 312 | { |
306 | struct backing_dev_info *bdi = (struct backing_dev_info *)data; | 313 | struct backing_dev_info *bdi = (struct backing_dev_info *)data; |
307 | 314 | ||
308 | spin_lock_bh(&bdi->wb_lock); | 315 | spin_lock_bh(&bdi->wb_lock); |
309 | if (bdi->wb.task) { | 316 | if (bdi->wb.task) { |
310 | trace_writeback_wake_thread(bdi); | 317 | trace_writeback_wake_thread(bdi); |
311 | wake_up_process(bdi->wb.task); | 318 | wake_up_process(bdi->wb.task); |
312 | } else { | 319 | } else { |
313 | /* | 320 | /* |
314 | * When bdi tasks are inactive for long time, they are killed. | 321 | * When bdi tasks are inactive for long time, they are killed. |
315 | * In this case we have to wake-up the forker thread which | 322 | * In this case we have to wake-up the forker thread which |
316 | * should create and run the bdi thread. | 323 | * should create and run the bdi thread. |
317 | */ | 324 | */ |
318 | trace_writeback_wake_forker_thread(bdi); | 325 | trace_writeback_wake_forker_thread(bdi); |
319 | wake_up_process(default_backing_dev_info.wb.task); | 326 | wake_up_process(default_backing_dev_info.wb.task); |
320 | } | 327 | } |
321 | spin_unlock_bh(&bdi->wb_lock); | 328 | spin_unlock_bh(&bdi->wb_lock); |
322 | } | 329 | } |
323 | 330 | ||
324 | /* | 331 | /* |
325 | * This function is used when the first inode for this bdi is marked dirty. It | 332 | * This function is used when the first inode for this bdi is marked dirty. It |
326 | * wakes-up the corresponding bdi thread which should then take care of the | 333 | * wakes-up the corresponding bdi thread which should then take care of the |
327 | * periodic background write-out of dirty inodes. Since the write-out would | 334 | * periodic background write-out of dirty inodes. Since the write-out would |
328 | * starts only 'dirty_writeback_interval' centisecs from now anyway, we just | 335 | * starts only 'dirty_writeback_interval' centisecs from now anyway, we just |
329 | * set up a timer which wakes the bdi thread up later. | 336 | * set up a timer which wakes the bdi thread up later. |
330 | * | 337 | * |
331 | * Note, we wouldn't bother setting up the timer, but this function is on the | 338 | * Note, we wouldn't bother setting up the timer, but this function is on the |
332 | * fast-path (used by '__mark_inode_dirty()'), so we save few context switches | 339 | * fast-path (used by '__mark_inode_dirty()'), so we save few context switches |
333 | * by delaying the wake-up. | 340 | * by delaying the wake-up. |
334 | */ | 341 | */ |
335 | void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) | 342 | void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) |
336 | { | 343 | { |
337 | unsigned long timeout; | 344 | unsigned long timeout; |
338 | 345 | ||
339 | timeout = msecs_to_jiffies(dirty_writeback_interval * 10); | 346 | timeout = msecs_to_jiffies(dirty_writeback_interval * 10); |
340 | mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); | 347 | mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); |
341 | } | 348 | } |
342 | 349 | ||
343 | /* | 350 | /* |
344 | * Calculate the longest interval (jiffies) bdi threads are allowed to be | 351 | * Calculate the longest interval (jiffies) bdi threads are allowed to be |
345 | * inactive. | 352 | * inactive. |
346 | */ | 353 | */ |
347 | static unsigned long bdi_longest_inactive(void) | 354 | static unsigned long bdi_longest_inactive(void) |
348 | { | 355 | { |
349 | unsigned long interval; | 356 | unsigned long interval; |
350 | 357 | ||
351 | interval = msecs_to_jiffies(dirty_writeback_interval * 10); | 358 | interval = msecs_to_jiffies(dirty_writeback_interval * 10); |
352 | return max(5UL * 60 * HZ, interval); | 359 | return max(5UL * 60 * HZ, interval); |
353 | } | 360 | } |
354 | 361 | ||
355 | static int bdi_forker_thread(void *ptr) | 362 | static int bdi_forker_thread(void *ptr) |
356 | { | 363 | { |
357 | struct bdi_writeback *me = ptr; | 364 | struct bdi_writeback *me = ptr; |
358 | 365 | ||
359 | current->flags |= PF_SWAPWRITE; | 366 | current->flags |= PF_SWAPWRITE; |
360 | set_freezable(); | 367 | set_freezable(); |
361 | 368 | ||
362 | /* | 369 | /* |
363 | * Our parent may run at a different priority, just set us to normal | 370 | * Our parent may run at a different priority, just set us to normal |
364 | */ | 371 | */ |
365 | set_user_nice(current, 0); | 372 | set_user_nice(current, 0); |
366 | 373 | ||
367 | for (;;) { | 374 | for (;;) { |
368 | struct task_struct *task = NULL; | 375 | struct task_struct *task = NULL; |
369 | struct backing_dev_info *bdi; | 376 | struct backing_dev_info *bdi; |
370 | enum { | 377 | enum { |
371 | NO_ACTION, /* Nothing to do */ | 378 | NO_ACTION, /* Nothing to do */ |
372 | FORK_THREAD, /* Fork bdi thread */ | 379 | FORK_THREAD, /* Fork bdi thread */ |
373 | KILL_THREAD, /* Kill inactive bdi thread */ | 380 | KILL_THREAD, /* Kill inactive bdi thread */ |
374 | } action = NO_ACTION; | 381 | } action = NO_ACTION; |
375 | 382 | ||
376 | /* | 383 | /* |
377 | * Temporary measure, we want to make sure we don't see | 384 | * Temporary measure, we want to make sure we don't see |
378 | * dirty data on the default backing_dev_info | 385 | * dirty data on the default backing_dev_info |
379 | */ | 386 | */ |
380 | if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { | 387 | if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { |
381 | del_timer(&me->wakeup_timer); | 388 | del_timer(&me->wakeup_timer); |
382 | wb_do_writeback(me, 0); | 389 | wb_do_writeback(me, 0); |
383 | } | 390 | } |
384 | 391 | ||
385 | spin_lock_bh(&bdi_lock); | 392 | spin_lock_bh(&bdi_lock); |
386 | set_current_state(TASK_INTERRUPTIBLE); | 393 | set_current_state(TASK_INTERRUPTIBLE); |
387 | 394 | ||
388 | list_for_each_entry(bdi, &bdi_list, bdi_list) { | 395 | list_for_each_entry(bdi, &bdi_list, bdi_list) { |
389 | bool have_dirty_io; | 396 | bool have_dirty_io; |
390 | 397 | ||
391 | if (!bdi_cap_writeback_dirty(bdi) || | 398 | if (!bdi_cap_writeback_dirty(bdi) || |
392 | bdi_cap_flush_forker(bdi)) | 399 | bdi_cap_flush_forker(bdi)) |
393 | continue; | 400 | continue; |
394 | 401 | ||
395 | WARN(!test_bit(BDI_registered, &bdi->state), | 402 | WARN(!test_bit(BDI_registered, &bdi->state), |
396 | "bdi %p/%s is not registered!\n", bdi, bdi->name); | 403 | "bdi %p/%s is not registered!\n", bdi, bdi->name); |
397 | 404 | ||
398 | have_dirty_io = !list_empty(&bdi->work_list) || | 405 | have_dirty_io = !list_empty(&bdi->work_list) || |
399 | wb_has_dirty_io(&bdi->wb); | 406 | wb_has_dirty_io(&bdi->wb); |
400 | 407 | ||
401 | /* | 408 | /* |
402 | * If the bdi has work to do, but the thread does not | 409 | * If the bdi has work to do, but the thread does not |
403 | * exist - create it. | 410 | * exist - create it. |
404 | */ | 411 | */ |
405 | if (!bdi->wb.task && have_dirty_io) { | 412 | if (!bdi->wb.task && have_dirty_io) { |
406 | /* | 413 | /* |
407 | * Set the pending bit - if someone will try to | 414 | * Set the pending bit - if someone will try to |
408 | * unregister this bdi - it'll wait on this bit. | 415 | * unregister this bdi - it'll wait on this bit. |
409 | */ | 416 | */ |
410 | set_bit(BDI_pending, &bdi->state); | 417 | set_bit(BDI_pending, &bdi->state); |
411 | action = FORK_THREAD; | 418 | action = FORK_THREAD; |
412 | break; | 419 | break; |
413 | } | 420 | } |
414 | 421 | ||
415 | spin_lock(&bdi->wb_lock); | 422 | spin_lock(&bdi->wb_lock); |
416 | 423 | ||
417 | /* | 424 | /* |
418 | * If there is no work to do and the bdi thread was | 425 | * If there is no work to do and the bdi thread was |
419 | * inactive long enough - kill it. The wb_lock is taken | 426 | * inactive long enough - kill it. The wb_lock is taken |
420 | * to make sure no-one adds more work to this bdi and | 427 | * to make sure no-one adds more work to this bdi and |
421 | * wakes the bdi thread up. | 428 | * wakes the bdi thread up. |
422 | */ | 429 | */ |
423 | if (bdi->wb.task && !have_dirty_io && | 430 | if (bdi->wb.task && !have_dirty_io && |
424 | time_after(jiffies, bdi->wb.last_active + | 431 | time_after(jiffies, bdi->wb.last_active + |
425 | bdi_longest_inactive())) { | 432 | bdi_longest_inactive())) { |
426 | task = bdi->wb.task; | 433 | task = bdi->wb.task; |
427 | bdi->wb.task = NULL; | 434 | bdi->wb.task = NULL; |
428 | spin_unlock(&bdi->wb_lock); | 435 | spin_unlock(&bdi->wb_lock); |
429 | set_bit(BDI_pending, &bdi->state); | 436 | set_bit(BDI_pending, &bdi->state); |
430 | action = KILL_THREAD; | 437 | action = KILL_THREAD; |
431 | break; | 438 | break; |
432 | } | 439 | } |
433 | spin_unlock(&bdi->wb_lock); | 440 | spin_unlock(&bdi->wb_lock); |
434 | } | 441 | } |
435 | spin_unlock_bh(&bdi_lock); | 442 | spin_unlock_bh(&bdi_lock); |
436 | 443 | ||
437 | /* Keep working if default bdi still has things to do */ | 444 | /* Keep working if default bdi still has things to do */ |
438 | if (!list_empty(&me->bdi->work_list)) | 445 | if (!list_empty(&me->bdi->work_list)) |
439 | __set_current_state(TASK_RUNNING); | 446 | __set_current_state(TASK_RUNNING); |
440 | 447 | ||
441 | switch (action) { | 448 | switch (action) { |
442 | case FORK_THREAD: | 449 | case FORK_THREAD: |
443 | __set_current_state(TASK_RUNNING); | 450 | __set_current_state(TASK_RUNNING); |
444 | task = kthread_create(bdi_writeback_thread, &bdi->wb, | 451 | task = kthread_create(bdi_writeback_thread, &bdi->wb, |
445 | "flush-%s", dev_name(bdi->dev)); | 452 | "flush-%s", dev_name(bdi->dev)); |
446 | if (IS_ERR(task)) { | 453 | if (IS_ERR(task)) { |
447 | /* | 454 | /* |
448 | * If thread creation fails, force writeout of | 455 | * If thread creation fails, force writeout of |
449 | * the bdi from the thread. | 456 | * the bdi from the thread. Hopefully 1024 is |
457 | * large enough for efficient IO. | ||
450 | */ | 458 | */ |
451 | bdi_flush_io(bdi); | 459 | writeback_inodes_wb(&bdi->wb, 1024); |
452 | } else { | 460 | } else { |
453 | /* | 461 | /* |
454 | * The spinlock makes sure we do not lose | 462 | * The spinlock makes sure we do not lose |
455 | * wake-ups when racing with 'bdi_queue_work()'. | 463 | * wake-ups when racing with 'bdi_queue_work()'. |
456 | * And as soon as the bdi thread is visible, we | 464 | * And as soon as the bdi thread is visible, we |
457 | * can start it. | 465 | * can start it. |
458 | */ | 466 | */ |
459 | spin_lock_bh(&bdi->wb_lock); | 467 | spin_lock_bh(&bdi->wb_lock); |
460 | bdi->wb.task = task; | 468 | bdi->wb.task = task; |
461 | spin_unlock_bh(&bdi->wb_lock); | 469 | spin_unlock_bh(&bdi->wb_lock); |
462 | wake_up_process(task); | 470 | wake_up_process(task); |
463 | } | 471 | } |
464 | break; | 472 | break; |
465 | 473 | ||
466 | case KILL_THREAD: | 474 | case KILL_THREAD: |
467 | __set_current_state(TASK_RUNNING); | 475 | __set_current_state(TASK_RUNNING); |
468 | kthread_stop(task); | 476 | kthread_stop(task); |
469 | break; | 477 | break; |
470 | 478 | ||
471 | case NO_ACTION: | 479 | case NO_ACTION: |
472 | if (!wb_has_dirty_io(me) || !dirty_writeback_interval) | 480 | if (!wb_has_dirty_io(me) || !dirty_writeback_interval) |
473 | /* | 481 | /* |
474 | * There are no dirty data. The only thing we | 482 | * There are no dirty data. The only thing we |
475 | * should now care about is checking for | 483 | * should now care about is checking for |
476 | * inactive bdi threads and killing them. Thus, | 484 | * inactive bdi threads and killing them. Thus, |
477 | * let's sleep for longer time, save energy and | 485 | * let's sleep for longer time, save energy and |
478 | * be friendly for battery-driven devices. | 486 | * be friendly for battery-driven devices. |
479 | */ | 487 | */ |
480 | schedule_timeout(bdi_longest_inactive()); | 488 | schedule_timeout(bdi_longest_inactive()); |
481 | else | 489 | else |
482 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); | 490 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
483 | try_to_freeze(); | 491 | try_to_freeze(); |
484 | /* Back to the main loop */ | 492 | /* Back to the main loop */ |
485 | continue; | 493 | continue; |
486 | } | 494 | } |
487 | 495 | ||
488 | /* | 496 | /* |
489 | * Clear pending bit and wakeup anybody waiting to tear us down. | 497 | * Clear pending bit and wakeup anybody waiting to tear us down. |
490 | */ | 498 | */ |
491 | clear_bit(BDI_pending, &bdi->state); | 499 | clear_bit(BDI_pending, &bdi->state); |
492 | smp_mb__after_clear_bit(); | 500 | smp_mb__after_clear_bit(); |
493 | wake_up_bit(&bdi->state, BDI_pending); | 501 | wake_up_bit(&bdi->state, BDI_pending); |
494 | } | 502 | } |
495 | 503 | ||
496 | return 0; | 504 | return 0; |
497 | } | 505 | } |
498 | 506 | ||
499 | /* | 507 | /* |
500 | * Remove bdi from bdi_list, and ensure that it is no longer visible | 508 | * Remove bdi from bdi_list, and ensure that it is no longer visible |
501 | */ | 509 | */ |
502 | static void bdi_remove_from_list(struct backing_dev_info *bdi) | 510 | static void bdi_remove_from_list(struct backing_dev_info *bdi) |
503 | { | 511 | { |
504 | spin_lock_bh(&bdi_lock); | 512 | spin_lock_bh(&bdi_lock); |
505 | list_del_rcu(&bdi->bdi_list); | 513 | list_del_rcu(&bdi->bdi_list); |
506 | spin_unlock_bh(&bdi_lock); | 514 | spin_unlock_bh(&bdi_lock); |
507 | 515 | ||
508 | synchronize_rcu_expedited(); | 516 | synchronize_rcu_expedited(); |
509 | } | 517 | } |
510 | 518 | ||
511 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 519 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
512 | const char *fmt, ...) | 520 | const char *fmt, ...) |
513 | { | 521 | { |
514 | va_list args; | 522 | va_list args; |
515 | struct device *dev; | 523 | struct device *dev; |
516 | 524 | ||
517 | if (bdi->dev) /* The driver needs to use separate queues per device */ | 525 | if (bdi->dev) /* The driver needs to use separate queues per device */ |
518 | return 0; | 526 | return 0; |
519 | 527 | ||
520 | va_start(args, fmt); | 528 | va_start(args, fmt); |
521 | dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); | 529 | dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); |
522 | va_end(args); | 530 | va_end(args); |
523 | if (IS_ERR(dev)) | 531 | if (IS_ERR(dev)) |
524 | return PTR_ERR(dev); | 532 | return PTR_ERR(dev); |
525 | 533 | ||
526 | bdi->dev = dev; | 534 | bdi->dev = dev; |
527 | 535 | ||
528 | /* | 536 | /* |
529 | * Just start the forker thread for our default backing_dev_info, | 537 | * Just start the forker thread for our default backing_dev_info, |
530 | * and add other bdi's to the list. They will get a thread created | 538 | * and add other bdi's to the list. They will get a thread created |
531 | * on-demand when they need it. | 539 | * on-demand when they need it. |
532 | */ | 540 | */ |
533 | if (bdi_cap_flush_forker(bdi)) { | 541 | if (bdi_cap_flush_forker(bdi)) { |
534 | struct bdi_writeback *wb = &bdi->wb; | 542 | struct bdi_writeback *wb = &bdi->wb; |
535 | 543 | ||
536 | wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", | 544 | wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", |
537 | dev_name(dev)); | 545 | dev_name(dev)); |
538 | if (IS_ERR(wb->task)) | 546 | if (IS_ERR(wb->task)) |
539 | return PTR_ERR(wb->task); | 547 | return PTR_ERR(wb->task); |
540 | } | 548 | } |
541 | 549 | ||
542 | bdi_debug_register(bdi, dev_name(dev)); | 550 | bdi_debug_register(bdi, dev_name(dev)); |
543 | set_bit(BDI_registered, &bdi->state); | 551 | set_bit(BDI_registered, &bdi->state); |
544 | 552 | ||
545 | spin_lock_bh(&bdi_lock); | 553 | spin_lock_bh(&bdi_lock); |
546 | list_add_tail_rcu(&bdi->bdi_list, &bdi_list); | 554 | list_add_tail_rcu(&bdi->bdi_list, &bdi_list); |
547 | spin_unlock_bh(&bdi_lock); | 555 | spin_unlock_bh(&bdi_lock); |
548 | 556 | ||
549 | trace_writeback_bdi_register(bdi); | 557 | trace_writeback_bdi_register(bdi); |
550 | return 0; | 558 | return 0; |
551 | } | 559 | } |
552 | EXPORT_SYMBOL(bdi_register); | 560 | EXPORT_SYMBOL(bdi_register); |
553 | 561 | ||
554 | int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) | 562 | int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) |
555 | { | 563 | { |
556 | return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); | 564 | return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); |
557 | } | 565 | } |
558 | EXPORT_SYMBOL(bdi_register_dev); | 566 | EXPORT_SYMBOL(bdi_register_dev); |
559 | 567 | ||
560 | /* | 568 | /* |
561 | * Remove bdi from the global list and shutdown any threads we have running | 569 | * Remove bdi from the global list and shutdown any threads we have running |
562 | */ | 570 | */ |
563 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | 571 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) |
564 | { | 572 | { |
565 | if (!bdi_cap_writeback_dirty(bdi)) | 573 | if (!bdi_cap_writeback_dirty(bdi)) |
566 | return; | 574 | return; |
567 | 575 | ||
568 | /* | 576 | /* |
569 | * Make sure nobody finds us on the bdi_list anymore | 577 | * Make sure nobody finds us on the bdi_list anymore |
570 | */ | 578 | */ |
571 | bdi_remove_from_list(bdi); | 579 | bdi_remove_from_list(bdi); |
572 | 580 | ||
573 | /* | 581 | /* |
574 | * If setup is pending, wait for that to complete first | 582 | * If setup is pending, wait for that to complete first |
575 | */ | 583 | */ |
576 | wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, | 584 | wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, |
577 | TASK_UNINTERRUPTIBLE); | 585 | TASK_UNINTERRUPTIBLE); |
578 | 586 | ||
579 | /* | 587 | /* |
580 | * Finally, kill the kernel thread. We don't need to be RCU | 588 | * Finally, kill the kernel thread. We don't need to be RCU |
581 | * safe anymore, since the bdi is gone from visibility. Force | 589 | * safe anymore, since the bdi is gone from visibility. Force |
582 | * unfreeze of the thread before calling kthread_stop(), otherwise | 590 | * unfreeze of the thread before calling kthread_stop(), otherwise |
583 | * it would never exet if it is currently stuck in the refrigerator. | 591 | * it would never exet if it is currently stuck in the refrigerator. |
584 | */ | 592 | */ |
585 | if (bdi->wb.task) { | 593 | if (bdi->wb.task) { |
586 | thaw_process(bdi->wb.task); | 594 | thaw_process(bdi->wb.task); |
587 | kthread_stop(bdi->wb.task); | 595 | kthread_stop(bdi->wb.task); |
588 | } | 596 | } |
589 | } | 597 | } |
590 | 598 | ||
591 | /* | 599 | /* |
592 | * This bdi is going away now, make sure that no super_blocks point to it | 600 | * This bdi is going away now, make sure that no super_blocks point to it |
593 | */ | 601 | */ |
594 | static void bdi_prune_sb(struct backing_dev_info *bdi) | 602 | static void bdi_prune_sb(struct backing_dev_info *bdi) |
595 | { | 603 | { |
596 | struct super_block *sb; | 604 | struct super_block *sb; |
597 | 605 | ||
598 | spin_lock(&sb_lock); | 606 | spin_lock(&sb_lock); |
599 | list_for_each_entry(sb, &super_blocks, s_list) { | 607 | list_for_each_entry(sb, &super_blocks, s_list) { |
600 | if (sb->s_bdi == bdi) | 608 | if (sb->s_bdi == bdi) |
601 | sb->s_bdi = &default_backing_dev_info; | 609 | sb->s_bdi = &default_backing_dev_info; |
602 | } | 610 | } |
603 | spin_unlock(&sb_lock); | 611 | spin_unlock(&sb_lock); |
604 | } | 612 | } |
605 | 613 | ||
606 | void bdi_unregister(struct backing_dev_info *bdi) | 614 | void bdi_unregister(struct backing_dev_info *bdi) |
607 | { | 615 | { |
608 | if (bdi->dev) { | 616 | if (bdi->dev) { |
609 | bdi_set_min_ratio(bdi, 0); | 617 | bdi_set_min_ratio(bdi, 0); |
610 | trace_writeback_bdi_unregister(bdi); | 618 | trace_writeback_bdi_unregister(bdi); |
611 | bdi_prune_sb(bdi); | 619 | bdi_prune_sb(bdi); |
612 | del_timer_sync(&bdi->wb.wakeup_timer); | 620 | del_timer_sync(&bdi->wb.wakeup_timer); |
613 | 621 | ||
614 | if (!bdi_cap_flush_forker(bdi)) | 622 | if (!bdi_cap_flush_forker(bdi)) |
615 | bdi_wb_shutdown(bdi); | 623 | bdi_wb_shutdown(bdi); |
616 | bdi_debug_unregister(bdi); | 624 | bdi_debug_unregister(bdi); |
617 | device_unregister(bdi->dev); | 625 | device_unregister(bdi->dev); |
618 | bdi->dev = NULL; | 626 | bdi->dev = NULL; |
619 | } | 627 | } |
620 | } | 628 | } |
621 | EXPORT_SYMBOL(bdi_unregister); | 629 | EXPORT_SYMBOL(bdi_unregister); |
622 | 630 | ||
623 | static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | 631 | static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) |
624 | { | 632 | { |
625 | memset(wb, 0, sizeof(*wb)); | 633 | memset(wb, 0, sizeof(*wb)); |
626 | 634 | ||
627 | wb->bdi = bdi; | 635 | wb->bdi = bdi; |
628 | wb->last_old_flush = jiffies; | 636 | wb->last_old_flush = jiffies; |
629 | INIT_LIST_HEAD(&wb->b_dirty); | 637 | INIT_LIST_HEAD(&wb->b_dirty); |
630 | INIT_LIST_HEAD(&wb->b_io); | 638 | INIT_LIST_HEAD(&wb->b_io); |
631 | INIT_LIST_HEAD(&wb->b_more_io); | 639 | INIT_LIST_HEAD(&wb->b_more_io); |
640 | spin_lock_init(&wb->list_lock); | ||
632 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 641 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
633 | } | 642 | } |
634 | 643 | ||
644 | /* | ||
645 | * Initial write bandwidth: 100 MB/s | ||
646 | */ | ||
647 | #define INIT_BW (100 << (20 - PAGE_SHIFT)) | ||
648 | |||
635 | int bdi_init(struct backing_dev_info *bdi) | 649 | int bdi_init(struct backing_dev_info *bdi) |
636 | { | 650 | { |
637 | int i, err; | 651 | int i, err; |
638 | 652 | ||
639 | bdi->dev = NULL; | 653 | bdi->dev = NULL; |
640 | 654 | ||
641 | bdi->min_ratio = 0; | 655 | bdi->min_ratio = 0; |
642 | bdi->max_ratio = 100; | 656 | bdi->max_ratio = 100; |
643 | bdi->max_prop_frac = PROP_FRAC_BASE; | 657 | bdi->max_prop_frac = PROP_FRAC_BASE; |
644 | spin_lock_init(&bdi->wb_lock); | 658 | spin_lock_init(&bdi->wb_lock); |
645 | INIT_LIST_HEAD(&bdi->bdi_list); | 659 | INIT_LIST_HEAD(&bdi->bdi_list); |
646 | INIT_LIST_HEAD(&bdi->work_list); | 660 | INIT_LIST_HEAD(&bdi->work_list); |
647 | 661 | ||
648 | bdi_wb_init(&bdi->wb, bdi); | 662 | bdi_wb_init(&bdi->wb, bdi); |
649 | 663 | ||
650 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 664 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
651 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); | 665 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); |
652 | if (err) | 666 | if (err) |
653 | goto err; | 667 | goto err; |
654 | } | 668 | } |
655 | 669 | ||
656 | bdi->dirty_exceeded = 0; | 670 | bdi->dirty_exceeded = 0; |
671 | |||
672 | bdi->bw_time_stamp = jiffies; | ||
673 | bdi->written_stamp = 0; | ||
674 | |||
675 | bdi->write_bandwidth = INIT_BW; | ||
676 | bdi->avg_write_bandwidth = INIT_BW; | ||
677 | |||
657 | err = prop_local_init_percpu(&bdi->completions); | 678 | err = prop_local_init_percpu(&bdi->completions); |
658 | 679 | ||
659 | if (err) { | 680 | if (err) { |
660 | err: | 681 | err: |
661 | while (i--) | 682 | while (i--) |
662 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 683 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
663 | } | 684 | } |
664 | 685 | ||
665 | return err; | 686 | return err; |
666 | } | 687 | } |
667 | EXPORT_SYMBOL(bdi_init); | 688 | EXPORT_SYMBOL(bdi_init); |
668 | 689 | ||
669 | void bdi_destroy(struct backing_dev_info *bdi) | 690 | void bdi_destroy(struct backing_dev_info *bdi) |
670 | { | 691 | { |
671 | int i; | 692 | int i; |
672 | 693 | ||
673 | /* | 694 | /* |
674 | * Splice our entries to the default_backing_dev_info, if this | 695 | * Splice our entries to the default_backing_dev_info, if this |
675 | * bdi disappears | 696 | * bdi disappears |
676 | */ | 697 | */ |
677 | if (bdi_has_dirty_io(bdi)) { | 698 | if (bdi_has_dirty_io(bdi)) { |
678 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 699 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
679 | 700 | ||
680 | spin_lock(&inode_wb_list_lock); | 701 | bdi_lock_two(&bdi->wb, dst); |
681 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 702 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
682 | list_splice(&bdi->wb.b_io, &dst->b_io); | 703 | list_splice(&bdi->wb.b_io, &dst->b_io); |
683 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 704 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
684 | spin_unlock(&inode_wb_list_lock); | 705 | spin_unlock(&bdi->wb.list_lock); |
706 | spin_unlock(&dst->list_lock); | ||
685 | } | 707 | } |
686 | 708 | ||
687 | bdi_unregister(bdi); | 709 | bdi_unregister(bdi); |
688 | 710 | ||
689 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 711 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
690 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 712 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
691 | 713 | ||
692 | prop_local_destroy_percpu(&bdi->completions); | 714 | prop_local_destroy_percpu(&bdi->completions); |
693 | } | 715 | } |
694 | EXPORT_SYMBOL(bdi_destroy); | 716 | EXPORT_SYMBOL(bdi_destroy); |
695 | 717 | ||
696 | /* | 718 | /* |
697 | * For use from filesystems to quickly init and register a bdi associated | 719 | * For use from filesystems to quickly init and register a bdi associated |
698 | * with dirty writeback | 720 | * with dirty writeback |
699 | */ | 721 | */ |
700 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | 722 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, |
701 | unsigned int cap) | 723 | unsigned int cap) |
702 | { | 724 | { |
703 | char tmp[32]; | 725 | char tmp[32]; |
704 | int err; | 726 | int err; |
705 | 727 | ||
706 | bdi->name = name; | 728 | bdi->name = name; |
707 | bdi->capabilities = cap; | 729 | bdi->capabilities = cap; |
708 | err = bdi_init(bdi); | 730 | err = bdi_init(bdi); |
709 | if (err) | 731 | if (err) |
710 | return err; | 732 | return err; |
711 | 733 | ||
712 | sprintf(tmp, "%.28s%s", name, "-%d"); | 734 | sprintf(tmp, "%.28s%s", name, "-%d"); |
713 | err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); | 735 | err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); |
714 | if (err) { | 736 | if (err) { |
715 | bdi_destroy(bdi); | 737 | bdi_destroy(bdi); |
716 | return err; | 738 | return err; |
717 | } | 739 | } |
718 | 740 | ||
719 | return 0; | 741 | return 0; |
720 | } | 742 | } |
721 | EXPORT_SYMBOL(bdi_setup_and_register); | 743 | EXPORT_SYMBOL(bdi_setup_and_register); |
722 | 744 | ||
723 | static wait_queue_head_t congestion_wqh[2] = { | 745 | static wait_queue_head_t congestion_wqh[2] = { |
724 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 746 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
725 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 747 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
726 | }; | 748 | }; |
727 | static atomic_t nr_bdi_congested[2]; | 749 | static atomic_t nr_bdi_congested[2]; |
728 | 750 | ||
729 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | 751 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) |
730 | { | 752 | { |
731 | enum bdi_state bit; | 753 | enum bdi_state bit; |
732 | wait_queue_head_t *wqh = &congestion_wqh[sync]; | 754 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
733 | 755 | ||
734 | bit = sync ? BDI_sync_congested : BDI_async_congested; | 756 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
735 | if (test_and_clear_bit(bit, &bdi->state)) | 757 | if (test_and_clear_bit(bit, &bdi->state)) |
736 | atomic_dec(&nr_bdi_congested[sync]); | 758 | atomic_dec(&nr_bdi_congested[sync]); |
737 | smp_mb__after_clear_bit(); | 759 | smp_mb__after_clear_bit(); |
738 | if (waitqueue_active(wqh)) | 760 | if (waitqueue_active(wqh)) |
739 | wake_up(wqh); | 761 | wake_up(wqh); |
740 | } | 762 | } |
741 | EXPORT_SYMBOL(clear_bdi_congested); | 763 | EXPORT_SYMBOL(clear_bdi_congested); |
742 | 764 | ||
743 | void set_bdi_congested(struct backing_dev_info *bdi, int sync) | 765 | void set_bdi_congested(struct backing_dev_info *bdi, int sync) |
744 | { | 766 | { |
745 | enum bdi_state bit; | 767 | enum bdi_state bit; |
746 | 768 | ||
747 | bit = sync ? BDI_sync_congested : BDI_async_congested; | 769 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
748 | if (!test_and_set_bit(bit, &bdi->state)) | 770 | if (!test_and_set_bit(bit, &bdi->state)) |
749 | atomic_inc(&nr_bdi_congested[sync]); | 771 | atomic_inc(&nr_bdi_congested[sync]); |
750 | } | 772 | } |
751 | EXPORT_SYMBOL(set_bdi_congested); | 773 | EXPORT_SYMBOL(set_bdi_congested); |
752 | 774 | ||
753 | /** | 775 | /** |
754 | * congestion_wait - wait for a backing_dev to become uncongested | 776 | * congestion_wait - wait for a backing_dev to become uncongested |
755 | * @sync: SYNC or ASYNC IO | 777 | * @sync: SYNC or ASYNC IO |
756 | * @timeout: timeout in jiffies | 778 | * @timeout: timeout in jiffies |
757 | * | 779 | * |
758 | * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit | 780 | * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit |
759 | * write congestion. If no backing_devs are congested then just wait for the | 781 | * write congestion. If no backing_devs are congested then just wait for the |
760 | * next write to be completed. | 782 | * next write to be completed. |
761 | */ | 783 | */ |
762 | long congestion_wait(int sync, long timeout) | 784 | long congestion_wait(int sync, long timeout) |
763 | { | 785 | { |
764 | long ret; | 786 | long ret; |
765 | unsigned long start = jiffies; | 787 | unsigned long start = jiffies; |
766 | DEFINE_WAIT(wait); | 788 | DEFINE_WAIT(wait); |
767 | wait_queue_head_t *wqh = &congestion_wqh[sync]; | 789 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
768 | 790 | ||
769 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); | 791 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); |
770 | ret = io_schedule_timeout(timeout); | 792 | ret = io_schedule_timeout(timeout); |
771 | finish_wait(wqh, &wait); | 793 | finish_wait(wqh, &wait); |
772 | 794 | ||
773 | trace_writeback_congestion_wait(jiffies_to_usecs(timeout), | 795 | trace_writeback_congestion_wait(jiffies_to_usecs(timeout), |
774 | jiffies_to_usecs(jiffies - start)); | 796 | jiffies_to_usecs(jiffies - start)); |
775 | 797 | ||
776 | return ret; | 798 | return ret; |
777 | } | 799 | } |
778 | EXPORT_SYMBOL(congestion_wait); | 800 | EXPORT_SYMBOL(congestion_wait); |
779 | 801 | ||
780 | /** | 802 | /** |
781 | * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes | 803 | * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes |
782 | * @zone: A zone to check if it is heavily congested | 804 | * @zone: A zone to check if it is heavily congested |
783 | * @sync: SYNC or ASYNC IO | 805 | * @sync: SYNC or ASYNC IO |
784 | * @timeout: timeout in jiffies | 806 | * @timeout: timeout in jiffies |
785 | * | 807 | * |
786 | * In the event of a congested backing_dev (any backing_dev) and the given | 808 | * In the event of a congested backing_dev (any backing_dev) and the given |
787 | * @zone has experienced recent congestion, this waits for up to @timeout | 809 | * @zone has experienced recent congestion, this waits for up to @timeout |
788 | * jiffies for either a BDI to exit congestion of the given @sync queue | 810 | * jiffies for either a BDI to exit congestion of the given @sync queue |
789 | * or a write to complete. | 811 | * or a write to complete. |
790 | * | 812 | * |
791 | * In the absence of zone congestion, cond_resched() is called to yield | 813 | * In the absence of zone congestion, cond_resched() is called to yield |
792 | * the processor if necessary but otherwise does not sleep. | 814 | * the processor if necessary but otherwise does not sleep. |
793 | * | 815 | * |
794 | * The return value is 0 if the sleep is for the full timeout. Otherwise, | 816 | * The return value is 0 if the sleep is for the full timeout. Otherwise, |
795 | * it is the number of jiffies that were still remaining when the function | 817 | * it is the number of jiffies that were still remaining when the function |
796 | * returned. return_value == timeout implies the function did not sleep. | 818 | * returned. return_value == timeout implies the function did not sleep. |
797 | */ | 819 | */ |
798 | long wait_iff_congested(struct zone *zone, int sync, long timeout) | 820 | long wait_iff_congested(struct zone *zone, int sync, long timeout) |
799 | { | 821 | { |
800 | long ret; | 822 | long ret; |
801 | unsigned long start = jiffies; | 823 | unsigned long start = jiffies; |
802 | DEFINE_WAIT(wait); | 824 | DEFINE_WAIT(wait); |
803 | wait_queue_head_t *wqh = &congestion_wqh[sync]; | 825 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
804 | 826 | ||
805 | /* | 827 | /* |
806 | * If there is no congestion, or heavy congestion is not being | 828 | * If there is no congestion, or heavy congestion is not being |
807 | * encountered in the current zone, yield if necessary instead | 829 | * encountered in the current zone, yield if necessary instead |
808 | * of sleeping on the congestion queue | 830 | * of sleeping on the congestion queue |
809 | */ | 831 | */ |
810 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || | 832 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || |
811 | !zone_is_reclaim_congested(zone)) { | 833 | !zone_is_reclaim_congested(zone)) { |
812 | cond_resched(); | 834 | cond_resched(); |
813 | 835 | ||
814 | /* In case we scheduled, work out time remaining */ | 836 | /* In case we scheduled, work out time remaining */ |
815 | ret = timeout - (jiffies - start); | 837 | ret = timeout - (jiffies - start); |
816 | if (ret < 0) | 838 | if (ret < 0) |
817 | ret = 0; | 839 | ret = 0; |
818 | 840 | ||
819 | goto out; | 841 | goto out; |
820 | } | 842 | } |
821 | 843 | ||
822 | /* Sleep until uncongested or a write happens */ | 844 | /* Sleep until uncongested or a write happens */ |
mm/filemap.c
1 | /* | 1 | /* |
2 | * linux/mm/filemap.c | 2 | * linux/mm/filemap.c |
3 | * | 3 | * |
4 | * Copyright (C) 1994-1999 Linus Torvalds | 4 | * Copyright (C) 1994-1999 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * This file handles the generic file mmap semantics used by | 8 | * This file handles the generic file mmap semantics used by |
9 | * most "normal" filesystems (but you don't /have/ to use this: | 9 | * most "normal" filesystems (but you don't /have/ to use this: |
10 | * the NFS filesystem used to do this differently, for example) | 10 | * the NFS filesystem used to do this differently, for example) |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
16 | #include <linux/aio.h> | 16 | #include <linux/aio.h> |
17 | #include <linux/capability.h> | 17 | #include <linux/capability.h> |
18 | #include <linux/kernel_stat.h> | 18 | #include <linux/kernel_stat.h> |
19 | #include <linux/gfp.h> | 19 | #include <linux/gfp.h> |
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
22 | #include <linux/mman.h> | 22 | #include <linux/mman.h> |
23 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
24 | #include <linux/file.h> | 24 | #include <linux/file.h> |
25 | #include <linux/uio.h> | 25 | #include <linux/uio.h> |
26 | #include <linux/hash.h> | 26 | #include <linux/hash.h> |
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/backing-dev.h> | 28 | #include <linux/backing-dev.h> |
29 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
30 | #include <linux/blkdev.h> | 30 | #include <linux/blkdev.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/syscalls.h> | 32 | #include <linux/syscalls.h> |
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | 36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ |
37 | #include <linux/cleancache.h> | 37 | #include <linux/cleancache.h> |
38 | #include "internal.h" | 38 | #include "internal.h" |
39 | 39 | ||
40 | /* | 40 | /* |
41 | * FIXME: remove all knowledge of the buffer layer from the core VM | 41 | * FIXME: remove all knowledge of the buffer layer from the core VM |
42 | */ | 42 | */ |
43 | #include <linux/buffer_head.h> /* for try_to_free_buffers */ | 43 | #include <linux/buffer_head.h> /* for try_to_free_buffers */ |
44 | 44 | ||
45 | #include <asm/mman.h> | 45 | #include <asm/mman.h> |
46 | 46 | ||
47 | /* | 47 | /* |
48 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | 48 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
49 | * though. | 49 | * though. |
50 | * | 50 | * |
51 | * Shared mappings now work. 15.8.1995 Bruno. | 51 | * Shared mappings now work. 15.8.1995 Bruno. |
52 | * | 52 | * |
53 | * finished 'unifying' the page and buffer cache and SMP-threaded the | 53 | * finished 'unifying' the page and buffer cache and SMP-threaded the |
54 | * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> | 54 | * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> |
55 | * | 55 | * |
56 | * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> | 56 | * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> |
57 | */ | 57 | */ |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * Lock ordering: | 60 | * Lock ordering: |
61 | * | 61 | * |
62 | * ->i_mmap_mutex (truncate_pagecache) | 62 | * ->i_mmap_mutex (truncate_pagecache) |
63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
64 | * ->swap_lock (exclusive_swap_page, others) | 64 | * ->swap_lock (exclusive_swap_page, others) |
65 | * ->mapping->tree_lock | 65 | * ->mapping->tree_lock |
66 | * | 66 | * |
67 | * ->i_mutex | 67 | * ->i_mutex |
68 | * ->i_mmap_mutex (truncate->unmap_mapping_range) | 68 | * ->i_mmap_mutex (truncate->unmap_mapping_range) |
69 | * | 69 | * |
70 | * ->mmap_sem | 70 | * ->mmap_sem |
71 | * ->i_mmap_mutex | 71 | * ->i_mmap_mutex |
72 | * ->page_table_lock or pte_lock (various, mainly in memory.c) | 72 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
73 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 73 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
74 | * | 74 | * |
75 | * ->mmap_sem | 75 | * ->mmap_sem |
76 | * ->lock_page (access_process_vm) | 76 | * ->lock_page (access_process_vm) |
77 | * | 77 | * |
78 | * ->i_mutex (generic_file_buffered_write) | 78 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 80 | * |
81 | * inode_wb_list_lock | 81 | * bdi->wb.list_lock |
82 | * sb_lock (fs/fs-writeback.c) | 82 | * sb_lock (fs/fs-writeback.c) |
83 | * ->mapping->tree_lock (__sync_single_inode) | 83 | * ->mapping->tree_lock (__sync_single_inode) |
84 | * | 84 | * |
85 | * ->i_mmap_mutex | 85 | * ->i_mmap_mutex |
86 | * ->anon_vma.lock (vma_adjust) | 86 | * ->anon_vma.lock (vma_adjust) |
87 | * | 87 | * |
88 | * ->anon_vma.lock | 88 | * ->anon_vma.lock |
89 | * ->page_table_lock or pte_lock (anon_vma_prepare and various) | 89 | * ->page_table_lock or pte_lock (anon_vma_prepare and various) |
90 | * | 90 | * |
91 | * ->page_table_lock or pte_lock | 91 | * ->page_table_lock or pte_lock |
92 | * ->swap_lock (try_to_unmap_one) | 92 | * ->swap_lock (try_to_unmap_one) |
93 | * ->private_lock (try_to_unmap_one) | 93 | * ->private_lock (try_to_unmap_one) |
94 | * ->tree_lock (try_to_unmap_one) | 94 | * ->tree_lock (try_to_unmap_one) |
95 | * ->zone.lru_lock (follow_page->mark_page_accessed) | 95 | * ->zone.lru_lock (follow_page->mark_page_accessed) |
96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
97 | * ->private_lock (page_remove_rmap->set_page_dirty) | 97 | * ->private_lock (page_remove_rmap->set_page_dirty) |
98 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 98 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
99 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | 99 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) | 100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
101 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | 101 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) |
102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 104 | * |
105 | * (code doesn't rely on that order, so you could switch it around) | 105 | * (code doesn't rely on that order, so you could switch it around) |
106 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 106 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
107 | * ->i_mmap_mutex | 107 | * ->i_mmap_mutex |
108 | */ | 108 | */ |
109 | 109 | ||
110 | /* | 110 | /* |
111 | * Delete a page from the page cache and free it. Caller has to make | 111 | * Delete a page from the page cache and free it. Caller has to make |
112 | * sure the page is locked and that nobody else uses it - or that usage | 112 | * sure the page is locked and that nobody else uses it - or that usage |
113 | * is safe. The caller must hold the mapping's tree_lock. | 113 | * is safe. The caller must hold the mapping's tree_lock. |
114 | */ | 114 | */ |
115 | void __delete_from_page_cache(struct page *page) | 115 | void __delete_from_page_cache(struct page *page) |
116 | { | 116 | { |
117 | struct address_space *mapping = page->mapping; | 117 | struct address_space *mapping = page->mapping; |
118 | 118 | ||
119 | /* | 119 | /* |
120 | * if we're uptodate, flush out into the cleancache, otherwise | 120 | * if we're uptodate, flush out into the cleancache, otherwise |
121 | * invalidate any existing cleancache entries. We can't leave | 121 | * invalidate any existing cleancache entries. We can't leave |
122 | * stale data around in the cleancache once our page is gone | 122 | * stale data around in the cleancache once our page is gone |
123 | */ | 123 | */ |
124 | if (PageUptodate(page) && PageMappedToDisk(page)) | 124 | if (PageUptodate(page) && PageMappedToDisk(page)) |
125 | cleancache_put_page(page); | 125 | cleancache_put_page(page); |
126 | else | 126 | else |
127 | cleancache_flush_page(mapping, page); | 127 | cleancache_flush_page(mapping, page); |
128 | 128 | ||
129 | radix_tree_delete(&mapping->page_tree, page->index); | 129 | radix_tree_delete(&mapping->page_tree, page->index); |
130 | page->mapping = NULL; | 130 | page->mapping = NULL; |
131 | /* Leave page->index set: truncation lookup relies upon it */ | 131 | /* Leave page->index set: truncation lookup relies upon it */ |
132 | mapping->nrpages--; | 132 | mapping->nrpages--; |
133 | __dec_zone_page_state(page, NR_FILE_PAGES); | 133 | __dec_zone_page_state(page, NR_FILE_PAGES); |
134 | if (PageSwapBacked(page)) | 134 | if (PageSwapBacked(page)) |
135 | __dec_zone_page_state(page, NR_SHMEM); | 135 | __dec_zone_page_state(page, NR_SHMEM); |
136 | BUG_ON(page_mapped(page)); | 136 | BUG_ON(page_mapped(page)); |
137 | 137 | ||
138 | /* | 138 | /* |
139 | * Some filesystems seem to re-dirty the page even after | 139 | * Some filesystems seem to re-dirty the page even after |
140 | * the VM has canceled the dirty bit (eg ext3 journaling). | 140 | * the VM has canceled the dirty bit (eg ext3 journaling). |
141 | * | 141 | * |
142 | * Fix it up by doing a final dirty accounting check after | 142 | * Fix it up by doing a final dirty accounting check after |
143 | * having removed the page entirely. | 143 | * having removed the page entirely. |
144 | */ | 144 | */ |
145 | if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { | 145 | if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { |
146 | dec_zone_page_state(page, NR_FILE_DIRTY); | 146 | dec_zone_page_state(page, NR_FILE_DIRTY); |
147 | dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 147 | dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); |
148 | } | 148 | } |
149 | } | 149 | } |
150 | 150 | ||
151 | /** | 151 | /** |
152 | * delete_from_page_cache - delete page from page cache | 152 | * delete_from_page_cache - delete page from page cache |
153 | * @page: the page which the kernel is trying to remove from page cache | 153 | * @page: the page which the kernel is trying to remove from page cache |
154 | * | 154 | * |
155 | * This must be called only on pages that have been verified to be in the page | 155 | * This must be called only on pages that have been verified to be in the page |
156 | * cache and locked. It will never put the page into the free list, the caller | 156 | * cache and locked. It will never put the page into the free list, the caller |
157 | * has a reference on the page. | 157 | * has a reference on the page. |
158 | */ | 158 | */ |
159 | void delete_from_page_cache(struct page *page) | 159 | void delete_from_page_cache(struct page *page) |
160 | { | 160 | { |
161 | struct address_space *mapping = page->mapping; | 161 | struct address_space *mapping = page->mapping; |
162 | void (*freepage)(struct page *); | 162 | void (*freepage)(struct page *); |
163 | 163 | ||
164 | BUG_ON(!PageLocked(page)); | 164 | BUG_ON(!PageLocked(page)); |
165 | 165 | ||
166 | freepage = mapping->a_ops->freepage; | 166 | freepage = mapping->a_ops->freepage; |
167 | spin_lock_irq(&mapping->tree_lock); | 167 | spin_lock_irq(&mapping->tree_lock); |
168 | __delete_from_page_cache(page); | 168 | __delete_from_page_cache(page); |
169 | spin_unlock_irq(&mapping->tree_lock); | 169 | spin_unlock_irq(&mapping->tree_lock); |
170 | mem_cgroup_uncharge_cache_page(page); | 170 | mem_cgroup_uncharge_cache_page(page); |
171 | 171 | ||
172 | if (freepage) | 172 | if (freepage) |
173 | freepage(page); | 173 | freepage(page); |
174 | page_cache_release(page); | 174 | page_cache_release(page); |
175 | } | 175 | } |
176 | EXPORT_SYMBOL(delete_from_page_cache); | 176 | EXPORT_SYMBOL(delete_from_page_cache); |
177 | 177 | ||
178 | static int sleep_on_page(void *word) | 178 | static int sleep_on_page(void *word) |
179 | { | 179 | { |
180 | io_schedule(); | 180 | io_schedule(); |
181 | return 0; | 181 | return 0; |
182 | } | 182 | } |
183 | 183 | ||
184 | static int sleep_on_page_killable(void *word) | 184 | static int sleep_on_page_killable(void *word) |
185 | { | 185 | { |
186 | sleep_on_page(word); | 186 | sleep_on_page(word); |
187 | return fatal_signal_pending(current) ? -EINTR : 0; | 187 | return fatal_signal_pending(current) ? -EINTR : 0; |
188 | } | 188 | } |
189 | 189 | ||
190 | /** | 190 | /** |
191 | * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range | 191 | * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range |
192 | * @mapping: address space structure to write | 192 | * @mapping: address space structure to write |
193 | * @start: offset in bytes where the range starts | 193 | * @start: offset in bytes where the range starts |
194 | * @end: offset in bytes where the range ends (inclusive) | 194 | * @end: offset in bytes where the range ends (inclusive) |
195 | * @sync_mode: enable synchronous operation | 195 | * @sync_mode: enable synchronous operation |
196 | * | 196 | * |
197 | * Start writeback against all of a mapping's dirty pages that lie | 197 | * Start writeback against all of a mapping's dirty pages that lie |
198 | * within the byte offsets <start, end> inclusive. | 198 | * within the byte offsets <start, end> inclusive. |
199 | * | 199 | * |
200 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | 200 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as |
201 | * opposed to a regular memory cleansing writeback. The difference between | 201 | * opposed to a regular memory cleansing writeback. The difference between |
202 | * these two operations is that if a dirty page/buffer is encountered, it must | 202 | * these two operations is that if a dirty page/buffer is encountered, it must |
203 | * be waited upon, and not just skipped over. | 203 | * be waited upon, and not just skipped over. |
204 | */ | 204 | */ |
205 | int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | 205 | int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
206 | loff_t end, int sync_mode) | 206 | loff_t end, int sync_mode) |
207 | { | 207 | { |
208 | int ret; | 208 | int ret; |
209 | struct writeback_control wbc = { | 209 | struct writeback_control wbc = { |
210 | .sync_mode = sync_mode, | 210 | .sync_mode = sync_mode, |
211 | .nr_to_write = LONG_MAX, | 211 | .nr_to_write = LONG_MAX, |
212 | .range_start = start, | 212 | .range_start = start, |
213 | .range_end = end, | 213 | .range_end = end, |
214 | }; | 214 | }; |
215 | 215 | ||
216 | if (!mapping_cap_writeback_dirty(mapping)) | 216 | if (!mapping_cap_writeback_dirty(mapping)) |
217 | return 0; | 217 | return 0; |
218 | 218 | ||
219 | ret = do_writepages(mapping, &wbc); | 219 | ret = do_writepages(mapping, &wbc); |
220 | return ret; | 220 | return ret; |
221 | } | 221 | } |
222 | 222 | ||
223 | static inline int __filemap_fdatawrite(struct address_space *mapping, | 223 | static inline int __filemap_fdatawrite(struct address_space *mapping, |
224 | int sync_mode) | 224 | int sync_mode) |
225 | { | 225 | { |
226 | return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); | 226 | return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); |
227 | } | 227 | } |
228 | 228 | ||
229 | int filemap_fdatawrite(struct address_space *mapping) | 229 | int filemap_fdatawrite(struct address_space *mapping) |
230 | { | 230 | { |
231 | return __filemap_fdatawrite(mapping, WB_SYNC_ALL); | 231 | return __filemap_fdatawrite(mapping, WB_SYNC_ALL); |
232 | } | 232 | } |
233 | EXPORT_SYMBOL(filemap_fdatawrite); | 233 | EXPORT_SYMBOL(filemap_fdatawrite); |
234 | 234 | ||
235 | int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | 235 | int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
236 | loff_t end) | 236 | loff_t end) |
237 | { | 237 | { |
238 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | 238 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); |
239 | } | 239 | } |
240 | EXPORT_SYMBOL(filemap_fdatawrite_range); | 240 | EXPORT_SYMBOL(filemap_fdatawrite_range); |
241 | 241 | ||
242 | /** | 242 | /** |
243 | * filemap_flush - mostly a non-blocking flush | 243 | * filemap_flush - mostly a non-blocking flush |
244 | * @mapping: target address_space | 244 | * @mapping: target address_space |
245 | * | 245 | * |
246 | * This is a mostly non-blocking flush. Not suitable for data-integrity | 246 | * This is a mostly non-blocking flush. Not suitable for data-integrity |
247 | * purposes - I/O may not be started against all dirty pages. | 247 | * purposes - I/O may not be started against all dirty pages. |
248 | */ | 248 | */ |
249 | int filemap_flush(struct address_space *mapping) | 249 | int filemap_flush(struct address_space *mapping) |
250 | { | 250 | { |
251 | return __filemap_fdatawrite(mapping, WB_SYNC_NONE); | 251 | return __filemap_fdatawrite(mapping, WB_SYNC_NONE); |
252 | } | 252 | } |
253 | EXPORT_SYMBOL(filemap_flush); | 253 | EXPORT_SYMBOL(filemap_flush); |
254 | 254 | ||
255 | /** | 255 | /** |
256 | * filemap_fdatawait_range - wait for writeback to complete | 256 | * filemap_fdatawait_range - wait for writeback to complete |
257 | * @mapping: address space structure to wait for | 257 | * @mapping: address space structure to wait for |
258 | * @start_byte: offset in bytes where the range starts | 258 | * @start_byte: offset in bytes where the range starts |
259 | * @end_byte: offset in bytes where the range ends (inclusive) | 259 | * @end_byte: offset in bytes where the range ends (inclusive) |
260 | * | 260 | * |
261 | * Walk the list of under-writeback pages of the given address space | 261 | * Walk the list of under-writeback pages of the given address space |
262 | * in the given range and wait for all of them. | 262 | * in the given range and wait for all of them. |
263 | */ | 263 | */ |
264 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | 264 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, |
265 | loff_t end_byte) | 265 | loff_t end_byte) |
266 | { | 266 | { |
267 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; | 267 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; |
268 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; | 268 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; |
269 | struct pagevec pvec; | 269 | struct pagevec pvec; |
270 | int nr_pages; | 270 | int nr_pages; |
271 | int ret = 0; | 271 | int ret = 0; |
272 | 272 | ||
273 | if (end_byte < start_byte) | 273 | if (end_byte < start_byte) |
274 | return 0; | 274 | return 0; |
275 | 275 | ||
276 | pagevec_init(&pvec, 0); | 276 | pagevec_init(&pvec, 0); |
277 | while ((index <= end) && | 277 | while ((index <= end) && |
278 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 278 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
279 | PAGECACHE_TAG_WRITEBACK, | 279 | PAGECACHE_TAG_WRITEBACK, |
280 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { | 280 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { |
281 | unsigned i; | 281 | unsigned i; |
282 | 282 | ||
283 | for (i = 0; i < nr_pages; i++) { | 283 | for (i = 0; i < nr_pages; i++) { |
284 | struct page *page = pvec.pages[i]; | 284 | struct page *page = pvec.pages[i]; |
285 | 285 | ||
286 | /* until radix tree lookup accepts end_index */ | 286 | /* until radix tree lookup accepts end_index */ |
287 | if (page->index > end) | 287 | if (page->index > end) |
288 | continue; | 288 | continue; |
289 | 289 | ||
290 | wait_on_page_writeback(page); | 290 | wait_on_page_writeback(page); |
291 | if (TestClearPageError(page)) | 291 | if (TestClearPageError(page)) |
292 | ret = -EIO; | 292 | ret = -EIO; |
293 | } | 293 | } |
294 | pagevec_release(&pvec); | 294 | pagevec_release(&pvec); |
295 | cond_resched(); | 295 | cond_resched(); |
296 | } | 296 | } |
297 | 297 | ||
298 | /* Check for outstanding write errors */ | 298 | /* Check for outstanding write errors */ |
299 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | 299 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) |
300 | ret = -ENOSPC; | 300 | ret = -ENOSPC; |
301 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) | 301 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) |
302 | ret = -EIO; | 302 | ret = -EIO; |
303 | 303 | ||
304 | return ret; | 304 | return ret; |
305 | } | 305 | } |
306 | EXPORT_SYMBOL(filemap_fdatawait_range); | 306 | EXPORT_SYMBOL(filemap_fdatawait_range); |
307 | 307 | ||
308 | /** | 308 | /** |
309 | * filemap_fdatawait - wait for all under-writeback pages to complete | 309 | * filemap_fdatawait - wait for all under-writeback pages to complete |
310 | * @mapping: address space structure to wait for | 310 | * @mapping: address space structure to wait for |
311 | * | 311 | * |
312 | * Walk the list of under-writeback pages of the given address space | 312 | * Walk the list of under-writeback pages of the given address space |
313 | * and wait for all of them. | 313 | * and wait for all of them. |
314 | */ | 314 | */ |
315 | int filemap_fdatawait(struct address_space *mapping) | 315 | int filemap_fdatawait(struct address_space *mapping) |
316 | { | 316 | { |
317 | loff_t i_size = i_size_read(mapping->host); | 317 | loff_t i_size = i_size_read(mapping->host); |
318 | 318 | ||
319 | if (i_size == 0) | 319 | if (i_size == 0) |
320 | return 0; | 320 | return 0; |
321 | 321 | ||
322 | return filemap_fdatawait_range(mapping, 0, i_size - 1); | 322 | return filemap_fdatawait_range(mapping, 0, i_size - 1); |
323 | } | 323 | } |
324 | EXPORT_SYMBOL(filemap_fdatawait); | 324 | EXPORT_SYMBOL(filemap_fdatawait); |
325 | 325 | ||
326 | int filemap_write_and_wait(struct address_space *mapping) | 326 | int filemap_write_and_wait(struct address_space *mapping) |
327 | { | 327 | { |
328 | int err = 0; | 328 | int err = 0; |
329 | 329 | ||
330 | if (mapping->nrpages) { | 330 | if (mapping->nrpages) { |
331 | err = filemap_fdatawrite(mapping); | 331 | err = filemap_fdatawrite(mapping); |
332 | /* | 332 | /* |
333 | * Even if the above returned error, the pages may be | 333 | * Even if the above returned error, the pages may be |
334 | * written partially (e.g. -ENOSPC), so we wait for it. | 334 | * written partially (e.g. -ENOSPC), so we wait for it. |
335 | * But the -EIO is special case, it may indicate the worst | 335 | * But the -EIO is special case, it may indicate the worst |
336 | * thing (e.g. bug) happened, so we avoid waiting for it. | 336 | * thing (e.g. bug) happened, so we avoid waiting for it. |
337 | */ | 337 | */ |
338 | if (err != -EIO) { | 338 | if (err != -EIO) { |
339 | int err2 = filemap_fdatawait(mapping); | 339 | int err2 = filemap_fdatawait(mapping); |
340 | if (!err) | 340 | if (!err) |
341 | err = err2; | 341 | err = err2; |
342 | } | 342 | } |
343 | } | 343 | } |
344 | return err; | 344 | return err; |
345 | } | 345 | } |
346 | EXPORT_SYMBOL(filemap_write_and_wait); | 346 | EXPORT_SYMBOL(filemap_write_and_wait); |
347 | 347 | ||
348 | /** | 348 | /** |
349 | * filemap_write_and_wait_range - write out & wait on a file range | 349 | * filemap_write_and_wait_range - write out & wait on a file range |
350 | * @mapping: the address_space for the pages | 350 | * @mapping: the address_space for the pages |
351 | * @lstart: offset in bytes where the range starts | 351 | * @lstart: offset in bytes where the range starts |
352 | * @lend: offset in bytes where the range ends (inclusive) | 352 | * @lend: offset in bytes where the range ends (inclusive) |
353 | * | 353 | * |
354 | * Write out and wait upon file offsets lstart->lend, inclusive. | 354 | * Write out and wait upon file offsets lstart->lend, inclusive. |
355 | * | 355 | * |
356 | * Note that `lend' is inclusive (describes the last byte to be written) so | 356 | * Note that `lend' is inclusive (describes the last byte to be written) so |
357 | * that this function can be used to write to the very end-of-file (end = -1). | 357 | * that this function can be used to write to the very end-of-file (end = -1). |
358 | */ | 358 | */ |
359 | int filemap_write_and_wait_range(struct address_space *mapping, | 359 | int filemap_write_and_wait_range(struct address_space *mapping, |
360 | loff_t lstart, loff_t lend) | 360 | loff_t lstart, loff_t lend) |
361 | { | 361 | { |
362 | int err = 0; | 362 | int err = 0; |
363 | 363 | ||
364 | if (mapping->nrpages) { | 364 | if (mapping->nrpages) { |
365 | err = __filemap_fdatawrite_range(mapping, lstart, lend, | 365 | err = __filemap_fdatawrite_range(mapping, lstart, lend, |
366 | WB_SYNC_ALL); | 366 | WB_SYNC_ALL); |
367 | /* See comment of filemap_write_and_wait() */ | 367 | /* See comment of filemap_write_and_wait() */ |
368 | if (err != -EIO) { | 368 | if (err != -EIO) { |
369 | int err2 = filemap_fdatawait_range(mapping, | 369 | int err2 = filemap_fdatawait_range(mapping, |
370 | lstart, lend); | 370 | lstart, lend); |
371 | if (!err) | 371 | if (!err) |
372 | err = err2; | 372 | err = err2; |
373 | } | 373 | } |
374 | } | 374 | } |
375 | return err; | 375 | return err; |
376 | } | 376 | } |
377 | EXPORT_SYMBOL(filemap_write_and_wait_range); | 377 | EXPORT_SYMBOL(filemap_write_and_wait_range); |
378 | 378 | ||
379 | /** | 379 | /** |
380 | * replace_page_cache_page - replace a pagecache page with a new one | 380 | * replace_page_cache_page - replace a pagecache page with a new one |
381 | * @old: page to be replaced | 381 | * @old: page to be replaced |
382 | * @new: page to replace with | 382 | * @new: page to replace with |
383 | * @gfp_mask: allocation mode | 383 | * @gfp_mask: allocation mode |
384 | * | 384 | * |
385 | * This function replaces a page in the pagecache with a new one. On | 385 | * This function replaces a page in the pagecache with a new one. On |
386 | * success it acquires the pagecache reference for the new page and | 386 | * success it acquires the pagecache reference for the new page and |
387 | * drops it for the old page. Both the old and new pages must be | 387 | * drops it for the old page. Both the old and new pages must be |
388 | * locked. This function does not add the new page to the LRU, the | 388 | * locked. This function does not add the new page to the LRU, the |
389 | * caller must do that. | 389 | * caller must do that. |
390 | * | 390 | * |
391 | * The remove + add is atomic. The only way this function can fail is | 391 | * The remove + add is atomic. The only way this function can fail is |
392 | * memory allocation failure. | 392 | * memory allocation failure. |
393 | */ | 393 | */ |
394 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | 394 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) |
395 | { | 395 | { |
396 | int error; | 396 | int error; |
397 | struct mem_cgroup *memcg = NULL; | 397 | struct mem_cgroup *memcg = NULL; |
398 | 398 | ||
399 | VM_BUG_ON(!PageLocked(old)); | 399 | VM_BUG_ON(!PageLocked(old)); |
400 | VM_BUG_ON(!PageLocked(new)); | 400 | VM_BUG_ON(!PageLocked(new)); |
401 | VM_BUG_ON(new->mapping); | 401 | VM_BUG_ON(new->mapping); |
402 | 402 | ||
403 | /* | 403 | /* |
404 | * This is not page migration, but prepare_migration and | 404 | * This is not page migration, but prepare_migration and |
405 | * end_migration does enough work for charge replacement. | 405 | * end_migration does enough work for charge replacement. |
406 | * | 406 | * |
407 | * In the longer term we probably want a specialized function | 407 | * In the longer term we probably want a specialized function |
408 | * for moving the charge from old to new in a more efficient | 408 | * for moving the charge from old to new in a more efficient |
409 | * manner. | 409 | * manner. |
410 | */ | 410 | */ |
411 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); | 411 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); |
412 | if (error) | 412 | if (error) |
413 | return error; | 413 | return error; |
414 | 414 | ||
415 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 415 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
416 | if (!error) { | 416 | if (!error) { |
417 | struct address_space *mapping = old->mapping; | 417 | struct address_space *mapping = old->mapping; |
418 | void (*freepage)(struct page *); | 418 | void (*freepage)(struct page *); |
419 | 419 | ||
420 | pgoff_t offset = old->index; | 420 | pgoff_t offset = old->index; |
421 | freepage = mapping->a_ops->freepage; | 421 | freepage = mapping->a_ops->freepage; |
422 | 422 | ||
423 | page_cache_get(new); | 423 | page_cache_get(new); |
424 | new->mapping = mapping; | 424 | new->mapping = mapping; |
425 | new->index = offset; | 425 | new->index = offset; |
426 | 426 | ||
427 | spin_lock_irq(&mapping->tree_lock); | 427 | spin_lock_irq(&mapping->tree_lock); |
428 | __delete_from_page_cache(old); | 428 | __delete_from_page_cache(old); |
429 | error = radix_tree_insert(&mapping->page_tree, offset, new); | 429 | error = radix_tree_insert(&mapping->page_tree, offset, new); |
430 | BUG_ON(error); | 430 | BUG_ON(error); |
431 | mapping->nrpages++; | 431 | mapping->nrpages++; |
432 | __inc_zone_page_state(new, NR_FILE_PAGES); | 432 | __inc_zone_page_state(new, NR_FILE_PAGES); |
433 | if (PageSwapBacked(new)) | 433 | if (PageSwapBacked(new)) |
434 | __inc_zone_page_state(new, NR_SHMEM); | 434 | __inc_zone_page_state(new, NR_SHMEM); |
435 | spin_unlock_irq(&mapping->tree_lock); | 435 | spin_unlock_irq(&mapping->tree_lock); |
436 | radix_tree_preload_end(); | 436 | radix_tree_preload_end(); |
437 | if (freepage) | 437 | if (freepage) |
438 | freepage(old); | 438 | freepage(old); |
439 | page_cache_release(old); | 439 | page_cache_release(old); |
440 | mem_cgroup_end_migration(memcg, old, new, true); | 440 | mem_cgroup_end_migration(memcg, old, new, true); |
441 | } else { | 441 | } else { |
442 | mem_cgroup_end_migration(memcg, old, new, false); | 442 | mem_cgroup_end_migration(memcg, old, new, false); |
443 | } | 443 | } |
444 | 444 | ||
445 | return error; | 445 | return error; |
446 | } | 446 | } |
447 | EXPORT_SYMBOL_GPL(replace_page_cache_page); | 447 | EXPORT_SYMBOL_GPL(replace_page_cache_page); |
448 | 448 | ||
449 | /** | 449 | /** |
450 | * add_to_page_cache_locked - add a locked page to the pagecache | 450 | * add_to_page_cache_locked - add a locked page to the pagecache |
451 | * @page: page to add | 451 | * @page: page to add |
452 | * @mapping: the page's address_space | 452 | * @mapping: the page's address_space |
453 | * @offset: page index | 453 | * @offset: page index |
454 | * @gfp_mask: page allocation mode | 454 | * @gfp_mask: page allocation mode |
455 | * | 455 | * |
456 | * This function is used to add a page to the pagecache. It must be locked. | 456 | * This function is used to add a page to the pagecache. It must be locked. |
457 | * This function does not add the page to the LRU. The caller must do that. | 457 | * This function does not add the page to the LRU. The caller must do that. |
458 | */ | 458 | */ |
459 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | 459 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
460 | pgoff_t offset, gfp_t gfp_mask) | 460 | pgoff_t offset, gfp_t gfp_mask) |
461 | { | 461 | { |
462 | int error; | 462 | int error; |
463 | 463 | ||
464 | VM_BUG_ON(!PageLocked(page)); | 464 | VM_BUG_ON(!PageLocked(page)); |
465 | 465 | ||
466 | error = mem_cgroup_cache_charge(page, current->mm, | 466 | error = mem_cgroup_cache_charge(page, current->mm, |
467 | gfp_mask & GFP_RECLAIM_MASK); | 467 | gfp_mask & GFP_RECLAIM_MASK); |
468 | if (error) | 468 | if (error) |
469 | goto out; | 469 | goto out; |
470 | 470 | ||
471 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 471 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
472 | if (error == 0) { | 472 | if (error == 0) { |
473 | page_cache_get(page); | 473 | page_cache_get(page); |
474 | page->mapping = mapping; | 474 | page->mapping = mapping; |
475 | page->index = offset; | 475 | page->index = offset; |
476 | 476 | ||
477 | spin_lock_irq(&mapping->tree_lock); | 477 | spin_lock_irq(&mapping->tree_lock); |
478 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 478 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
479 | if (likely(!error)) { | 479 | if (likely(!error)) { |
480 | mapping->nrpages++; | 480 | mapping->nrpages++; |
481 | __inc_zone_page_state(page, NR_FILE_PAGES); | 481 | __inc_zone_page_state(page, NR_FILE_PAGES); |
482 | if (PageSwapBacked(page)) | 482 | if (PageSwapBacked(page)) |
483 | __inc_zone_page_state(page, NR_SHMEM); | 483 | __inc_zone_page_state(page, NR_SHMEM); |
484 | spin_unlock_irq(&mapping->tree_lock); | 484 | spin_unlock_irq(&mapping->tree_lock); |
485 | } else { | 485 | } else { |
486 | page->mapping = NULL; | 486 | page->mapping = NULL; |
487 | /* Leave page->index set: truncation relies upon it */ | 487 | /* Leave page->index set: truncation relies upon it */ |
488 | spin_unlock_irq(&mapping->tree_lock); | 488 | spin_unlock_irq(&mapping->tree_lock); |
489 | mem_cgroup_uncharge_cache_page(page); | 489 | mem_cgroup_uncharge_cache_page(page); |
490 | page_cache_release(page); | 490 | page_cache_release(page); |
491 | } | 491 | } |
492 | radix_tree_preload_end(); | 492 | radix_tree_preload_end(); |
493 | } else | 493 | } else |
494 | mem_cgroup_uncharge_cache_page(page); | 494 | mem_cgroup_uncharge_cache_page(page); |
495 | out: | 495 | out: |
496 | return error; | 496 | return error; |
497 | } | 497 | } |
498 | EXPORT_SYMBOL(add_to_page_cache_locked); | 498 | EXPORT_SYMBOL(add_to_page_cache_locked); |
499 | 499 | ||
500 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 500 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
501 | pgoff_t offset, gfp_t gfp_mask) | 501 | pgoff_t offset, gfp_t gfp_mask) |
502 | { | 502 | { |
503 | int ret; | 503 | int ret; |
504 | 504 | ||
505 | /* | 505 | /* |
506 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | 506 | * Splice_read and readahead add shmem/tmpfs pages into the page cache |
507 | * before shmem_readpage has a chance to mark them as SwapBacked: they | 507 | * before shmem_readpage has a chance to mark them as SwapBacked: they |
508 | * need to go on the anon lru below, and mem_cgroup_cache_charge | 508 | * need to go on the anon lru below, and mem_cgroup_cache_charge |
509 | * (called in add_to_page_cache) needs to know where they're going too. | 509 | * (called in add_to_page_cache) needs to know where they're going too. |
510 | */ | 510 | */ |
511 | if (mapping_cap_swap_backed(mapping)) | 511 | if (mapping_cap_swap_backed(mapping)) |
512 | SetPageSwapBacked(page); | 512 | SetPageSwapBacked(page); |
513 | 513 | ||
514 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 514 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); |
515 | if (ret == 0) { | 515 | if (ret == 0) { |
516 | if (page_is_file_cache(page)) | 516 | if (page_is_file_cache(page)) |
517 | lru_cache_add_file(page); | 517 | lru_cache_add_file(page); |
518 | else | 518 | else |
519 | lru_cache_add_anon(page); | 519 | lru_cache_add_anon(page); |
520 | } | 520 | } |
521 | return ret; | 521 | return ret; |
522 | } | 522 | } |
523 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | 523 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); |
524 | 524 | ||
525 | #ifdef CONFIG_NUMA | 525 | #ifdef CONFIG_NUMA |
526 | struct page *__page_cache_alloc(gfp_t gfp) | 526 | struct page *__page_cache_alloc(gfp_t gfp) |
527 | { | 527 | { |
528 | int n; | 528 | int n; |
529 | struct page *page; | 529 | struct page *page; |
530 | 530 | ||
531 | if (cpuset_do_page_mem_spread()) { | 531 | if (cpuset_do_page_mem_spread()) { |
532 | get_mems_allowed(); | 532 | get_mems_allowed(); |
533 | n = cpuset_mem_spread_node(); | 533 | n = cpuset_mem_spread_node(); |
534 | page = alloc_pages_exact_node(n, gfp, 0); | 534 | page = alloc_pages_exact_node(n, gfp, 0); |
535 | put_mems_allowed(); | 535 | put_mems_allowed(); |
536 | return page; | 536 | return page; |
537 | } | 537 | } |
538 | return alloc_pages(gfp, 0); | 538 | return alloc_pages(gfp, 0); |
539 | } | 539 | } |
540 | EXPORT_SYMBOL(__page_cache_alloc); | 540 | EXPORT_SYMBOL(__page_cache_alloc); |
541 | #endif | 541 | #endif |
542 | 542 | ||
543 | /* | 543 | /* |
544 | * In order to wait for pages to become available there must be | 544 | * In order to wait for pages to become available there must be |
545 | * waitqueues associated with pages. By using a hash table of | 545 | * waitqueues associated with pages. By using a hash table of |
546 | * waitqueues where the bucket discipline is to maintain all | 546 | * waitqueues where the bucket discipline is to maintain all |
547 | * waiters on the same queue and wake all when any of the pages | 547 | * waiters on the same queue and wake all when any of the pages |
548 | * become available, and for the woken contexts to check to be | 548 | * become available, and for the woken contexts to check to be |
549 | * sure the appropriate page became available, this saves space | 549 | * sure the appropriate page became available, this saves space |
550 | * at a cost of "thundering herd" phenomena during rare hash | 550 | * at a cost of "thundering herd" phenomena during rare hash |
551 | * collisions. | 551 | * collisions. |
552 | */ | 552 | */ |
553 | static wait_queue_head_t *page_waitqueue(struct page *page) | 553 | static wait_queue_head_t *page_waitqueue(struct page *page) |
554 | { | 554 | { |
555 | const struct zone *zone = page_zone(page); | 555 | const struct zone *zone = page_zone(page); |
556 | 556 | ||
557 | return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; | 557 | return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; |
558 | } | 558 | } |
559 | 559 | ||
560 | static inline void wake_up_page(struct page *page, int bit) | 560 | static inline void wake_up_page(struct page *page, int bit) |
561 | { | 561 | { |
562 | __wake_up_bit(page_waitqueue(page), &page->flags, bit); | 562 | __wake_up_bit(page_waitqueue(page), &page->flags, bit); |
563 | } | 563 | } |
564 | 564 | ||
565 | void wait_on_page_bit(struct page *page, int bit_nr) | 565 | void wait_on_page_bit(struct page *page, int bit_nr) |
566 | { | 566 | { |
567 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | 567 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); |
568 | 568 | ||
569 | if (test_bit(bit_nr, &page->flags)) | 569 | if (test_bit(bit_nr, &page->flags)) |
570 | __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, | 570 | __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, |
571 | TASK_UNINTERRUPTIBLE); | 571 | TASK_UNINTERRUPTIBLE); |
572 | } | 572 | } |
573 | EXPORT_SYMBOL(wait_on_page_bit); | 573 | EXPORT_SYMBOL(wait_on_page_bit); |
574 | 574 | ||
575 | int wait_on_page_bit_killable(struct page *page, int bit_nr) | 575 | int wait_on_page_bit_killable(struct page *page, int bit_nr) |
576 | { | 576 | { |
577 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | 577 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); |
578 | 578 | ||
579 | if (!test_bit(bit_nr, &page->flags)) | 579 | if (!test_bit(bit_nr, &page->flags)) |
580 | return 0; | 580 | return 0; |
581 | 581 | ||
582 | return __wait_on_bit(page_waitqueue(page), &wait, | 582 | return __wait_on_bit(page_waitqueue(page), &wait, |
583 | sleep_on_page_killable, TASK_KILLABLE); | 583 | sleep_on_page_killable, TASK_KILLABLE); |
584 | } | 584 | } |
585 | 585 | ||
586 | /** | 586 | /** |
587 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue | 587 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue |
588 | * @page: Page defining the wait queue of interest | 588 | * @page: Page defining the wait queue of interest |
589 | * @waiter: Waiter to add to the queue | 589 | * @waiter: Waiter to add to the queue |
590 | * | 590 | * |
591 | * Add an arbitrary @waiter to the wait queue for the nominated @page. | 591 | * Add an arbitrary @waiter to the wait queue for the nominated @page. |
592 | */ | 592 | */ |
593 | void add_page_wait_queue(struct page *page, wait_queue_t *waiter) | 593 | void add_page_wait_queue(struct page *page, wait_queue_t *waiter) |
594 | { | 594 | { |
595 | wait_queue_head_t *q = page_waitqueue(page); | 595 | wait_queue_head_t *q = page_waitqueue(page); |
596 | unsigned long flags; | 596 | unsigned long flags; |
597 | 597 | ||
598 | spin_lock_irqsave(&q->lock, flags); | 598 | spin_lock_irqsave(&q->lock, flags); |
599 | __add_wait_queue(q, waiter); | 599 | __add_wait_queue(q, waiter); |
600 | spin_unlock_irqrestore(&q->lock, flags); | 600 | spin_unlock_irqrestore(&q->lock, flags); |
601 | } | 601 | } |
602 | EXPORT_SYMBOL_GPL(add_page_wait_queue); | 602 | EXPORT_SYMBOL_GPL(add_page_wait_queue); |
603 | 603 | ||
604 | /** | 604 | /** |
605 | * unlock_page - unlock a locked page | 605 | * unlock_page - unlock a locked page |
606 | * @page: the page | 606 | * @page: the page |
607 | * | 607 | * |
608 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). | 608 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). |
609 | * Also wakes sleepers in wait_on_page_writeback() because the wakeup | 609 | * Also wakes sleepers in wait_on_page_writeback() because the wakeup |
610 | * mechananism between PageLocked pages and PageWriteback pages is shared. | 610 | * mechananism between PageLocked pages and PageWriteback pages is shared. |
611 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 611 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
612 | * | 612 | * |
613 | * The mb is necessary to enforce ordering between the clear_bit and the read | 613 | * The mb is necessary to enforce ordering between the clear_bit and the read |
614 | * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). | 614 | * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). |
615 | */ | 615 | */ |
616 | void unlock_page(struct page *page) | 616 | void unlock_page(struct page *page) |
617 | { | 617 | { |
618 | VM_BUG_ON(!PageLocked(page)); | 618 | VM_BUG_ON(!PageLocked(page)); |
619 | clear_bit_unlock(PG_locked, &page->flags); | 619 | clear_bit_unlock(PG_locked, &page->flags); |
620 | smp_mb__after_clear_bit(); | 620 | smp_mb__after_clear_bit(); |
621 | wake_up_page(page, PG_locked); | 621 | wake_up_page(page, PG_locked); |
622 | } | 622 | } |
623 | EXPORT_SYMBOL(unlock_page); | 623 | EXPORT_SYMBOL(unlock_page); |
624 | 624 | ||
625 | /** | 625 | /** |
626 | * end_page_writeback - end writeback against a page | 626 | * end_page_writeback - end writeback against a page |
627 | * @page: the page | 627 | * @page: the page |
628 | */ | 628 | */ |
629 | void end_page_writeback(struct page *page) | 629 | void end_page_writeback(struct page *page) |
630 | { | 630 | { |
631 | if (TestClearPageReclaim(page)) | 631 | if (TestClearPageReclaim(page)) |
632 | rotate_reclaimable_page(page); | 632 | rotate_reclaimable_page(page); |
633 | 633 | ||
634 | if (!test_clear_page_writeback(page)) | 634 | if (!test_clear_page_writeback(page)) |
635 | BUG(); | 635 | BUG(); |
636 | 636 | ||
637 | smp_mb__after_clear_bit(); | 637 | smp_mb__after_clear_bit(); |
638 | wake_up_page(page, PG_writeback); | 638 | wake_up_page(page, PG_writeback); |
639 | } | 639 | } |
640 | EXPORT_SYMBOL(end_page_writeback); | 640 | EXPORT_SYMBOL(end_page_writeback); |
641 | 641 | ||
642 | /** | 642 | /** |
643 | * __lock_page - get a lock on the page, assuming we need to sleep to get it | 643 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
644 | * @page: the page to lock | 644 | * @page: the page to lock |
645 | */ | 645 | */ |
646 | void __lock_page(struct page *page) | 646 | void __lock_page(struct page *page) |
647 | { | 647 | { |
648 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 648 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
649 | 649 | ||
650 | __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, | 650 | __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, |
651 | TASK_UNINTERRUPTIBLE); | 651 | TASK_UNINTERRUPTIBLE); |
652 | } | 652 | } |
653 | EXPORT_SYMBOL(__lock_page); | 653 | EXPORT_SYMBOL(__lock_page); |
654 | 654 | ||
655 | int __lock_page_killable(struct page *page) | 655 | int __lock_page_killable(struct page *page) |
656 | { | 656 | { |
657 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 657 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
658 | 658 | ||
659 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 659 | return __wait_on_bit_lock(page_waitqueue(page), &wait, |
660 | sleep_on_page_killable, TASK_KILLABLE); | 660 | sleep_on_page_killable, TASK_KILLABLE); |
661 | } | 661 | } |
662 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 662 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
663 | 663 | ||
664 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | 664 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
665 | unsigned int flags) | 665 | unsigned int flags) |
666 | { | 666 | { |
667 | if (flags & FAULT_FLAG_ALLOW_RETRY) { | 667 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
668 | /* | 668 | /* |
669 | * CAUTION! In this case, mmap_sem is not released | 669 | * CAUTION! In this case, mmap_sem is not released |
670 | * even though return 0. | 670 | * even though return 0. |
671 | */ | 671 | */ |
672 | if (flags & FAULT_FLAG_RETRY_NOWAIT) | 672 | if (flags & FAULT_FLAG_RETRY_NOWAIT) |
673 | return 0; | 673 | return 0; |
674 | 674 | ||
675 | up_read(&mm->mmap_sem); | 675 | up_read(&mm->mmap_sem); |
676 | if (flags & FAULT_FLAG_KILLABLE) | 676 | if (flags & FAULT_FLAG_KILLABLE) |
677 | wait_on_page_locked_killable(page); | 677 | wait_on_page_locked_killable(page); |
678 | else | 678 | else |
679 | wait_on_page_locked(page); | 679 | wait_on_page_locked(page); |
680 | return 0; | 680 | return 0; |
681 | } else { | 681 | } else { |
682 | if (flags & FAULT_FLAG_KILLABLE) { | 682 | if (flags & FAULT_FLAG_KILLABLE) { |
683 | int ret; | 683 | int ret; |
684 | 684 | ||
685 | ret = __lock_page_killable(page); | 685 | ret = __lock_page_killable(page); |
686 | if (ret) { | 686 | if (ret) { |
687 | up_read(&mm->mmap_sem); | 687 | up_read(&mm->mmap_sem); |
688 | return 0; | 688 | return 0; |
689 | } | 689 | } |
690 | } else | 690 | } else |
691 | __lock_page(page); | 691 | __lock_page(page); |
692 | return 1; | 692 | return 1; |
693 | } | 693 | } |
694 | } | 694 | } |
695 | 695 | ||
696 | /** | 696 | /** |
697 | * find_get_page - find and get a page reference | 697 | * find_get_page - find and get a page reference |
698 | * @mapping: the address_space to search | 698 | * @mapping: the address_space to search |
699 | * @offset: the page index | 699 | * @offset: the page index |
700 | * | 700 | * |
701 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 701 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
702 | * If yes, increment its refcount and return it; if no, return NULL. | 702 | * If yes, increment its refcount and return it; if no, return NULL. |
703 | */ | 703 | */ |
704 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) | 704 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) |
705 | { | 705 | { |
706 | void **pagep; | 706 | void **pagep; |
707 | struct page *page; | 707 | struct page *page; |
708 | 708 | ||
709 | rcu_read_lock(); | 709 | rcu_read_lock(); |
710 | repeat: | 710 | repeat: |
711 | page = NULL; | 711 | page = NULL; |
712 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); | 712 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
713 | if (pagep) { | 713 | if (pagep) { |
714 | page = radix_tree_deref_slot(pagep); | 714 | page = radix_tree_deref_slot(pagep); |
715 | if (unlikely(!page)) | 715 | if (unlikely(!page)) |
716 | goto out; | 716 | goto out; |
717 | if (radix_tree_deref_retry(page)) | 717 | if (radix_tree_deref_retry(page)) |
718 | goto repeat; | 718 | goto repeat; |
719 | 719 | ||
720 | if (!page_cache_get_speculative(page)) | 720 | if (!page_cache_get_speculative(page)) |
721 | goto repeat; | 721 | goto repeat; |
722 | 722 | ||
723 | /* | 723 | /* |
724 | * Has the page moved? | 724 | * Has the page moved? |
725 | * This is part of the lockless pagecache protocol. See | 725 | * This is part of the lockless pagecache protocol. See |
726 | * include/linux/pagemap.h for details. | 726 | * include/linux/pagemap.h for details. |
727 | */ | 727 | */ |
728 | if (unlikely(page != *pagep)) { | 728 | if (unlikely(page != *pagep)) { |
729 | page_cache_release(page); | 729 | page_cache_release(page); |
730 | goto repeat; | 730 | goto repeat; |
731 | } | 731 | } |
732 | } | 732 | } |
733 | out: | 733 | out: |
734 | rcu_read_unlock(); | 734 | rcu_read_unlock(); |
735 | 735 | ||
736 | return page; | 736 | return page; |
737 | } | 737 | } |
738 | EXPORT_SYMBOL(find_get_page); | 738 | EXPORT_SYMBOL(find_get_page); |
739 | 739 | ||
740 | /** | 740 | /** |
741 | * find_lock_page - locate, pin and lock a pagecache page | 741 | * find_lock_page - locate, pin and lock a pagecache page |
742 | * @mapping: the address_space to search | 742 | * @mapping: the address_space to search |
743 | * @offset: the page index | 743 | * @offset: the page index |
744 | * | 744 | * |
745 | * Locates the desired pagecache page, locks it, increments its reference | 745 | * Locates the desired pagecache page, locks it, increments its reference |
746 | * count and returns its address. | 746 | * count and returns its address. |
747 | * | 747 | * |
748 | * Returns zero if the page was not present. find_lock_page() may sleep. | 748 | * Returns zero if the page was not present. find_lock_page() may sleep. |
749 | */ | 749 | */ |
750 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | 750 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) |
751 | { | 751 | { |
752 | struct page *page; | 752 | struct page *page; |
753 | 753 | ||
754 | repeat: | 754 | repeat: |
755 | page = find_get_page(mapping, offset); | 755 | page = find_get_page(mapping, offset); |
756 | if (page) { | 756 | if (page) { |
757 | lock_page(page); | 757 | lock_page(page); |
758 | /* Has the page been truncated? */ | 758 | /* Has the page been truncated? */ |
759 | if (unlikely(page->mapping != mapping)) { | 759 | if (unlikely(page->mapping != mapping)) { |
760 | unlock_page(page); | 760 | unlock_page(page); |
761 | page_cache_release(page); | 761 | page_cache_release(page); |
762 | goto repeat; | 762 | goto repeat; |
763 | } | 763 | } |
764 | VM_BUG_ON(page->index != offset); | 764 | VM_BUG_ON(page->index != offset); |
765 | } | 765 | } |
766 | return page; | 766 | return page; |
767 | } | 767 | } |
768 | EXPORT_SYMBOL(find_lock_page); | 768 | EXPORT_SYMBOL(find_lock_page); |
769 | 769 | ||
770 | /** | 770 | /** |
771 | * find_or_create_page - locate or add a pagecache page | 771 | * find_or_create_page - locate or add a pagecache page |
772 | * @mapping: the page's address_space | 772 | * @mapping: the page's address_space |
773 | * @index: the page's index into the mapping | 773 | * @index: the page's index into the mapping |
774 | * @gfp_mask: page allocation mode | 774 | * @gfp_mask: page allocation mode |
775 | * | 775 | * |
776 | * Locates a page in the pagecache. If the page is not present, a new page | 776 | * Locates a page in the pagecache. If the page is not present, a new page |
777 | * is allocated using @gfp_mask and is added to the pagecache and to the VM's | 777 | * is allocated using @gfp_mask and is added to the pagecache and to the VM's |
778 | * LRU list. The returned page is locked and has its reference count | 778 | * LRU list. The returned page is locked and has its reference count |
779 | * incremented. | 779 | * incremented. |
780 | * | 780 | * |
781 | * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic | 781 | * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic |
782 | * allocation! | 782 | * allocation! |
783 | * | 783 | * |
784 | * find_or_create_page() returns the desired page's address, or zero on | 784 | * find_or_create_page() returns the desired page's address, or zero on |
785 | * memory exhaustion. | 785 | * memory exhaustion. |
786 | */ | 786 | */ |
787 | struct page *find_or_create_page(struct address_space *mapping, | 787 | struct page *find_or_create_page(struct address_space *mapping, |
788 | pgoff_t index, gfp_t gfp_mask) | 788 | pgoff_t index, gfp_t gfp_mask) |
789 | { | 789 | { |
790 | struct page *page; | 790 | struct page *page; |
791 | int err; | 791 | int err; |
792 | repeat: | 792 | repeat: |
793 | page = find_lock_page(mapping, index); | 793 | page = find_lock_page(mapping, index); |
794 | if (!page) { | 794 | if (!page) { |
795 | page = __page_cache_alloc(gfp_mask); | 795 | page = __page_cache_alloc(gfp_mask); |
796 | if (!page) | 796 | if (!page) |
797 | return NULL; | 797 | return NULL; |
798 | /* | 798 | /* |
799 | * We want a regular kernel memory (not highmem or DMA etc) | 799 | * We want a regular kernel memory (not highmem or DMA etc) |
800 | * allocation for the radix tree nodes, but we need to honour | 800 | * allocation for the radix tree nodes, but we need to honour |
801 | * the context-specific requirements the caller has asked for. | 801 | * the context-specific requirements the caller has asked for. |
802 | * GFP_RECLAIM_MASK collects those requirements. | 802 | * GFP_RECLAIM_MASK collects those requirements. |
803 | */ | 803 | */ |
804 | err = add_to_page_cache_lru(page, mapping, index, | 804 | err = add_to_page_cache_lru(page, mapping, index, |
805 | (gfp_mask & GFP_RECLAIM_MASK)); | 805 | (gfp_mask & GFP_RECLAIM_MASK)); |
806 | if (unlikely(err)) { | 806 | if (unlikely(err)) { |
807 | page_cache_release(page); | 807 | page_cache_release(page); |
808 | page = NULL; | 808 | page = NULL; |
809 | if (err == -EEXIST) | 809 | if (err == -EEXIST) |
810 | goto repeat; | 810 | goto repeat; |
811 | } | 811 | } |
812 | } | 812 | } |
813 | return page; | 813 | return page; |
814 | } | 814 | } |
815 | EXPORT_SYMBOL(find_or_create_page); | 815 | EXPORT_SYMBOL(find_or_create_page); |
816 | 816 | ||
817 | /** | 817 | /** |
818 | * find_get_pages - gang pagecache lookup | 818 | * find_get_pages - gang pagecache lookup |
819 | * @mapping: The address_space to search | 819 | * @mapping: The address_space to search |
820 | * @start: The starting page index | 820 | * @start: The starting page index |
821 | * @nr_pages: The maximum number of pages | 821 | * @nr_pages: The maximum number of pages |
822 | * @pages: Where the resulting pages are placed | 822 | * @pages: Where the resulting pages are placed |
823 | * | 823 | * |
824 | * find_get_pages() will search for and return a group of up to | 824 | * find_get_pages() will search for and return a group of up to |
825 | * @nr_pages pages in the mapping. The pages are placed at @pages. | 825 | * @nr_pages pages in the mapping. The pages are placed at @pages. |
826 | * find_get_pages() takes a reference against the returned pages. | 826 | * find_get_pages() takes a reference against the returned pages. |
827 | * | 827 | * |
828 | * The search returns a group of mapping-contiguous pages with ascending | 828 | * The search returns a group of mapping-contiguous pages with ascending |
829 | * indexes. There may be holes in the indices due to not-present pages. | 829 | * indexes. There may be holes in the indices due to not-present pages. |
830 | * | 830 | * |
831 | * find_get_pages() returns the number of pages which were found. | 831 | * find_get_pages() returns the number of pages which were found. |
832 | */ | 832 | */ |
833 | unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | 833 | unsigned find_get_pages(struct address_space *mapping, pgoff_t start, |
834 | unsigned int nr_pages, struct page **pages) | 834 | unsigned int nr_pages, struct page **pages) |
835 | { | 835 | { |
836 | unsigned int i; | 836 | unsigned int i; |
837 | unsigned int ret; | 837 | unsigned int ret; |
838 | unsigned int nr_found; | 838 | unsigned int nr_found; |
839 | 839 | ||
840 | rcu_read_lock(); | 840 | rcu_read_lock(); |
841 | restart: | 841 | restart: |
842 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 842 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
843 | (void ***)pages, start, nr_pages); | 843 | (void ***)pages, start, nr_pages); |
844 | ret = 0; | 844 | ret = 0; |
845 | for (i = 0; i < nr_found; i++) { | 845 | for (i = 0; i < nr_found; i++) { |
846 | struct page *page; | 846 | struct page *page; |
847 | repeat: | 847 | repeat: |
848 | page = radix_tree_deref_slot((void **)pages[i]); | 848 | page = radix_tree_deref_slot((void **)pages[i]); |
849 | if (unlikely(!page)) | 849 | if (unlikely(!page)) |
850 | continue; | 850 | continue; |
851 | 851 | ||
852 | /* | 852 | /* |
853 | * This can only trigger when the entry at index 0 moves out | 853 | * This can only trigger when the entry at index 0 moves out |
854 | * of or back to the root: none yet gotten, safe to restart. | 854 | * of or back to the root: none yet gotten, safe to restart. |
855 | */ | 855 | */ |
856 | if (radix_tree_deref_retry(page)) { | 856 | if (radix_tree_deref_retry(page)) { |
857 | WARN_ON(start | i); | 857 | WARN_ON(start | i); |
858 | goto restart; | 858 | goto restart; |
859 | } | 859 | } |
860 | 860 | ||
861 | if (!page_cache_get_speculative(page)) | 861 | if (!page_cache_get_speculative(page)) |
862 | goto repeat; | 862 | goto repeat; |
863 | 863 | ||
864 | /* Has the page moved? */ | 864 | /* Has the page moved? */ |
865 | if (unlikely(page != *((void **)pages[i]))) { | 865 | if (unlikely(page != *((void **)pages[i]))) { |
866 | page_cache_release(page); | 866 | page_cache_release(page); |
867 | goto repeat; | 867 | goto repeat; |
868 | } | 868 | } |
869 | 869 | ||
870 | pages[ret] = page; | 870 | pages[ret] = page; |
871 | ret++; | 871 | ret++; |
872 | } | 872 | } |
873 | 873 | ||
874 | /* | 874 | /* |
875 | * If all entries were removed before we could secure them, | 875 | * If all entries were removed before we could secure them, |
876 | * try again, because callers stop trying once 0 is returned. | 876 | * try again, because callers stop trying once 0 is returned. |
877 | */ | 877 | */ |
878 | if (unlikely(!ret && nr_found)) | 878 | if (unlikely(!ret && nr_found)) |
879 | goto restart; | 879 | goto restart; |
880 | rcu_read_unlock(); | 880 | rcu_read_unlock(); |
881 | return ret; | 881 | return ret; |
882 | } | 882 | } |
883 | 883 | ||
884 | /** | 884 | /** |
885 | * find_get_pages_contig - gang contiguous pagecache lookup | 885 | * find_get_pages_contig - gang contiguous pagecache lookup |
886 | * @mapping: The address_space to search | 886 | * @mapping: The address_space to search |
887 | * @index: The starting page index | 887 | * @index: The starting page index |
888 | * @nr_pages: The maximum number of pages | 888 | * @nr_pages: The maximum number of pages |
889 | * @pages: Where the resulting pages are placed | 889 | * @pages: Where the resulting pages are placed |
890 | * | 890 | * |
891 | * find_get_pages_contig() works exactly like find_get_pages(), except | 891 | * find_get_pages_contig() works exactly like find_get_pages(), except |
892 | * that the returned number of pages are guaranteed to be contiguous. | 892 | * that the returned number of pages are guaranteed to be contiguous. |
893 | * | 893 | * |
894 | * find_get_pages_contig() returns the number of pages which were found. | 894 | * find_get_pages_contig() returns the number of pages which were found. |
895 | */ | 895 | */ |
896 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | 896 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, |
897 | unsigned int nr_pages, struct page **pages) | 897 | unsigned int nr_pages, struct page **pages) |
898 | { | 898 | { |
899 | unsigned int i; | 899 | unsigned int i; |
900 | unsigned int ret; | 900 | unsigned int ret; |
901 | unsigned int nr_found; | 901 | unsigned int nr_found; |
902 | 902 | ||
903 | rcu_read_lock(); | 903 | rcu_read_lock(); |
904 | restart: | 904 | restart: |
905 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 905 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
906 | (void ***)pages, index, nr_pages); | 906 | (void ***)pages, index, nr_pages); |
907 | ret = 0; | 907 | ret = 0; |
908 | for (i = 0; i < nr_found; i++) { | 908 | for (i = 0; i < nr_found; i++) { |
909 | struct page *page; | 909 | struct page *page; |
910 | repeat: | 910 | repeat: |
911 | page = radix_tree_deref_slot((void **)pages[i]); | 911 | page = radix_tree_deref_slot((void **)pages[i]); |
912 | if (unlikely(!page)) | 912 | if (unlikely(!page)) |
913 | continue; | 913 | continue; |
914 | 914 | ||
915 | /* | 915 | /* |
916 | * This can only trigger when the entry at index 0 moves out | 916 | * This can only trigger when the entry at index 0 moves out |
917 | * of or back to the root: none yet gotten, safe to restart. | 917 | * of or back to the root: none yet gotten, safe to restart. |
918 | */ | 918 | */ |
919 | if (radix_tree_deref_retry(page)) | 919 | if (radix_tree_deref_retry(page)) |
920 | goto restart; | 920 | goto restart; |
921 | 921 | ||
922 | if (!page_cache_get_speculative(page)) | 922 | if (!page_cache_get_speculative(page)) |
923 | goto repeat; | 923 | goto repeat; |
924 | 924 | ||
925 | /* Has the page moved? */ | 925 | /* Has the page moved? */ |
926 | if (unlikely(page != *((void **)pages[i]))) { | 926 | if (unlikely(page != *((void **)pages[i]))) { |
927 | page_cache_release(page); | 927 | page_cache_release(page); |
928 | goto repeat; | 928 | goto repeat; |
929 | } | 929 | } |
930 | 930 | ||
931 | /* | 931 | /* |
932 | * must check mapping and index after taking the ref. | 932 | * must check mapping and index after taking the ref. |
933 | * otherwise we can get both false positives and false | 933 | * otherwise we can get both false positives and false |
934 | * negatives, which is just confusing to the caller. | 934 | * negatives, which is just confusing to the caller. |
935 | */ | 935 | */ |
936 | if (page->mapping == NULL || page->index != index) { | 936 | if (page->mapping == NULL || page->index != index) { |
937 | page_cache_release(page); | 937 | page_cache_release(page); |
938 | break; | 938 | break; |
939 | } | 939 | } |
940 | 940 | ||
941 | pages[ret] = page; | 941 | pages[ret] = page; |
942 | ret++; | 942 | ret++; |
943 | index++; | 943 | index++; |
944 | } | 944 | } |
945 | rcu_read_unlock(); | 945 | rcu_read_unlock(); |
946 | return ret; | 946 | return ret; |
947 | } | 947 | } |
948 | EXPORT_SYMBOL(find_get_pages_contig); | 948 | EXPORT_SYMBOL(find_get_pages_contig); |
949 | 949 | ||
950 | /** | 950 | /** |
951 | * find_get_pages_tag - find and return pages that match @tag | 951 | * find_get_pages_tag - find and return pages that match @tag |
952 | * @mapping: the address_space to search | 952 | * @mapping: the address_space to search |
953 | * @index: the starting page index | 953 | * @index: the starting page index |
954 | * @tag: the tag index | 954 | * @tag: the tag index |
955 | * @nr_pages: the maximum number of pages | 955 | * @nr_pages: the maximum number of pages |
956 | * @pages: where the resulting pages are placed | 956 | * @pages: where the resulting pages are placed |
957 | * | 957 | * |
958 | * Like find_get_pages, except we only return pages which are tagged with | 958 | * Like find_get_pages, except we only return pages which are tagged with |
959 | * @tag. We update @index to index the next page for the traversal. | 959 | * @tag. We update @index to index the next page for the traversal. |
960 | */ | 960 | */ |
961 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | 961 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, |
962 | int tag, unsigned int nr_pages, struct page **pages) | 962 | int tag, unsigned int nr_pages, struct page **pages) |
963 | { | 963 | { |
964 | unsigned int i; | 964 | unsigned int i; |
965 | unsigned int ret; | 965 | unsigned int ret; |
966 | unsigned int nr_found; | 966 | unsigned int nr_found; |
967 | 967 | ||
968 | rcu_read_lock(); | 968 | rcu_read_lock(); |
969 | restart: | 969 | restart: |
970 | nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, | 970 | nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, |
971 | (void ***)pages, *index, nr_pages, tag); | 971 | (void ***)pages, *index, nr_pages, tag); |
972 | ret = 0; | 972 | ret = 0; |
973 | for (i = 0; i < nr_found; i++) { | 973 | for (i = 0; i < nr_found; i++) { |
974 | struct page *page; | 974 | struct page *page; |
975 | repeat: | 975 | repeat: |
976 | page = radix_tree_deref_slot((void **)pages[i]); | 976 | page = radix_tree_deref_slot((void **)pages[i]); |
977 | if (unlikely(!page)) | 977 | if (unlikely(!page)) |
978 | continue; | 978 | continue; |
979 | 979 | ||
980 | /* | 980 | /* |
981 | * This can only trigger when the entry at index 0 moves out | 981 | * This can only trigger when the entry at index 0 moves out |
982 | * of or back to the root: none yet gotten, safe to restart. | 982 | * of or back to the root: none yet gotten, safe to restart. |
983 | */ | 983 | */ |
984 | if (radix_tree_deref_retry(page)) | 984 | if (radix_tree_deref_retry(page)) |
985 | goto restart; | 985 | goto restart; |
986 | 986 | ||
987 | if (!page_cache_get_speculative(page)) | 987 | if (!page_cache_get_speculative(page)) |
988 | goto repeat; | 988 | goto repeat; |
989 | 989 | ||
990 | /* Has the page moved? */ | 990 | /* Has the page moved? */ |
991 | if (unlikely(page != *((void **)pages[i]))) { | 991 | if (unlikely(page != *((void **)pages[i]))) { |
992 | page_cache_release(page); | 992 | page_cache_release(page); |
993 | goto repeat; | 993 | goto repeat; |
994 | } | 994 | } |
995 | 995 | ||
996 | pages[ret] = page; | 996 | pages[ret] = page; |
997 | ret++; | 997 | ret++; |
998 | } | 998 | } |
999 | 999 | ||
1000 | /* | 1000 | /* |
1001 | * If all entries were removed before we could secure them, | 1001 | * If all entries were removed before we could secure them, |
1002 | * try again, because callers stop trying once 0 is returned. | 1002 | * try again, because callers stop trying once 0 is returned. |
1003 | */ | 1003 | */ |
1004 | if (unlikely(!ret && nr_found)) | 1004 | if (unlikely(!ret && nr_found)) |
1005 | goto restart; | 1005 | goto restart; |
1006 | rcu_read_unlock(); | 1006 | rcu_read_unlock(); |
1007 | 1007 | ||
1008 | if (ret) | 1008 | if (ret) |
1009 | *index = pages[ret - 1]->index + 1; | 1009 | *index = pages[ret - 1]->index + 1; |
1010 | 1010 | ||
1011 | return ret; | 1011 | return ret; |
1012 | } | 1012 | } |
1013 | EXPORT_SYMBOL(find_get_pages_tag); | 1013 | EXPORT_SYMBOL(find_get_pages_tag); |
1014 | 1014 | ||
1015 | /** | 1015 | /** |
1016 | * grab_cache_page_nowait - returns locked page at given index in given cache | 1016 | * grab_cache_page_nowait - returns locked page at given index in given cache |
1017 | * @mapping: target address_space | 1017 | * @mapping: target address_space |
1018 | * @index: the page index | 1018 | * @index: the page index |
1019 | * | 1019 | * |
1020 | * Same as grab_cache_page(), but do not wait if the page is unavailable. | 1020 | * Same as grab_cache_page(), but do not wait if the page is unavailable. |
1021 | * This is intended for speculative data generators, where the data can | 1021 | * This is intended for speculative data generators, where the data can |
1022 | * be regenerated if the page couldn't be grabbed. This routine should | 1022 | * be regenerated if the page couldn't be grabbed. This routine should |
1023 | * be safe to call while holding the lock for another page. | 1023 | * be safe to call while holding the lock for another page. |
1024 | * | 1024 | * |
1025 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs | 1025 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs |
1026 | * and deadlock against the caller's locked page. | 1026 | * and deadlock against the caller's locked page. |
1027 | */ | 1027 | */ |
1028 | struct page * | 1028 | struct page * |
1029 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | 1029 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) |
1030 | { | 1030 | { |
1031 | struct page *page = find_get_page(mapping, index); | 1031 | struct page *page = find_get_page(mapping, index); |
1032 | 1032 | ||
1033 | if (page) { | 1033 | if (page) { |
1034 | if (trylock_page(page)) | 1034 | if (trylock_page(page)) |
1035 | return page; | 1035 | return page; |
1036 | page_cache_release(page); | 1036 | page_cache_release(page); |
1037 | return NULL; | 1037 | return NULL; |
1038 | } | 1038 | } |
1039 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); | 1039 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); |
1040 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { | 1040 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { |
1041 | page_cache_release(page); | 1041 | page_cache_release(page); |
1042 | page = NULL; | 1042 | page = NULL; |
1043 | } | 1043 | } |
1044 | return page; | 1044 | return page; |
1045 | } | 1045 | } |
1046 | EXPORT_SYMBOL(grab_cache_page_nowait); | 1046 | EXPORT_SYMBOL(grab_cache_page_nowait); |
1047 | 1047 | ||
1048 | /* | 1048 | /* |
1049 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail | 1049 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail |
1050 | * a _large_ part of the i/o request. Imagine the worst scenario: | 1050 | * a _large_ part of the i/o request. Imagine the worst scenario: |
1051 | * | 1051 | * |
1052 | * ---R__________________________________________B__________ | 1052 | * ---R__________________________________________B__________ |
1053 | * ^ reading here ^ bad block(assume 4k) | 1053 | * ^ reading here ^ bad block(assume 4k) |
1054 | * | 1054 | * |
1055 | * read(R) => miss => readahead(R...B) => media error => frustrating retries | 1055 | * read(R) => miss => readahead(R...B) => media error => frustrating retries |
1056 | * => failing the whole request => read(R) => read(R+1) => | 1056 | * => failing the whole request => read(R) => read(R+1) => |
1057 | * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => | 1057 | * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => |
1058 | * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => | 1058 | * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => |
1059 | * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... | 1059 | * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... |
1060 | * | 1060 | * |
1061 | * It is going insane. Fix it by quickly scaling down the readahead size. | 1061 | * It is going insane. Fix it by quickly scaling down the readahead size. |
1062 | */ | 1062 | */ |
1063 | static void shrink_readahead_size_eio(struct file *filp, | 1063 | static void shrink_readahead_size_eio(struct file *filp, |
1064 | struct file_ra_state *ra) | 1064 | struct file_ra_state *ra) |
1065 | { | 1065 | { |
1066 | ra->ra_pages /= 4; | 1066 | ra->ra_pages /= 4; |
1067 | } | 1067 | } |
1068 | 1068 | ||
1069 | /** | 1069 | /** |
1070 | * do_generic_file_read - generic file read routine | 1070 | * do_generic_file_read - generic file read routine |
1071 | * @filp: the file to read | 1071 | * @filp: the file to read |
1072 | * @ppos: current file position | 1072 | * @ppos: current file position |
1073 | * @desc: read_descriptor | 1073 | * @desc: read_descriptor |
1074 | * @actor: read method | 1074 | * @actor: read method |
1075 | * | 1075 | * |
1076 | * This is a generic file read routine, and uses the | 1076 | * This is a generic file read routine, and uses the |
1077 | * mapping->a_ops->readpage() function for the actual low-level stuff. | 1077 | * mapping->a_ops->readpage() function for the actual low-level stuff. |
1078 | * | 1078 | * |
1079 | * This is really ugly. But the goto's actually try to clarify some | 1079 | * This is really ugly. But the goto's actually try to clarify some |
1080 | * of the logic when it comes to error handling etc. | 1080 | * of the logic when it comes to error handling etc. |
1081 | */ | 1081 | */ |
1082 | static void do_generic_file_read(struct file *filp, loff_t *ppos, | 1082 | static void do_generic_file_read(struct file *filp, loff_t *ppos, |
1083 | read_descriptor_t *desc, read_actor_t actor) | 1083 | read_descriptor_t *desc, read_actor_t actor) |
1084 | { | 1084 | { |
1085 | struct address_space *mapping = filp->f_mapping; | 1085 | struct address_space *mapping = filp->f_mapping; |
1086 | struct inode *inode = mapping->host; | 1086 | struct inode *inode = mapping->host; |
1087 | struct file_ra_state *ra = &filp->f_ra; | 1087 | struct file_ra_state *ra = &filp->f_ra; |
1088 | pgoff_t index; | 1088 | pgoff_t index; |
1089 | pgoff_t last_index; | 1089 | pgoff_t last_index; |
1090 | pgoff_t prev_index; | 1090 | pgoff_t prev_index; |
1091 | unsigned long offset; /* offset into pagecache page */ | 1091 | unsigned long offset; /* offset into pagecache page */ |
1092 | unsigned int prev_offset; | 1092 | unsigned int prev_offset; |
1093 | int error; | 1093 | int error; |
1094 | 1094 | ||
1095 | index = *ppos >> PAGE_CACHE_SHIFT; | 1095 | index = *ppos >> PAGE_CACHE_SHIFT; |
1096 | prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; | 1096 | prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; |
1097 | prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); | 1097 | prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); |
1098 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 1098 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
1099 | offset = *ppos & ~PAGE_CACHE_MASK; | 1099 | offset = *ppos & ~PAGE_CACHE_MASK; |
1100 | 1100 | ||
1101 | for (;;) { | 1101 | for (;;) { |
1102 | struct page *page; | 1102 | struct page *page; |
1103 | pgoff_t end_index; | 1103 | pgoff_t end_index; |
1104 | loff_t isize; | 1104 | loff_t isize; |
1105 | unsigned long nr, ret; | 1105 | unsigned long nr, ret; |
1106 | 1106 | ||
1107 | cond_resched(); | 1107 | cond_resched(); |
1108 | find_page: | 1108 | find_page: |
1109 | page = find_get_page(mapping, index); | 1109 | page = find_get_page(mapping, index); |
1110 | if (!page) { | 1110 | if (!page) { |
1111 | page_cache_sync_readahead(mapping, | 1111 | page_cache_sync_readahead(mapping, |
1112 | ra, filp, | 1112 | ra, filp, |
1113 | index, last_index - index); | 1113 | index, last_index - index); |
1114 | page = find_get_page(mapping, index); | 1114 | page = find_get_page(mapping, index); |
1115 | if (unlikely(page == NULL)) | 1115 | if (unlikely(page == NULL)) |
1116 | goto no_cached_page; | 1116 | goto no_cached_page; |
1117 | } | 1117 | } |
1118 | if (PageReadahead(page)) { | 1118 | if (PageReadahead(page)) { |
1119 | page_cache_async_readahead(mapping, | 1119 | page_cache_async_readahead(mapping, |
1120 | ra, filp, page, | 1120 | ra, filp, page, |
1121 | index, last_index - index); | 1121 | index, last_index - index); |
1122 | } | 1122 | } |
1123 | if (!PageUptodate(page)) { | 1123 | if (!PageUptodate(page)) { |
1124 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || | 1124 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || |
1125 | !mapping->a_ops->is_partially_uptodate) | 1125 | !mapping->a_ops->is_partially_uptodate) |
1126 | goto page_not_up_to_date; | 1126 | goto page_not_up_to_date; |
1127 | if (!trylock_page(page)) | 1127 | if (!trylock_page(page)) |
1128 | goto page_not_up_to_date; | 1128 | goto page_not_up_to_date; |
1129 | /* Did it get truncated before we got the lock? */ | 1129 | /* Did it get truncated before we got the lock? */ |
1130 | if (!page->mapping) | 1130 | if (!page->mapping) |
1131 | goto page_not_up_to_date_locked; | 1131 | goto page_not_up_to_date_locked; |
1132 | if (!mapping->a_ops->is_partially_uptodate(page, | 1132 | if (!mapping->a_ops->is_partially_uptodate(page, |
1133 | desc, offset)) | 1133 | desc, offset)) |
1134 | goto page_not_up_to_date_locked; | 1134 | goto page_not_up_to_date_locked; |
1135 | unlock_page(page); | 1135 | unlock_page(page); |
1136 | } | 1136 | } |
1137 | page_ok: | 1137 | page_ok: |
1138 | /* | 1138 | /* |
1139 | * i_size must be checked after we know the page is Uptodate. | 1139 | * i_size must be checked after we know the page is Uptodate. |
1140 | * | 1140 | * |
1141 | * Checking i_size after the check allows us to calculate | 1141 | * Checking i_size after the check allows us to calculate |
1142 | * the correct value for "nr", which means the zero-filled | 1142 | * the correct value for "nr", which means the zero-filled |
1143 | * part of the page is not copied back to userspace (unless | 1143 | * part of the page is not copied back to userspace (unless |
1144 | * another truncate extends the file - this is desired though). | 1144 | * another truncate extends the file - this is desired though). |
1145 | */ | 1145 | */ |
1146 | 1146 | ||
1147 | isize = i_size_read(inode); | 1147 | isize = i_size_read(inode); |
1148 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | 1148 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
1149 | if (unlikely(!isize || index > end_index)) { | 1149 | if (unlikely(!isize || index > end_index)) { |
1150 | page_cache_release(page); | 1150 | page_cache_release(page); |
1151 | goto out; | 1151 | goto out; |
1152 | } | 1152 | } |
1153 | 1153 | ||
1154 | /* nr is the maximum number of bytes to copy from this page */ | 1154 | /* nr is the maximum number of bytes to copy from this page */ |
1155 | nr = PAGE_CACHE_SIZE; | 1155 | nr = PAGE_CACHE_SIZE; |
1156 | if (index == end_index) { | 1156 | if (index == end_index) { |
1157 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | 1157 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; |
1158 | if (nr <= offset) { | 1158 | if (nr <= offset) { |
1159 | page_cache_release(page); | 1159 | page_cache_release(page); |
1160 | goto out; | 1160 | goto out; |
1161 | } | 1161 | } |
1162 | } | 1162 | } |
1163 | nr = nr - offset; | 1163 | nr = nr - offset; |
1164 | 1164 | ||
1165 | /* If users can be writing to this page using arbitrary | 1165 | /* If users can be writing to this page using arbitrary |
1166 | * virtual addresses, take care about potential aliasing | 1166 | * virtual addresses, take care about potential aliasing |
1167 | * before reading the page on the kernel side. | 1167 | * before reading the page on the kernel side. |
1168 | */ | 1168 | */ |
1169 | if (mapping_writably_mapped(mapping)) | 1169 | if (mapping_writably_mapped(mapping)) |
1170 | flush_dcache_page(page); | 1170 | flush_dcache_page(page); |
1171 | 1171 | ||
1172 | /* | 1172 | /* |
1173 | * When a sequential read accesses a page several times, | 1173 | * When a sequential read accesses a page several times, |
1174 | * only mark it as accessed the first time. | 1174 | * only mark it as accessed the first time. |
1175 | */ | 1175 | */ |
1176 | if (prev_index != index || offset != prev_offset) | 1176 | if (prev_index != index || offset != prev_offset) |
1177 | mark_page_accessed(page); | 1177 | mark_page_accessed(page); |
1178 | prev_index = index; | 1178 | prev_index = index; |
1179 | 1179 | ||
1180 | /* | 1180 | /* |
1181 | * Ok, we have the page, and it's up-to-date, so | 1181 | * Ok, we have the page, and it's up-to-date, so |
1182 | * now we can copy it to user space... | 1182 | * now we can copy it to user space... |
1183 | * | 1183 | * |
1184 | * The actor routine returns how many bytes were actually used.. | 1184 | * The actor routine returns how many bytes were actually used.. |
1185 | * NOTE! This may not be the same as how much of a user buffer | 1185 | * NOTE! This may not be the same as how much of a user buffer |
1186 | * we filled up (we may be padding etc), so we can only update | 1186 | * we filled up (we may be padding etc), so we can only update |
1187 | * "pos" here (the actor routine has to update the user buffer | 1187 | * "pos" here (the actor routine has to update the user buffer |
1188 | * pointers and the remaining count). | 1188 | * pointers and the remaining count). |
1189 | */ | 1189 | */ |
1190 | ret = actor(desc, page, offset, nr); | 1190 | ret = actor(desc, page, offset, nr); |
1191 | offset += ret; | 1191 | offset += ret; |
1192 | index += offset >> PAGE_CACHE_SHIFT; | 1192 | index += offset >> PAGE_CACHE_SHIFT; |
1193 | offset &= ~PAGE_CACHE_MASK; | 1193 | offset &= ~PAGE_CACHE_MASK; |
1194 | prev_offset = offset; | 1194 | prev_offset = offset; |
1195 | 1195 | ||
1196 | page_cache_release(page); | 1196 | page_cache_release(page); |
1197 | if (ret == nr && desc->count) | 1197 | if (ret == nr && desc->count) |
1198 | continue; | 1198 | continue; |
1199 | goto out; | 1199 | goto out; |
1200 | 1200 | ||
1201 | page_not_up_to_date: | 1201 | page_not_up_to_date: |
1202 | /* Get exclusive access to the page ... */ | 1202 | /* Get exclusive access to the page ... */ |
1203 | error = lock_page_killable(page); | 1203 | error = lock_page_killable(page); |
1204 | if (unlikely(error)) | 1204 | if (unlikely(error)) |
1205 | goto readpage_error; | 1205 | goto readpage_error; |
1206 | 1206 | ||
1207 | page_not_up_to_date_locked: | 1207 | page_not_up_to_date_locked: |
1208 | /* Did it get truncated before we got the lock? */ | 1208 | /* Did it get truncated before we got the lock? */ |
1209 | if (!page->mapping) { | 1209 | if (!page->mapping) { |
1210 | unlock_page(page); | 1210 | unlock_page(page); |
1211 | page_cache_release(page); | 1211 | page_cache_release(page); |
1212 | continue; | 1212 | continue; |
1213 | } | 1213 | } |
1214 | 1214 | ||
1215 | /* Did somebody else fill it already? */ | 1215 | /* Did somebody else fill it already? */ |
1216 | if (PageUptodate(page)) { | 1216 | if (PageUptodate(page)) { |
1217 | unlock_page(page); | 1217 | unlock_page(page); |
1218 | goto page_ok; | 1218 | goto page_ok; |
1219 | } | 1219 | } |
1220 | 1220 | ||
1221 | readpage: | 1221 | readpage: |
1222 | /* | 1222 | /* |
1223 | * A previous I/O error may have been due to temporary | 1223 | * A previous I/O error may have been due to temporary |
1224 | * failures, eg. multipath errors. | 1224 | * failures, eg. multipath errors. |
1225 | * PG_error will be set again if readpage fails. | 1225 | * PG_error will be set again if readpage fails. |
1226 | */ | 1226 | */ |
1227 | ClearPageError(page); | 1227 | ClearPageError(page); |
1228 | /* Start the actual read. The read will unlock the page. */ | 1228 | /* Start the actual read. The read will unlock the page. */ |
1229 | error = mapping->a_ops->readpage(filp, page); | 1229 | error = mapping->a_ops->readpage(filp, page); |
1230 | 1230 | ||
1231 | if (unlikely(error)) { | 1231 | if (unlikely(error)) { |
1232 | if (error == AOP_TRUNCATED_PAGE) { | 1232 | if (error == AOP_TRUNCATED_PAGE) { |
1233 | page_cache_release(page); | 1233 | page_cache_release(page); |
1234 | goto find_page; | 1234 | goto find_page; |
1235 | } | 1235 | } |
1236 | goto readpage_error; | 1236 | goto readpage_error; |
1237 | } | 1237 | } |
1238 | 1238 | ||
1239 | if (!PageUptodate(page)) { | 1239 | if (!PageUptodate(page)) { |
1240 | error = lock_page_killable(page); | 1240 | error = lock_page_killable(page); |
1241 | if (unlikely(error)) | 1241 | if (unlikely(error)) |
1242 | goto readpage_error; | 1242 | goto readpage_error; |
1243 | if (!PageUptodate(page)) { | 1243 | if (!PageUptodate(page)) { |
1244 | if (page->mapping == NULL) { | 1244 | if (page->mapping == NULL) { |
1245 | /* | 1245 | /* |
1246 | * invalidate_mapping_pages got it | 1246 | * invalidate_mapping_pages got it |
1247 | */ | 1247 | */ |
1248 | unlock_page(page); | 1248 | unlock_page(page); |
1249 | page_cache_release(page); | 1249 | page_cache_release(page); |
1250 | goto find_page; | 1250 | goto find_page; |
1251 | } | 1251 | } |
1252 | unlock_page(page); | 1252 | unlock_page(page); |
1253 | shrink_readahead_size_eio(filp, ra); | 1253 | shrink_readahead_size_eio(filp, ra); |
1254 | error = -EIO; | 1254 | error = -EIO; |
1255 | goto readpage_error; | 1255 | goto readpage_error; |
1256 | } | 1256 | } |
1257 | unlock_page(page); | 1257 | unlock_page(page); |
1258 | } | 1258 | } |
1259 | 1259 | ||
1260 | goto page_ok; | 1260 | goto page_ok; |
1261 | 1261 | ||
1262 | readpage_error: | 1262 | readpage_error: |
1263 | /* UHHUH! A synchronous read error occurred. Report it */ | 1263 | /* UHHUH! A synchronous read error occurred. Report it */ |
1264 | desc->error = error; | 1264 | desc->error = error; |
1265 | page_cache_release(page); | 1265 | page_cache_release(page); |
1266 | goto out; | 1266 | goto out; |
1267 | 1267 | ||
1268 | no_cached_page: | 1268 | no_cached_page: |
1269 | /* | 1269 | /* |
1270 | * Ok, it wasn't cached, so we need to create a new | 1270 | * Ok, it wasn't cached, so we need to create a new |
1271 | * page.. | 1271 | * page.. |
1272 | */ | 1272 | */ |
1273 | page = page_cache_alloc_cold(mapping); | 1273 | page = page_cache_alloc_cold(mapping); |
1274 | if (!page) { | 1274 | if (!page) { |
1275 | desc->error = -ENOMEM; | 1275 | desc->error = -ENOMEM; |
1276 | goto out; | 1276 | goto out; |
1277 | } | 1277 | } |
1278 | error = add_to_page_cache_lru(page, mapping, | 1278 | error = add_to_page_cache_lru(page, mapping, |
1279 | index, GFP_KERNEL); | 1279 | index, GFP_KERNEL); |
1280 | if (error) { | 1280 | if (error) { |
1281 | page_cache_release(page); | 1281 | page_cache_release(page); |
1282 | if (error == -EEXIST) | 1282 | if (error == -EEXIST) |
1283 | goto find_page; | 1283 | goto find_page; |
1284 | desc->error = error; | 1284 | desc->error = error; |
1285 | goto out; | 1285 | goto out; |
1286 | } | 1286 | } |
1287 | goto readpage; | 1287 | goto readpage; |
1288 | } | 1288 | } |
1289 | 1289 | ||
1290 | out: | 1290 | out: |
1291 | ra->prev_pos = prev_index; | 1291 | ra->prev_pos = prev_index; |
1292 | ra->prev_pos <<= PAGE_CACHE_SHIFT; | 1292 | ra->prev_pos <<= PAGE_CACHE_SHIFT; |
1293 | ra->prev_pos |= prev_offset; | 1293 | ra->prev_pos |= prev_offset; |
1294 | 1294 | ||
1295 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; | 1295 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; |
1296 | file_accessed(filp); | 1296 | file_accessed(filp); |
1297 | } | 1297 | } |
1298 | 1298 | ||
1299 | int file_read_actor(read_descriptor_t *desc, struct page *page, | 1299 | int file_read_actor(read_descriptor_t *desc, struct page *page, |
1300 | unsigned long offset, unsigned long size) | 1300 | unsigned long offset, unsigned long size) |
1301 | { | 1301 | { |
1302 | char *kaddr; | 1302 | char *kaddr; |
1303 | unsigned long left, count = desc->count; | 1303 | unsigned long left, count = desc->count; |
1304 | 1304 | ||
1305 | if (size > count) | 1305 | if (size > count) |
1306 | size = count; | 1306 | size = count; |
1307 | 1307 | ||
1308 | /* | 1308 | /* |
1309 | * Faults on the destination of a read are common, so do it before | 1309 | * Faults on the destination of a read are common, so do it before |
1310 | * taking the kmap. | 1310 | * taking the kmap. |
1311 | */ | 1311 | */ |
1312 | if (!fault_in_pages_writeable(desc->arg.buf, size)) { | 1312 | if (!fault_in_pages_writeable(desc->arg.buf, size)) { |
1313 | kaddr = kmap_atomic(page, KM_USER0); | 1313 | kaddr = kmap_atomic(page, KM_USER0); |
1314 | left = __copy_to_user_inatomic(desc->arg.buf, | 1314 | left = __copy_to_user_inatomic(desc->arg.buf, |
1315 | kaddr + offset, size); | 1315 | kaddr + offset, size); |
1316 | kunmap_atomic(kaddr, KM_USER0); | 1316 | kunmap_atomic(kaddr, KM_USER0); |
1317 | if (left == 0) | 1317 | if (left == 0) |
1318 | goto success; | 1318 | goto success; |
1319 | } | 1319 | } |
1320 | 1320 | ||
1321 | /* Do it the slow way */ | 1321 | /* Do it the slow way */ |
1322 | kaddr = kmap(page); | 1322 | kaddr = kmap(page); |
1323 | left = __copy_to_user(desc->arg.buf, kaddr + offset, size); | 1323 | left = __copy_to_user(desc->arg.buf, kaddr + offset, size); |
1324 | kunmap(page); | 1324 | kunmap(page); |
1325 | 1325 | ||
1326 | if (left) { | 1326 | if (left) { |
1327 | size -= left; | 1327 | size -= left; |
1328 | desc->error = -EFAULT; | 1328 | desc->error = -EFAULT; |
1329 | } | 1329 | } |
1330 | success: | 1330 | success: |
1331 | desc->count = count - size; | 1331 | desc->count = count - size; |
1332 | desc->written += size; | 1332 | desc->written += size; |
1333 | desc->arg.buf += size; | 1333 | desc->arg.buf += size; |
1334 | return size; | 1334 | return size; |
1335 | } | 1335 | } |
1336 | 1336 | ||
1337 | /* | 1337 | /* |
1338 | * Performs necessary checks before doing a write | 1338 | * Performs necessary checks before doing a write |
1339 | * @iov: io vector request | 1339 | * @iov: io vector request |
1340 | * @nr_segs: number of segments in the iovec | 1340 | * @nr_segs: number of segments in the iovec |
1341 | * @count: number of bytes to write | 1341 | * @count: number of bytes to write |
1342 | * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE | 1342 | * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE |
1343 | * | 1343 | * |
1344 | * Adjust number of segments and amount of bytes to write (nr_segs should be | 1344 | * Adjust number of segments and amount of bytes to write (nr_segs should be |
1345 | * properly initialized first). Returns appropriate error code that caller | 1345 | * properly initialized first). Returns appropriate error code that caller |
1346 | * should return or zero in case that write should be allowed. | 1346 | * should return or zero in case that write should be allowed. |
1347 | */ | 1347 | */ |
1348 | int generic_segment_checks(const struct iovec *iov, | 1348 | int generic_segment_checks(const struct iovec *iov, |
1349 | unsigned long *nr_segs, size_t *count, int access_flags) | 1349 | unsigned long *nr_segs, size_t *count, int access_flags) |
1350 | { | 1350 | { |
1351 | unsigned long seg; | 1351 | unsigned long seg; |
1352 | size_t cnt = 0; | 1352 | size_t cnt = 0; |
1353 | for (seg = 0; seg < *nr_segs; seg++) { | 1353 | for (seg = 0; seg < *nr_segs; seg++) { |
1354 | const struct iovec *iv = &iov[seg]; | 1354 | const struct iovec *iv = &iov[seg]; |
1355 | 1355 | ||
1356 | /* | 1356 | /* |
1357 | * If any segment has a negative length, or the cumulative | 1357 | * If any segment has a negative length, or the cumulative |
1358 | * length ever wraps negative then return -EINVAL. | 1358 | * length ever wraps negative then return -EINVAL. |
1359 | */ | 1359 | */ |
1360 | cnt += iv->iov_len; | 1360 | cnt += iv->iov_len; |
1361 | if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) | 1361 | if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) |
1362 | return -EINVAL; | 1362 | return -EINVAL; |
1363 | if (access_ok(access_flags, iv->iov_base, iv->iov_len)) | 1363 | if (access_ok(access_flags, iv->iov_base, iv->iov_len)) |
1364 | continue; | 1364 | continue; |
1365 | if (seg == 0) | 1365 | if (seg == 0) |
1366 | return -EFAULT; | 1366 | return -EFAULT; |
1367 | *nr_segs = seg; | 1367 | *nr_segs = seg; |
1368 | cnt -= iv->iov_len; /* This segment is no good */ | 1368 | cnt -= iv->iov_len; /* This segment is no good */ |
1369 | break; | 1369 | break; |
1370 | } | 1370 | } |
1371 | *count = cnt; | 1371 | *count = cnt; |
1372 | return 0; | 1372 | return 0; |
1373 | } | 1373 | } |
1374 | EXPORT_SYMBOL(generic_segment_checks); | 1374 | EXPORT_SYMBOL(generic_segment_checks); |
1375 | 1375 | ||
1376 | /** | 1376 | /** |
1377 | * generic_file_aio_read - generic filesystem read routine | 1377 | * generic_file_aio_read - generic filesystem read routine |
1378 | * @iocb: kernel I/O control block | 1378 | * @iocb: kernel I/O control block |
1379 | * @iov: io vector request | 1379 | * @iov: io vector request |
1380 | * @nr_segs: number of segments in the iovec | 1380 | * @nr_segs: number of segments in the iovec |
1381 | * @pos: current file position | 1381 | * @pos: current file position |
1382 | * | 1382 | * |
1383 | * This is the "read()" routine for all filesystems | 1383 | * This is the "read()" routine for all filesystems |
1384 | * that can use the page cache directly. | 1384 | * that can use the page cache directly. |
1385 | */ | 1385 | */ |
1386 | ssize_t | 1386 | ssize_t |
1387 | generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | 1387 | generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, |
1388 | unsigned long nr_segs, loff_t pos) | 1388 | unsigned long nr_segs, loff_t pos) |
1389 | { | 1389 | { |
1390 | struct file *filp = iocb->ki_filp; | 1390 | struct file *filp = iocb->ki_filp; |
1391 | ssize_t retval; | 1391 | ssize_t retval; |
1392 | unsigned long seg = 0; | 1392 | unsigned long seg = 0; |
1393 | size_t count; | 1393 | size_t count; |
1394 | loff_t *ppos = &iocb->ki_pos; | 1394 | loff_t *ppos = &iocb->ki_pos; |
1395 | struct blk_plug plug; | 1395 | struct blk_plug plug; |
1396 | 1396 | ||
1397 | count = 0; | 1397 | count = 0; |
1398 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); | 1398 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1399 | if (retval) | 1399 | if (retval) |
1400 | return retval; | 1400 | return retval; |
1401 | 1401 | ||
1402 | blk_start_plug(&plug); | 1402 | blk_start_plug(&plug); |
1403 | 1403 | ||
1404 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1404 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1405 | if (filp->f_flags & O_DIRECT) { | 1405 | if (filp->f_flags & O_DIRECT) { |
1406 | loff_t size; | 1406 | loff_t size; |
1407 | struct address_space *mapping; | 1407 | struct address_space *mapping; |
1408 | struct inode *inode; | 1408 | struct inode *inode; |
1409 | 1409 | ||
1410 | mapping = filp->f_mapping; | 1410 | mapping = filp->f_mapping; |
1411 | inode = mapping->host; | 1411 | inode = mapping->host; |
1412 | if (!count) | 1412 | if (!count) |
1413 | goto out; /* skip atime */ | 1413 | goto out; /* skip atime */ |
1414 | size = i_size_read(inode); | 1414 | size = i_size_read(inode); |
1415 | if (pos < size) { | 1415 | if (pos < size) { |
1416 | retval = filemap_write_and_wait_range(mapping, pos, | 1416 | retval = filemap_write_and_wait_range(mapping, pos, |
1417 | pos + iov_length(iov, nr_segs) - 1); | 1417 | pos + iov_length(iov, nr_segs) - 1); |
1418 | if (!retval) { | 1418 | if (!retval) { |
1419 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1419 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1420 | iov, pos, nr_segs); | 1420 | iov, pos, nr_segs); |
1421 | } | 1421 | } |
1422 | if (retval > 0) { | 1422 | if (retval > 0) { |
1423 | *ppos = pos + retval; | 1423 | *ppos = pos + retval; |
1424 | count -= retval; | 1424 | count -= retval; |
1425 | } | 1425 | } |
1426 | 1426 | ||
1427 | /* | 1427 | /* |
1428 | * Btrfs can have a short DIO read if we encounter | 1428 | * Btrfs can have a short DIO read if we encounter |
1429 | * compressed extents, so if there was an error, or if | 1429 | * compressed extents, so if there was an error, or if |
1430 | * we've already read everything we wanted to, or if | 1430 | * we've already read everything we wanted to, or if |
1431 | * there was a short read because we hit EOF, go ahead | 1431 | * there was a short read because we hit EOF, go ahead |
1432 | * and return. Otherwise fallthrough to buffered io for | 1432 | * and return. Otherwise fallthrough to buffered io for |
1433 | * the rest of the read. | 1433 | * the rest of the read. |
1434 | */ | 1434 | */ |
1435 | if (retval < 0 || !count || *ppos >= size) { | 1435 | if (retval < 0 || !count || *ppos >= size) { |
1436 | file_accessed(filp); | 1436 | file_accessed(filp); |
1437 | goto out; | 1437 | goto out; |
1438 | } | 1438 | } |
1439 | } | 1439 | } |
1440 | } | 1440 | } |
1441 | 1441 | ||
1442 | count = retval; | 1442 | count = retval; |
1443 | for (seg = 0; seg < nr_segs; seg++) { | 1443 | for (seg = 0; seg < nr_segs; seg++) { |
1444 | read_descriptor_t desc; | 1444 | read_descriptor_t desc; |
1445 | loff_t offset = 0; | 1445 | loff_t offset = 0; |
1446 | 1446 | ||
1447 | /* | 1447 | /* |
1448 | * If we did a short DIO read we need to skip the section of the | 1448 | * If we did a short DIO read we need to skip the section of the |
1449 | * iov that we've already read data into. | 1449 | * iov that we've already read data into. |
1450 | */ | 1450 | */ |
1451 | if (count) { | 1451 | if (count) { |
1452 | if (count > iov[seg].iov_len) { | 1452 | if (count > iov[seg].iov_len) { |
1453 | count -= iov[seg].iov_len; | 1453 | count -= iov[seg].iov_len; |
1454 | continue; | 1454 | continue; |
1455 | } | 1455 | } |
1456 | offset = count; | 1456 | offset = count; |
1457 | count = 0; | 1457 | count = 0; |
1458 | } | 1458 | } |
1459 | 1459 | ||
1460 | desc.written = 0; | 1460 | desc.written = 0; |
1461 | desc.arg.buf = iov[seg].iov_base + offset; | 1461 | desc.arg.buf = iov[seg].iov_base + offset; |
1462 | desc.count = iov[seg].iov_len - offset; | 1462 | desc.count = iov[seg].iov_len - offset; |
1463 | if (desc.count == 0) | 1463 | if (desc.count == 0) |
1464 | continue; | 1464 | continue; |
1465 | desc.error = 0; | 1465 | desc.error = 0; |
1466 | do_generic_file_read(filp, ppos, &desc, file_read_actor); | 1466 | do_generic_file_read(filp, ppos, &desc, file_read_actor); |
1467 | retval += desc.written; | 1467 | retval += desc.written; |
1468 | if (desc.error) { | 1468 | if (desc.error) { |
1469 | retval = retval ?: desc.error; | 1469 | retval = retval ?: desc.error; |
1470 | break; | 1470 | break; |
1471 | } | 1471 | } |
1472 | if (desc.count > 0) | 1472 | if (desc.count > 0) |
1473 | break; | 1473 | break; |
1474 | } | 1474 | } |
1475 | out: | 1475 | out: |
1476 | blk_finish_plug(&plug); | 1476 | blk_finish_plug(&plug); |
1477 | return retval; | 1477 | return retval; |
1478 | } | 1478 | } |
1479 | EXPORT_SYMBOL(generic_file_aio_read); | 1479 | EXPORT_SYMBOL(generic_file_aio_read); |
1480 | 1480 | ||
1481 | static ssize_t | 1481 | static ssize_t |
1482 | do_readahead(struct address_space *mapping, struct file *filp, | 1482 | do_readahead(struct address_space *mapping, struct file *filp, |
1483 | pgoff_t index, unsigned long nr) | 1483 | pgoff_t index, unsigned long nr) |
1484 | { | 1484 | { |
1485 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1485 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
1486 | return -EINVAL; | 1486 | return -EINVAL; |
1487 | 1487 | ||
1488 | force_page_cache_readahead(mapping, filp, index, nr); | 1488 | force_page_cache_readahead(mapping, filp, index, nr); |
1489 | return 0; | 1489 | return 0; |
1490 | } | 1490 | } |
1491 | 1491 | ||
1492 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | 1492 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) |
1493 | { | 1493 | { |
1494 | ssize_t ret; | 1494 | ssize_t ret; |
1495 | struct file *file; | 1495 | struct file *file; |
1496 | 1496 | ||
1497 | ret = -EBADF; | 1497 | ret = -EBADF; |
1498 | file = fget(fd); | 1498 | file = fget(fd); |
1499 | if (file) { | 1499 | if (file) { |
1500 | if (file->f_mode & FMODE_READ) { | 1500 | if (file->f_mode & FMODE_READ) { |
1501 | struct address_space *mapping = file->f_mapping; | 1501 | struct address_space *mapping = file->f_mapping; |
1502 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | 1502 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; |
1503 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | 1503 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; |
1504 | unsigned long len = end - start + 1; | 1504 | unsigned long len = end - start + 1; |
1505 | ret = do_readahead(mapping, file, start, len); | 1505 | ret = do_readahead(mapping, file, start, len); |
1506 | } | 1506 | } |
1507 | fput(file); | 1507 | fput(file); |
1508 | } | 1508 | } |
1509 | return ret; | 1509 | return ret; |
1510 | } | 1510 | } |
1511 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | 1511 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS |
1512 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | 1512 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) |
1513 | { | 1513 | { |
1514 | return SYSC_readahead((int) fd, offset, (size_t) count); | 1514 | return SYSC_readahead((int) fd, offset, (size_t) count); |
1515 | } | 1515 | } |
1516 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | 1516 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); |
1517 | #endif | 1517 | #endif |
1518 | 1518 | ||
1519 | #ifdef CONFIG_MMU | 1519 | #ifdef CONFIG_MMU |
1520 | /** | 1520 | /** |
1521 | * page_cache_read - adds requested page to the page cache if not already there | 1521 | * page_cache_read - adds requested page to the page cache if not already there |
1522 | * @file: file to read | 1522 | * @file: file to read |
1523 | * @offset: page index | 1523 | * @offset: page index |
1524 | * | 1524 | * |
1525 | * This adds the requested page to the page cache if it isn't already there, | 1525 | * This adds the requested page to the page cache if it isn't already there, |
1526 | * and schedules an I/O to read in its contents from disk. | 1526 | * and schedules an I/O to read in its contents from disk. |
1527 | */ | 1527 | */ |
1528 | static int page_cache_read(struct file *file, pgoff_t offset) | 1528 | static int page_cache_read(struct file *file, pgoff_t offset) |
1529 | { | 1529 | { |
1530 | struct address_space *mapping = file->f_mapping; | 1530 | struct address_space *mapping = file->f_mapping; |
1531 | struct page *page; | 1531 | struct page *page; |
1532 | int ret; | 1532 | int ret; |
1533 | 1533 | ||
1534 | do { | 1534 | do { |
1535 | page = page_cache_alloc_cold(mapping); | 1535 | page = page_cache_alloc_cold(mapping); |
1536 | if (!page) | 1536 | if (!page) |
1537 | return -ENOMEM; | 1537 | return -ENOMEM; |
1538 | 1538 | ||
1539 | ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | 1539 | ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); |
1540 | if (ret == 0) | 1540 | if (ret == 0) |
1541 | ret = mapping->a_ops->readpage(file, page); | 1541 | ret = mapping->a_ops->readpage(file, page); |
1542 | else if (ret == -EEXIST) | 1542 | else if (ret == -EEXIST) |
1543 | ret = 0; /* losing race to add is OK */ | 1543 | ret = 0; /* losing race to add is OK */ |
1544 | 1544 | ||
1545 | page_cache_release(page); | 1545 | page_cache_release(page); |
1546 | 1546 | ||
1547 | } while (ret == AOP_TRUNCATED_PAGE); | 1547 | } while (ret == AOP_TRUNCATED_PAGE); |
1548 | 1548 | ||
1549 | return ret; | 1549 | return ret; |
1550 | } | 1550 | } |
1551 | 1551 | ||
1552 | #define MMAP_LOTSAMISS (100) | 1552 | #define MMAP_LOTSAMISS (100) |
1553 | 1553 | ||
1554 | /* | 1554 | /* |
1555 | * Synchronous readahead happens when we don't even find | 1555 | * Synchronous readahead happens when we don't even find |
1556 | * a page in the page cache at all. | 1556 | * a page in the page cache at all. |
1557 | */ | 1557 | */ |
1558 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, | 1558 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, |
1559 | struct file_ra_state *ra, | 1559 | struct file_ra_state *ra, |
1560 | struct file *file, | 1560 | struct file *file, |
1561 | pgoff_t offset) | 1561 | pgoff_t offset) |
1562 | { | 1562 | { |
1563 | unsigned long ra_pages; | 1563 | unsigned long ra_pages; |
1564 | struct address_space *mapping = file->f_mapping; | 1564 | struct address_space *mapping = file->f_mapping; |
1565 | 1565 | ||
1566 | /* If we don't want any read-ahead, don't bother */ | 1566 | /* If we don't want any read-ahead, don't bother */ |
1567 | if (VM_RandomReadHint(vma)) | 1567 | if (VM_RandomReadHint(vma)) |
1568 | return; | 1568 | return; |
1569 | if (!ra->ra_pages) | 1569 | if (!ra->ra_pages) |
1570 | return; | 1570 | return; |
1571 | 1571 | ||
1572 | if (VM_SequentialReadHint(vma)) { | 1572 | if (VM_SequentialReadHint(vma)) { |
1573 | page_cache_sync_readahead(mapping, ra, file, offset, | 1573 | page_cache_sync_readahead(mapping, ra, file, offset, |
1574 | ra->ra_pages); | 1574 | ra->ra_pages); |
1575 | return; | 1575 | return; |
1576 | } | 1576 | } |
1577 | 1577 | ||
1578 | /* Avoid banging the cache line if not needed */ | 1578 | /* Avoid banging the cache line if not needed */ |
1579 | if (ra->mmap_miss < MMAP_LOTSAMISS * 10) | 1579 | if (ra->mmap_miss < MMAP_LOTSAMISS * 10) |
1580 | ra->mmap_miss++; | 1580 | ra->mmap_miss++; |
1581 | 1581 | ||
1582 | /* | 1582 | /* |
1583 | * Do we miss much more than hit in this file? If so, | 1583 | * Do we miss much more than hit in this file? If so, |
1584 | * stop bothering with read-ahead. It will only hurt. | 1584 | * stop bothering with read-ahead. It will only hurt. |
1585 | */ | 1585 | */ |
1586 | if (ra->mmap_miss > MMAP_LOTSAMISS) | 1586 | if (ra->mmap_miss > MMAP_LOTSAMISS) |
1587 | return; | 1587 | return; |
1588 | 1588 | ||
1589 | /* | 1589 | /* |
1590 | * mmap read-around | 1590 | * mmap read-around |
1591 | */ | 1591 | */ |
1592 | ra_pages = max_sane_readahead(ra->ra_pages); | 1592 | ra_pages = max_sane_readahead(ra->ra_pages); |
1593 | ra->start = max_t(long, 0, offset - ra_pages / 2); | 1593 | ra->start = max_t(long, 0, offset - ra_pages / 2); |
1594 | ra->size = ra_pages; | 1594 | ra->size = ra_pages; |
1595 | ra->async_size = ra_pages / 4; | 1595 | ra->async_size = ra_pages / 4; |
1596 | ra_submit(ra, mapping, file); | 1596 | ra_submit(ra, mapping, file); |
1597 | } | 1597 | } |
1598 | 1598 | ||
1599 | /* | 1599 | /* |
1600 | * Asynchronous readahead happens when we find the page and PG_readahead, | 1600 | * Asynchronous readahead happens when we find the page and PG_readahead, |
1601 | * so we want to possibly extend the readahead further.. | 1601 | * so we want to possibly extend the readahead further.. |
1602 | */ | 1602 | */ |
1603 | static void do_async_mmap_readahead(struct vm_area_struct *vma, | 1603 | static void do_async_mmap_readahead(struct vm_area_struct *vma, |
1604 | struct file_ra_state *ra, | 1604 | struct file_ra_state *ra, |
1605 | struct file *file, | 1605 | struct file *file, |
1606 | struct page *page, | 1606 | struct page *page, |
1607 | pgoff_t offset) | 1607 | pgoff_t offset) |
1608 | { | 1608 | { |
1609 | struct address_space *mapping = file->f_mapping; | 1609 | struct address_space *mapping = file->f_mapping; |
1610 | 1610 | ||
1611 | /* If we don't want any read-ahead, don't bother */ | 1611 | /* If we don't want any read-ahead, don't bother */ |
1612 | if (VM_RandomReadHint(vma)) | 1612 | if (VM_RandomReadHint(vma)) |
1613 | return; | 1613 | return; |
1614 | if (ra->mmap_miss > 0) | 1614 | if (ra->mmap_miss > 0) |
1615 | ra->mmap_miss--; | 1615 | ra->mmap_miss--; |
1616 | if (PageReadahead(page)) | 1616 | if (PageReadahead(page)) |
1617 | page_cache_async_readahead(mapping, ra, file, | 1617 | page_cache_async_readahead(mapping, ra, file, |
1618 | page, offset, ra->ra_pages); | 1618 | page, offset, ra->ra_pages); |
1619 | } | 1619 | } |
1620 | 1620 | ||
1621 | /** | 1621 | /** |
1622 | * filemap_fault - read in file data for page fault handling | 1622 | * filemap_fault - read in file data for page fault handling |
1623 | * @vma: vma in which the fault was taken | 1623 | * @vma: vma in which the fault was taken |
1624 | * @vmf: struct vm_fault containing details of the fault | 1624 | * @vmf: struct vm_fault containing details of the fault |
1625 | * | 1625 | * |
1626 | * filemap_fault() is invoked via the vma operations vector for a | 1626 | * filemap_fault() is invoked via the vma operations vector for a |
1627 | * mapped memory region to read in file data during a page fault. | 1627 | * mapped memory region to read in file data during a page fault. |
1628 | * | 1628 | * |
1629 | * The goto's are kind of ugly, but this streamlines the normal case of having | 1629 | * The goto's are kind of ugly, but this streamlines the normal case of having |
1630 | * it in the page cache, and handles the special cases reasonably without | 1630 | * it in the page cache, and handles the special cases reasonably without |
1631 | * having a lot of duplicated code. | 1631 | * having a lot of duplicated code. |
1632 | */ | 1632 | */ |
1633 | int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1633 | int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
1634 | { | 1634 | { |
1635 | int error; | 1635 | int error; |
1636 | struct file *file = vma->vm_file; | 1636 | struct file *file = vma->vm_file; |
1637 | struct address_space *mapping = file->f_mapping; | 1637 | struct address_space *mapping = file->f_mapping; |
1638 | struct file_ra_state *ra = &file->f_ra; | 1638 | struct file_ra_state *ra = &file->f_ra; |
1639 | struct inode *inode = mapping->host; | 1639 | struct inode *inode = mapping->host; |
1640 | pgoff_t offset = vmf->pgoff; | 1640 | pgoff_t offset = vmf->pgoff; |
1641 | struct page *page; | 1641 | struct page *page; |
1642 | pgoff_t size; | 1642 | pgoff_t size; |
1643 | int ret = 0; | 1643 | int ret = 0; |
1644 | 1644 | ||
1645 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1645 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1646 | if (offset >= size) | 1646 | if (offset >= size) |
1647 | return VM_FAULT_SIGBUS; | 1647 | return VM_FAULT_SIGBUS; |
1648 | 1648 | ||
1649 | /* | 1649 | /* |
1650 | * Do we have something in the page cache already? | 1650 | * Do we have something in the page cache already? |
1651 | */ | 1651 | */ |
1652 | page = find_get_page(mapping, offset); | 1652 | page = find_get_page(mapping, offset); |
1653 | if (likely(page)) { | 1653 | if (likely(page)) { |
1654 | /* | 1654 | /* |
1655 | * We found the page, so try async readahead before | 1655 | * We found the page, so try async readahead before |
1656 | * waiting for the lock. | 1656 | * waiting for the lock. |
1657 | */ | 1657 | */ |
1658 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1658 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1659 | } else { | 1659 | } else { |
1660 | /* No page in the page cache at all */ | 1660 | /* No page in the page cache at all */ |
1661 | do_sync_mmap_readahead(vma, ra, file, offset); | 1661 | do_sync_mmap_readahead(vma, ra, file, offset); |
1662 | count_vm_event(PGMAJFAULT); | 1662 | count_vm_event(PGMAJFAULT); |
1663 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1663 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1664 | ret = VM_FAULT_MAJOR; | 1664 | ret = VM_FAULT_MAJOR; |
1665 | retry_find: | 1665 | retry_find: |
1666 | page = find_get_page(mapping, offset); | 1666 | page = find_get_page(mapping, offset); |
1667 | if (!page) | 1667 | if (!page) |
1668 | goto no_cached_page; | 1668 | goto no_cached_page; |
1669 | } | 1669 | } |
1670 | 1670 | ||
1671 | if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { | 1671 | if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { |
1672 | page_cache_release(page); | 1672 | page_cache_release(page); |
1673 | return ret | VM_FAULT_RETRY; | 1673 | return ret | VM_FAULT_RETRY; |
1674 | } | 1674 | } |
1675 | 1675 | ||
1676 | /* Did it get truncated? */ | 1676 | /* Did it get truncated? */ |
1677 | if (unlikely(page->mapping != mapping)) { | 1677 | if (unlikely(page->mapping != mapping)) { |
1678 | unlock_page(page); | 1678 | unlock_page(page); |
1679 | put_page(page); | 1679 | put_page(page); |
1680 | goto retry_find; | 1680 | goto retry_find; |
1681 | } | 1681 | } |
1682 | VM_BUG_ON(page->index != offset); | 1682 | VM_BUG_ON(page->index != offset); |
1683 | 1683 | ||
1684 | /* | 1684 | /* |
1685 | * We have a locked page in the page cache, now we need to check | 1685 | * We have a locked page in the page cache, now we need to check |
1686 | * that it's up-to-date. If not, it is going to be due to an error. | 1686 | * that it's up-to-date. If not, it is going to be due to an error. |
1687 | */ | 1687 | */ |
1688 | if (unlikely(!PageUptodate(page))) | 1688 | if (unlikely(!PageUptodate(page))) |
1689 | goto page_not_uptodate; | 1689 | goto page_not_uptodate; |
1690 | 1690 | ||
1691 | /* | 1691 | /* |
1692 | * Found the page and have a reference on it. | 1692 | * Found the page and have a reference on it. |
1693 | * We must recheck i_size under page lock. | 1693 | * We must recheck i_size under page lock. |
1694 | */ | 1694 | */ |
1695 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1695 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1696 | if (unlikely(offset >= size)) { | 1696 | if (unlikely(offset >= size)) { |
1697 | unlock_page(page); | 1697 | unlock_page(page); |
1698 | page_cache_release(page); | 1698 | page_cache_release(page); |
1699 | return VM_FAULT_SIGBUS; | 1699 | return VM_FAULT_SIGBUS; |
1700 | } | 1700 | } |
1701 | 1701 | ||
1702 | vmf->page = page; | 1702 | vmf->page = page; |
1703 | return ret | VM_FAULT_LOCKED; | 1703 | return ret | VM_FAULT_LOCKED; |
1704 | 1704 | ||
1705 | no_cached_page: | 1705 | no_cached_page: |
1706 | /* | 1706 | /* |
1707 | * We're only likely to ever get here if MADV_RANDOM is in | 1707 | * We're only likely to ever get here if MADV_RANDOM is in |
1708 | * effect. | 1708 | * effect. |
1709 | */ | 1709 | */ |
1710 | error = page_cache_read(file, offset); | 1710 | error = page_cache_read(file, offset); |
1711 | 1711 | ||
1712 | /* | 1712 | /* |
1713 | * The page we want has now been added to the page cache. | 1713 | * The page we want has now been added to the page cache. |
1714 | * In the unlikely event that someone removed it in the | 1714 | * In the unlikely event that someone removed it in the |
1715 | * meantime, we'll just come back here and read it again. | 1715 | * meantime, we'll just come back here and read it again. |
1716 | */ | 1716 | */ |
1717 | if (error >= 0) | 1717 | if (error >= 0) |
1718 | goto retry_find; | 1718 | goto retry_find; |
1719 | 1719 | ||
1720 | /* | 1720 | /* |
1721 | * An error return from page_cache_read can result if the | 1721 | * An error return from page_cache_read can result if the |
1722 | * system is low on memory, or a problem occurs while trying | 1722 | * system is low on memory, or a problem occurs while trying |
1723 | * to schedule I/O. | 1723 | * to schedule I/O. |
1724 | */ | 1724 | */ |
1725 | if (error == -ENOMEM) | 1725 | if (error == -ENOMEM) |
1726 | return VM_FAULT_OOM; | 1726 | return VM_FAULT_OOM; |
1727 | return VM_FAULT_SIGBUS; | 1727 | return VM_FAULT_SIGBUS; |
1728 | 1728 | ||
1729 | page_not_uptodate: | 1729 | page_not_uptodate: |
1730 | /* | 1730 | /* |
1731 | * Umm, take care of errors if the page isn't up-to-date. | 1731 | * Umm, take care of errors if the page isn't up-to-date. |
1732 | * Try to re-read it _once_. We do this synchronously, | 1732 | * Try to re-read it _once_. We do this synchronously, |
1733 | * because there really aren't any performance issues here | 1733 | * because there really aren't any performance issues here |
1734 | * and we need to check for errors. | 1734 | * and we need to check for errors. |
1735 | */ | 1735 | */ |
1736 | ClearPageError(page); | 1736 | ClearPageError(page); |
1737 | error = mapping->a_ops->readpage(file, page); | 1737 | error = mapping->a_ops->readpage(file, page); |
1738 | if (!error) { | 1738 | if (!error) { |
1739 | wait_on_page_locked(page); | 1739 | wait_on_page_locked(page); |
1740 | if (!PageUptodate(page)) | 1740 | if (!PageUptodate(page)) |
1741 | error = -EIO; | 1741 | error = -EIO; |
1742 | } | 1742 | } |
1743 | page_cache_release(page); | 1743 | page_cache_release(page); |
1744 | 1744 | ||
1745 | if (!error || error == AOP_TRUNCATED_PAGE) | 1745 | if (!error || error == AOP_TRUNCATED_PAGE) |
1746 | goto retry_find; | 1746 | goto retry_find; |
1747 | 1747 | ||
1748 | /* Things didn't work out. Return zero to tell the mm layer so. */ | 1748 | /* Things didn't work out. Return zero to tell the mm layer so. */ |
1749 | shrink_readahead_size_eio(file, ra); | 1749 | shrink_readahead_size_eio(file, ra); |
1750 | return VM_FAULT_SIGBUS; | 1750 | return VM_FAULT_SIGBUS; |
1751 | } | 1751 | } |
1752 | EXPORT_SYMBOL(filemap_fault); | 1752 | EXPORT_SYMBOL(filemap_fault); |
1753 | 1753 | ||
1754 | const struct vm_operations_struct generic_file_vm_ops = { | 1754 | const struct vm_operations_struct generic_file_vm_ops = { |
1755 | .fault = filemap_fault, | 1755 | .fault = filemap_fault, |
1756 | }; | 1756 | }; |
1757 | 1757 | ||
1758 | /* This is used for a general mmap of a disk file */ | 1758 | /* This is used for a general mmap of a disk file */ |
1759 | 1759 | ||
1760 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | 1760 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) |
1761 | { | 1761 | { |
1762 | struct address_space *mapping = file->f_mapping; | 1762 | struct address_space *mapping = file->f_mapping; |
1763 | 1763 | ||
1764 | if (!mapping->a_ops->readpage) | 1764 | if (!mapping->a_ops->readpage) |
1765 | return -ENOEXEC; | 1765 | return -ENOEXEC; |
1766 | file_accessed(file); | 1766 | file_accessed(file); |
1767 | vma->vm_ops = &generic_file_vm_ops; | 1767 | vma->vm_ops = &generic_file_vm_ops; |
1768 | vma->vm_flags |= VM_CAN_NONLINEAR; | 1768 | vma->vm_flags |= VM_CAN_NONLINEAR; |
1769 | return 0; | 1769 | return 0; |
1770 | } | 1770 | } |
1771 | 1771 | ||
1772 | /* | 1772 | /* |
1773 | * This is for filesystems which do not implement ->writepage. | 1773 | * This is for filesystems which do not implement ->writepage. |
1774 | */ | 1774 | */ |
1775 | int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) | 1775 | int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) |
1776 | { | 1776 | { |
1777 | if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) | 1777 | if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) |
1778 | return -EINVAL; | 1778 | return -EINVAL; |
1779 | return generic_file_mmap(file, vma); | 1779 | return generic_file_mmap(file, vma); |
1780 | } | 1780 | } |
1781 | #else | 1781 | #else |
1782 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | 1782 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) |
1783 | { | 1783 | { |
1784 | return -ENOSYS; | 1784 | return -ENOSYS; |
1785 | } | 1785 | } |
1786 | int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) | 1786 | int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) |
1787 | { | 1787 | { |
1788 | return -ENOSYS; | 1788 | return -ENOSYS; |
1789 | } | 1789 | } |
1790 | #endif /* CONFIG_MMU */ | 1790 | #endif /* CONFIG_MMU */ |
1791 | 1791 | ||
1792 | EXPORT_SYMBOL(generic_file_mmap); | 1792 | EXPORT_SYMBOL(generic_file_mmap); |
1793 | EXPORT_SYMBOL(generic_file_readonly_mmap); | 1793 | EXPORT_SYMBOL(generic_file_readonly_mmap); |
1794 | 1794 | ||
1795 | static struct page *__read_cache_page(struct address_space *mapping, | 1795 | static struct page *__read_cache_page(struct address_space *mapping, |
1796 | pgoff_t index, | 1796 | pgoff_t index, |
1797 | int (*filler)(void *, struct page *), | 1797 | int (*filler)(void *, struct page *), |
1798 | void *data, | 1798 | void *data, |
1799 | gfp_t gfp) | 1799 | gfp_t gfp) |
1800 | { | 1800 | { |
1801 | struct page *page; | 1801 | struct page *page; |
1802 | int err; | 1802 | int err; |
1803 | repeat: | 1803 | repeat: |
1804 | page = find_get_page(mapping, index); | 1804 | page = find_get_page(mapping, index); |
1805 | if (!page) { | 1805 | if (!page) { |
1806 | page = __page_cache_alloc(gfp | __GFP_COLD); | 1806 | page = __page_cache_alloc(gfp | __GFP_COLD); |
1807 | if (!page) | 1807 | if (!page) |
1808 | return ERR_PTR(-ENOMEM); | 1808 | return ERR_PTR(-ENOMEM); |
1809 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); | 1809 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
1810 | if (unlikely(err)) { | 1810 | if (unlikely(err)) { |
1811 | page_cache_release(page); | 1811 | page_cache_release(page); |
1812 | if (err == -EEXIST) | 1812 | if (err == -EEXIST) |
1813 | goto repeat; | 1813 | goto repeat; |
1814 | /* Presumably ENOMEM for radix tree node */ | 1814 | /* Presumably ENOMEM for radix tree node */ |
1815 | return ERR_PTR(err); | 1815 | return ERR_PTR(err); |
1816 | } | 1816 | } |
1817 | err = filler(data, page); | 1817 | err = filler(data, page); |
1818 | if (err < 0) { | 1818 | if (err < 0) { |
1819 | page_cache_release(page); | 1819 | page_cache_release(page); |
1820 | page = ERR_PTR(err); | 1820 | page = ERR_PTR(err); |
1821 | } | 1821 | } |
1822 | } | 1822 | } |
1823 | return page; | 1823 | return page; |
1824 | } | 1824 | } |
1825 | 1825 | ||
1826 | static struct page *do_read_cache_page(struct address_space *mapping, | 1826 | static struct page *do_read_cache_page(struct address_space *mapping, |
1827 | pgoff_t index, | 1827 | pgoff_t index, |
1828 | int (*filler)(void *, struct page *), | 1828 | int (*filler)(void *, struct page *), |
1829 | void *data, | 1829 | void *data, |
1830 | gfp_t gfp) | 1830 | gfp_t gfp) |
1831 | 1831 | ||
1832 | { | 1832 | { |
1833 | struct page *page; | 1833 | struct page *page; |
1834 | int err; | 1834 | int err; |
1835 | 1835 | ||
1836 | retry: | 1836 | retry: |
1837 | page = __read_cache_page(mapping, index, filler, data, gfp); | 1837 | page = __read_cache_page(mapping, index, filler, data, gfp); |
1838 | if (IS_ERR(page)) | 1838 | if (IS_ERR(page)) |
1839 | return page; | 1839 | return page; |
1840 | if (PageUptodate(page)) | 1840 | if (PageUptodate(page)) |
1841 | goto out; | 1841 | goto out; |
1842 | 1842 | ||
1843 | lock_page(page); | 1843 | lock_page(page); |
1844 | if (!page->mapping) { | 1844 | if (!page->mapping) { |
1845 | unlock_page(page); | 1845 | unlock_page(page); |
1846 | page_cache_release(page); | 1846 | page_cache_release(page); |
1847 | goto retry; | 1847 | goto retry; |
1848 | } | 1848 | } |
1849 | if (PageUptodate(page)) { | 1849 | if (PageUptodate(page)) { |
1850 | unlock_page(page); | 1850 | unlock_page(page); |
1851 | goto out; | 1851 | goto out; |
1852 | } | 1852 | } |
1853 | err = filler(data, page); | 1853 | err = filler(data, page); |
1854 | if (err < 0) { | 1854 | if (err < 0) { |
1855 | page_cache_release(page); | 1855 | page_cache_release(page); |
1856 | return ERR_PTR(err); | 1856 | return ERR_PTR(err); |
1857 | } | 1857 | } |
1858 | out: | 1858 | out: |
1859 | mark_page_accessed(page); | 1859 | mark_page_accessed(page); |
1860 | return page; | 1860 | return page; |
1861 | } | 1861 | } |
1862 | 1862 | ||
1863 | /** | 1863 | /** |
1864 | * read_cache_page_async - read into page cache, fill it if needed | 1864 | * read_cache_page_async - read into page cache, fill it if needed |
1865 | * @mapping: the page's address_space | 1865 | * @mapping: the page's address_space |
1866 | * @index: the page index | 1866 | * @index: the page index |
1867 | * @filler: function to perform the read | 1867 | * @filler: function to perform the read |
1868 | * @data: first arg to filler(data, page) function, often left as NULL | 1868 | * @data: first arg to filler(data, page) function, often left as NULL |
1869 | * | 1869 | * |
1870 | * Same as read_cache_page, but don't wait for page to become unlocked | 1870 | * Same as read_cache_page, but don't wait for page to become unlocked |
1871 | * after submitting it to the filler. | 1871 | * after submitting it to the filler. |
1872 | * | 1872 | * |
1873 | * Read into the page cache. If a page already exists, and PageUptodate() is | 1873 | * Read into the page cache. If a page already exists, and PageUptodate() is |
1874 | * not set, try to fill the page but don't wait for it to become unlocked. | 1874 | * not set, try to fill the page but don't wait for it to become unlocked. |
1875 | * | 1875 | * |
1876 | * If the page does not get brought uptodate, return -EIO. | 1876 | * If the page does not get brought uptodate, return -EIO. |
1877 | */ | 1877 | */ |
1878 | struct page *read_cache_page_async(struct address_space *mapping, | 1878 | struct page *read_cache_page_async(struct address_space *mapping, |
1879 | pgoff_t index, | 1879 | pgoff_t index, |
1880 | int (*filler)(void *, struct page *), | 1880 | int (*filler)(void *, struct page *), |
1881 | void *data) | 1881 | void *data) |
1882 | { | 1882 | { |
1883 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | 1883 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); |
1884 | } | 1884 | } |
1885 | EXPORT_SYMBOL(read_cache_page_async); | 1885 | EXPORT_SYMBOL(read_cache_page_async); |
1886 | 1886 | ||
1887 | static struct page *wait_on_page_read(struct page *page) | 1887 | static struct page *wait_on_page_read(struct page *page) |
1888 | { | 1888 | { |
1889 | if (!IS_ERR(page)) { | 1889 | if (!IS_ERR(page)) { |
1890 | wait_on_page_locked(page); | 1890 | wait_on_page_locked(page); |
1891 | if (!PageUptodate(page)) { | 1891 | if (!PageUptodate(page)) { |
1892 | page_cache_release(page); | 1892 | page_cache_release(page); |
1893 | page = ERR_PTR(-EIO); | 1893 | page = ERR_PTR(-EIO); |
1894 | } | 1894 | } |
1895 | } | 1895 | } |
1896 | return page; | 1896 | return page; |
1897 | } | 1897 | } |
1898 | 1898 | ||
1899 | /** | 1899 | /** |
1900 | * read_cache_page_gfp - read into page cache, using specified page allocation flags. | 1900 | * read_cache_page_gfp - read into page cache, using specified page allocation flags. |
1901 | * @mapping: the page's address_space | 1901 | * @mapping: the page's address_space |
1902 | * @index: the page index | 1902 | * @index: the page index |
1903 | * @gfp: the page allocator flags to use if allocating | 1903 | * @gfp: the page allocator flags to use if allocating |
1904 | * | 1904 | * |
1905 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with | 1905 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with |
1906 | * any new page allocations done using the specified allocation flags. Note | 1906 | * any new page allocations done using the specified allocation flags. Note |
1907 | * that the Radix tree operations will still use GFP_KERNEL, so you can't | 1907 | * that the Radix tree operations will still use GFP_KERNEL, so you can't |
1908 | * expect to do this atomically or anything like that - but you can pass in | 1908 | * expect to do this atomically or anything like that - but you can pass in |
1909 | * other page requirements. | 1909 | * other page requirements. |
1910 | * | 1910 | * |
1911 | * If the page does not get brought uptodate, return -EIO. | 1911 | * If the page does not get brought uptodate, return -EIO. |
1912 | */ | 1912 | */ |
1913 | struct page *read_cache_page_gfp(struct address_space *mapping, | 1913 | struct page *read_cache_page_gfp(struct address_space *mapping, |
1914 | pgoff_t index, | 1914 | pgoff_t index, |
1915 | gfp_t gfp) | 1915 | gfp_t gfp) |
1916 | { | 1916 | { |
1917 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; | 1917 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; |
1918 | 1918 | ||
1919 | return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); | 1919 | return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); |
1920 | } | 1920 | } |
1921 | EXPORT_SYMBOL(read_cache_page_gfp); | 1921 | EXPORT_SYMBOL(read_cache_page_gfp); |
1922 | 1922 | ||
1923 | /** | 1923 | /** |
1924 | * read_cache_page - read into page cache, fill it if needed | 1924 | * read_cache_page - read into page cache, fill it if needed |
1925 | * @mapping: the page's address_space | 1925 | * @mapping: the page's address_space |
1926 | * @index: the page index | 1926 | * @index: the page index |
1927 | * @filler: function to perform the read | 1927 | * @filler: function to perform the read |
1928 | * @data: first arg to filler(data, page) function, often left as NULL | 1928 | * @data: first arg to filler(data, page) function, often left as NULL |
1929 | * | 1929 | * |
1930 | * Read into the page cache. If a page already exists, and PageUptodate() is | 1930 | * Read into the page cache. If a page already exists, and PageUptodate() is |
1931 | * not set, try to fill the page then wait for it to become unlocked. | 1931 | * not set, try to fill the page then wait for it to become unlocked. |
1932 | * | 1932 | * |
1933 | * If the page does not get brought uptodate, return -EIO. | 1933 | * If the page does not get brought uptodate, return -EIO. |
1934 | */ | 1934 | */ |
1935 | struct page *read_cache_page(struct address_space *mapping, | 1935 | struct page *read_cache_page(struct address_space *mapping, |
1936 | pgoff_t index, | 1936 | pgoff_t index, |
1937 | int (*filler)(void *, struct page *), | 1937 | int (*filler)(void *, struct page *), |
1938 | void *data) | 1938 | void *data) |
1939 | { | 1939 | { |
1940 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); | 1940 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); |
1941 | } | 1941 | } |
1942 | EXPORT_SYMBOL(read_cache_page); | 1942 | EXPORT_SYMBOL(read_cache_page); |
1943 | 1943 | ||
1944 | /* | 1944 | /* |
1945 | * The logic we want is | 1945 | * The logic we want is |
1946 | * | 1946 | * |
1947 | * if suid or (sgid and xgrp) | 1947 | * if suid or (sgid and xgrp) |
1948 | * remove privs | 1948 | * remove privs |
1949 | */ | 1949 | */ |
1950 | int should_remove_suid(struct dentry *dentry) | 1950 | int should_remove_suid(struct dentry *dentry) |
1951 | { | 1951 | { |
1952 | mode_t mode = dentry->d_inode->i_mode; | 1952 | mode_t mode = dentry->d_inode->i_mode; |
1953 | int kill = 0; | 1953 | int kill = 0; |
1954 | 1954 | ||
1955 | /* suid always must be killed */ | 1955 | /* suid always must be killed */ |
1956 | if (unlikely(mode & S_ISUID)) | 1956 | if (unlikely(mode & S_ISUID)) |
1957 | kill = ATTR_KILL_SUID; | 1957 | kill = ATTR_KILL_SUID; |
1958 | 1958 | ||
1959 | /* | 1959 | /* |
1960 | * sgid without any exec bits is just a mandatory locking mark; leave | 1960 | * sgid without any exec bits is just a mandatory locking mark; leave |
1961 | * it alone. If some exec bits are set, it's a real sgid; kill it. | 1961 | * it alone. If some exec bits are set, it's a real sgid; kill it. |
1962 | */ | 1962 | */ |
1963 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | 1963 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) |
1964 | kill |= ATTR_KILL_SGID; | 1964 | kill |= ATTR_KILL_SGID; |
1965 | 1965 | ||
1966 | if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) | 1966 | if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) |
1967 | return kill; | 1967 | return kill; |
1968 | 1968 | ||
1969 | return 0; | 1969 | return 0; |
1970 | } | 1970 | } |
1971 | EXPORT_SYMBOL(should_remove_suid); | 1971 | EXPORT_SYMBOL(should_remove_suid); |
1972 | 1972 | ||
1973 | static int __remove_suid(struct dentry *dentry, int kill) | 1973 | static int __remove_suid(struct dentry *dentry, int kill) |
1974 | { | 1974 | { |
1975 | struct iattr newattrs; | 1975 | struct iattr newattrs; |
1976 | 1976 | ||
1977 | newattrs.ia_valid = ATTR_FORCE | kill; | 1977 | newattrs.ia_valid = ATTR_FORCE | kill; |
1978 | return notify_change(dentry, &newattrs); | 1978 | return notify_change(dentry, &newattrs); |
1979 | } | 1979 | } |
1980 | 1980 | ||
1981 | int file_remove_suid(struct file *file) | 1981 | int file_remove_suid(struct file *file) |
1982 | { | 1982 | { |
1983 | struct dentry *dentry = file->f_path.dentry; | 1983 | struct dentry *dentry = file->f_path.dentry; |
1984 | struct inode *inode = dentry->d_inode; | 1984 | struct inode *inode = dentry->d_inode; |
1985 | int killsuid; | 1985 | int killsuid; |
1986 | int killpriv; | 1986 | int killpriv; |
1987 | int error = 0; | 1987 | int error = 0; |
1988 | 1988 | ||
1989 | /* Fast path for nothing security related */ | 1989 | /* Fast path for nothing security related */ |
1990 | if (IS_NOSEC(inode)) | 1990 | if (IS_NOSEC(inode)) |
1991 | return 0; | 1991 | return 0; |
1992 | 1992 | ||
1993 | killsuid = should_remove_suid(dentry); | 1993 | killsuid = should_remove_suid(dentry); |
1994 | killpriv = security_inode_need_killpriv(dentry); | 1994 | killpriv = security_inode_need_killpriv(dentry); |
1995 | 1995 | ||
1996 | if (killpriv < 0) | 1996 | if (killpriv < 0) |
1997 | return killpriv; | 1997 | return killpriv; |
1998 | if (killpriv) | 1998 | if (killpriv) |
1999 | error = security_inode_killpriv(dentry); | 1999 | error = security_inode_killpriv(dentry); |
2000 | if (!error && killsuid) | 2000 | if (!error && killsuid) |
2001 | error = __remove_suid(dentry, killsuid); | 2001 | error = __remove_suid(dentry, killsuid); |
2002 | if (!error && (inode->i_sb->s_flags & MS_NOSEC)) | 2002 | if (!error && (inode->i_sb->s_flags & MS_NOSEC)) |
2003 | inode->i_flags |= S_NOSEC; | 2003 | inode->i_flags |= S_NOSEC; |
2004 | 2004 | ||
2005 | return error; | 2005 | return error; |
2006 | } | 2006 | } |
2007 | EXPORT_SYMBOL(file_remove_suid); | 2007 | EXPORT_SYMBOL(file_remove_suid); |
2008 | 2008 | ||
2009 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 2009 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
2010 | const struct iovec *iov, size_t base, size_t bytes) | 2010 | const struct iovec *iov, size_t base, size_t bytes) |
2011 | { | 2011 | { |
2012 | size_t copied = 0, left = 0; | 2012 | size_t copied = 0, left = 0; |
2013 | 2013 | ||
2014 | while (bytes) { | 2014 | while (bytes) { |
2015 | char __user *buf = iov->iov_base + base; | 2015 | char __user *buf = iov->iov_base + base; |
2016 | int copy = min(bytes, iov->iov_len - base); | 2016 | int copy = min(bytes, iov->iov_len - base); |
2017 | 2017 | ||
2018 | base = 0; | 2018 | base = 0; |
2019 | left = __copy_from_user_inatomic(vaddr, buf, copy); | 2019 | left = __copy_from_user_inatomic(vaddr, buf, copy); |
2020 | copied += copy; | 2020 | copied += copy; |
2021 | bytes -= copy; | 2021 | bytes -= copy; |
2022 | vaddr += copy; | 2022 | vaddr += copy; |
2023 | iov++; | 2023 | iov++; |
2024 | 2024 | ||
2025 | if (unlikely(left)) | 2025 | if (unlikely(left)) |
2026 | break; | 2026 | break; |
2027 | } | 2027 | } |
2028 | return copied - left; | 2028 | return copied - left; |
2029 | } | 2029 | } |
2030 | 2030 | ||
2031 | /* | 2031 | /* |
2032 | * Copy as much as we can into the page and return the number of bytes which | 2032 | * Copy as much as we can into the page and return the number of bytes which |
2033 | * were successfully copied. If a fault is encountered then return the number of | 2033 | * were successfully copied. If a fault is encountered then return the number of |
2034 | * bytes which were copied. | 2034 | * bytes which were copied. |
2035 | */ | 2035 | */ |
2036 | size_t iov_iter_copy_from_user_atomic(struct page *page, | 2036 | size_t iov_iter_copy_from_user_atomic(struct page *page, |
2037 | struct iov_iter *i, unsigned long offset, size_t bytes) | 2037 | struct iov_iter *i, unsigned long offset, size_t bytes) |
2038 | { | 2038 | { |
2039 | char *kaddr; | 2039 | char *kaddr; |
2040 | size_t copied; | 2040 | size_t copied; |
2041 | 2041 | ||
2042 | BUG_ON(!in_atomic()); | 2042 | BUG_ON(!in_atomic()); |
2043 | kaddr = kmap_atomic(page, KM_USER0); | 2043 | kaddr = kmap_atomic(page, KM_USER0); |
2044 | if (likely(i->nr_segs == 1)) { | 2044 | if (likely(i->nr_segs == 1)) { |
2045 | int left; | 2045 | int left; |
2046 | char __user *buf = i->iov->iov_base + i->iov_offset; | 2046 | char __user *buf = i->iov->iov_base + i->iov_offset; |
2047 | left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); | 2047 | left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); |
2048 | copied = bytes - left; | 2048 | copied = bytes - left; |
2049 | } else { | 2049 | } else { |
2050 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | 2050 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, |
2051 | i->iov, i->iov_offset, bytes); | 2051 | i->iov, i->iov_offset, bytes); |
2052 | } | 2052 | } |
2053 | kunmap_atomic(kaddr, KM_USER0); | 2053 | kunmap_atomic(kaddr, KM_USER0); |
2054 | 2054 | ||
2055 | return copied; | 2055 | return copied; |
2056 | } | 2056 | } |
2057 | EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); | 2057 | EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); |
2058 | 2058 | ||
2059 | /* | 2059 | /* |
2060 | * This has the same sideeffects and return value as | 2060 | * This has the same sideeffects and return value as |
2061 | * iov_iter_copy_from_user_atomic(). | 2061 | * iov_iter_copy_from_user_atomic(). |
2062 | * The difference is that it attempts to resolve faults. | 2062 | * The difference is that it attempts to resolve faults. |
2063 | * Page must not be locked. | 2063 | * Page must not be locked. |
2064 | */ | 2064 | */ |
2065 | size_t iov_iter_copy_from_user(struct page *page, | 2065 | size_t iov_iter_copy_from_user(struct page *page, |
2066 | struct iov_iter *i, unsigned long offset, size_t bytes) | 2066 | struct iov_iter *i, unsigned long offset, size_t bytes) |
2067 | { | 2067 | { |
2068 | char *kaddr; | 2068 | char *kaddr; |
2069 | size_t copied; | 2069 | size_t copied; |
2070 | 2070 | ||
2071 | kaddr = kmap(page); | 2071 | kaddr = kmap(page); |
2072 | if (likely(i->nr_segs == 1)) { | 2072 | if (likely(i->nr_segs == 1)) { |
2073 | int left; | 2073 | int left; |
2074 | char __user *buf = i->iov->iov_base + i->iov_offset; | 2074 | char __user *buf = i->iov->iov_base + i->iov_offset; |
2075 | left = __copy_from_user(kaddr + offset, buf, bytes); | 2075 | left = __copy_from_user(kaddr + offset, buf, bytes); |
2076 | copied = bytes - left; | 2076 | copied = bytes - left; |
2077 | } else { | 2077 | } else { |
2078 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | 2078 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, |
2079 | i->iov, i->iov_offset, bytes); | 2079 | i->iov, i->iov_offset, bytes); |
2080 | } | 2080 | } |
2081 | kunmap(page); | 2081 | kunmap(page); |
2082 | return copied; | 2082 | return copied; |
2083 | } | 2083 | } |
2084 | EXPORT_SYMBOL(iov_iter_copy_from_user); | 2084 | EXPORT_SYMBOL(iov_iter_copy_from_user); |
2085 | 2085 | ||
2086 | void iov_iter_advance(struct iov_iter *i, size_t bytes) | 2086 | void iov_iter_advance(struct iov_iter *i, size_t bytes) |
2087 | { | 2087 | { |
2088 | BUG_ON(i->count < bytes); | 2088 | BUG_ON(i->count < bytes); |
2089 | 2089 | ||
2090 | if (likely(i->nr_segs == 1)) { | 2090 | if (likely(i->nr_segs == 1)) { |
2091 | i->iov_offset += bytes; | 2091 | i->iov_offset += bytes; |
2092 | i->count -= bytes; | 2092 | i->count -= bytes; |
2093 | } else { | 2093 | } else { |
2094 | const struct iovec *iov = i->iov; | 2094 | const struct iovec *iov = i->iov; |
2095 | size_t base = i->iov_offset; | 2095 | size_t base = i->iov_offset; |
2096 | 2096 | ||
2097 | /* | 2097 | /* |
2098 | * The !iov->iov_len check ensures we skip over unlikely | 2098 | * The !iov->iov_len check ensures we skip over unlikely |
2099 | * zero-length segments (without overruning the iovec). | 2099 | * zero-length segments (without overruning the iovec). |
2100 | */ | 2100 | */ |
2101 | while (bytes || unlikely(i->count && !iov->iov_len)) { | 2101 | while (bytes || unlikely(i->count && !iov->iov_len)) { |
2102 | int copy; | 2102 | int copy; |
2103 | 2103 | ||
2104 | copy = min(bytes, iov->iov_len - base); | 2104 | copy = min(bytes, iov->iov_len - base); |
2105 | BUG_ON(!i->count || i->count < copy); | 2105 | BUG_ON(!i->count || i->count < copy); |
2106 | i->count -= copy; | 2106 | i->count -= copy; |
2107 | bytes -= copy; | 2107 | bytes -= copy; |
2108 | base += copy; | 2108 | base += copy; |
2109 | if (iov->iov_len == base) { | 2109 | if (iov->iov_len == base) { |
2110 | iov++; | 2110 | iov++; |
2111 | base = 0; | 2111 | base = 0; |
2112 | } | 2112 | } |
2113 | } | 2113 | } |
2114 | i->iov = iov; | 2114 | i->iov = iov; |
2115 | i->iov_offset = base; | 2115 | i->iov_offset = base; |
2116 | } | 2116 | } |
2117 | } | 2117 | } |
2118 | EXPORT_SYMBOL(iov_iter_advance); | 2118 | EXPORT_SYMBOL(iov_iter_advance); |
2119 | 2119 | ||
2120 | /* | 2120 | /* |
2121 | * Fault in the first iovec of the given iov_iter, to a maximum length | 2121 | * Fault in the first iovec of the given iov_iter, to a maximum length |
2122 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | 2122 | * of bytes. Returns 0 on success, or non-zero if the memory could not be |
2123 | * accessed (ie. because it is an invalid address). | 2123 | * accessed (ie. because it is an invalid address). |
2124 | * | 2124 | * |
2125 | * writev-intensive code may want this to prefault several iovecs -- that | 2125 | * writev-intensive code may want this to prefault several iovecs -- that |
2126 | * would be possible (callers must not rely on the fact that _only_ the | 2126 | * would be possible (callers must not rely on the fact that _only_ the |
2127 | * first iovec will be faulted with the current implementation). | 2127 | * first iovec will be faulted with the current implementation). |
2128 | */ | 2128 | */ |
2129 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | 2129 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) |
2130 | { | 2130 | { |
2131 | char __user *buf = i->iov->iov_base + i->iov_offset; | 2131 | char __user *buf = i->iov->iov_base + i->iov_offset; |
2132 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); | 2132 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); |
2133 | return fault_in_pages_readable(buf, bytes); | 2133 | return fault_in_pages_readable(buf, bytes); |
2134 | } | 2134 | } |
2135 | EXPORT_SYMBOL(iov_iter_fault_in_readable); | 2135 | EXPORT_SYMBOL(iov_iter_fault_in_readable); |
2136 | 2136 | ||
2137 | /* | 2137 | /* |
2138 | * Return the count of just the current iov_iter segment. | 2138 | * Return the count of just the current iov_iter segment. |
2139 | */ | 2139 | */ |
2140 | size_t iov_iter_single_seg_count(struct iov_iter *i) | 2140 | size_t iov_iter_single_seg_count(struct iov_iter *i) |
2141 | { | 2141 | { |
2142 | const struct iovec *iov = i->iov; | 2142 | const struct iovec *iov = i->iov; |
2143 | if (i->nr_segs == 1) | 2143 | if (i->nr_segs == 1) |
2144 | return i->count; | 2144 | return i->count; |
2145 | else | 2145 | else |
2146 | return min(i->count, iov->iov_len - i->iov_offset); | 2146 | return min(i->count, iov->iov_len - i->iov_offset); |
2147 | } | 2147 | } |
2148 | EXPORT_SYMBOL(iov_iter_single_seg_count); | 2148 | EXPORT_SYMBOL(iov_iter_single_seg_count); |
2149 | 2149 | ||
2150 | /* | 2150 | /* |
2151 | * Performs necessary checks before doing a write | 2151 | * Performs necessary checks before doing a write |
2152 | * | 2152 | * |
2153 | * Can adjust writing position or amount of bytes to write. | 2153 | * Can adjust writing position or amount of bytes to write. |
2154 | * Returns appropriate error code that caller should return or | 2154 | * Returns appropriate error code that caller should return or |
2155 | * zero in case that write should be allowed. | 2155 | * zero in case that write should be allowed. |
2156 | */ | 2156 | */ |
2157 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) | 2157 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) |
2158 | { | 2158 | { |
2159 | struct inode *inode = file->f_mapping->host; | 2159 | struct inode *inode = file->f_mapping->host; |
2160 | unsigned long limit = rlimit(RLIMIT_FSIZE); | 2160 | unsigned long limit = rlimit(RLIMIT_FSIZE); |
2161 | 2161 | ||
2162 | if (unlikely(*pos < 0)) | 2162 | if (unlikely(*pos < 0)) |
2163 | return -EINVAL; | 2163 | return -EINVAL; |
2164 | 2164 | ||
2165 | if (!isblk) { | 2165 | if (!isblk) { |
2166 | /* FIXME: this is for backwards compatibility with 2.4 */ | 2166 | /* FIXME: this is for backwards compatibility with 2.4 */ |
2167 | if (file->f_flags & O_APPEND) | 2167 | if (file->f_flags & O_APPEND) |
2168 | *pos = i_size_read(inode); | 2168 | *pos = i_size_read(inode); |
2169 | 2169 | ||
2170 | if (limit != RLIM_INFINITY) { | 2170 | if (limit != RLIM_INFINITY) { |
2171 | if (*pos >= limit) { | 2171 | if (*pos >= limit) { |
2172 | send_sig(SIGXFSZ, current, 0); | 2172 | send_sig(SIGXFSZ, current, 0); |
2173 | return -EFBIG; | 2173 | return -EFBIG; |
2174 | } | 2174 | } |
2175 | if (*count > limit - (typeof(limit))*pos) { | 2175 | if (*count > limit - (typeof(limit))*pos) { |
2176 | *count = limit - (typeof(limit))*pos; | 2176 | *count = limit - (typeof(limit))*pos; |
2177 | } | 2177 | } |
2178 | } | 2178 | } |
2179 | } | 2179 | } |
2180 | 2180 | ||
2181 | /* | 2181 | /* |
2182 | * LFS rule | 2182 | * LFS rule |
2183 | */ | 2183 | */ |
2184 | if (unlikely(*pos + *count > MAX_NON_LFS && | 2184 | if (unlikely(*pos + *count > MAX_NON_LFS && |
2185 | !(file->f_flags & O_LARGEFILE))) { | 2185 | !(file->f_flags & O_LARGEFILE))) { |
2186 | if (*pos >= MAX_NON_LFS) { | 2186 | if (*pos >= MAX_NON_LFS) { |
2187 | return -EFBIG; | 2187 | return -EFBIG; |
2188 | } | 2188 | } |
2189 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { | 2189 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { |
2190 | *count = MAX_NON_LFS - (unsigned long)*pos; | 2190 | *count = MAX_NON_LFS - (unsigned long)*pos; |
2191 | } | 2191 | } |
2192 | } | 2192 | } |
2193 | 2193 | ||
2194 | /* | 2194 | /* |
2195 | * Are we about to exceed the fs block limit ? | 2195 | * Are we about to exceed the fs block limit ? |
2196 | * | 2196 | * |
2197 | * If we have written data it becomes a short write. If we have | 2197 | * If we have written data it becomes a short write. If we have |
2198 | * exceeded without writing data we send a signal and return EFBIG. | 2198 | * exceeded without writing data we send a signal and return EFBIG. |
2199 | * Linus frestrict idea will clean these up nicely.. | 2199 | * Linus frestrict idea will clean these up nicely.. |
2200 | */ | 2200 | */ |
2201 | if (likely(!isblk)) { | 2201 | if (likely(!isblk)) { |
2202 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { | 2202 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { |
2203 | if (*count || *pos > inode->i_sb->s_maxbytes) { | 2203 | if (*count || *pos > inode->i_sb->s_maxbytes) { |
2204 | return -EFBIG; | 2204 | return -EFBIG; |
2205 | } | 2205 | } |
2206 | /* zero-length writes at ->s_maxbytes are OK */ | 2206 | /* zero-length writes at ->s_maxbytes are OK */ |
2207 | } | 2207 | } |
2208 | 2208 | ||
2209 | if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) | 2209 | if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) |
2210 | *count = inode->i_sb->s_maxbytes - *pos; | 2210 | *count = inode->i_sb->s_maxbytes - *pos; |
2211 | } else { | 2211 | } else { |
2212 | #ifdef CONFIG_BLOCK | 2212 | #ifdef CONFIG_BLOCK |
2213 | loff_t isize; | 2213 | loff_t isize; |
2214 | if (bdev_read_only(I_BDEV(inode))) | 2214 | if (bdev_read_only(I_BDEV(inode))) |
2215 | return -EPERM; | 2215 | return -EPERM; |
2216 | isize = i_size_read(inode); | 2216 | isize = i_size_read(inode); |
2217 | if (*pos >= isize) { | 2217 | if (*pos >= isize) { |
2218 | if (*count || *pos > isize) | 2218 | if (*count || *pos > isize) |
2219 | return -ENOSPC; | 2219 | return -ENOSPC; |
2220 | } | 2220 | } |
2221 | 2221 | ||
2222 | if (*pos + *count > isize) | 2222 | if (*pos + *count > isize) |
2223 | *count = isize - *pos; | 2223 | *count = isize - *pos; |
2224 | #else | 2224 | #else |
2225 | return -EPERM; | 2225 | return -EPERM; |
2226 | #endif | 2226 | #endif |
2227 | } | 2227 | } |
2228 | return 0; | 2228 | return 0; |
2229 | } | 2229 | } |
2230 | EXPORT_SYMBOL(generic_write_checks); | 2230 | EXPORT_SYMBOL(generic_write_checks); |
2231 | 2231 | ||
2232 | int pagecache_write_begin(struct file *file, struct address_space *mapping, | 2232 | int pagecache_write_begin(struct file *file, struct address_space *mapping, |
2233 | loff_t pos, unsigned len, unsigned flags, | 2233 | loff_t pos, unsigned len, unsigned flags, |
2234 | struct page **pagep, void **fsdata) | 2234 | struct page **pagep, void **fsdata) |
2235 | { | 2235 | { |
2236 | const struct address_space_operations *aops = mapping->a_ops; | 2236 | const struct address_space_operations *aops = mapping->a_ops; |
2237 | 2237 | ||
2238 | return aops->write_begin(file, mapping, pos, len, flags, | 2238 | return aops->write_begin(file, mapping, pos, len, flags, |
2239 | pagep, fsdata); | 2239 | pagep, fsdata); |
2240 | } | 2240 | } |
2241 | EXPORT_SYMBOL(pagecache_write_begin); | 2241 | EXPORT_SYMBOL(pagecache_write_begin); |
2242 | 2242 | ||
2243 | int pagecache_write_end(struct file *file, struct address_space *mapping, | 2243 | int pagecache_write_end(struct file *file, struct address_space *mapping, |
2244 | loff_t pos, unsigned len, unsigned copied, | 2244 | loff_t pos, unsigned len, unsigned copied, |
2245 | struct page *page, void *fsdata) | 2245 | struct page *page, void *fsdata) |
2246 | { | 2246 | { |
2247 | const struct address_space_operations *aops = mapping->a_ops; | 2247 | const struct address_space_operations *aops = mapping->a_ops; |
2248 | 2248 | ||
2249 | mark_page_accessed(page); | 2249 | mark_page_accessed(page); |
2250 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); | 2250 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); |
2251 | } | 2251 | } |
2252 | EXPORT_SYMBOL(pagecache_write_end); | 2252 | EXPORT_SYMBOL(pagecache_write_end); |
2253 | 2253 | ||
2254 | ssize_t | 2254 | ssize_t |
2255 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 2255 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
2256 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | 2256 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, |
2257 | size_t count, size_t ocount) | 2257 | size_t count, size_t ocount) |
2258 | { | 2258 | { |
2259 | struct file *file = iocb->ki_filp; | 2259 | struct file *file = iocb->ki_filp; |
2260 | struct address_space *mapping = file->f_mapping; | 2260 | struct address_space *mapping = file->f_mapping; |
2261 | struct inode *inode = mapping->host; | 2261 | struct inode *inode = mapping->host; |
2262 | ssize_t written; | 2262 | ssize_t written; |
2263 | size_t write_len; | 2263 | size_t write_len; |
2264 | pgoff_t end; | 2264 | pgoff_t end; |
2265 | 2265 | ||
2266 | if (count != ocount) | 2266 | if (count != ocount) |
2267 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2267 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
2268 | 2268 | ||
2269 | write_len = iov_length(iov, *nr_segs); | 2269 | write_len = iov_length(iov, *nr_segs); |
2270 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | 2270 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; |
2271 | 2271 | ||
2272 | written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); | 2272 | written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); |
2273 | if (written) | 2273 | if (written) |
2274 | goto out; | 2274 | goto out; |
2275 | 2275 | ||
2276 | /* | 2276 | /* |
2277 | * After a write we want buffered reads to be sure to go to disk to get | 2277 | * After a write we want buffered reads to be sure to go to disk to get |
2278 | * the new data. We invalidate clean cached page from the region we're | 2278 | * the new data. We invalidate clean cached page from the region we're |
2279 | * about to write. We do this *before* the write so that we can return | 2279 | * about to write. We do this *before* the write so that we can return |
2280 | * without clobbering -EIOCBQUEUED from ->direct_IO(). | 2280 | * without clobbering -EIOCBQUEUED from ->direct_IO(). |
2281 | */ | 2281 | */ |
2282 | if (mapping->nrpages) { | 2282 | if (mapping->nrpages) { |
2283 | written = invalidate_inode_pages2_range(mapping, | 2283 | written = invalidate_inode_pages2_range(mapping, |
2284 | pos >> PAGE_CACHE_SHIFT, end); | 2284 | pos >> PAGE_CACHE_SHIFT, end); |
2285 | /* | 2285 | /* |
2286 | * If a page can not be invalidated, return 0 to fall back | 2286 | * If a page can not be invalidated, return 0 to fall back |
2287 | * to buffered write. | 2287 | * to buffered write. |
2288 | */ | 2288 | */ |
2289 | if (written) { | 2289 | if (written) { |
2290 | if (written == -EBUSY) | 2290 | if (written == -EBUSY) |
2291 | return 0; | 2291 | return 0; |
2292 | goto out; | 2292 | goto out; |
2293 | } | 2293 | } |
2294 | } | 2294 | } |
2295 | 2295 | ||
2296 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); | 2296 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); |
2297 | 2297 | ||
2298 | /* | 2298 | /* |
2299 | * Finally, try again to invalidate clean pages which might have been | 2299 | * Finally, try again to invalidate clean pages which might have been |
2300 | * cached by non-direct readahead, or faulted in by get_user_pages() | 2300 | * cached by non-direct readahead, or faulted in by get_user_pages() |
2301 | * if the source of the write was an mmap'ed region of the file | 2301 | * if the source of the write was an mmap'ed region of the file |
2302 | * we're writing. Either one is a pretty crazy thing to do, | 2302 | * we're writing. Either one is a pretty crazy thing to do, |
2303 | * so we don't support it 100%. If this invalidation | 2303 | * so we don't support it 100%. If this invalidation |
2304 | * fails, tough, the write still worked... | 2304 | * fails, tough, the write still worked... |
2305 | */ | 2305 | */ |
2306 | if (mapping->nrpages) { | 2306 | if (mapping->nrpages) { |
2307 | invalidate_inode_pages2_range(mapping, | 2307 | invalidate_inode_pages2_range(mapping, |
2308 | pos >> PAGE_CACHE_SHIFT, end); | 2308 | pos >> PAGE_CACHE_SHIFT, end); |
2309 | } | 2309 | } |
2310 | 2310 | ||
2311 | if (written > 0) { | 2311 | if (written > 0) { |
2312 | pos += written; | 2312 | pos += written; |
2313 | if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { | 2313 | if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { |
2314 | i_size_write(inode, pos); | 2314 | i_size_write(inode, pos); |
2315 | mark_inode_dirty(inode); | 2315 | mark_inode_dirty(inode); |
2316 | } | 2316 | } |
2317 | *ppos = pos; | 2317 | *ppos = pos; |
2318 | } | 2318 | } |
2319 | out: | 2319 | out: |
2320 | return written; | 2320 | return written; |
2321 | } | 2321 | } |
2322 | EXPORT_SYMBOL(generic_file_direct_write); | 2322 | EXPORT_SYMBOL(generic_file_direct_write); |
2323 | 2323 | ||
2324 | /* | 2324 | /* |
2325 | * Find or create a page at the given pagecache position. Return the locked | 2325 | * Find or create a page at the given pagecache position. Return the locked |
2326 | * page. This function is specifically for buffered writes. | 2326 | * page. This function is specifically for buffered writes. |
2327 | */ | 2327 | */ |
2328 | struct page *grab_cache_page_write_begin(struct address_space *mapping, | 2328 | struct page *grab_cache_page_write_begin(struct address_space *mapping, |
2329 | pgoff_t index, unsigned flags) | 2329 | pgoff_t index, unsigned flags) |
2330 | { | 2330 | { |
2331 | int status; | 2331 | int status; |
2332 | struct page *page; | 2332 | struct page *page; |
2333 | gfp_t gfp_notmask = 0; | 2333 | gfp_t gfp_notmask = 0; |
2334 | if (flags & AOP_FLAG_NOFS) | 2334 | if (flags & AOP_FLAG_NOFS) |
2335 | gfp_notmask = __GFP_FS; | 2335 | gfp_notmask = __GFP_FS; |
2336 | repeat: | 2336 | repeat: |
2337 | page = find_lock_page(mapping, index); | 2337 | page = find_lock_page(mapping, index); |
2338 | if (page) | 2338 | if (page) |
2339 | goto found; | 2339 | goto found; |
2340 | 2340 | ||
2341 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); | 2341 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); |
2342 | if (!page) | 2342 | if (!page) |
2343 | return NULL; | 2343 | return NULL; |
2344 | status = add_to_page_cache_lru(page, mapping, index, | 2344 | status = add_to_page_cache_lru(page, mapping, index, |
2345 | GFP_KERNEL & ~gfp_notmask); | 2345 | GFP_KERNEL & ~gfp_notmask); |
2346 | if (unlikely(status)) { | 2346 | if (unlikely(status)) { |
2347 | page_cache_release(page); | 2347 | page_cache_release(page); |
2348 | if (status == -EEXIST) | 2348 | if (status == -EEXIST) |
2349 | goto repeat; | 2349 | goto repeat; |
2350 | return NULL; | 2350 | return NULL; |
2351 | } | 2351 | } |
2352 | found: | 2352 | found: |
2353 | wait_on_page_writeback(page); | 2353 | wait_on_page_writeback(page); |
2354 | return page; | 2354 | return page; |
2355 | } | 2355 | } |
2356 | EXPORT_SYMBOL(grab_cache_page_write_begin); | 2356 | EXPORT_SYMBOL(grab_cache_page_write_begin); |
2357 | 2357 | ||
2358 | static ssize_t generic_perform_write(struct file *file, | 2358 | static ssize_t generic_perform_write(struct file *file, |
2359 | struct iov_iter *i, loff_t pos) | 2359 | struct iov_iter *i, loff_t pos) |
2360 | { | 2360 | { |
2361 | struct address_space *mapping = file->f_mapping; | 2361 | struct address_space *mapping = file->f_mapping; |
2362 | const struct address_space_operations *a_ops = mapping->a_ops; | 2362 | const struct address_space_operations *a_ops = mapping->a_ops; |
2363 | long status = 0; | 2363 | long status = 0; |
2364 | ssize_t written = 0; | 2364 | ssize_t written = 0; |
2365 | unsigned int flags = 0; | 2365 | unsigned int flags = 0; |
2366 | 2366 | ||
2367 | /* | 2367 | /* |
2368 | * Copies from kernel address space cannot fail (NFSD is a big user). | 2368 | * Copies from kernel address space cannot fail (NFSD is a big user). |
2369 | */ | 2369 | */ |
2370 | if (segment_eq(get_fs(), KERNEL_DS)) | 2370 | if (segment_eq(get_fs(), KERNEL_DS)) |
2371 | flags |= AOP_FLAG_UNINTERRUPTIBLE; | 2371 | flags |= AOP_FLAG_UNINTERRUPTIBLE; |
2372 | 2372 | ||
2373 | do { | 2373 | do { |
2374 | struct page *page; | 2374 | struct page *page; |
2375 | unsigned long offset; /* Offset into pagecache page */ | 2375 | unsigned long offset; /* Offset into pagecache page */ |
2376 | unsigned long bytes; /* Bytes to write to page */ | 2376 | unsigned long bytes; /* Bytes to write to page */ |
2377 | size_t copied; /* Bytes copied from user */ | 2377 | size_t copied; /* Bytes copied from user */ |
2378 | void *fsdata; | 2378 | void *fsdata; |
2379 | 2379 | ||
2380 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | 2380 | offset = (pos & (PAGE_CACHE_SIZE - 1)); |
2381 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | 2381 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
2382 | iov_iter_count(i)); | 2382 | iov_iter_count(i)); |
2383 | 2383 | ||
2384 | again: | 2384 | again: |
2385 | 2385 | ||
2386 | /* | 2386 | /* |
2387 | * Bring in the user page that we will copy from _first_. | 2387 | * Bring in the user page that we will copy from _first_. |
2388 | * Otherwise there's a nasty deadlock on copying from the | 2388 | * Otherwise there's a nasty deadlock on copying from the |
2389 | * same page as we're writing to, without it being marked | 2389 | * same page as we're writing to, without it being marked |
2390 | * up-to-date. | 2390 | * up-to-date. |
2391 | * | 2391 | * |
2392 | * Not only is this an optimisation, but it is also required | 2392 | * Not only is this an optimisation, but it is also required |
2393 | * to check that the address is actually valid, when atomic | 2393 | * to check that the address is actually valid, when atomic |
2394 | * usercopies are used, below. | 2394 | * usercopies are used, below. |
2395 | */ | 2395 | */ |
2396 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | 2396 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { |
2397 | status = -EFAULT; | 2397 | status = -EFAULT; |
2398 | break; | 2398 | break; |
2399 | } | 2399 | } |
2400 | 2400 | ||
2401 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | 2401 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, |
2402 | &page, &fsdata); | 2402 | &page, &fsdata); |
2403 | if (unlikely(status)) | 2403 | if (unlikely(status)) |
2404 | break; | 2404 | break; |
2405 | 2405 | ||
2406 | if (mapping_writably_mapped(mapping)) | 2406 | if (mapping_writably_mapped(mapping)) |
2407 | flush_dcache_page(page); | 2407 | flush_dcache_page(page); |
2408 | 2408 | ||
2409 | pagefault_disable(); | 2409 | pagefault_disable(); |
2410 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2410 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
2411 | pagefault_enable(); | 2411 | pagefault_enable(); |
2412 | flush_dcache_page(page); | 2412 | flush_dcache_page(page); |
2413 | 2413 | ||
2414 | mark_page_accessed(page); | 2414 | mark_page_accessed(page); |
2415 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | 2415 | status = a_ops->write_end(file, mapping, pos, bytes, copied, |
2416 | page, fsdata); | 2416 | page, fsdata); |
2417 | if (unlikely(status < 0)) | 2417 | if (unlikely(status < 0)) |
2418 | break; | 2418 | break; |
2419 | copied = status; | 2419 | copied = status; |
2420 | 2420 | ||
2421 | cond_resched(); | 2421 | cond_resched(); |
2422 | 2422 | ||
2423 | iov_iter_advance(i, copied); | 2423 | iov_iter_advance(i, copied); |
2424 | if (unlikely(copied == 0)) { | 2424 | if (unlikely(copied == 0)) { |
2425 | /* | 2425 | /* |
2426 | * If we were unable to copy any data at all, we must | 2426 | * If we were unable to copy any data at all, we must |
2427 | * fall back to a single segment length write. | 2427 | * fall back to a single segment length write. |
2428 | * | 2428 | * |
2429 | * If we didn't fallback here, we could livelock | 2429 | * If we didn't fallback here, we could livelock |
2430 | * because not all segments in the iov can be copied at | 2430 | * because not all segments in the iov can be copied at |
2431 | * once without a pagefault. | 2431 | * once without a pagefault. |
2432 | */ | 2432 | */ |
2433 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | 2433 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
2434 | iov_iter_single_seg_count(i)); | 2434 | iov_iter_single_seg_count(i)); |
2435 | goto again; | 2435 | goto again; |
2436 | } | 2436 | } |
2437 | pos += copied; | 2437 | pos += copied; |
2438 | written += copied; | 2438 | written += copied; |
2439 | 2439 | ||
2440 | balance_dirty_pages_ratelimited(mapping); | 2440 | balance_dirty_pages_ratelimited(mapping); |
2441 | 2441 | ||
2442 | } while (iov_iter_count(i)); | 2442 | } while (iov_iter_count(i)); |
2443 | 2443 | ||
2444 | return written ? written : status; | 2444 | return written ? written : status; |
2445 | } | 2445 | } |
2446 | 2446 | ||
2447 | ssize_t | 2447 | ssize_t |
2448 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | 2448 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, |
2449 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | 2449 | unsigned long nr_segs, loff_t pos, loff_t *ppos, |
2450 | size_t count, ssize_t written) | 2450 | size_t count, ssize_t written) |
2451 | { | 2451 | { |
2452 | struct file *file = iocb->ki_filp; | 2452 | struct file *file = iocb->ki_filp; |
2453 | ssize_t status; | 2453 | ssize_t status; |
2454 | struct iov_iter i; | 2454 | struct iov_iter i; |
2455 | 2455 | ||
2456 | iov_iter_init(&i, iov, nr_segs, count, written); | 2456 | iov_iter_init(&i, iov, nr_segs, count, written); |
2457 | status = generic_perform_write(file, &i, pos); | 2457 | status = generic_perform_write(file, &i, pos); |
2458 | 2458 | ||
2459 | if (likely(status >= 0)) { | 2459 | if (likely(status >= 0)) { |
2460 | written += status; | 2460 | written += status; |
2461 | *ppos = pos + status; | 2461 | *ppos = pos + status; |
2462 | } | 2462 | } |
2463 | 2463 | ||
2464 | return written ? written : status; | 2464 | return written ? written : status; |
2465 | } | 2465 | } |
2466 | EXPORT_SYMBOL(generic_file_buffered_write); | 2466 | EXPORT_SYMBOL(generic_file_buffered_write); |
2467 | 2467 | ||
2468 | /** | 2468 | /** |
2469 | * __generic_file_aio_write - write data to a file | 2469 | * __generic_file_aio_write - write data to a file |
2470 | * @iocb: IO state structure (file, offset, etc.) | 2470 | * @iocb: IO state structure (file, offset, etc.) |
2471 | * @iov: vector with data to write | 2471 | * @iov: vector with data to write |
2472 | * @nr_segs: number of segments in the vector | 2472 | * @nr_segs: number of segments in the vector |
2473 | * @ppos: position where to write | 2473 | * @ppos: position where to write |
2474 | * | 2474 | * |
2475 | * This function does all the work needed for actually writing data to a | 2475 | * This function does all the work needed for actually writing data to a |
2476 | * file. It does all basic checks, removes SUID from the file, updates | 2476 | * file. It does all basic checks, removes SUID from the file, updates |
2477 | * modification times and calls proper subroutines depending on whether we | 2477 | * modification times and calls proper subroutines depending on whether we |
2478 | * do direct IO or a standard buffered write. | 2478 | * do direct IO or a standard buffered write. |
2479 | * | 2479 | * |
2480 | * It expects i_mutex to be grabbed unless we work on a block device or similar | 2480 | * It expects i_mutex to be grabbed unless we work on a block device or similar |
2481 | * object which does not need locking at all. | 2481 | * object which does not need locking at all. |
2482 | * | 2482 | * |
2483 | * This function does *not* take care of syncing data in case of O_SYNC write. | 2483 | * This function does *not* take care of syncing data in case of O_SYNC write. |
2484 | * A caller has to handle it. This is mainly due to the fact that we want to | 2484 | * A caller has to handle it. This is mainly due to the fact that we want to |
2485 | * avoid syncing under i_mutex. | 2485 | * avoid syncing under i_mutex. |
2486 | */ | 2486 | */ |
2487 | ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | 2487 | ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, |
2488 | unsigned long nr_segs, loff_t *ppos) | 2488 | unsigned long nr_segs, loff_t *ppos) |
2489 | { | 2489 | { |
2490 | struct file *file = iocb->ki_filp; | 2490 | struct file *file = iocb->ki_filp; |
2491 | struct address_space * mapping = file->f_mapping; | 2491 | struct address_space * mapping = file->f_mapping; |
2492 | size_t ocount; /* original count */ | 2492 | size_t ocount; /* original count */ |
2493 | size_t count; /* after file limit checks */ | 2493 | size_t count; /* after file limit checks */ |
2494 | struct inode *inode = mapping->host; | 2494 | struct inode *inode = mapping->host; |
2495 | loff_t pos; | 2495 | loff_t pos; |
2496 | ssize_t written; | 2496 | ssize_t written; |
2497 | ssize_t err; | 2497 | ssize_t err; |
2498 | 2498 | ||
2499 | ocount = 0; | 2499 | ocount = 0; |
2500 | err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); | 2500 | err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); |
2501 | if (err) | 2501 | if (err) |
2502 | return err; | 2502 | return err; |
2503 | 2503 | ||
2504 | count = ocount; | 2504 | count = ocount; |
2505 | pos = *ppos; | 2505 | pos = *ppos; |
2506 | 2506 | ||
2507 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | 2507 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
2508 | 2508 | ||
2509 | /* We can write back this queue in page reclaim */ | 2509 | /* We can write back this queue in page reclaim */ |
2510 | current->backing_dev_info = mapping->backing_dev_info; | 2510 | current->backing_dev_info = mapping->backing_dev_info; |
2511 | written = 0; | 2511 | written = 0; |
2512 | 2512 | ||
2513 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); | 2513 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); |
2514 | if (err) | 2514 | if (err) |
2515 | goto out; | 2515 | goto out; |
2516 | 2516 | ||
2517 | if (count == 0) | 2517 | if (count == 0) |
2518 | goto out; | 2518 | goto out; |
2519 | 2519 | ||
2520 | err = file_remove_suid(file); | 2520 | err = file_remove_suid(file); |
2521 | if (err) | 2521 | if (err) |
2522 | goto out; | 2522 | goto out; |
2523 | 2523 | ||
2524 | file_update_time(file); | 2524 | file_update_time(file); |
2525 | 2525 | ||
2526 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 2526 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
2527 | if (unlikely(file->f_flags & O_DIRECT)) { | 2527 | if (unlikely(file->f_flags & O_DIRECT)) { |
2528 | loff_t endbyte; | 2528 | loff_t endbyte; |
2529 | ssize_t written_buffered; | 2529 | ssize_t written_buffered; |
2530 | 2530 | ||
2531 | written = generic_file_direct_write(iocb, iov, &nr_segs, pos, | 2531 | written = generic_file_direct_write(iocb, iov, &nr_segs, pos, |
2532 | ppos, count, ocount); | 2532 | ppos, count, ocount); |
2533 | if (written < 0 || written == count) | 2533 | if (written < 0 || written == count) |
2534 | goto out; | 2534 | goto out; |
2535 | /* | 2535 | /* |
2536 | * direct-io write to a hole: fall through to buffered I/O | 2536 | * direct-io write to a hole: fall through to buffered I/O |
2537 | * for completing the rest of the request. | 2537 | * for completing the rest of the request. |
2538 | */ | 2538 | */ |
2539 | pos += written; | 2539 | pos += written; |
2540 | count -= written; | 2540 | count -= written; |
2541 | written_buffered = generic_file_buffered_write(iocb, iov, | 2541 | written_buffered = generic_file_buffered_write(iocb, iov, |
2542 | nr_segs, pos, ppos, count, | 2542 | nr_segs, pos, ppos, count, |
2543 | written); | 2543 | written); |
2544 | /* | 2544 | /* |
2545 | * If generic_file_buffered_write() retuned a synchronous error | 2545 | * If generic_file_buffered_write() retuned a synchronous error |
2546 | * then we want to return the number of bytes which were | 2546 | * then we want to return the number of bytes which were |
2547 | * direct-written, or the error code if that was zero. Note | 2547 | * direct-written, or the error code if that was zero. Note |
2548 | * that this differs from normal direct-io semantics, which | 2548 | * that this differs from normal direct-io semantics, which |
2549 | * will return -EFOO even if some bytes were written. | 2549 | * will return -EFOO even if some bytes were written. |
2550 | */ | 2550 | */ |
2551 | if (written_buffered < 0) { | 2551 | if (written_buffered < 0) { |
2552 | err = written_buffered; | 2552 | err = written_buffered; |
2553 | goto out; | 2553 | goto out; |
2554 | } | 2554 | } |
2555 | 2555 | ||
2556 | /* | 2556 | /* |
2557 | * We need to ensure that the page cache pages are written to | 2557 | * We need to ensure that the page cache pages are written to |
2558 | * disk and invalidated to preserve the expected O_DIRECT | 2558 | * disk and invalidated to preserve the expected O_DIRECT |
2559 | * semantics. | 2559 | * semantics. |
2560 | */ | 2560 | */ |
2561 | endbyte = pos + written_buffered - written - 1; | 2561 | endbyte = pos + written_buffered - written - 1; |
2562 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); | 2562 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); |
2563 | if (err == 0) { | 2563 | if (err == 0) { |
2564 | written = written_buffered; | 2564 | written = written_buffered; |
2565 | invalidate_mapping_pages(mapping, | 2565 | invalidate_mapping_pages(mapping, |
2566 | pos >> PAGE_CACHE_SHIFT, | 2566 | pos >> PAGE_CACHE_SHIFT, |
2567 | endbyte >> PAGE_CACHE_SHIFT); | 2567 | endbyte >> PAGE_CACHE_SHIFT); |
2568 | } else { | 2568 | } else { |
2569 | /* | 2569 | /* |
2570 | * We don't know how much we wrote, so just return | 2570 | * We don't know how much we wrote, so just return |
2571 | * the number of bytes which were direct-written | 2571 | * the number of bytes which were direct-written |
2572 | */ | 2572 | */ |
2573 | } | 2573 | } |
2574 | } else { | 2574 | } else { |
2575 | written = generic_file_buffered_write(iocb, iov, nr_segs, | 2575 | written = generic_file_buffered_write(iocb, iov, nr_segs, |
2576 | pos, ppos, count, written); | 2576 | pos, ppos, count, written); |
2577 | } | 2577 | } |
2578 | out: | 2578 | out: |
2579 | current->backing_dev_info = NULL; | 2579 | current->backing_dev_info = NULL; |
2580 | return written ? written : err; | 2580 | return written ? written : err; |
2581 | } | 2581 | } |
2582 | EXPORT_SYMBOL(__generic_file_aio_write); | 2582 | EXPORT_SYMBOL(__generic_file_aio_write); |
2583 | 2583 | ||
2584 | /** | 2584 | /** |
2585 | * generic_file_aio_write - write data to a file | 2585 | * generic_file_aio_write - write data to a file |
2586 | * @iocb: IO state structure | 2586 | * @iocb: IO state structure |
2587 | * @iov: vector with data to write | 2587 | * @iov: vector with data to write |
2588 | * @nr_segs: number of segments in the vector | 2588 | * @nr_segs: number of segments in the vector |
2589 | * @pos: position in file where to write | 2589 | * @pos: position in file where to write |
2590 | * | 2590 | * |
2591 | * This is a wrapper around __generic_file_aio_write() to be used by most | 2591 | * This is a wrapper around __generic_file_aio_write() to be used by most |
2592 | * filesystems. It takes care of syncing the file in case of O_SYNC file | 2592 | * filesystems. It takes care of syncing the file in case of O_SYNC file |
2593 | * and acquires i_mutex as needed. | 2593 | * and acquires i_mutex as needed. |
2594 | */ | 2594 | */ |
2595 | ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | 2595 | ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, |
2596 | unsigned long nr_segs, loff_t pos) | 2596 | unsigned long nr_segs, loff_t pos) |
2597 | { | 2597 | { |
2598 | struct file *file = iocb->ki_filp; | 2598 | struct file *file = iocb->ki_filp; |
2599 | struct inode *inode = file->f_mapping->host; | 2599 | struct inode *inode = file->f_mapping->host; |
2600 | struct blk_plug plug; | 2600 | struct blk_plug plug; |
2601 | ssize_t ret; | 2601 | ssize_t ret; |
2602 | 2602 | ||
2603 | BUG_ON(iocb->ki_pos != pos); | 2603 | BUG_ON(iocb->ki_pos != pos); |
2604 | 2604 | ||
2605 | mutex_lock(&inode->i_mutex); | 2605 | mutex_lock(&inode->i_mutex); |
2606 | blk_start_plug(&plug); | 2606 | blk_start_plug(&plug); |
2607 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2607 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
2608 | mutex_unlock(&inode->i_mutex); | 2608 | mutex_unlock(&inode->i_mutex); |
2609 | 2609 | ||
2610 | if (ret > 0 || ret == -EIOCBQUEUED) { | 2610 | if (ret > 0 || ret == -EIOCBQUEUED) { |
2611 | ssize_t err; | 2611 | ssize_t err; |
2612 | 2612 | ||
2613 | err = generic_write_sync(file, pos, ret); | 2613 | err = generic_write_sync(file, pos, ret); |
2614 | if (err < 0 && ret > 0) | 2614 | if (err < 0 && ret > 0) |
2615 | ret = err; | 2615 | ret = err; |
2616 | } | 2616 | } |
2617 | blk_finish_plug(&plug); | 2617 | blk_finish_plug(&plug); |
2618 | return ret; | 2618 | return ret; |
2619 | } | 2619 | } |
2620 | EXPORT_SYMBOL(generic_file_aio_write); | 2620 | EXPORT_SYMBOL(generic_file_aio_write); |
2621 | 2621 | ||
2622 | /** | 2622 | /** |
2623 | * try_to_release_page() - release old fs-specific metadata on a page | 2623 | * try_to_release_page() - release old fs-specific metadata on a page |
2624 | * | 2624 | * |
2625 | * @page: the page which the kernel is trying to free | 2625 | * @page: the page which the kernel is trying to free |
2626 | * @gfp_mask: memory allocation flags (and I/O mode) | 2626 | * @gfp_mask: memory allocation flags (and I/O mode) |
2627 | * | 2627 | * |
2628 | * The address_space is to try to release any data against the page | 2628 | * The address_space is to try to release any data against the page |
2629 | * (presumably at page->private). If the release was successful, return `1'. | 2629 | * (presumably at page->private). If the release was successful, return `1'. |
2630 | * Otherwise return zero. | 2630 | * Otherwise return zero. |
2631 | * | 2631 | * |
2632 | * This may also be called if PG_fscache is set on a page, indicating that the | 2632 | * This may also be called if PG_fscache is set on a page, indicating that the |
2633 | * page is known to the local caching routines. | 2633 | * page is known to the local caching routines. |
2634 | * | 2634 | * |
2635 | * The @gfp_mask argument specifies whether I/O may be performed to release | 2635 | * The @gfp_mask argument specifies whether I/O may be performed to release |
2636 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). | 2636 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). |
2637 | * | 2637 | * |
2638 | */ | 2638 | */ |
2639 | int try_to_release_page(struct page *page, gfp_t gfp_mask) | 2639 | int try_to_release_page(struct page *page, gfp_t gfp_mask) |
2640 | { | 2640 | { |
2641 | struct address_space * const mapping = page->mapping; | 2641 | struct address_space * const mapping = page->mapping; |
2642 | 2642 | ||
2643 | BUG_ON(!PageLocked(page)); | 2643 | BUG_ON(!PageLocked(page)); |
2644 | if (PageWriteback(page)) | 2644 | if (PageWriteback(page)) |
2645 | return 0; | 2645 | return 0; |
2646 | 2646 | ||
2647 | if (mapping && mapping->a_ops->releasepage) | 2647 | if (mapping && mapping->a_ops->releasepage) |
2648 | return mapping->a_ops->releasepage(page, gfp_mask); | 2648 | return mapping->a_ops->releasepage(page, gfp_mask); |
2649 | return try_to_free_buffers(page); | 2649 | return try_to_free_buffers(page); |
2650 | } | 2650 | } |
2651 | 2651 | ||
2652 | EXPORT_SYMBOL(try_to_release_page); | 2652 | EXPORT_SYMBOL(try_to_release_page); |
2653 | 2653 |
mm/page-writeback.c
1 | /* | 1 | /* |
2 | * mm/page-writeback.c | 2 | * mm/page-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 5 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
6 | * | 6 | * |
7 | * Contains functions related to writing back dirty pages at the | 7 | * Contains functions related to writing back dirty pages at the |
8 | * address_space level. | 8 | * address_space level. |
9 | * | 9 | * |
10 | * 10Apr2002 Andrew Morton | 10 | * 10Apr2002 Andrew Morton |
11 | * Initial version | 11 | * Initial version |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/writeback.h> | 22 | #include <linux/writeback.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/backing-dev.h> | 24 | #include <linux/backing-dev.h> |
25 | #include <linux/task_io_accounting_ops.h> | 25 | #include <linux/task_io_accounting_ops.h> |
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/mpage.h> | 27 | #include <linux/mpage.h> |
28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
29 | #include <linux/percpu.h> | 29 | #include <linux/percpu.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/smp.h> | 31 | #include <linux/smp.h> |
32 | #include <linux/sysctl.h> | 32 | #include <linux/sysctl.h> |
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/buffer_head.h> | 35 | #include <linux/buffer_head.h> |
36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
37 | #include <trace/events/writeback.h> | 37 | #include <trace/events/writeback.h> |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Sleep at most 200ms at a time in balance_dirty_pages(). | ||
41 | */ | ||
42 | #define MAX_PAUSE max(HZ/5, 1) | ||
43 | |||
44 | /* | ||
45 | * Estimate write bandwidth at 200ms intervals. | ||
46 | */ | ||
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | ||
48 | |||
49 | /* | ||
40 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
41 | * will look to see if it needs to force writeback or throttling. | 51 | * will look to see if it needs to force writeback or throttling. |
42 | */ | 52 | */ |
43 | static long ratelimit_pages = 32; | 53 | static long ratelimit_pages = 32; |
44 | 54 | ||
45 | /* | 55 | /* |
46 | * When balance_dirty_pages decides that the caller needs to perform some | 56 | * When balance_dirty_pages decides that the caller needs to perform some |
47 | * non-background writeback, this is how many pages it will attempt to write. | 57 | * non-background writeback, this is how many pages it will attempt to write. |
48 | * It should be somewhat larger than dirtied pages to ensure that reasonably | 58 | * It should be somewhat larger than dirtied pages to ensure that reasonably |
49 | * large amounts of I/O are submitted. | 59 | * large amounts of I/O are submitted. |
50 | */ | 60 | */ |
51 | static inline long sync_writeback_pages(unsigned long dirtied) | 61 | static inline long sync_writeback_pages(unsigned long dirtied) |
52 | { | 62 | { |
53 | if (dirtied < ratelimit_pages) | 63 | if (dirtied < ratelimit_pages) |
54 | dirtied = ratelimit_pages; | 64 | dirtied = ratelimit_pages; |
55 | 65 | ||
56 | return dirtied + dirtied / 2; | 66 | return dirtied + dirtied / 2; |
57 | } | 67 | } |
58 | 68 | ||
59 | /* The following parameters are exported via /proc/sys/vm */ | 69 | /* The following parameters are exported via /proc/sys/vm */ |
60 | 70 | ||
61 | /* | 71 | /* |
62 | * Start background writeback (via writeback threads) at this percentage | 72 | * Start background writeback (via writeback threads) at this percentage |
63 | */ | 73 | */ |
64 | int dirty_background_ratio = 10; | 74 | int dirty_background_ratio = 10; |
65 | 75 | ||
66 | /* | 76 | /* |
67 | * dirty_background_bytes starts at 0 (disabled) so that it is a function of | 77 | * dirty_background_bytes starts at 0 (disabled) so that it is a function of |
68 | * dirty_background_ratio * the amount of dirtyable memory | 78 | * dirty_background_ratio * the amount of dirtyable memory |
69 | */ | 79 | */ |
70 | unsigned long dirty_background_bytes; | 80 | unsigned long dirty_background_bytes; |
71 | 81 | ||
72 | /* | 82 | /* |
73 | * free highmem will not be subtracted from the total free memory | 83 | * free highmem will not be subtracted from the total free memory |
74 | * for calculating free ratios if vm_highmem_is_dirtyable is true | 84 | * for calculating free ratios if vm_highmem_is_dirtyable is true |
75 | */ | 85 | */ |
76 | int vm_highmem_is_dirtyable; | 86 | int vm_highmem_is_dirtyable; |
77 | 87 | ||
78 | /* | 88 | /* |
79 | * The generator of dirty data starts writeback at this percentage | 89 | * The generator of dirty data starts writeback at this percentage |
80 | */ | 90 | */ |
81 | int vm_dirty_ratio = 20; | 91 | int vm_dirty_ratio = 20; |
82 | 92 | ||
83 | /* | 93 | /* |
84 | * vm_dirty_bytes starts at 0 (disabled) so that it is a function of | 94 | * vm_dirty_bytes starts at 0 (disabled) so that it is a function of |
85 | * vm_dirty_ratio * the amount of dirtyable memory | 95 | * vm_dirty_ratio * the amount of dirtyable memory |
86 | */ | 96 | */ |
87 | unsigned long vm_dirty_bytes; | 97 | unsigned long vm_dirty_bytes; |
88 | 98 | ||
89 | /* | 99 | /* |
90 | * The interval between `kupdate'-style writebacks | 100 | * The interval between `kupdate'-style writebacks |
91 | */ | 101 | */ |
92 | unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ | 102 | unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ |
93 | 103 | ||
94 | /* | 104 | /* |
95 | * The longest time for which data is allowed to remain dirty | 105 | * The longest time for which data is allowed to remain dirty |
96 | */ | 106 | */ |
97 | unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ | 107 | unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ |
98 | 108 | ||
99 | /* | 109 | /* |
100 | * Flag that makes the machine dump writes/reads and block dirtyings. | 110 | * Flag that makes the machine dump writes/reads and block dirtyings. |
101 | */ | 111 | */ |
102 | int block_dump; | 112 | int block_dump; |
103 | 113 | ||
104 | /* | 114 | /* |
105 | * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: | 115 | * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: |
106 | * a full sync is triggered after this time elapses without any disk activity. | 116 | * a full sync is triggered after this time elapses without any disk activity. |
107 | */ | 117 | */ |
108 | int laptop_mode; | 118 | int laptop_mode; |
109 | 119 | ||
110 | EXPORT_SYMBOL(laptop_mode); | 120 | EXPORT_SYMBOL(laptop_mode); |
111 | 121 | ||
112 | /* End of sysctl-exported parameters */ | 122 | /* End of sysctl-exported parameters */ |
113 | 123 | ||
124 | unsigned long global_dirty_limit; | ||
114 | 125 | ||
115 | /* | 126 | /* |
116 | * Scale the writeback cache size proportional to the relative writeout speeds. | 127 | * Scale the writeback cache size proportional to the relative writeout speeds. |
117 | * | 128 | * |
118 | * We do this by keeping a floating proportion between BDIs, based on page | 129 | * We do this by keeping a floating proportion between BDIs, based on page |
119 | * writeback completions [end_page_writeback()]. Those devices that write out | 130 | * writeback completions [end_page_writeback()]. Those devices that write out |
120 | * pages fastest will get the larger share, while the slower will get a smaller | 131 | * pages fastest will get the larger share, while the slower will get a smaller |
121 | * share. | 132 | * share. |
122 | * | 133 | * |
123 | * We use page writeout completions because we are interested in getting rid of | 134 | * We use page writeout completions because we are interested in getting rid of |
124 | * dirty pages. Having them written out is the primary goal. | 135 | * dirty pages. Having them written out is the primary goal. |
125 | * | 136 | * |
126 | * We introduce a concept of time, a period over which we measure these events, | 137 | * We introduce a concept of time, a period over which we measure these events, |
127 | * because demand can/will vary over time. The length of this period itself is | 138 | * because demand can/will vary over time. The length of this period itself is |
128 | * measured in page writeback completions. | 139 | * measured in page writeback completions. |
129 | * | 140 | * |
130 | */ | 141 | */ |
131 | static struct prop_descriptor vm_completions; | 142 | static struct prop_descriptor vm_completions; |
132 | static struct prop_descriptor vm_dirties; | 143 | static struct prop_descriptor vm_dirties; |
133 | 144 | ||
134 | /* | 145 | /* |
135 | * couple the period to the dirty_ratio: | 146 | * couple the period to the dirty_ratio: |
136 | * | 147 | * |
137 | * period/2 ~ roundup_pow_of_two(dirty limit) | 148 | * period/2 ~ roundup_pow_of_two(dirty limit) |
138 | */ | 149 | */ |
139 | static int calc_period_shift(void) | 150 | static int calc_period_shift(void) |
140 | { | 151 | { |
141 | unsigned long dirty_total; | 152 | unsigned long dirty_total; |
142 | 153 | ||
143 | if (vm_dirty_bytes) | 154 | if (vm_dirty_bytes) |
144 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | 155 | dirty_total = vm_dirty_bytes / PAGE_SIZE; |
145 | else | 156 | else |
146 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / | 157 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / |
147 | 100; | 158 | 100; |
148 | return 2 + ilog2(dirty_total - 1); | 159 | return 2 + ilog2(dirty_total - 1); |
149 | } | 160 | } |
150 | 161 | ||
151 | /* | 162 | /* |
152 | * update the period when the dirty threshold changes. | 163 | * update the period when the dirty threshold changes. |
153 | */ | 164 | */ |
154 | static void update_completion_period(void) | 165 | static void update_completion_period(void) |
155 | { | 166 | { |
156 | int shift = calc_period_shift(); | 167 | int shift = calc_period_shift(); |
157 | prop_change_shift(&vm_completions, shift); | 168 | prop_change_shift(&vm_completions, shift); |
158 | prop_change_shift(&vm_dirties, shift); | 169 | prop_change_shift(&vm_dirties, shift); |
159 | } | 170 | } |
160 | 171 | ||
161 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 172 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
162 | void __user *buffer, size_t *lenp, | 173 | void __user *buffer, size_t *lenp, |
163 | loff_t *ppos) | 174 | loff_t *ppos) |
164 | { | 175 | { |
165 | int ret; | 176 | int ret; |
166 | 177 | ||
167 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 178 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
168 | if (ret == 0 && write) | 179 | if (ret == 0 && write) |
169 | dirty_background_bytes = 0; | 180 | dirty_background_bytes = 0; |
170 | return ret; | 181 | return ret; |
171 | } | 182 | } |
172 | 183 | ||
173 | int dirty_background_bytes_handler(struct ctl_table *table, int write, | 184 | int dirty_background_bytes_handler(struct ctl_table *table, int write, |
174 | void __user *buffer, size_t *lenp, | 185 | void __user *buffer, size_t *lenp, |
175 | loff_t *ppos) | 186 | loff_t *ppos) |
176 | { | 187 | { |
177 | int ret; | 188 | int ret; |
178 | 189 | ||
179 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | 190 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
180 | if (ret == 0 && write) | 191 | if (ret == 0 && write) |
181 | dirty_background_ratio = 0; | 192 | dirty_background_ratio = 0; |
182 | return ret; | 193 | return ret; |
183 | } | 194 | } |
184 | 195 | ||
185 | int dirty_ratio_handler(struct ctl_table *table, int write, | 196 | int dirty_ratio_handler(struct ctl_table *table, int write, |
186 | void __user *buffer, size_t *lenp, | 197 | void __user *buffer, size_t *lenp, |
187 | loff_t *ppos) | 198 | loff_t *ppos) |
188 | { | 199 | { |
189 | int old_ratio = vm_dirty_ratio; | 200 | int old_ratio = vm_dirty_ratio; |
190 | int ret; | 201 | int ret; |
191 | 202 | ||
192 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 203 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
193 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 204 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
194 | update_completion_period(); | 205 | update_completion_period(); |
195 | vm_dirty_bytes = 0; | 206 | vm_dirty_bytes = 0; |
196 | } | 207 | } |
197 | return ret; | 208 | return ret; |
198 | } | 209 | } |
199 | 210 | ||
200 | 211 | ||
201 | int dirty_bytes_handler(struct ctl_table *table, int write, | 212 | int dirty_bytes_handler(struct ctl_table *table, int write, |
202 | void __user *buffer, size_t *lenp, | 213 | void __user *buffer, size_t *lenp, |
203 | loff_t *ppos) | 214 | loff_t *ppos) |
204 | { | 215 | { |
205 | unsigned long old_bytes = vm_dirty_bytes; | 216 | unsigned long old_bytes = vm_dirty_bytes; |
206 | int ret; | 217 | int ret; |
207 | 218 | ||
208 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | 219 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
209 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 220 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
210 | update_completion_period(); | 221 | update_completion_period(); |
211 | vm_dirty_ratio = 0; | 222 | vm_dirty_ratio = 0; |
212 | } | 223 | } |
213 | return ret; | 224 | return ret; |
214 | } | 225 | } |
215 | 226 | ||
216 | /* | 227 | /* |
217 | * Increment the BDI's writeout completion count and the global writeout | 228 | * Increment the BDI's writeout completion count and the global writeout |
218 | * completion count. Called from test_clear_page_writeback(). | 229 | * completion count. Called from test_clear_page_writeback(). |
219 | */ | 230 | */ |
220 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 231 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | { | 232 | { |
233 | __inc_bdi_stat(bdi, BDI_WRITTEN); | ||
222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 234 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | bdi->max_prop_frac); | 235 | bdi->max_prop_frac); |
224 | } | 236 | } |
225 | 237 | ||
226 | void bdi_writeout_inc(struct backing_dev_info *bdi) | 238 | void bdi_writeout_inc(struct backing_dev_info *bdi) |
227 | { | 239 | { |
228 | unsigned long flags; | 240 | unsigned long flags; |
229 | 241 | ||
230 | local_irq_save(flags); | 242 | local_irq_save(flags); |
231 | __bdi_writeout_inc(bdi); | 243 | __bdi_writeout_inc(bdi); |
232 | local_irq_restore(flags); | 244 | local_irq_restore(flags); |
233 | } | 245 | } |
234 | EXPORT_SYMBOL_GPL(bdi_writeout_inc); | 246 | EXPORT_SYMBOL_GPL(bdi_writeout_inc); |
235 | 247 | ||
236 | void task_dirty_inc(struct task_struct *tsk) | 248 | void task_dirty_inc(struct task_struct *tsk) |
237 | { | 249 | { |
238 | prop_inc_single(&vm_dirties, &tsk->dirties); | 250 | prop_inc_single(&vm_dirties, &tsk->dirties); |
239 | } | 251 | } |
240 | 252 | ||
241 | /* | 253 | /* |
242 | * Obtain an accurate fraction of the BDI's portion. | 254 | * Obtain an accurate fraction of the BDI's portion. |
243 | */ | 255 | */ |
244 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 256 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | long *numerator, long *denominator) | 257 | long *numerator, long *denominator) |
246 | { | 258 | { |
247 | if (bdi_cap_writeback_dirty(bdi)) { | 259 | prop_fraction_percpu(&vm_completions, &bdi->completions, |
248 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
249 | numerator, denominator); | 260 | numerator, denominator); |
250 | } else { | ||
251 | *numerator = 0; | ||
252 | *denominator = 1; | ||
253 | } | ||
254 | } | 261 | } |
255 | 262 | ||
256 | static inline void task_dirties_fraction(struct task_struct *tsk, | 263 | static inline void task_dirties_fraction(struct task_struct *tsk, |
257 | long *numerator, long *denominator) | 264 | long *numerator, long *denominator) |
258 | { | 265 | { |
259 | prop_fraction_single(&vm_dirties, &tsk->dirties, | 266 | prop_fraction_single(&vm_dirties, &tsk->dirties, |
260 | numerator, denominator); | 267 | numerator, denominator); |
261 | } | 268 | } |
262 | 269 | ||
263 | /* | 270 | /* |
264 | * task_dirty_limit - scale down dirty throttling threshold for one task | 271 | * task_dirty_limit - scale down dirty throttling threshold for one task |
265 | * | 272 | * |
266 | * task specific dirty limit: | 273 | * task specific dirty limit: |
267 | * | 274 | * |
268 | * dirty -= (dirty/8) * p_{t} | 275 | * dirty -= (dirty/8) * p_{t} |
269 | * | 276 | * |
270 | * To protect light/slow dirtying tasks from heavier/fast ones, we start | 277 | * To protect light/slow dirtying tasks from heavier/fast ones, we start |
271 | * throttling individual tasks before reaching the bdi dirty limit. | 278 | * throttling individual tasks before reaching the bdi dirty limit. |
272 | * Relatively low thresholds will be allocated to heavy dirtiers. So when | 279 | * Relatively low thresholds will be allocated to heavy dirtiers. So when |
273 | * dirty pages grow large, heavy dirtiers will be throttled first, which will | 280 | * dirty pages grow large, heavy dirtiers will be throttled first, which will |
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | 281 | * effectively curb the growth of dirty pages. Light dirtiers with high enough |
275 | * dirty threshold may never get throttled. | 282 | * dirty threshold may never get throttled. |
276 | */ | 283 | */ |
284 | #define TASK_LIMIT_FRACTION 8 | ||
277 | static unsigned long task_dirty_limit(struct task_struct *tsk, | 285 | static unsigned long task_dirty_limit(struct task_struct *tsk, |
278 | unsigned long bdi_dirty) | 286 | unsigned long bdi_dirty) |
279 | { | 287 | { |
280 | long numerator, denominator; | 288 | long numerator, denominator; |
281 | unsigned long dirty = bdi_dirty; | 289 | unsigned long dirty = bdi_dirty; |
282 | u64 inv = dirty >> 3; | 290 | u64 inv = dirty / TASK_LIMIT_FRACTION; |
283 | 291 | ||
284 | task_dirties_fraction(tsk, &numerator, &denominator); | 292 | task_dirties_fraction(tsk, &numerator, &denominator); |
285 | inv *= numerator; | 293 | inv *= numerator; |
286 | do_div(inv, denominator); | 294 | do_div(inv, denominator); |
287 | 295 | ||
288 | dirty -= inv; | 296 | dirty -= inv; |
289 | 297 | ||
290 | return max(dirty, bdi_dirty/2); | 298 | return max(dirty, bdi_dirty/2); |
291 | } | 299 | } |
292 | 300 | ||
301 | /* Minimum limit for any task */ | ||
302 | static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | ||
303 | { | ||
304 | return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | ||
305 | } | ||
306 | |||
293 | /* | 307 | /* |
294 | * | 308 | * |
295 | */ | 309 | */ |
296 | static unsigned int bdi_min_ratio; | 310 | static unsigned int bdi_min_ratio; |
297 | 311 | ||
298 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) | 312 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) |
299 | { | 313 | { |
300 | int ret = 0; | 314 | int ret = 0; |
301 | 315 | ||
302 | spin_lock_bh(&bdi_lock); | 316 | spin_lock_bh(&bdi_lock); |
303 | if (min_ratio > bdi->max_ratio) { | 317 | if (min_ratio > bdi->max_ratio) { |
304 | ret = -EINVAL; | 318 | ret = -EINVAL; |
305 | } else { | 319 | } else { |
306 | min_ratio -= bdi->min_ratio; | 320 | min_ratio -= bdi->min_ratio; |
307 | if (bdi_min_ratio + min_ratio < 100) { | 321 | if (bdi_min_ratio + min_ratio < 100) { |
308 | bdi_min_ratio += min_ratio; | 322 | bdi_min_ratio += min_ratio; |
309 | bdi->min_ratio += min_ratio; | 323 | bdi->min_ratio += min_ratio; |
310 | } else { | 324 | } else { |
311 | ret = -EINVAL; | 325 | ret = -EINVAL; |
312 | } | 326 | } |
313 | } | 327 | } |
314 | spin_unlock_bh(&bdi_lock); | 328 | spin_unlock_bh(&bdi_lock); |
315 | 329 | ||
316 | return ret; | 330 | return ret; |
317 | } | 331 | } |
318 | 332 | ||
319 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | 333 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) |
320 | { | 334 | { |
321 | int ret = 0; | 335 | int ret = 0; |
322 | 336 | ||
323 | if (max_ratio > 100) | 337 | if (max_ratio > 100) |
324 | return -EINVAL; | 338 | return -EINVAL; |
325 | 339 | ||
326 | spin_lock_bh(&bdi_lock); | 340 | spin_lock_bh(&bdi_lock); |
327 | if (bdi->min_ratio > max_ratio) { | 341 | if (bdi->min_ratio > max_ratio) { |
328 | ret = -EINVAL; | 342 | ret = -EINVAL; |
329 | } else { | 343 | } else { |
330 | bdi->max_ratio = max_ratio; | 344 | bdi->max_ratio = max_ratio; |
331 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 345 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; |
332 | } | 346 | } |
333 | spin_unlock_bh(&bdi_lock); | 347 | spin_unlock_bh(&bdi_lock); |
334 | 348 | ||
335 | return ret; | 349 | return ret; |
336 | } | 350 | } |
337 | EXPORT_SYMBOL(bdi_set_max_ratio); | 351 | EXPORT_SYMBOL(bdi_set_max_ratio); |
338 | 352 | ||
339 | /* | 353 | /* |
340 | * Work out the current dirty-memory clamping and background writeout | 354 | * Work out the current dirty-memory clamping and background writeout |
341 | * thresholds. | 355 | * thresholds. |
342 | * | 356 | * |
343 | * The main aim here is to lower them aggressively if there is a lot of mapped | 357 | * The main aim here is to lower them aggressively if there is a lot of mapped |
344 | * memory around. To avoid stressing page reclaim with lots of unreclaimable | 358 | * memory around. To avoid stressing page reclaim with lots of unreclaimable |
345 | * pages. It is better to clamp down on writers than to start swapping, and | 359 | * pages. It is better to clamp down on writers than to start swapping, and |
346 | * performing lots of scanning. | 360 | * performing lots of scanning. |
347 | * | 361 | * |
348 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. | 362 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. |
349 | * | 363 | * |
350 | * We don't permit the clamping level to fall below 5% - that is getting rather | 364 | * We don't permit the clamping level to fall below 5% - that is getting rather |
351 | * excessive. | 365 | * excessive. |
352 | * | 366 | * |
353 | * We make sure that the background writeout level is below the adjusted | 367 | * We make sure that the background writeout level is below the adjusted |
354 | * clamping level. | 368 | * clamping level. |
355 | */ | 369 | */ |
356 | 370 | ||
357 | static unsigned long highmem_dirtyable_memory(unsigned long total) | 371 | static unsigned long highmem_dirtyable_memory(unsigned long total) |
358 | { | 372 | { |
359 | #ifdef CONFIG_HIGHMEM | 373 | #ifdef CONFIG_HIGHMEM |
360 | int node; | 374 | int node; |
361 | unsigned long x = 0; | 375 | unsigned long x = 0; |
362 | 376 | ||
363 | for_each_node_state(node, N_HIGH_MEMORY) { | 377 | for_each_node_state(node, N_HIGH_MEMORY) { |
364 | struct zone *z = | 378 | struct zone *z = |
365 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | 379 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
366 | 380 | ||
367 | x += zone_page_state(z, NR_FREE_PAGES) + | 381 | x += zone_page_state(z, NR_FREE_PAGES) + |
368 | zone_reclaimable_pages(z); | 382 | zone_reclaimable_pages(z); |
369 | } | 383 | } |
370 | /* | 384 | /* |
371 | * Make sure that the number of highmem pages is never larger | 385 | * Make sure that the number of highmem pages is never larger |
372 | * than the number of the total dirtyable memory. This can only | 386 | * than the number of the total dirtyable memory. This can only |
373 | * occur in very strange VM situations but we want to make sure | 387 | * occur in very strange VM situations but we want to make sure |
374 | * that this does not occur. | 388 | * that this does not occur. |
375 | */ | 389 | */ |
376 | return min(x, total); | 390 | return min(x, total); |
377 | #else | 391 | #else |
378 | return 0; | 392 | return 0; |
379 | #endif | 393 | #endif |
380 | } | 394 | } |
381 | 395 | ||
382 | /** | 396 | /** |
383 | * determine_dirtyable_memory - amount of memory that may be used | 397 | * determine_dirtyable_memory - amount of memory that may be used |
384 | * | 398 | * |
385 | * Returns the numebr of pages that can currently be freed and used | 399 | * Returns the numebr of pages that can currently be freed and used |
386 | * by the kernel for direct mappings. | 400 | * by the kernel for direct mappings. |
387 | */ | 401 | */ |
388 | unsigned long determine_dirtyable_memory(void) | 402 | unsigned long determine_dirtyable_memory(void) |
389 | { | 403 | { |
390 | unsigned long x; | 404 | unsigned long x; |
391 | 405 | ||
392 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); | 406 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); |
393 | 407 | ||
394 | if (!vm_highmem_is_dirtyable) | 408 | if (!vm_highmem_is_dirtyable) |
395 | x -= highmem_dirtyable_memory(x); | 409 | x -= highmem_dirtyable_memory(x); |
396 | 410 | ||
397 | return x + 1; /* Ensure that we never return 0 */ | 411 | return x + 1; /* Ensure that we never return 0 */ |
398 | } | 412 | } |
399 | 413 | ||
414 | static unsigned long hard_dirty_limit(unsigned long thresh) | ||
415 | { | ||
416 | return max(thresh, global_dirty_limit); | ||
417 | } | ||
418 | |||
400 | /* | 419 | /* |
401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | 420 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | * | 421 | * |
403 | * Calculate the dirty thresholds based on sysctl parameters | 422 | * Calculate the dirty thresholds based on sysctl parameters |
404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes | 423 | * - vm.dirty_background_ratio or vm.dirty_background_bytes |
405 | * - vm.dirty_ratio or vm.dirty_bytes | 424 | * - vm.dirty_ratio or vm.dirty_bytes |
406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and | 425 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
407 | * real-time tasks. | 426 | * real-time tasks. |
408 | */ | 427 | */ |
409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | 428 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) |
410 | { | 429 | { |
411 | unsigned long background; | 430 | unsigned long background; |
412 | unsigned long dirty; | 431 | unsigned long dirty; |
413 | unsigned long uninitialized_var(available_memory); | 432 | unsigned long uninitialized_var(available_memory); |
414 | struct task_struct *tsk; | 433 | struct task_struct *tsk; |
415 | 434 | ||
416 | if (!vm_dirty_bytes || !dirty_background_bytes) | 435 | if (!vm_dirty_bytes || !dirty_background_bytes) |
417 | available_memory = determine_dirtyable_memory(); | 436 | available_memory = determine_dirtyable_memory(); |
418 | 437 | ||
419 | if (vm_dirty_bytes) | 438 | if (vm_dirty_bytes) |
420 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); | 439 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
421 | else | 440 | else |
422 | dirty = (vm_dirty_ratio * available_memory) / 100; | 441 | dirty = (vm_dirty_ratio * available_memory) / 100; |
423 | 442 | ||
424 | if (dirty_background_bytes) | 443 | if (dirty_background_bytes) |
425 | background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); | 444 | background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); |
426 | else | 445 | else |
427 | background = (dirty_background_ratio * available_memory) / 100; | 446 | background = (dirty_background_ratio * available_memory) / 100; |
428 | 447 | ||
429 | if (background >= dirty) | 448 | if (background >= dirty) |
430 | background = dirty / 2; | 449 | background = dirty / 2; |
431 | tsk = current; | 450 | tsk = current; |
432 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | 451 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
433 | background += background / 4; | 452 | background += background / 4; |
434 | dirty += dirty / 4; | 453 | dirty += dirty / 4; |
435 | } | 454 | } |
436 | *pbackground = background; | 455 | *pbackground = background; |
437 | *pdirty = dirty; | 456 | *pdirty = dirty; |
457 | trace_global_dirty_state(background, dirty); | ||
438 | } | 458 | } |
439 | 459 | ||
440 | /* | 460 | /** |
441 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 461 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
462 | * @bdi: the backing_dev_info to query | ||
463 | * @dirty: global dirty limit in pages | ||
442 | * | 464 | * |
443 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | 465 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of |
466 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
467 | * And the "limit" in the name is not seriously taken as hard limit in | ||
468 | * balance_dirty_pages(). | ||
469 | * | ||
470 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent | ||
444 | * - starving fast devices | 471 | * - starving fast devices |
445 | * - piling up dirty pages (that will take long time to sync) on slow devices | 472 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | * | 473 | * |
447 | * The bdi's share of dirty limit will be adapting to its throughput and | 474 | * The bdi's share of dirty limit will be adapting to its throughput and |
448 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. | 475 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. |
449 | */ | 476 | */ |
450 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | 477 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) |
451 | { | 478 | { |
452 | u64 bdi_dirty; | 479 | u64 bdi_dirty; |
453 | long numerator, denominator; | 480 | long numerator, denominator; |
454 | 481 | ||
455 | /* | 482 | /* |
456 | * Calculate this BDI's share of the dirty ratio. | 483 | * Calculate this BDI's share of the dirty ratio. |
457 | */ | 484 | */ |
458 | bdi_writeout_fraction(bdi, &numerator, &denominator); | 485 | bdi_writeout_fraction(bdi, &numerator, &denominator); |
459 | 486 | ||
460 | bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; | 487 | bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; |
461 | bdi_dirty *= numerator; | 488 | bdi_dirty *= numerator; |
462 | do_div(bdi_dirty, denominator); | 489 | do_div(bdi_dirty, denominator); |
463 | 490 | ||
464 | bdi_dirty += (dirty * bdi->min_ratio) / 100; | 491 | bdi_dirty += (dirty * bdi->min_ratio) / 100; |
465 | if (bdi_dirty > (dirty * bdi->max_ratio) / 100) | 492 | if (bdi_dirty > (dirty * bdi->max_ratio) / 100) |
466 | bdi_dirty = dirty * bdi->max_ratio / 100; | 493 | bdi_dirty = dirty * bdi->max_ratio / 100; |
467 | 494 | ||
468 | return bdi_dirty; | 495 | return bdi_dirty; |
469 | } | 496 | } |
470 | 497 | ||
498 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | ||
499 | unsigned long elapsed, | ||
500 | unsigned long written) | ||
501 | { | ||
502 | const unsigned long period = roundup_pow_of_two(3 * HZ); | ||
503 | unsigned long avg = bdi->avg_write_bandwidth; | ||
504 | unsigned long old = bdi->write_bandwidth; | ||
505 | u64 bw; | ||
506 | |||
507 | /* | ||
508 | * bw = written * HZ / elapsed | ||
509 | * | ||
510 | * bw * elapsed + write_bandwidth * (period - elapsed) | ||
511 | * write_bandwidth = --------------------------------------------------- | ||
512 | * period | ||
513 | */ | ||
514 | bw = written - bdi->written_stamp; | ||
515 | bw *= HZ; | ||
516 | if (unlikely(elapsed > period)) { | ||
517 | do_div(bw, elapsed); | ||
518 | avg = bw; | ||
519 | goto out; | ||
520 | } | ||
521 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | ||
522 | bw >>= ilog2(period); | ||
523 | |||
524 | /* | ||
525 | * one more level of smoothing, for filtering out sudden spikes | ||
526 | */ | ||
527 | if (avg > old && old >= (unsigned long)bw) | ||
528 | avg -= (avg - old) >> 3; | ||
529 | |||
530 | if (avg < old && old <= (unsigned long)bw) | ||
531 | avg += (old - avg) >> 3; | ||
532 | |||
533 | out: | ||
534 | bdi->write_bandwidth = bw; | ||
535 | bdi->avg_write_bandwidth = avg; | ||
536 | } | ||
537 | |||
471 | /* | 538 | /* |
539 | * The global dirtyable memory and dirty threshold could be suddenly knocked | ||
540 | * down by a large amount (eg. on the startup of KVM in a swapless system). | ||
541 | * This may throw the system into deep dirty exceeded state and throttle | ||
542 | * heavy/light dirtiers alike. To retain good responsiveness, maintain | ||
543 | * global_dirty_limit for tracking slowly down to the knocked down dirty | ||
544 | * threshold. | ||
545 | */ | ||
546 | static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | ||
547 | { | ||
548 | unsigned long limit = global_dirty_limit; | ||
549 | |||
550 | /* | ||
551 | * Follow up in one step. | ||
552 | */ | ||
553 | if (limit < thresh) { | ||
554 | limit = thresh; | ||
555 | goto update; | ||
556 | } | ||
557 | |||
558 | /* | ||
559 | * Follow down slowly. Use the higher one as the target, because thresh | ||
560 | * may drop below dirty. This is exactly the reason to introduce | ||
561 | * global_dirty_limit which is guaranteed to lie above the dirty pages. | ||
562 | */ | ||
563 | thresh = max(thresh, dirty); | ||
564 | if (limit > thresh) { | ||
565 | limit -= (limit - thresh) >> 5; | ||
566 | goto update; | ||
567 | } | ||
568 | return; | ||
569 | update: | ||
570 | global_dirty_limit = limit; | ||
571 | } | ||
572 | |||
573 | static void global_update_bandwidth(unsigned long thresh, | ||
574 | unsigned long dirty, | ||
575 | unsigned long now) | ||
576 | { | ||
577 | static DEFINE_SPINLOCK(dirty_lock); | ||
578 | static unsigned long update_time; | ||
579 | |||
580 | /* | ||
581 | * check locklessly first to optimize away locking for the most time | ||
582 | */ | ||
583 | if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | ||
584 | return; | ||
585 | |||
586 | spin_lock(&dirty_lock); | ||
587 | if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | ||
588 | update_dirty_limit(thresh, dirty); | ||
589 | update_time = now; | ||
590 | } | ||
591 | spin_unlock(&dirty_lock); | ||
592 | } | ||
593 | |||
594 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
595 | unsigned long thresh, | ||
596 | unsigned long dirty, | ||
597 | unsigned long bdi_thresh, | ||
598 | unsigned long bdi_dirty, | ||
599 | unsigned long start_time) | ||
600 | { | ||
601 | unsigned long now = jiffies; | ||
602 | unsigned long elapsed = now - bdi->bw_time_stamp; | ||
603 | unsigned long written; | ||
604 | |||
605 | /* | ||
606 | * rate-limit, only update once every 200ms. | ||
607 | */ | ||
608 | if (elapsed < BANDWIDTH_INTERVAL) | ||
609 | return; | ||
610 | |||
611 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | ||
612 | |||
613 | /* | ||
614 | * Skip quiet periods when disk bandwidth is under-utilized. | ||
615 | * (at least 1s idle time between two flusher runs) | ||
616 | */ | ||
617 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | ||
618 | goto snapshot; | ||
619 | |||
620 | if (thresh) | ||
621 | global_update_bandwidth(thresh, dirty, now); | ||
622 | |||
623 | bdi_update_write_bandwidth(bdi, elapsed, written); | ||
624 | |||
625 | snapshot: | ||
626 | bdi->written_stamp = written; | ||
627 | bdi->bw_time_stamp = now; | ||
628 | } | ||
629 | |||
630 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
631 | unsigned long thresh, | ||
632 | unsigned long dirty, | ||
633 | unsigned long bdi_thresh, | ||
634 | unsigned long bdi_dirty, | ||
635 | unsigned long start_time) | ||
636 | { | ||
637 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | ||
638 | return; | ||
639 | spin_lock(&bdi->wb.list_lock); | ||
640 | __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, | ||
641 | start_time); | ||
642 | spin_unlock(&bdi->wb.list_lock); | ||
643 | } | ||
644 | |||
645 | /* | ||
472 | * balance_dirty_pages() must be called by processes which are generating dirty | 646 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | * data. It looks at the number of dirty pages in the machine and will force | 647 | * data. It looks at the number of dirty pages in the machine and will force |
474 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. | 648 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. |
475 | * If we're over `background_thresh' then the writeback threads are woken to | 649 | * If we're over `background_thresh' then the writeback threads are woken to |
476 | * perform some writeout. | 650 | * perform some writeout. |
477 | */ | 651 | */ |
478 | static void balance_dirty_pages(struct address_space *mapping, | 652 | static void balance_dirty_pages(struct address_space *mapping, |
479 | unsigned long write_chunk) | 653 | unsigned long write_chunk) |
480 | { | 654 | { |
481 | long nr_reclaimable, bdi_nr_reclaimable; | 655 | unsigned long nr_reclaimable, bdi_nr_reclaimable; |
482 | long nr_writeback, bdi_nr_writeback; | 656 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
657 | unsigned long bdi_dirty; | ||
483 | unsigned long background_thresh; | 658 | unsigned long background_thresh; |
484 | unsigned long dirty_thresh; | 659 | unsigned long dirty_thresh; |
485 | unsigned long bdi_thresh; | 660 | unsigned long bdi_thresh; |
661 | unsigned long task_bdi_thresh; | ||
662 | unsigned long min_task_bdi_thresh; | ||
486 | unsigned long pages_written = 0; | 663 | unsigned long pages_written = 0; |
487 | unsigned long pause = 1; | 664 | unsigned long pause = 1; |
488 | bool dirty_exceeded = false; | 665 | bool dirty_exceeded = false; |
666 | bool clear_dirty_exceeded = true; | ||
489 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 667 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
668 | unsigned long start_time = jiffies; | ||
490 | 669 | ||
491 | for (;;) { | 670 | for (;;) { |
492 | struct writeback_control wbc = { | ||
493 | .sync_mode = WB_SYNC_NONE, | ||
494 | .older_than_this = NULL, | ||
495 | .nr_to_write = write_chunk, | ||
496 | .range_cyclic = 1, | ||
497 | }; | ||
498 | |||
499 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 671 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | global_page_state(NR_UNSTABLE_NFS); | 672 | global_page_state(NR_UNSTABLE_NFS); |
501 | nr_writeback = global_page_state(NR_WRITEBACK); | 673 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
502 | 674 | ||
503 | global_dirty_limits(&background_thresh, &dirty_thresh); | 675 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 676 | ||
505 | /* | 677 | /* |
506 | * Throttle it only when the background writeback cannot | 678 | * Throttle it only when the background writeback cannot |
507 | * catch-up. This avoids (excessively) small writeouts | 679 | * catch-up. This avoids (excessively) small writeouts |
508 | * when the bdi limits are ramping up. | 680 | * when the bdi limits are ramping up. |
509 | */ | 681 | */ |
510 | if (nr_reclaimable + nr_writeback <= | 682 | if (nr_dirty <= (background_thresh + dirty_thresh) / 2) |
511 | (background_thresh + dirty_thresh) / 2) | ||
512 | break; | 683 | break; |
513 | 684 | ||
514 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | bdi_thresh = task_dirty_limit(current, bdi_thresh); | 686 | min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); |
687 | task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
516 | 688 | ||
517 | /* | 689 | /* |
518 | * In order to avoid the stacked BDI deadlock we need | 690 | * In order to avoid the stacked BDI deadlock we need |
519 | * to ensure we accurately count the 'dirty' pages when | 691 | * to ensure we accurately count the 'dirty' pages when |
520 | * the threshold is low. | 692 | * the threshold is low. |
521 | * | 693 | * |
522 | * Otherwise it would be possible to get thresh+n pages | 694 | * Otherwise it would be possible to get thresh+n pages |
523 | * reported dirty, even though there are thresh-m pages | 695 | * reported dirty, even though there are thresh-m pages |
524 | * actually dirty; with m+n sitting in the percpu | 696 | * actually dirty; with m+n sitting in the percpu |
525 | * deltas. | 697 | * deltas. |
526 | */ | 698 | */ |
527 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 699 | if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { |
528 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 700 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 701 | bdi_dirty = bdi_nr_reclaimable + |
702 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
530 | } else { | 703 | } else { |
531 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 704 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 705 | bdi_dirty = bdi_nr_reclaimable + |
706 | bdi_stat(bdi, BDI_WRITEBACK); | ||
533 | } | 707 | } |
534 | 708 | ||
535 | /* | 709 | /* |
536 | * The bdi thresh is somehow "soft" limit derived from the | 710 | * The bdi thresh is somehow "soft" limit derived from the |
537 | * global "hard" limit. The former helps to prevent heavy IO | 711 | * global "hard" limit. The former helps to prevent heavy IO |
538 | * bdi or process from holding back light ones; The latter is | 712 | * bdi or process from holding back light ones; The latter is |
539 | * the last resort safeguard. | 713 | * the last resort safeguard. |
540 | */ | 714 | */ |
541 | dirty_exceeded = | 715 | dirty_exceeded = (bdi_dirty > task_bdi_thresh) || |
542 | (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | 716 | (nr_dirty > dirty_thresh); |
543 | || (nr_reclaimable + nr_writeback > dirty_thresh); | 717 | clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && |
718 | (nr_dirty <= dirty_thresh); | ||
544 | 719 | ||
545 | if (!dirty_exceeded) | 720 | if (!dirty_exceeded) |
546 | break; | 721 | break; |
547 | 722 | ||
548 | if (!bdi->dirty_exceeded) | 723 | if (!bdi->dirty_exceeded) |
549 | bdi->dirty_exceeded = 1; | 724 | bdi->dirty_exceeded = 1; |
550 | 725 | ||
726 | bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, | ||
727 | bdi_thresh, bdi_dirty, start_time); | ||
728 | |||
551 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 729 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
552 | * Unstable writes are a feature of certain networked | 730 | * Unstable writes are a feature of certain networked |
553 | * filesystems (i.e. NFS) in which data may have been | 731 | * filesystems (i.e. NFS) in which data may have been |
554 | * written to the server's write cache, but has not yet | 732 | * written to the server's write cache, but has not yet |
555 | * been flushed to permanent storage. | 733 | * been flushed to permanent storage. |
556 | * Only move pages to writeback if this bdi is over its | 734 | * Only move pages to writeback if this bdi is over its |
557 | * threshold otherwise wait until the disk writes catch | 735 | * threshold otherwise wait until the disk writes catch |
558 | * up. | 736 | * up. |
559 | */ | 737 | */ |
560 | trace_wbc_balance_dirty_start(&wbc, bdi); | 738 | trace_balance_dirty_start(bdi); |
561 | if (bdi_nr_reclaimable > bdi_thresh) { | 739 | if (bdi_nr_reclaimable > task_bdi_thresh) { |
562 | writeback_inodes_wb(&bdi->wb, &wbc); | 740 | pages_written += writeback_inodes_wb(&bdi->wb, |
563 | pages_written += write_chunk - wbc.nr_to_write; | 741 | write_chunk); |
564 | trace_wbc_balance_dirty_written(&wbc, bdi); | 742 | trace_balance_dirty_written(bdi, pages_written); |
565 | if (pages_written >= write_chunk) | 743 | if (pages_written >= write_chunk) |
566 | break; /* We've done our duty */ | 744 | break; /* We've done our duty */ |
567 | } | 745 | } |
568 | trace_wbc_balance_dirty_wait(&wbc, bdi); | ||
569 | __set_current_state(TASK_UNINTERRUPTIBLE); | 746 | __set_current_state(TASK_UNINTERRUPTIBLE); |
570 | io_schedule_timeout(pause); | 747 | io_schedule_timeout(pause); |
748 | trace_balance_dirty_wait(bdi); | ||
571 | 749 | ||
750 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
572 | /* | 751 | /* |
752 | * max-pause area. If dirty exceeded but still within this | ||
753 | * area, no need to sleep for more than 200ms: (a) 8 pages per | ||
754 | * 200ms is typically more than enough to curb heavy dirtiers; | ||
755 | * (b) the pause time limit makes the dirtiers more responsive. | ||
756 | */ | ||
757 | if (nr_dirty < dirty_thresh + | ||
758 | dirty_thresh / DIRTY_MAXPAUSE_AREA && | ||
759 | time_after(jiffies, start_time + MAX_PAUSE)) | ||
760 | break; | ||
761 | /* | ||
762 | * pass-good area. When some bdi gets blocked (eg. NFS server | ||
763 | * not responding), or write bandwidth dropped dramatically due | ||
764 | * to concurrent reads, or dirty threshold suddenly dropped and | ||
765 | * the dirty pages cannot be brought down anytime soon (eg. on | ||
766 | * slow USB stick), at least let go of the good bdi's. | ||
767 | */ | ||
768 | if (nr_dirty < dirty_thresh + | ||
769 | dirty_thresh / DIRTY_PASSGOOD_AREA && | ||
770 | bdi_dirty < bdi_thresh) | ||
771 | break; | ||
772 | |||
773 | /* | ||
573 | * Increase the delay for each loop, up to our previous | 774 | * Increase the delay for each loop, up to our previous |
574 | * default of taking a 100ms nap. | 775 | * default of taking a 100ms nap. |
575 | */ | 776 | */ |
576 | pause <<= 1; | 777 | pause <<= 1; |
577 | if (pause > HZ / 10) | 778 | if (pause > HZ / 10) |
578 | pause = HZ / 10; | 779 | pause = HZ / 10; |
579 | } | 780 | } |
580 | 781 | ||
581 | if (!dirty_exceeded && bdi->dirty_exceeded) | 782 | /* Clear dirty_exceeded flag only when no task can exceed the limit */ |
783 | if (clear_dirty_exceeded && bdi->dirty_exceeded) | ||
582 | bdi->dirty_exceeded = 0; | 784 | bdi->dirty_exceeded = 0; |
583 | 785 | ||
584 | if (writeback_in_progress(bdi)) | 786 | if (writeback_in_progress(bdi)) |
585 | return; | 787 | return; |
586 | 788 | ||
587 | /* | 789 | /* |
588 | * In laptop mode, we wait until hitting the higher threshold before | 790 | * In laptop mode, we wait until hitting the higher threshold before |
589 | * starting background writeout, and then write out all the way down | 791 | * starting background writeout, and then write out all the way down |
590 | * to the lower threshold. So slow writers cause minimal disk activity. | 792 | * to the lower threshold. So slow writers cause minimal disk activity. |
591 | * | 793 | * |
592 | * In normal mode, we start background writeout at the lower | 794 | * In normal mode, we start background writeout at the lower |
593 | * background_thresh, to keep the amount of dirty memory low. | 795 | * background_thresh, to keep the amount of dirty memory low. |
594 | */ | 796 | */ |
595 | if ((laptop_mode && pages_written) || | 797 | if ((laptop_mode && pages_written) || |
596 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 798 | (!laptop_mode && (nr_reclaimable > background_thresh))) |
597 | bdi_start_background_writeback(bdi); | 799 | bdi_start_background_writeback(bdi); |
598 | } | 800 | } |
599 | 801 | ||
600 | void set_page_dirty_balance(struct page *page, int page_mkwrite) | 802 | void set_page_dirty_balance(struct page *page, int page_mkwrite) |
601 | { | 803 | { |
602 | if (set_page_dirty(page) || page_mkwrite) { | 804 | if (set_page_dirty(page) || page_mkwrite) { |
603 | struct address_space *mapping = page_mapping(page); | 805 | struct address_space *mapping = page_mapping(page); |
604 | 806 | ||
605 | if (mapping) | 807 | if (mapping) |
606 | balance_dirty_pages_ratelimited(mapping); | 808 | balance_dirty_pages_ratelimited(mapping); |
607 | } | 809 | } |
608 | } | 810 | } |
609 | 811 | ||
610 | static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | 812 | static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; |
611 | 813 | ||
612 | /** | 814 | /** |
613 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 815 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
614 | * @mapping: address_space which was dirtied | 816 | * @mapping: address_space which was dirtied |
615 | * @nr_pages_dirtied: number of pages which the caller has just dirtied | 817 | * @nr_pages_dirtied: number of pages which the caller has just dirtied |
616 | * | 818 | * |
617 | * Processes which are dirtying memory should call in here once for each page | 819 | * Processes which are dirtying memory should call in here once for each page |
618 | * which was newly dirtied. The function will periodically check the system's | 820 | * which was newly dirtied. The function will periodically check the system's |
619 | * dirty state and will initiate writeback if needed. | 821 | * dirty state and will initiate writeback if needed. |
620 | * | 822 | * |
621 | * On really big machines, get_writeback_state is expensive, so try to avoid | 823 | * On really big machines, get_writeback_state is expensive, so try to avoid |
622 | * calling it too often (ratelimiting). But once we're over the dirty memory | 824 | * calling it too often (ratelimiting). But once we're over the dirty memory |
623 | * limit we decrease the ratelimiting by a lot, to prevent individual processes | 825 | * limit we decrease the ratelimiting by a lot, to prevent individual processes |
624 | * from overshooting the limit by (ratelimit_pages) each. | 826 | * from overshooting the limit by (ratelimit_pages) each. |
625 | */ | 827 | */ |
626 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 828 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | unsigned long nr_pages_dirtied) | 829 | unsigned long nr_pages_dirtied) |
628 | { | 830 | { |
831 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
629 | unsigned long ratelimit; | 832 | unsigned long ratelimit; |
630 | unsigned long *p; | 833 | unsigned long *p; |
631 | 834 | ||
835 | if (!bdi_cap_account_dirty(bdi)) | ||
836 | return; | ||
837 | |||
632 | ratelimit = ratelimit_pages; | 838 | ratelimit = ratelimit_pages; |
633 | if (mapping->backing_dev_info->dirty_exceeded) | 839 | if (mapping->backing_dev_info->dirty_exceeded) |
634 | ratelimit = 8; | 840 | ratelimit = 8; |
635 | 841 | ||
636 | /* | 842 | /* |
637 | * Check the rate limiting. Also, we do not want to throttle real-time | 843 | * Check the rate limiting. Also, we do not want to throttle real-time |
638 | * tasks in balance_dirty_pages(). Period. | 844 | * tasks in balance_dirty_pages(). Period. |
639 | */ | 845 | */ |
640 | preempt_disable(); | 846 | preempt_disable(); |
641 | p = &__get_cpu_var(bdp_ratelimits); | 847 | p = &__get_cpu_var(bdp_ratelimits); |
642 | *p += nr_pages_dirtied; | 848 | *p += nr_pages_dirtied; |
643 | if (unlikely(*p >= ratelimit)) { | 849 | if (unlikely(*p >= ratelimit)) { |
644 | ratelimit = sync_writeback_pages(*p); | 850 | ratelimit = sync_writeback_pages(*p); |
645 | *p = 0; | 851 | *p = 0; |
646 | preempt_enable(); | 852 | preempt_enable(); |
647 | balance_dirty_pages(mapping, ratelimit); | 853 | balance_dirty_pages(mapping, ratelimit); |
648 | return; | 854 | return; |
649 | } | 855 | } |
650 | preempt_enable(); | 856 | preempt_enable(); |
651 | } | 857 | } |
652 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | 858 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); |
653 | 859 | ||
654 | void throttle_vm_writeout(gfp_t gfp_mask) | 860 | void throttle_vm_writeout(gfp_t gfp_mask) |
655 | { | 861 | { |
656 | unsigned long background_thresh; | 862 | unsigned long background_thresh; |
657 | unsigned long dirty_thresh; | 863 | unsigned long dirty_thresh; |
658 | 864 | ||
659 | for ( ; ; ) { | 865 | for ( ; ; ) { |
660 | global_dirty_limits(&background_thresh, &dirty_thresh); | 866 | global_dirty_limits(&background_thresh, &dirty_thresh); |
661 | 867 | ||
662 | /* | 868 | /* |
663 | * Boost the allowable dirty threshold a bit for page | 869 | * Boost the allowable dirty threshold a bit for page |
664 | * allocators so they don't get DoS'ed by heavy writers | 870 | * allocators so they don't get DoS'ed by heavy writers |
665 | */ | 871 | */ |
666 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ | 872 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ |
667 | 873 | ||
668 | if (global_page_state(NR_UNSTABLE_NFS) + | 874 | if (global_page_state(NR_UNSTABLE_NFS) + |
669 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 875 | global_page_state(NR_WRITEBACK) <= dirty_thresh) |
670 | break; | 876 | break; |
671 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 877 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
672 | 878 | ||
673 | /* | 879 | /* |
674 | * The caller might hold locks which can prevent IO completion | 880 | * The caller might hold locks which can prevent IO completion |
675 | * or progress in the filesystem. So we cannot just sit here | 881 | * or progress in the filesystem. So we cannot just sit here |
676 | * waiting for IO to complete. | 882 | * waiting for IO to complete. |
677 | */ | 883 | */ |
678 | if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) | 884 | if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) |
679 | break; | 885 | break; |
680 | } | 886 | } |
681 | } | 887 | } |
682 | 888 | ||
683 | /* | 889 | /* |
684 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 890 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
685 | */ | 891 | */ |
686 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 892 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
687 | void __user *buffer, size_t *length, loff_t *ppos) | 893 | void __user *buffer, size_t *length, loff_t *ppos) |
688 | { | 894 | { |
689 | proc_dointvec(table, write, buffer, length, ppos); | 895 | proc_dointvec(table, write, buffer, length, ppos); |
690 | bdi_arm_supers_timer(); | 896 | bdi_arm_supers_timer(); |
691 | return 0; | 897 | return 0; |
692 | } | 898 | } |
693 | 899 | ||
694 | #ifdef CONFIG_BLOCK | 900 | #ifdef CONFIG_BLOCK |
695 | void laptop_mode_timer_fn(unsigned long data) | 901 | void laptop_mode_timer_fn(unsigned long data) |
696 | { | 902 | { |
697 | struct request_queue *q = (struct request_queue *)data; | 903 | struct request_queue *q = (struct request_queue *)data; |
698 | int nr_pages = global_page_state(NR_FILE_DIRTY) + | 904 | int nr_pages = global_page_state(NR_FILE_DIRTY) + |
699 | global_page_state(NR_UNSTABLE_NFS); | 905 | global_page_state(NR_UNSTABLE_NFS); |
700 | 906 | ||
701 | /* | 907 | /* |
702 | * We want to write everything out, not just down to the dirty | 908 | * We want to write everything out, not just down to the dirty |
703 | * threshold | 909 | * threshold |
704 | */ | 910 | */ |
705 | if (bdi_has_dirty_io(&q->backing_dev_info)) | 911 | if (bdi_has_dirty_io(&q->backing_dev_info)) |
706 | bdi_start_writeback(&q->backing_dev_info, nr_pages); | 912 | bdi_start_writeback(&q->backing_dev_info, nr_pages); |
707 | } | 913 | } |
708 | 914 | ||
709 | /* | 915 | /* |
710 | * We've spun up the disk and we're in laptop mode: schedule writeback | 916 | * We've spun up the disk and we're in laptop mode: schedule writeback |
711 | * of all dirty data a few seconds from now. If the flush is already scheduled | 917 | * of all dirty data a few seconds from now. If the flush is already scheduled |
712 | * then push it back - the user is still using the disk. | 918 | * then push it back - the user is still using the disk. |
713 | */ | 919 | */ |
714 | void laptop_io_completion(struct backing_dev_info *info) | 920 | void laptop_io_completion(struct backing_dev_info *info) |
715 | { | 921 | { |
716 | mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); | 922 | mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); |
717 | } | 923 | } |
718 | 924 | ||
719 | /* | 925 | /* |
720 | * We're in laptop mode and we've just synced. The sync's writes will have | 926 | * We're in laptop mode and we've just synced. The sync's writes will have |
721 | * caused another writeback to be scheduled by laptop_io_completion. | 927 | * caused another writeback to be scheduled by laptop_io_completion. |
722 | * Nothing needs to be written back anymore, so we unschedule the writeback. | 928 | * Nothing needs to be written back anymore, so we unschedule the writeback. |
723 | */ | 929 | */ |
724 | void laptop_sync_completion(void) | 930 | void laptop_sync_completion(void) |
725 | { | 931 | { |
726 | struct backing_dev_info *bdi; | 932 | struct backing_dev_info *bdi; |
727 | 933 | ||
728 | rcu_read_lock(); | 934 | rcu_read_lock(); |
729 | 935 | ||
730 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) | 936 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) |
731 | del_timer(&bdi->laptop_mode_wb_timer); | 937 | del_timer(&bdi->laptop_mode_wb_timer); |
732 | 938 | ||
733 | rcu_read_unlock(); | 939 | rcu_read_unlock(); |
734 | } | 940 | } |
735 | #endif | 941 | #endif |
736 | 942 | ||
737 | /* | 943 | /* |
738 | * If ratelimit_pages is too high then we can get into dirty-data overload | 944 | * If ratelimit_pages is too high then we can get into dirty-data overload |
739 | * if a large number of processes all perform writes at the same time. | 945 | * if a large number of processes all perform writes at the same time. |
740 | * If it is too low then SMP machines will call the (expensive) | 946 | * If it is too low then SMP machines will call the (expensive) |
741 | * get_writeback_state too often. | 947 | * get_writeback_state too often. |
742 | * | 948 | * |
743 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are | 949 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are |
744 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory | 950 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
745 | * thresholds before writeback cuts in. | 951 | * thresholds before writeback cuts in. |
746 | * | 952 | * |
747 | * But the limit should not be set too high. Because it also controls the | 953 | * But the limit should not be set too high. Because it also controls the |
748 | * amount of memory which the balance_dirty_pages() caller has to write back. | 954 | * amount of memory which the balance_dirty_pages() caller has to write back. |
749 | * If this is too large then the caller will block on the IO queue all the | 955 | * If this is too large then the caller will block on the IO queue all the |
750 | * time. So limit it to four megabytes - the balance_dirty_pages() caller | 956 | * time. So limit it to four megabytes - the balance_dirty_pages() caller |
751 | * will write six megabyte chunks, max. | 957 | * will write six megabyte chunks, max. |
752 | */ | 958 | */ |
753 | 959 | ||
754 | void writeback_set_ratelimit(void) | 960 | void writeback_set_ratelimit(void) |
755 | { | 961 | { |
756 | ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); | 962 | ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); |
757 | if (ratelimit_pages < 16) | 963 | if (ratelimit_pages < 16) |
758 | ratelimit_pages = 16; | 964 | ratelimit_pages = 16; |
759 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | 965 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) |
760 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | 966 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; |
761 | } | 967 | } |
762 | 968 | ||
763 | static int __cpuinit | 969 | static int __cpuinit |
764 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) | 970 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) |
765 | { | 971 | { |
766 | writeback_set_ratelimit(); | 972 | writeback_set_ratelimit(); |
767 | return NOTIFY_DONE; | 973 | return NOTIFY_DONE; |
768 | } | 974 | } |
769 | 975 | ||
770 | static struct notifier_block __cpuinitdata ratelimit_nb = { | 976 | static struct notifier_block __cpuinitdata ratelimit_nb = { |
771 | .notifier_call = ratelimit_handler, | 977 | .notifier_call = ratelimit_handler, |
772 | .next = NULL, | 978 | .next = NULL, |
773 | }; | 979 | }; |
774 | 980 | ||
775 | /* | 981 | /* |
776 | * Called early on to tune the page writeback dirty limits. | 982 | * Called early on to tune the page writeback dirty limits. |
777 | * | 983 | * |
778 | * We used to scale dirty pages according to how total memory | 984 | * We used to scale dirty pages according to how total memory |
779 | * related to pages that could be allocated for buffers (by | 985 | * related to pages that could be allocated for buffers (by |
780 | * comparing nr_free_buffer_pages() to vm_total_pages. | 986 | * comparing nr_free_buffer_pages() to vm_total_pages. |
781 | * | 987 | * |
782 | * However, that was when we used "dirty_ratio" to scale with | 988 | * However, that was when we used "dirty_ratio" to scale with |
783 | * all memory, and we don't do that any more. "dirty_ratio" | 989 | * all memory, and we don't do that any more. "dirty_ratio" |
784 | * is now applied to total non-HIGHPAGE memory (by subtracting | 990 | * is now applied to total non-HIGHPAGE memory (by subtracting |
785 | * totalhigh_pages from vm_total_pages), and as such we can't | 991 | * totalhigh_pages from vm_total_pages), and as such we can't |
786 | * get into the old insane situation any more where we had | 992 | * get into the old insane situation any more where we had |
787 | * large amounts of dirty pages compared to a small amount of | 993 | * large amounts of dirty pages compared to a small amount of |
788 | * non-HIGHMEM memory. | 994 | * non-HIGHMEM memory. |
789 | * | 995 | * |
790 | * But we might still want to scale the dirty_ratio by how | 996 | * But we might still want to scale the dirty_ratio by how |
791 | * much memory the box has.. | 997 | * much memory the box has.. |
792 | */ | 998 | */ |
793 | void __init page_writeback_init(void) | 999 | void __init page_writeback_init(void) |
794 | { | 1000 | { |
795 | int shift; | 1001 | int shift; |
796 | 1002 | ||
797 | writeback_set_ratelimit(); | 1003 | writeback_set_ratelimit(); |
798 | register_cpu_notifier(&ratelimit_nb); | 1004 | register_cpu_notifier(&ratelimit_nb); |
799 | 1005 | ||
800 | shift = calc_period_shift(); | 1006 | shift = calc_period_shift(); |
801 | prop_descriptor_init(&vm_completions, shift); | 1007 | prop_descriptor_init(&vm_completions, shift); |
802 | prop_descriptor_init(&vm_dirties, shift); | 1008 | prop_descriptor_init(&vm_dirties, shift); |
803 | } | 1009 | } |
804 | 1010 | ||
805 | /** | 1011 | /** |
806 | * tag_pages_for_writeback - tag pages to be written by write_cache_pages | 1012 | * tag_pages_for_writeback - tag pages to be written by write_cache_pages |
807 | * @mapping: address space structure to write | 1013 | * @mapping: address space structure to write |
808 | * @start: starting page index | 1014 | * @start: starting page index |
809 | * @end: ending page index (inclusive) | 1015 | * @end: ending page index (inclusive) |
810 | * | 1016 | * |
811 | * This function scans the page range from @start to @end (inclusive) and tags | 1017 | * This function scans the page range from @start to @end (inclusive) and tags |
812 | * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is | 1018 | * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is |
813 | * that write_cache_pages (or whoever calls this function) will then use | 1019 | * that write_cache_pages (or whoever calls this function) will then use |
814 | * TOWRITE tag to identify pages eligible for writeback. This mechanism is | 1020 | * TOWRITE tag to identify pages eligible for writeback. This mechanism is |
815 | * used to avoid livelocking of writeback by a process steadily creating new | 1021 | * used to avoid livelocking of writeback by a process steadily creating new |
816 | * dirty pages in the file (thus it is important for this function to be quick | 1022 | * dirty pages in the file (thus it is important for this function to be quick |
817 | * so that it can tag pages faster than a dirtying process can create them). | 1023 | * so that it can tag pages faster than a dirtying process can create them). |
818 | */ | 1024 | */ |
819 | /* | 1025 | /* |
820 | * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. | 1026 | * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. |
821 | */ | 1027 | */ |
822 | void tag_pages_for_writeback(struct address_space *mapping, | 1028 | void tag_pages_for_writeback(struct address_space *mapping, |
823 | pgoff_t start, pgoff_t end) | 1029 | pgoff_t start, pgoff_t end) |
824 | { | 1030 | { |
825 | #define WRITEBACK_TAG_BATCH 4096 | 1031 | #define WRITEBACK_TAG_BATCH 4096 |
826 | unsigned long tagged; | 1032 | unsigned long tagged; |
827 | 1033 | ||
828 | do { | 1034 | do { |
829 | spin_lock_irq(&mapping->tree_lock); | 1035 | spin_lock_irq(&mapping->tree_lock); |
830 | tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, | 1036 | tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, |
831 | &start, end, WRITEBACK_TAG_BATCH, | 1037 | &start, end, WRITEBACK_TAG_BATCH, |
832 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); | 1038 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); |
833 | spin_unlock_irq(&mapping->tree_lock); | 1039 | spin_unlock_irq(&mapping->tree_lock); |
834 | WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); | 1040 | WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); |
835 | cond_resched(); | 1041 | cond_resched(); |
836 | /* We check 'start' to handle wrapping when end == ~0UL */ | 1042 | /* We check 'start' to handle wrapping when end == ~0UL */ |
837 | } while (tagged >= WRITEBACK_TAG_BATCH && start); | 1043 | } while (tagged >= WRITEBACK_TAG_BATCH && start); |
838 | } | 1044 | } |
839 | EXPORT_SYMBOL(tag_pages_for_writeback); | 1045 | EXPORT_SYMBOL(tag_pages_for_writeback); |
840 | 1046 | ||
841 | /** | 1047 | /** |
842 | * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. | 1048 | * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. |
843 | * @mapping: address space structure to write | 1049 | * @mapping: address space structure to write |
844 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | 1050 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
845 | * @writepage: function called for each page | 1051 | * @writepage: function called for each page |
846 | * @data: data passed to writepage function | 1052 | * @data: data passed to writepage function |
847 | * | 1053 | * |
848 | * If a page is already under I/O, write_cache_pages() skips it, even | 1054 | * If a page is already under I/O, write_cache_pages() skips it, even |
849 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, | 1055 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, |
850 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() | 1056 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() |
851 | * and msync() need to guarantee that all the data which was dirty at the time | 1057 | * and msync() need to guarantee that all the data which was dirty at the time |
852 | * the call was made get new I/O started against them. If wbc->sync_mode is | 1058 | * the call was made get new I/O started against them. If wbc->sync_mode is |
853 | * WB_SYNC_ALL then we were called for data integrity and we must wait for | 1059 | * WB_SYNC_ALL then we were called for data integrity and we must wait for |
854 | * existing IO to complete. | 1060 | * existing IO to complete. |
855 | * | 1061 | * |
856 | * To avoid livelocks (when other process dirties new pages), we first tag | 1062 | * To avoid livelocks (when other process dirties new pages), we first tag |
857 | * pages which should be written back with TOWRITE tag and only then start | 1063 | * pages which should be written back with TOWRITE tag and only then start |
858 | * writing them. For data-integrity sync we have to be careful so that we do | 1064 | * writing them. For data-integrity sync we have to be careful so that we do |
859 | * not miss some pages (e.g., because some other process has cleared TOWRITE | 1065 | * not miss some pages (e.g., because some other process has cleared TOWRITE |
860 | * tag we set). The rule we follow is that TOWRITE tag can be cleared only | 1066 | * tag we set). The rule we follow is that TOWRITE tag can be cleared only |
861 | * by the process clearing the DIRTY tag (and submitting the page for IO). | 1067 | * by the process clearing the DIRTY tag (and submitting the page for IO). |
862 | */ | 1068 | */ |
863 | int write_cache_pages(struct address_space *mapping, | 1069 | int write_cache_pages(struct address_space *mapping, |
864 | struct writeback_control *wbc, writepage_t writepage, | 1070 | struct writeback_control *wbc, writepage_t writepage, |
865 | void *data) | 1071 | void *data) |
866 | { | 1072 | { |
867 | int ret = 0; | 1073 | int ret = 0; |
868 | int done = 0; | 1074 | int done = 0; |
869 | struct pagevec pvec; | 1075 | struct pagevec pvec; |
870 | int nr_pages; | 1076 | int nr_pages; |
871 | pgoff_t uninitialized_var(writeback_index); | 1077 | pgoff_t uninitialized_var(writeback_index); |
872 | pgoff_t index; | 1078 | pgoff_t index; |
873 | pgoff_t end; /* Inclusive */ | 1079 | pgoff_t end; /* Inclusive */ |
874 | pgoff_t done_index; | 1080 | pgoff_t done_index; |
875 | int cycled; | 1081 | int cycled; |
876 | int range_whole = 0; | 1082 | int range_whole = 0; |
877 | int tag; | 1083 | int tag; |
878 | 1084 | ||
879 | pagevec_init(&pvec, 0); | 1085 | pagevec_init(&pvec, 0); |
880 | if (wbc->range_cyclic) { | 1086 | if (wbc->range_cyclic) { |
881 | writeback_index = mapping->writeback_index; /* prev offset */ | 1087 | writeback_index = mapping->writeback_index; /* prev offset */ |
882 | index = writeback_index; | 1088 | index = writeback_index; |
883 | if (index == 0) | 1089 | if (index == 0) |
884 | cycled = 1; | 1090 | cycled = 1; |
885 | else | 1091 | else |
886 | cycled = 0; | 1092 | cycled = 0; |
887 | end = -1; | 1093 | end = -1; |
888 | } else { | 1094 | } else { |
889 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 1095 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
890 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 1096 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
891 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 1097 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
892 | range_whole = 1; | 1098 | range_whole = 1; |
893 | cycled = 1; /* ignore range_cyclic tests */ | 1099 | cycled = 1; /* ignore range_cyclic tests */ |
894 | } | 1100 | } |
895 | if (wbc->sync_mode == WB_SYNC_ALL) | 1101 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
896 | tag = PAGECACHE_TAG_TOWRITE; | 1102 | tag = PAGECACHE_TAG_TOWRITE; |
897 | else | 1103 | else |
898 | tag = PAGECACHE_TAG_DIRTY; | 1104 | tag = PAGECACHE_TAG_DIRTY; |
899 | retry: | 1105 | retry: |
900 | if (wbc->sync_mode == WB_SYNC_ALL) | 1106 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
901 | tag_pages_for_writeback(mapping, index, end); | 1107 | tag_pages_for_writeback(mapping, index, end); |
902 | done_index = index; | 1108 | done_index = index; |
903 | while (!done && (index <= end)) { | 1109 | while (!done && (index <= end)) { |
904 | int i; | 1110 | int i; |
905 | 1111 | ||
906 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | 1112 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, |
907 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | 1113 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); |
908 | if (nr_pages == 0) | 1114 | if (nr_pages == 0) |
909 | break; | 1115 | break; |
910 | 1116 | ||
911 | for (i = 0; i < nr_pages; i++) { | 1117 | for (i = 0; i < nr_pages; i++) { |
912 | struct page *page = pvec.pages[i]; | 1118 | struct page *page = pvec.pages[i]; |
913 | 1119 | ||
914 | /* | 1120 | /* |
915 | * At this point, the page may be truncated or | 1121 | * At this point, the page may be truncated or |
916 | * invalidated (changing page->mapping to NULL), or | 1122 | * invalidated (changing page->mapping to NULL), or |
917 | * even swizzled back from swapper_space to tmpfs file | 1123 | * even swizzled back from swapper_space to tmpfs file |
918 | * mapping. However, page->index will not change | 1124 | * mapping. However, page->index will not change |
919 | * because we have a reference on the page. | 1125 | * because we have a reference on the page. |
920 | */ | 1126 | */ |
921 | if (page->index > end) { | 1127 | if (page->index > end) { |
922 | /* | 1128 | /* |
923 | * can't be range_cyclic (1st pass) because | 1129 | * can't be range_cyclic (1st pass) because |
924 | * end == -1 in that case. | 1130 | * end == -1 in that case. |
925 | */ | 1131 | */ |
926 | done = 1; | 1132 | done = 1; |
927 | break; | 1133 | break; |
928 | } | 1134 | } |
929 | 1135 | ||
930 | done_index = page->index; | 1136 | done_index = page->index; |
931 | 1137 | ||
932 | lock_page(page); | 1138 | lock_page(page); |
933 | 1139 | ||
934 | /* | 1140 | /* |
935 | * Page truncated or invalidated. We can freely skip it | 1141 | * Page truncated or invalidated. We can freely skip it |
936 | * then, even for data integrity operations: the page | 1142 | * then, even for data integrity operations: the page |
937 | * has disappeared concurrently, so there could be no | 1143 | * has disappeared concurrently, so there could be no |
938 | * real expectation of this data interity operation | 1144 | * real expectation of this data interity operation |
939 | * even if there is now a new, dirty page at the same | 1145 | * even if there is now a new, dirty page at the same |
940 | * pagecache address. | 1146 | * pagecache address. |
941 | */ | 1147 | */ |
942 | if (unlikely(page->mapping != mapping)) { | 1148 | if (unlikely(page->mapping != mapping)) { |
943 | continue_unlock: | 1149 | continue_unlock: |
944 | unlock_page(page); | 1150 | unlock_page(page); |
945 | continue; | 1151 | continue; |
946 | } | 1152 | } |
947 | 1153 | ||
948 | if (!PageDirty(page)) { | 1154 | if (!PageDirty(page)) { |
949 | /* someone wrote it for us */ | 1155 | /* someone wrote it for us */ |
950 | goto continue_unlock; | 1156 | goto continue_unlock; |
951 | } | 1157 | } |
952 | 1158 | ||
953 | if (PageWriteback(page)) { | 1159 | if (PageWriteback(page)) { |
954 | if (wbc->sync_mode != WB_SYNC_NONE) | 1160 | if (wbc->sync_mode != WB_SYNC_NONE) |
955 | wait_on_page_writeback(page); | 1161 | wait_on_page_writeback(page); |
956 | else | 1162 | else |
957 | goto continue_unlock; | 1163 | goto continue_unlock; |
958 | } | 1164 | } |
959 | 1165 | ||
960 | BUG_ON(PageWriteback(page)); | 1166 | BUG_ON(PageWriteback(page)); |
961 | if (!clear_page_dirty_for_io(page)) | 1167 | if (!clear_page_dirty_for_io(page)) |
962 | goto continue_unlock; | 1168 | goto continue_unlock; |
963 | 1169 | ||
964 | trace_wbc_writepage(wbc, mapping->backing_dev_info); | 1170 | trace_wbc_writepage(wbc, mapping->backing_dev_info); |
965 | ret = (*writepage)(page, wbc, data); | 1171 | ret = (*writepage)(page, wbc, data); |
966 | if (unlikely(ret)) { | 1172 | if (unlikely(ret)) { |
967 | if (ret == AOP_WRITEPAGE_ACTIVATE) { | 1173 | if (ret == AOP_WRITEPAGE_ACTIVATE) { |
968 | unlock_page(page); | 1174 | unlock_page(page); |
969 | ret = 0; | 1175 | ret = 0; |
970 | } else { | 1176 | } else { |
971 | /* | 1177 | /* |
972 | * done_index is set past this page, | 1178 | * done_index is set past this page, |
973 | * so media errors will not choke | 1179 | * so media errors will not choke |
974 | * background writeout for the entire | 1180 | * background writeout for the entire |
975 | * file. This has consequences for | 1181 | * file. This has consequences for |
976 | * range_cyclic semantics (ie. it may | 1182 | * range_cyclic semantics (ie. it may |
977 | * not be suitable for data integrity | 1183 | * not be suitable for data integrity |
978 | * writeout). | 1184 | * writeout). |
979 | */ | 1185 | */ |
980 | done_index = page->index + 1; | 1186 | done_index = page->index + 1; |
981 | done = 1; | 1187 | done = 1; |
982 | break; | 1188 | break; |
983 | } | 1189 | } |
984 | } | 1190 | } |
985 | 1191 | ||
986 | /* | 1192 | /* |
987 | * We stop writing back only if we are not doing | 1193 | * We stop writing back only if we are not doing |
988 | * integrity sync. In case of integrity sync we have to | 1194 | * integrity sync. In case of integrity sync we have to |
989 | * keep going until we have written all the pages | 1195 | * keep going until we have written all the pages |
990 | * we tagged for writeback prior to entering this loop. | 1196 | * we tagged for writeback prior to entering this loop. |
991 | */ | 1197 | */ |
992 | if (--wbc->nr_to_write <= 0 && | 1198 | if (--wbc->nr_to_write <= 0 && |
993 | wbc->sync_mode == WB_SYNC_NONE) { | 1199 | wbc->sync_mode == WB_SYNC_NONE) { |
994 | done = 1; | 1200 | done = 1; |
995 | break; | 1201 | break; |
996 | } | 1202 | } |
997 | } | 1203 | } |
998 | pagevec_release(&pvec); | 1204 | pagevec_release(&pvec); |
999 | cond_resched(); | 1205 | cond_resched(); |
1000 | } | 1206 | } |
1001 | if (!cycled && !done) { | 1207 | if (!cycled && !done) { |
1002 | /* | 1208 | /* |
1003 | * range_cyclic: | 1209 | * range_cyclic: |
1004 | * We hit the last page and there is more work to be done: wrap | 1210 | * We hit the last page and there is more work to be done: wrap |
1005 | * back to the start of the file | 1211 | * back to the start of the file |
1006 | */ | 1212 | */ |
1007 | cycled = 1; | 1213 | cycled = 1; |
1008 | index = 0; | 1214 | index = 0; |
1009 | end = writeback_index - 1; | 1215 | end = writeback_index - 1; |
1010 | goto retry; | 1216 | goto retry; |
1011 | } | 1217 | } |
1012 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 1218 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
1013 | mapping->writeback_index = done_index; | 1219 | mapping->writeback_index = done_index; |
1014 | 1220 | ||
1015 | return ret; | 1221 | return ret; |
1016 | } | 1222 | } |
1017 | EXPORT_SYMBOL(write_cache_pages); | 1223 | EXPORT_SYMBOL(write_cache_pages); |
1018 | 1224 | ||
1019 | /* | 1225 | /* |
1020 | * Function used by generic_writepages to call the real writepage | 1226 | * Function used by generic_writepages to call the real writepage |
1021 | * function and set the mapping flags on error | 1227 | * function and set the mapping flags on error |
1022 | */ | 1228 | */ |
1023 | static int __writepage(struct page *page, struct writeback_control *wbc, | 1229 | static int __writepage(struct page *page, struct writeback_control *wbc, |
1024 | void *data) | 1230 | void *data) |
1025 | { | 1231 | { |
1026 | struct address_space *mapping = data; | 1232 | struct address_space *mapping = data; |
1027 | int ret = mapping->a_ops->writepage(page, wbc); | 1233 | int ret = mapping->a_ops->writepage(page, wbc); |
1028 | mapping_set_error(mapping, ret); | 1234 | mapping_set_error(mapping, ret); |
1029 | return ret; | 1235 | return ret; |
1030 | } | 1236 | } |
1031 | 1237 | ||
1032 | /** | 1238 | /** |
1033 | * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. | 1239 | * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. |
1034 | * @mapping: address space structure to write | 1240 | * @mapping: address space structure to write |
1035 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | 1241 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
1036 | * | 1242 | * |
1037 | * This is a library function, which implements the writepages() | 1243 | * This is a library function, which implements the writepages() |
1038 | * address_space_operation. | 1244 | * address_space_operation. |
1039 | */ | 1245 | */ |
1040 | int generic_writepages(struct address_space *mapping, | 1246 | int generic_writepages(struct address_space *mapping, |
1041 | struct writeback_control *wbc) | 1247 | struct writeback_control *wbc) |
1042 | { | 1248 | { |
1043 | struct blk_plug plug; | 1249 | struct blk_plug plug; |
1044 | int ret; | 1250 | int ret; |
1045 | 1251 | ||
1046 | /* deal with chardevs and other special file */ | 1252 | /* deal with chardevs and other special file */ |
1047 | if (!mapping->a_ops->writepage) | 1253 | if (!mapping->a_ops->writepage) |
1048 | return 0; | 1254 | return 0; |
1049 | 1255 | ||
1050 | blk_start_plug(&plug); | 1256 | blk_start_plug(&plug); |
1051 | ret = write_cache_pages(mapping, wbc, __writepage, mapping); | 1257 | ret = write_cache_pages(mapping, wbc, __writepage, mapping); |
1052 | blk_finish_plug(&plug); | 1258 | blk_finish_plug(&plug); |
1053 | return ret; | 1259 | return ret; |
1054 | } | 1260 | } |
1055 | 1261 | ||
1056 | EXPORT_SYMBOL(generic_writepages); | 1262 | EXPORT_SYMBOL(generic_writepages); |
1057 | 1263 | ||
1058 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | 1264 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) |
1059 | { | 1265 | { |
1060 | int ret; | 1266 | int ret; |
1061 | 1267 | ||
1062 | if (wbc->nr_to_write <= 0) | 1268 | if (wbc->nr_to_write <= 0) |
1063 | return 0; | 1269 | return 0; |
1064 | if (mapping->a_ops->writepages) | 1270 | if (mapping->a_ops->writepages) |
1065 | ret = mapping->a_ops->writepages(mapping, wbc); | 1271 | ret = mapping->a_ops->writepages(mapping, wbc); |
1066 | else | 1272 | else |
1067 | ret = generic_writepages(mapping, wbc); | 1273 | ret = generic_writepages(mapping, wbc); |
1068 | return ret; | 1274 | return ret; |
1069 | } | 1275 | } |
1070 | 1276 | ||
1071 | /** | 1277 | /** |
1072 | * write_one_page - write out a single page and optionally wait on I/O | 1278 | * write_one_page - write out a single page and optionally wait on I/O |
1073 | * @page: the page to write | 1279 | * @page: the page to write |
1074 | * @wait: if true, wait on writeout | 1280 | * @wait: if true, wait on writeout |
1075 | * | 1281 | * |
1076 | * The page must be locked by the caller and will be unlocked upon return. | 1282 | * The page must be locked by the caller and will be unlocked upon return. |
1077 | * | 1283 | * |
1078 | * write_one_page() returns a negative error code if I/O failed. | 1284 | * write_one_page() returns a negative error code if I/O failed. |
1079 | */ | 1285 | */ |
1080 | int write_one_page(struct page *page, int wait) | 1286 | int write_one_page(struct page *page, int wait) |
1081 | { | 1287 | { |
1082 | struct address_space *mapping = page->mapping; | 1288 | struct address_space *mapping = page->mapping; |
1083 | int ret = 0; | 1289 | int ret = 0; |
1084 | struct writeback_control wbc = { | 1290 | struct writeback_control wbc = { |
1085 | .sync_mode = WB_SYNC_ALL, | 1291 | .sync_mode = WB_SYNC_ALL, |
1086 | .nr_to_write = 1, | 1292 | .nr_to_write = 1, |
1087 | }; | 1293 | }; |
1088 | 1294 | ||
1089 | BUG_ON(!PageLocked(page)); | 1295 | BUG_ON(!PageLocked(page)); |
1090 | 1296 | ||
1091 | if (wait) | 1297 | if (wait) |
1092 | wait_on_page_writeback(page); | 1298 | wait_on_page_writeback(page); |
1093 | 1299 | ||
1094 | if (clear_page_dirty_for_io(page)) { | 1300 | if (clear_page_dirty_for_io(page)) { |
1095 | page_cache_get(page); | 1301 | page_cache_get(page); |
1096 | ret = mapping->a_ops->writepage(page, &wbc); | 1302 | ret = mapping->a_ops->writepage(page, &wbc); |
1097 | if (ret == 0 && wait) { | 1303 | if (ret == 0 && wait) { |
1098 | wait_on_page_writeback(page); | 1304 | wait_on_page_writeback(page); |
1099 | if (PageError(page)) | 1305 | if (PageError(page)) |
1100 | ret = -EIO; | 1306 | ret = -EIO; |
1101 | } | 1307 | } |
1102 | page_cache_release(page); | 1308 | page_cache_release(page); |
1103 | } else { | 1309 | } else { |
1104 | unlock_page(page); | 1310 | unlock_page(page); |
1105 | } | 1311 | } |
1106 | return ret; | 1312 | return ret; |
1107 | } | 1313 | } |
1108 | EXPORT_SYMBOL(write_one_page); | 1314 | EXPORT_SYMBOL(write_one_page); |
1109 | 1315 | ||
1110 | /* | 1316 | /* |
1111 | * For address_spaces which do not use buffers nor write back. | 1317 | * For address_spaces which do not use buffers nor write back. |
1112 | */ | 1318 | */ |
1113 | int __set_page_dirty_no_writeback(struct page *page) | 1319 | int __set_page_dirty_no_writeback(struct page *page) |
1114 | { | 1320 | { |
1115 | if (!PageDirty(page)) | 1321 | if (!PageDirty(page)) |
1116 | return !TestSetPageDirty(page); | 1322 | return !TestSetPageDirty(page); |
1117 | return 0; | 1323 | return 0; |
1118 | } | 1324 | } |
1119 | 1325 | ||
1120 | /* | 1326 | /* |
1121 | * Helper function for set_page_dirty family. | 1327 | * Helper function for set_page_dirty family. |
1122 | * NOTE: This relies on being atomic wrt interrupts. | 1328 | * NOTE: This relies on being atomic wrt interrupts. |
1123 | */ | 1329 | */ |
1124 | void account_page_dirtied(struct page *page, struct address_space *mapping) | 1330 | void account_page_dirtied(struct page *page, struct address_space *mapping) |
1125 | { | 1331 | { |
1126 | if (mapping_cap_account_dirty(mapping)) { | 1332 | if (mapping_cap_account_dirty(mapping)) { |
1127 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 1333 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
1128 | __inc_zone_page_state(page, NR_DIRTIED); | 1334 | __inc_zone_page_state(page, NR_DIRTIED); |
1129 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 1335 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); |
1130 | task_dirty_inc(current); | 1336 | task_dirty_inc(current); |
1131 | task_io_account_write(PAGE_CACHE_SIZE); | 1337 | task_io_account_write(PAGE_CACHE_SIZE); |
1132 | } | 1338 | } |
1133 | } | 1339 | } |
1134 | EXPORT_SYMBOL(account_page_dirtied); | 1340 | EXPORT_SYMBOL(account_page_dirtied); |
1135 | 1341 | ||
1136 | /* | 1342 | /* |
1137 | * Helper function for set_page_writeback family. | 1343 | * Helper function for set_page_writeback family. |
1138 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic | 1344 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic |
1139 | * wrt interrupts. | 1345 | * wrt interrupts. |
1140 | */ | 1346 | */ |
1141 | void account_page_writeback(struct page *page) | 1347 | void account_page_writeback(struct page *page) |
1142 | { | 1348 | { |
1143 | inc_zone_page_state(page, NR_WRITEBACK); | 1349 | inc_zone_page_state(page, NR_WRITEBACK); |
1144 | } | 1350 | } |
1145 | EXPORT_SYMBOL(account_page_writeback); | 1351 | EXPORT_SYMBOL(account_page_writeback); |
1146 | 1352 | ||
1147 | /* | 1353 | /* |
1148 | * For address_spaces which do not use buffers. Just tag the page as dirty in | 1354 | * For address_spaces which do not use buffers. Just tag the page as dirty in |
1149 | * its radix tree. | 1355 | * its radix tree. |
1150 | * | 1356 | * |
1151 | * This is also used when a single buffer is being dirtied: we want to set the | 1357 | * This is also used when a single buffer is being dirtied: we want to set the |
1152 | * page dirty in that case, but not all the buffers. This is a "bottom-up" | 1358 | * page dirty in that case, but not all the buffers. This is a "bottom-up" |
1153 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. | 1359 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. |
1154 | * | 1360 | * |
1155 | * Most callers have locked the page, which pins the address_space in memory. | 1361 | * Most callers have locked the page, which pins the address_space in memory. |
1156 | * But zap_pte_range() does not lock the page, however in that case the | 1362 | * But zap_pte_range() does not lock the page, however in that case the |
1157 | * mapping is pinned by the vma's ->vm_file reference. | 1363 | * mapping is pinned by the vma's ->vm_file reference. |
1158 | * | 1364 | * |
1159 | * We take care to handle the case where the page was truncated from the | 1365 | * We take care to handle the case where the page was truncated from the |
1160 | * mapping by re-checking page_mapping() inside tree_lock. | 1366 | * mapping by re-checking page_mapping() inside tree_lock. |
1161 | */ | 1367 | */ |
1162 | int __set_page_dirty_nobuffers(struct page *page) | 1368 | int __set_page_dirty_nobuffers(struct page *page) |
1163 | { | 1369 | { |
1164 | if (!TestSetPageDirty(page)) { | 1370 | if (!TestSetPageDirty(page)) { |
1165 | struct address_space *mapping = page_mapping(page); | 1371 | struct address_space *mapping = page_mapping(page); |
1166 | struct address_space *mapping2; | 1372 | struct address_space *mapping2; |
1167 | 1373 | ||
1168 | if (!mapping) | 1374 | if (!mapping) |
1169 | return 1; | 1375 | return 1; |
1170 | 1376 | ||
1171 | spin_lock_irq(&mapping->tree_lock); | 1377 | spin_lock_irq(&mapping->tree_lock); |
1172 | mapping2 = page_mapping(page); | 1378 | mapping2 = page_mapping(page); |
1173 | if (mapping2) { /* Race with truncate? */ | 1379 | if (mapping2) { /* Race with truncate? */ |
1174 | BUG_ON(mapping2 != mapping); | 1380 | BUG_ON(mapping2 != mapping); |
1175 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | 1381 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
1176 | account_page_dirtied(page, mapping); | 1382 | account_page_dirtied(page, mapping); |
1177 | radix_tree_tag_set(&mapping->page_tree, | 1383 | radix_tree_tag_set(&mapping->page_tree, |
1178 | page_index(page), PAGECACHE_TAG_DIRTY); | 1384 | page_index(page), PAGECACHE_TAG_DIRTY); |
1179 | } | 1385 | } |
1180 | spin_unlock_irq(&mapping->tree_lock); | 1386 | spin_unlock_irq(&mapping->tree_lock); |
1181 | if (mapping->host) { | 1387 | if (mapping->host) { |
1182 | /* !PageAnon && !swapper_space */ | 1388 | /* !PageAnon && !swapper_space */ |
1183 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 1389 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
1184 | } | 1390 | } |
1185 | return 1; | 1391 | return 1; |
1186 | } | 1392 | } |
1187 | return 0; | 1393 | return 0; |
1188 | } | 1394 | } |
1189 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); | 1395 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); |
1190 | 1396 | ||
1191 | /* | 1397 | /* |
1192 | * When a writepage implementation decides that it doesn't want to write this | 1398 | * When a writepage implementation decides that it doesn't want to write this |
1193 | * page for some reason, it should redirty the locked page via | 1399 | * page for some reason, it should redirty the locked page via |
1194 | * redirty_page_for_writepage() and it should then unlock the page and return 0 | 1400 | * redirty_page_for_writepage() and it should then unlock the page and return 0 |
1195 | */ | 1401 | */ |
1196 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | 1402 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) |
1197 | { | 1403 | { |
1198 | wbc->pages_skipped++; | 1404 | wbc->pages_skipped++; |
1199 | return __set_page_dirty_nobuffers(page); | 1405 | return __set_page_dirty_nobuffers(page); |
1200 | } | 1406 | } |
1201 | EXPORT_SYMBOL(redirty_page_for_writepage); | 1407 | EXPORT_SYMBOL(redirty_page_for_writepage); |
1202 | 1408 | ||
1203 | /* | 1409 | /* |
1204 | * Dirty a page. | 1410 | * Dirty a page. |
1205 | * | 1411 | * |
1206 | * For pages with a mapping this should be done under the page lock | 1412 | * For pages with a mapping this should be done under the page lock |
1207 | * for the benefit of asynchronous memory errors who prefer a consistent | 1413 | * for the benefit of asynchronous memory errors who prefer a consistent |
1208 | * dirty state. This rule can be broken in some special cases, | 1414 | * dirty state. This rule can be broken in some special cases, |
1209 | * but should be better not to. | 1415 | * but should be better not to. |
1210 | * | 1416 | * |
1211 | * If the mapping doesn't provide a set_page_dirty a_op, then | 1417 | * If the mapping doesn't provide a set_page_dirty a_op, then |
1212 | * just fall through and assume that it wants buffer_heads. | 1418 | * just fall through and assume that it wants buffer_heads. |
1213 | */ | 1419 | */ |
1214 | int set_page_dirty(struct page *page) | 1420 | int set_page_dirty(struct page *page) |
1215 | { | 1421 | { |
1216 | struct address_space *mapping = page_mapping(page); | 1422 | struct address_space *mapping = page_mapping(page); |
1217 | 1423 | ||
1218 | if (likely(mapping)) { | 1424 | if (likely(mapping)) { |
1219 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; | 1425 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
1220 | /* | 1426 | /* |
1221 | * readahead/lru_deactivate_page could remain | 1427 | * readahead/lru_deactivate_page could remain |
1222 | * PG_readahead/PG_reclaim due to race with end_page_writeback | 1428 | * PG_readahead/PG_reclaim due to race with end_page_writeback |
1223 | * About readahead, if the page is written, the flags would be | 1429 | * About readahead, if the page is written, the flags would be |
1224 | * reset. So no problem. | 1430 | * reset. So no problem. |
1225 | * About lru_deactivate_page, if the page is redirty, the flag | 1431 | * About lru_deactivate_page, if the page is redirty, the flag |
1226 | * will be reset. So no problem. but if the page is used by readahead | 1432 | * will be reset. So no problem. but if the page is used by readahead |
1227 | * it will confuse readahead and make it restart the size rampup | 1433 | * it will confuse readahead and make it restart the size rampup |
1228 | * process. But it's a trivial problem. | 1434 | * process. But it's a trivial problem. |
1229 | */ | 1435 | */ |
1230 | ClearPageReclaim(page); | 1436 | ClearPageReclaim(page); |
1231 | #ifdef CONFIG_BLOCK | 1437 | #ifdef CONFIG_BLOCK |
1232 | if (!spd) | 1438 | if (!spd) |
1233 | spd = __set_page_dirty_buffers; | 1439 | spd = __set_page_dirty_buffers; |
1234 | #endif | 1440 | #endif |
1235 | return (*spd)(page); | 1441 | return (*spd)(page); |
1236 | } | 1442 | } |
1237 | if (!PageDirty(page)) { | 1443 | if (!PageDirty(page)) { |
1238 | if (!TestSetPageDirty(page)) | 1444 | if (!TestSetPageDirty(page)) |
1239 | return 1; | 1445 | return 1; |
1240 | } | 1446 | } |
1241 | return 0; | 1447 | return 0; |
1242 | } | 1448 | } |
1243 | EXPORT_SYMBOL(set_page_dirty); | 1449 | EXPORT_SYMBOL(set_page_dirty); |
1244 | 1450 | ||
1245 | /* | 1451 | /* |
1246 | * set_page_dirty() is racy if the caller has no reference against | 1452 | * set_page_dirty() is racy if the caller has no reference against |
1247 | * page->mapping->host, and if the page is unlocked. This is because another | 1453 | * page->mapping->host, and if the page is unlocked. This is because another |
1248 | * CPU could truncate the page off the mapping and then free the mapping. | 1454 | * CPU could truncate the page off the mapping and then free the mapping. |
1249 | * | 1455 | * |
1250 | * Usually, the page _is_ locked, or the caller is a user-space process which | 1456 | * Usually, the page _is_ locked, or the caller is a user-space process which |
1251 | * holds a reference on the inode by having an open file. | 1457 | * holds a reference on the inode by having an open file. |
1252 | * | 1458 | * |
1253 | * In other cases, the page should be locked before running set_page_dirty(). | 1459 | * In other cases, the page should be locked before running set_page_dirty(). |
1254 | */ | 1460 | */ |
1255 | int set_page_dirty_lock(struct page *page) | 1461 | int set_page_dirty_lock(struct page *page) |
1256 | { | 1462 | { |
1257 | int ret; | 1463 | int ret; |
1258 | 1464 | ||
1259 | lock_page(page); | 1465 | lock_page(page); |
1260 | ret = set_page_dirty(page); | 1466 | ret = set_page_dirty(page); |
1261 | unlock_page(page); | 1467 | unlock_page(page); |
1262 | return ret; | 1468 | return ret; |
1263 | } | 1469 | } |
1264 | EXPORT_SYMBOL(set_page_dirty_lock); | 1470 | EXPORT_SYMBOL(set_page_dirty_lock); |
1265 | 1471 | ||
1266 | /* | 1472 | /* |
1267 | * Clear a page's dirty flag, while caring for dirty memory accounting. | 1473 | * Clear a page's dirty flag, while caring for dirty memory accounting. |
1268 | * Returns true if the page was previously dirty. | 1474 | * Returns true if the page was previously dirty. |
1269 | * | 1475 | * |
1270 | * This is for preparing to put the page under writeout. We leave the page | 1476 | * This is for preparing to put the page under writeout. We leave the page |
1271 | * tagged as dirty in the radix tree so that a concurrent write-for-sync | 1477 | * tagged as dirty in the radix tree so that a concurrent write-for-sync |
1272 | * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage | 1478 | * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage |
1273 | * implementation will run either set_page_writeback() or set_page_dirty(), | 1479 | * implementation will run either set_page_writeback() or set_page_dirty(), |
1274 | * at which stage we bring the page's dirty flag and radix-tree dirty tag | 1480 | * at which stage we bring the page's dirty flag and radix-tree dirty tag |
1275 | * back into sync. | 1481 | * back into sync. |
1276 | * | 1482 | * |
1277 | * This incoherency between the page's dirty flag and radix-tree tag is | 1483 | * This incoherency between the page's dirty flag and radix-tree tag is |
1278 | * unfortunate, but it only exists while the page is locked. | 1484 | * unfortunate, but it only exists while the page is locked. |
1279 | */ | 1485 | */ |
1280 | int clear_page_dirty_for_io(struct page *page) | 1486 | int clear_page_dirty_for_io(struct page *page) |
1281 | { | 1487 | { |
1282 | struct address_space *mapping = page_mapping(page); | 1488 | struct address_space *mapping = page_mapping(page); |
1283 | 1489 | ||
1284 | BUG_ON(!PageLocked(page)); | 1490 | BUG_ON(!PageLocked(page)); |
1285 | 1491 | ||
1286 | if (mapping && mapping_cap_account_dirty(mapping)) { | 1492 | if (mapping && mapping_cap_account_dirty(mapping)) { |
1287 | /* | 1493 | /* |
1288 | * Yes, Virginia, this is indeed insane. | 1494 | * Yes, Virginia, this is indeed insane. |
1289 | * | 1495 | * |
1290 | * We use this sequence to make sure that | 1496 | * We use this sequence to make sure that |
1291 | * (a) we account for dirty stats properly | 1497 | * (a) we account for dirty stats properly |
1292 | * (b) we tell the low-level filesystem to | 1498 | * (b) we tell the low-level filesystem to |
1293 | * mark the whole page dirty if it was | 1499 | * mark the whole page dirty if it was |
1294 | * dirty in a pagetable. Only to then | 1500 | * dirty in a pagetable. Only to then |
1295 | * (c) clean the page again and return 1 to | 1501 | * (c) clean the page again and return 1 to |
1296 | * cause the writeback. | 1502 | * cause the writeback. |
1297 | * | 1503 | * |
1298 | * This way we avoid all nasty races with the | 1504 | * This way we avoid all nasty races with the |
1299 | * dirty bit in multiple places and clearing | 1505 | * dirty bit in multiple places and clearing |
1300 | * them concurrently from different threads. | 1506 | * them concurrently from different threads. |
1301 | * | 1507 | * |
1302 | * Note! Normally the "set_page_dirty(page)" | 1508 | * Note! Normally the "set_page_dirty(page)" |
1303 | * has no effect on the actual dirty bit - since | 1509 | * has no effect on the actual dirty bit - since |
1304 | * that will already usually be set. But we | 1510 | * that will already usually be set. But we |
1305 | * need the side effects, and it can help us | 1511 | * need the side effects, and it can help us |
1306 | * avoid races. | 1512 | * avoid races. |
1307 | * | 1513 | * |
1308 | * We basically use the page "master dirty bit" | 1514 | * We basically use the page "master dirty bit" |
1309 | * as a serialization point for all the different | 1515 | * as a serialization point for all the different |
1310 | * threads doing their things. | 1516 | * threads doing their things. |
1311 | */ | 1517 | */ |
1312 | if (page_mkclean(page)) | 1518 | if (page_mkclean(page)) |
1313 | set_page_dirty(page); | 1519 | set_page_dirty(page); |
1314 | /* | 1520 | /* |
1315 | * We carefully synchronise fault handlers against | 1521 | * We carefully synchronise fault handlers against |
1316 | * installing a dirty pte and marking the page dirty | 1522 | * installing a dirty pte and marking the page dirty |
1317 | * at this point. We do this by having them hold the | 1523 | * at this point. We do this by having them hold the |
1318 | * page lock at some point after installing their | 1524 | * page lock at some point after installing their |
1319 | * pte, but before marking the page dirty. | 1525 | * pte, but before marking the page dirty. |
1320 | * Pages are always locked coming in here, so we get | 1526 | * Pages are always locked coming in here, so we get |
1321 | * the desired exclusion. See mm/memory.c:do_wp_page() | 1527 | * the desired exclusion. See mm/memory.c:do_wp_page() |
1322 | * for more comments. | 1528 | * for more comments. |
1323 | */ | 1529 | */ |
1324 | if (TestClearPageDirty(page)) { | 1530 | if (TestClearPageDirty(page)) { |
1325 | dec_zone_page_state(page, NR_FILE_DIRTY); | 1531 | dec_zone_page_state(page, NR_FILE_DIRTY); |
1326 | dec_bdi_stat(mapping->backing_dev_info, | 1532 | dec_bdi_stat(mapping->backing_dev_info, |
1327 | BDI_RECLAIMABLE); | 1533 | BDI_RECLAIMABLE); |
1328 | return 1; | 1534 | return 1; |
1329 | } | 1535 | } |
1330 | return 0; | 1536 | return 0; |
1331 | } | 1537 | } |
1332 | return TestClearPageDirty(page); | 1538 | return TestClearPageDirty(page); |
1333 | } | 1539 | } |
1334 | EXPORT_SYMBOL(clear_page_dirty_for_io); | 1540 | EXPORT_SYMBOL(clear_page_dirty_for_io); |
1335 | 1541 | ||
1336 | int test_clear_page_writeback(struct page *page) | 1542 | int test_clear_page_writeback(struct page *page) |
1337 | { | 1543 | { |
1338 | struct address_space *mapping = page_mapping(page); | 1544 | struct address_space *mapping = page_mapping(page); |
1339 | int ret; | 1545 | int ret; |
1340 | 1546 | ||
1341 | if (mapping) { | 1547 | if (mapping) { |
1342 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1548 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1343 | unsigned long flags; | 1549 | unsigned long flags; |
1344 | 1550 | ||
1345 | spin_lock_irqsave(&mapping->tree_lock, flags); | 1551 | spin_lock_irqsave(&mapping->tree_lock, flags); |
1346 | ret = TestClearPageWriteback(page); | 1552 | ret = TestClearPageWriteback(page); |
1347 | if (ret) { | 1553 | if (ret) { |
1348 | radix_tree_tag_clear(&mapping->page_tree, | 1554 | radix_tree_tag_clear(&mapping->page_tree, |
1349 | page_index(page), | 1555 | page_index(page), |
1350 | PAGECACHE_TAG_WRITEBACK); | 1556 | PAGECACHE_TAG_WRITEBACK); |
1351 | if (bdi_cap_account_writeback(bdi)) { | 1557 | if (bdi_cap_account_writeback(bdi)) { |
1352 | __dec_bdi_stat(bdi, BDI_WRITEBACK); | 1558 | __dec_bdi_stat(bdi, BDI_WRITEBACK); |
1353 | __bdi_writeout_inc(bdi); | 1559 | __bdi_writeout_inc(bdi); |
1354 | } | 1560 | } |
1355 | } | 1561 | } |
1356 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 1562 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1357 | } else { | 1563 | } else { |
1358 | ret = TestClearPageWriteback(page); | 1564 | ret = TestClearPageWriteback(page); |
1359 | } | 1565 | } |
1360 | if (ret) { | 1566 | if (ret) { |
1361 | dec_zone_page_state(page, NR_WRITEBACK); | 1567 | dec_zone_page_state(page, NR_WRITEBACK); |
1362 | inc_zone_page_state(page, NR_WRITTEN); | 1568 | inc_zone_page_state(page, NR_WRITTEN); |
1363 | } | 1569 | } |
1364 | return ret; | 1570 | return ret; |
1365 | } | 1571 | } |
1366 | 1572 | ||
1367 | int test_set_page_writeback(struct page *page) | 1573 | int test_set_page_writeback(struct page *page) |
1368 | { | 1574 | { |
1369 | struct address_space *mapping = page_mapping(page); | 1575 | struct address_space *mapping = page_mapping(page); |
1370 | int ret; | 1576 | int ret; |
1371 | 1577 | ||
1372 | if (mapping) { | 1578 | if (mapping) { |
1373 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1579 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1374 | unsigned long flags; | 1580 | unsigned long flags; |
1375 | 1581 | ||
1376 | spin_lock_irqsave(&mapping->tree_lock, flags); | 1582 | spin_lock_irqsave(&mapping->tree_lock, flags); |
1377 | ret = TestSetPageWriteback(page); | 1583 | ret = TestSetPageWriteback(page); |
1378 | if (!ret) { | 1584 | if (!ret) { |
1379 | radix_tree_tag_set(&mapping->page_tree, | 1585 | radix_tree_tag_set(&mapping->page_tree, |
1380 | page_index(page), | 1586 | page_index(page), |
1381 | PAGECACHE_TAG_WRITEBACK); | 1587 | PAGECACHE_TAG_WRITEBACK); |
1382 | if (bdi_cap_account_writeback(bdi)) | 1588 | if (bdi_cap_account_writeback(bdi)) |
1383 | __inc_bdi_stat(bdi, BDI_WRITEBACK); | 1589 | __inc_bdi_stat(bdi, BDI_WRITEBACK); |
1384 | } | 1590 | } |
1385 | if (!PageDirty(page)) | 1591 | if (!PageDirty(page)) |
1386 | radix_tree_tag_clear(&mapping->page_tree, | 1592 | radix_tree_tag_clear(&mapping->page_tree, |
1387 | page_index(page), | 1593 | page_index(page), |
1388 | PAGECACHE_TAG_DIRTY); | 1594 | PAGECACHE_TAG_DIRTY); |
1389 | radix_tree_tag_clear(&mapping->page_tree, | 1595 | radix_tree_tag_clear(&mapping->page_tree, |
1390 | page_index(page), | 1596 | page_index(page), |
1391 | PAGECACHE_TAG_TOWRITE); | 1597 | PAGECACHE_TAG_TOWRITE); |
1392 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 1598 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1393 | } else { | 1599 | } else { |
1394 | ret = TestSetPageWriteback(page); | 1600 | ret = TestSetPageWriteback(page); |
1395 | } | 1601 | } |
1396 | if (!ret) | 1602 | if (!ret) |
1397 | account_page_writeback(page); | 1603 | account_page_writeback(page); |
1398 | return ret; | 1604 | return ret; |
mm/rmap.c
1 | /* | 1 | /* |
2 | * mm/rmap.c - physical to virtual reverse mappings | 2 | * mm/rmap.c - physical to virtual reverse mappings |
3 | * | 3 | * |
4 | * Copyright 2001, Rik van Riel <riel@conectiva.com.br> | 4 | * Copyright 2001, Rik van Riel <riel@conectiva.com.br> |
5 | * Released under the General Public License (GPL). | 5 | * Released under the General Public License (GPL). |
6 | * | 6 | * |
7 | * Simple, low overhead reverse mapping scheme. | 7 | * Simple, low overhead reverse mapping scheme. |
8 | * Please try to keep this thing as modular as possible. | 8 | * Please try to keep this thing as modular as possible. |
9 | * | 9 | * |
10 | * Provides methods for unmapping each kind of mapped page: | 10 | * Provides methods for unmapping each kind of mapped page: |
11 | * the anon methods track anonymous pages, and | 11 | * the anon methods track anonymous pages, and |
12 | * the file methods track pages belonging to an inode. | 12 | * the file methods track pages belonging to an inode. |
13 | * | 13 | * |
14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 | 14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 |
15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 | 15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 |
16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 | 16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 |
17 | * Contributions by Hugh Dickins 2003, 2004 | 17 | * Contributions by Hugh Dickins 2003, 2004 |
18 | */ | 18 | */ |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
27 | * anon_vma->mutex | 27 | * anon_vma->mutex |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
30 | * swap_lock (in swap_duplicate, swap_info_get) | 30 | * swap_lock (in swap_duplicate, swap_info_get) |
31 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * mmlist_lock (in mmput, drain_mmlist and others) |
32 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mapping->private_lock (in __set_page_dirty_buffers) |
33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
34 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_wb_list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |
42 | * pte map lock | 42 | * pte map lock |
43 | */ | 43 | */ |
44 | 44 | ||
45 | #include <linux/mm.h> | 45 | #include <linux/mm.h> |
46 | #include <linux/pagemap.h> | 46 | #include <linux/pagemap.h> |
47 | #include <linux/swap.h> | 47 | #include <linux/swap.h> |
48 | #include <linux/swapops.h> | 48 | #include <linux/swapops.h> |
49 | #include <linux/slab.h> | 49 | #include <linux/slab.h> |
50 | #include <linux/init.h> | 50 | #include <linux/init.h> |
51 | #include <linux/ksm.h> | 51 | #include <linux/ksm.h> |
52 | #include <linux/rmap.h> | 52 | #include <linux/rmap.h> |
53 | #include <linux/rcupdate.h> | 53 | #include <linux/rcupdate.h> |
54 | #include <linux/module.h> | 54 | #include <linux/module.h> |
55 | #include <linux/memcontrol.h> | 55 | #include <linux/memcontrol.h> |
56 | #include <linux/mmu_notifier.h> | 56 | #include <linux/mmu_notifier.h> |
57 | #include <linux/migrate.h> | 57 | #include <linux/migrate.h> |
58 | #include <linux/hugetlb.h> | 58 | #include <linux/hugetlb.h> |
59 | 59 | ||
60 | #include <asm/tlbflush.h> | 60 | #include <asm/tlbflush.h> |
61 | 61 | ||
62 | #include "internal.h" | 62 | #include "internal.h" |
63 | 63 | ||
64 | static struct kmem_cache *anon_vma_cachep; | 64 | static struct kmem_cache *anon_vma_cachep; |
65 | static struct kmem_cache *anon_vma_chain_cachep; | 65 | static struct kmem_cache *anon_vma_chain_cachep; |
66 | 66 | ||
67 | static inline struct anon_vma *anon_vma_alloc(void) | 67 | static inline struct anon_vma *anon_vma_alloc(void) |
68 | { | 68 | { |
69 | struct anon_vma *anon_vma; | 69 | struct anon_vma *anon_vma; |
70 | 70 | ||
71 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 71 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
72 | if (anon_vma) { | 72 | if (anon_vma) { |
73 | atomic_set(&anon_vma->refcount, 1); | 73 | atomic_set(&anon_vma->refcount, 1); |
74 | /* | 74 | /* |
75 | * Initialise the anon_vma root to point to itself. If called | 75 | * Initialise the anon_vma root to point to itself. If called |
76 | * from fork, the root will be reset to the parents anon_vma. | 76 | * from fork, the root will be reset to the parents anon_vma. |
77 | */ | 77 | */ |
78 | anon_vma->root = anon_vma; | 78 | anon_vma->root = anon_vma; |
79 | } | 79 | } |
80 | 80 | ||
81 | return anon_vma; | 81 | return anon_vma; |
82 | } | 82 | } |
83 | 83 | ||
84 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 84 | static inline void anon_vma_free(struct anon_vma *anon_vma) |
85 | { | 85 | { |
86 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 86 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * Synchronize against page_lock_anon_vma() such that | 89 | * Synchronize against page_lock_anon_vma() such that |
90 | * we can safely hold the lock without the anon_vma getting | 90 | * we can safely hold the lock without the anon_vma getting |
91 | * freed. | 91 | * freed. |
92 | * | 92 | * |
93 | * Relies on the full mb implied by the atomic_dec_and_test() from | 93 | * Relies on the full mb implied by the atomic_dec_and_test() from |
94 | * put_anon_vma() against the acquire barrier implied by | 94 | * put_anon_vma() against the acquire barrier implied by |
95 | * mutex_trylock() from page_lock_anon_vma(). This orders: | 95 | * mutex_trylock() from page_lock_anon_vma(). This orders: |
96 | * | 96 | * |
97 | * page_lock_anon_vma() VS put_anon_vma() | 97 | * page_lock_anon_vma() VS put_anon_vma() |
98 | * mutex_trylock() atomic_dec_and_test() | 98 | * mutex_trylock() atomic_dec_and_test() |
99 | * LOCK MB | 99 | * LOCK MB |
100 | * atomic_read() mutex_is_locked() | 100 | * atomic_read() mutex_is_locked() |
101 | * | 101 | * |
102 | * LOCK should suffice since the actual taking of the lock must | 102 | * LOCK should suffice since the actual taking of the lock must |
103 | * happen _before_ what follows. | 103 | * happen _before_ what follows. |
104 | */ | 104 | */ |
105 | if (mutex_is_locked(&anon_vma->root->mutex)) { | 105 | if (mutex_is_locked(&anon_vma->root->mutex)) { |
106 | anon_vma_lock(anon_vma); | 106 | anon_vma_lock(anon_vma); |
107 | anon_vma_unlock(anon_vma); | 107 | anon_vma_unlock(anon_vma); |
108 | } | 108 | } |
109 | 109 | ||
110 | kmem_cache_free(anon_vma_cachep, anon_vma); | 110 | kmem_cache_free(anon_vma_cachep, anon_vma); |
111 | } | 111 | } |
112 | 112 | ||
113 | static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) | 113 | static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) |
114 | { | 114 | { |
115 | return kmem_cache_alloc(anon_vma_chain_cachep, gfp); | 115 | return kmem_cache_alloc(anon_vma_chain_cachep, gfp); |
116 | } | 116 | } |
117 | 117 | ||
118 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | 118 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) |
119 | { | 119 | { |
120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | 120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); |
121 | } | 121 | } |
122 | 122 | ||
123 | /** | 123 | /** |
124 | * anon_vma_prepare - attach an anon_vma to a memory region | 124 | * anon_vma_prepare - attach an anon_vma to a memory region |
125 | * @vma: the memory region in question | 125 | * @vma: the memory region in question |
126 | * | 126 | * |
127 | * This makes sure the memory mapping described by 'vma' has | 127 | * This makes sure the memory mapping described by 'vma' has |
128 | * an 'anon_vma' attached to it, so that we can associate the | 128 | * an 'anon_vma' attached to it, so that we can associate the |
129 | * anonymous pages mapped into it with that anon_vma. | 129 | * anonymous pages mapped into it with that anon_vma. |
130 | * | 130 | * |
131 | * The common case will be that we already have one, but if | 131 | * The common case will be that we already have one, but if |
132 | * not we either need to find an adjacent mapping that we | 132 | * not we either need to find an adjacent mapping that we |
133 | * can re-use the anon_vma from (very common when the only | 133 | * can re-use the anon_vma from (very common when the only |
134 | * reason for splitting a vma has been mprotect()), or we | 134 | * reason for splitting a vma has been mprotect()), or we |
135 | * allocate a new one. | 135 | * allocate a new one. |
136 | * | 136 | * |
137 | * Anon-vma allocations are very subtle, because we may have | 137 | * Anon-vma allocations are very subtle, because we may have |
138 | * optimistically looked up an anon_vma in page_lock_anon_vma() | 138 | * optimistically looked up an anon_vma in page_lock_anon_vma() |
139 | * and that may actually touch the spinlock even in the newly | 139 | * and that may actually touch the spinlock even in the newly |
140 | * allocated vma (it depends on RCU to make sure that the | 140 | * allocated vma (it depends on RCU to make sure that the |
141 | * anon_vma isn't actually destroyed). | 141 | * anon_vma isn't actually destroyed). |
142 | * | 142 | * |
143 | * As a result, we need to do proper anon_vma locking even | 143 | * As a result, we need to do proper anon_vma locking even |
144 | * for the new allocation. At the same time, we do not want | 144 | * for the new allocation. At the same time, we do not want |
145 | * to do any locking for the common case of already having | 145 | * to do any locking for the common case of already having |
146 | * an anon_vma. | 146 | * an anon_vma. |
147 | * | 147 | * |
148 | * This must be called with the mmap_sem held for reading. | 148 | * This must be called with the mmap_sem held for reading. |
149 | */ | 149 | */ |
150 | int anon_vma_prepare(struct vm_area_struct *vma) | 150 | int anon_vma_prepare(struct vm_area_struct *vma) |
151 | { | 151 | { |
152 | struct anon_vma *anon_vma = vma->anon_vma; | 152 | struct anon_vma *anon_vma = vma->anon_vma; |
153 | struct anon_vma_chain *avc; | 153 | struct anon_vma_chain *avc; |
154 | 154 | ||
155 | might_sleep(); | 155 | might_sleep(); |
156 | if (unlikely(!anon_vma)) { | 156 | if (unlikely(!anon_vma)) { |
157 | struct mm_struct *mm = vma->vm_mm; | 157 | struct mm_struct *mm = vma->vm_mm; |
158 | struct anon_vma *allocated; | 158 | struct anon_vma *allocated; |
159 | 159 | ||
160 | avc = anon_vma_chain_alloc(GFP_KERNEL); | 160 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
161 | if (!avc) | 161 | if (!avc) |
162 | goto out_enomem; | 162 | goto out_enomem; |
163 | 163 | ||
164 | anon_vma = find_mergeable_anon_vma(vma); | 164 | anon_vma = find_mergeable_anon_vma(vma); |
165 | allocated = NULL; | 165 | allocated = NULL; |
166 | if (!anon_vma) { | 166 | if (!anon_vma) { |
167 | anon_vma = anon_vma_alloc(); | 167 | anon_vma = anon_vma_alloc(); |
168 | if (unlikely(!anon_vma)) | 168 | if (unlikely(!anon_vma)) |
169 | goto out_enomem_free_avc; | 169 | goto out_enomem_free_avc; |
170 | allocated = anon_vma; | 170 | allocated = anon_vma; |
171 | } | 171 | } |
172 | 172 | ||
173 | anon_vma_lock(anon_vma); | 173 | anon_vma_lock(anon_vma); |
174 | /* page_table_lock to protect against threads */ | 174 | /* page_table_lock to protect against threads */ |
175 | spin_lock(&mm->page_table_lock); | 175 | spin_lock(&mm->page_table_lock); |
176 | if (likely(!vma->anon_vma)) { | 176 | if (likely(!vma->anon_vma)) { |
177 | vma->anon_vma = anon_vma; | 177 | vma->anon_vma = anon_vma; |
178 | avc->anon_vma = anon_vma; | 178 | avc->anon_vma = anon_vma; |
179 | avc->vma = vma; | 179 | avc->vma = vma; |
180 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 180 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
181 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 181 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
182 | allocated = NULL; | 182 | allocated = NULL; |
183 | avc = NULL; | 183 | avc = NULL; |
184 | } | 184 | } |
185 | spin_unlock(&mm->page_table_lock); | 185 | spin_unlock(&mm->page_table_lock); |
186 | anon_vma_unlock(anon_vma); | 186 | anon_vma_unlock(anon_vma); |
187 | 187 | ||
188 | if (unlikely(allocated)) | 188 | if (unlikely(allocated)) |
189 | put_anon_vma(allocated); | 189 | put_anon_vma(allocated); |
190 | if (unlikely(avc)) | 190 | if (unlikely(avc)) |
191 | anon_vma_chain_free(avc); | 191 | anon_vma_chain_free(avc); |
192 | } | 192 | } |
193 | return 0; | 193 | return 0; |
194 | 194 | ||
195 | out_enomem_free_avc: | 195 | out_enomem_free_avc: |
196 | anon_vma_chain_free(avc); | 196 | anon_vma_chain_free(avc); |
197 | out_enomem: | 197 | out_enomem: |
198 | return -ENOMEM; | 198 | return -ENOMEM; |
199 | } | 199 | } |
200 | 200 | ||
201 | /* | 201 | /* |
202 | * This is a useful helper function for locking the anon_vma root as | 202 | * This is a useful helper function for locking the anon_vma root as |
203 | * we traverse the vma->anon_vma_chain, looping over anon_vma's that | 203 | * we traverse the vma->anon_vma_chain, looping over anon_vma's that |
204 | * have the same vma. | 204 | * have the same vma. |
205 | * | 205 | * |
206 | * Such anon_vma's should have the same root, so you'd expect to see | 206 | * Such anon_vma's should have the same root, so you'd expect to see |
207 | * just a single mutex_lock for the whole traversal. | 207 | * just a single mutex_lock for the whole traversal. |
208 | */ | 208 | */ |
209 | static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) | 209 | static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) |
210 | { | 210 | { |
211 | struct anon_vma *new_root = anon_vma->root; | 211 | struct anon_vma *new_root = anon_vma->root; |
212 | if (new_root != root) { | 212 | if (new_root != root) { |
213 | if (WARN_ON_ONCE(root)) | 213 | if (WARN_ON_ONCE(root)) |
214 | mutex_unlock(&root->mutex); | 214 | mutex_unlock(&root->mutex); |
215 | root = new_root; | 215 | root = new_root; |
216 | mutex_lock(&root->mutex); | 216 | mutex_lock(&root->mutex); |
217 | } | 217 | } |
218 | return root; | 218 | return root; |
219 | } | 219 | } |
220 | 220 | ||
221 | static inline void unlock_anon_vma_root(struct anon_vma *root) | 221 | static inline void unlock_anon_vma_root(struct anon_vma *root) |
222 | { | 222 | { |
223 | if (root) | 223 | if (root) |
224 | mutex_unlock(&root->mutex); | 224 | mutex_unlock(&root->mutex); |
225 | } | 225 | } |
226 | 226 | ||
227 | static void anon_vma_chain_link(struct vm_area_struct *vma, | 227 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
228 | struct anon_vma_chain *avc, | 228 | struct anon_vma_chain *avc, |
229 | struct anon_vma *anon_vma) | 229 | struct anon_vma *anon_vma) |
230 | { | 230 | { |
231 | avc->vma = vma; | 231 | avc->vma = vma; |
232 | avc->anon_vma = anon_vma; | 232 | avc->anon_vma = anon_vma; |
233 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 233 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
234 | 234 | ||
235 | /* | 235 | /* |
236 | * It's critical to add new vmas to the tail of the anon_vma, | 236 | * It's critical to add new vmas to the tail of the anon_vma, |
237 | * see comment in huge_memory.c:__split_huge_page(). | 237 | * see comment in huge_memory.c:__split_huge_page(). |
238 | */ | 238 | */ |
239 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 239 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
240 | } | 240 | } |
241 | 241 | ||
242 | /* | 242 | /* |
243 | * Attach the anon_vmas from src to dst. | 243 | * Attach the anon_vmas from src to dst. |
244 | * Returns 0 on success, -ENOMEM on failure. | 244 | * Returns 0 on success, -ENOMEM on failure. |
245 | */ | 245 | */ |
246 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | 246 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
247 | { | 247 | { |
248 | struct anon_vma_chain *avc, *pavc; | 248 | struct anon_vma_chain *avc, *pavc; |
249 | struct anon_vma *root = NULL; | 249 | struct anon_vma *root = NULL; |
250 | 250 | ||
251 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { | 251 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
252 | struct anon_vma *anon_vma; | 252 | struct anon_vma *anon_vma; |
253 | 253 | ||
254 | avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); | 254 | avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); |
255 | if (unlikely(!avc)) { | 255 | if (unlikely(!avc)) { |
256 | unlock_anon_vma_root(root); | 256 | unlock_anon_vma_root(root); |
257 | root = NULL; | 257 | root = NULL; |
258 | avc = anon_vma_chain_alloc(GFP_KERNEL); | 258 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
259 | if (!avc) | 259 | if (!avc) |
260 | goto enomem_failure; | 260 | goto enomem_failure; |
261 | } | 261 | } |
262 | anon_vma = pavc->anon_vma; | 262 | anon_vma = pavc->anon_vma; |
263 | root = lock_anon_vma_root(root, anon_vma); | 263 | root = lock_anon_vma_root(root, anon_vma); |
264 | anon_vma_chain_link(dst, avc, anon_vma); | 264 | anon_vma_chain_link(dst, avc, anon_vma); |
265 | } | 265 | } |
266 | unlock_anon_vma_root(root); | 266 | unlock_anon_vma_root(root); |
267 | return 0; | 267 | return 0; |
268 | 268 | ||
269 | enomem_failure: | 269 | enomem_failure: |
270 | unlink_anon_vmas(dst); | 270 | unlink_anon_vmas(dst); |
271 | return -ENOMEM; | 271 | return -ENOMEM; |
272 | } | 272 | } |
273 | 273 | ||
274 | /* | 274 | /* |
275 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | 275 | * Attach vma to its own anon_vma, as well as to the anon_vmas that |
276 | * the corresponding VMA in the parent process is attached to. | 276 | * the corresponding VMA in the parent process is attached to. |
277 | * Returns 0 on success, non-zero on failure. | 277 | * Returns 0 on success, non-zero on failure. |
278 | */ | 278 | */ |
279 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | 279 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) |
280 | { | 280 | { |
281 | struct anon_vma_chain *avc; | 281 | struct anon_vma_chain *avc; |
282 | struct anon_vma *anon_vma; | 282 | struct anon_vma *anon_vma; |
283 | 283 | ||
284 | /* Don't bother if the parent process has no anon_vma here. */ | 284 | /* Don't bother if the parent process has no anon_vma here. */ |
285 | if (!pvma->anon_vma) | 285 | if (!pvma->anon_vma) |
286 | return 0; | 286 | return 0; |
287 | 287 | ||
288 | /* | 288 | /* |
289 | * First, attach the new VMA to the parent VMA's anon_vmas, | 289 | * First, attach the new VMA to the parent VMA's anon_vmas, |
290 | * so rmap can find non-COWed pages in child processes. | 290 | * so rmap can find non-COWed pages in child processes. |
291 | */ | 291 | */ |
292 | if (anon_vma_clone(vma, pvma)) | 292 | if (anon_vma_clone(vma, pvma)) |
293 | return -ENOMEM; | 293 | return -ENOMEM; |
294 | 294 | ||
295 | /* Then add our own anon_vma. */ | 295 | /* Then add our own anon_vma. */ |
296 | anon_vma = anon_vma_alloc(); | 296 | anon_vma = anon_vma_alloc(); |
297 | if (!anon_vma) | 297 | if (!anon_vma) |
298 | goto out_error; | 298 | goto out_error; |
299 | avc = anon_vma_chain_alloc(GFP_KERNEL); | 299 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
300 | if (!avc) | 300 | if (!avc) |
301 | goto out_error_free_anon_vma; | 301 | goto out_error_free_anon_vma; |
302 | 302 | ||
303 | /* | 303 | /* |
304 | * The root anon_vma's spinlock is the lock actually used when we | 304 | * The root anon_vma's spinlock is the lock actually used when we |
305 | * lock any of the anon_vmas in this anon_vma tree. | 305 | * lock any of the anon_vmas in this anon_vma tree. |
306 | */ | 306 | */ |
307 | anon_vma->root = pvma->anon_vma->root; | 307 | anon_vma->root = pvma->anon_vma->root; |
308 | /* | 308 | /* |
309 | * With refcounts, an anon_vma can stay around longer than the | 309 | * With refcounts, an anon_vma can stay around longer than the |
310 | * process it belongs to. The root anon_vma needs to be pinned until | 310 | * process it belongs to. The root anon_vma needs to be pinned until |
311 | * this anon_vma is freed, because the lock lives in the root. | 311 | * this anon_vma is freed, because the lock lives in the root. |
312 | */ | 312 | */ |
313 | get_anon_vma(anon_vma->root); | 313 | get_anon_vma(anon_vma->root); |
314 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 314 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
315 | vma->anon_vma = anon_vma; | 315 | vma->anon_vma = anon_vma; |
316 | anon_vma_lock(anon_vma); | 316 | anon_vma_lock(anon_vma); |
317 | anon_vma_chain_link(vma, avc, anon_vma); | 317 | anon_vma_chain_link(vma, avc, anon_vma); |
318 | anon_vma_unlock(anon_vma); | 318 | anon_vma_unlock(anon_vma); |
319 | 319 | ||
320 | return 0; | 320 | return 0; |
321 | 321 | ||
322 | out_error_free_anon_vma: | 322 | out_error_free_anon_vma: |
323 | put_anon_vma(anon_vma); | 323 | put_anon_vma(anon_vma); |
324 | out_error: | 324 | out_error: |
325 | unlink_anon_vmas(vma); | 325 | unlink_anon_vmas(vma); |
326 | return -ENOMEM; | 326 | return -ENOMEM; |
327 | } | 327 | } |
328 | 328 | ||
329 | void unlink_anon_vmas(struct vm_area_struct *vma) | 329 | void unlink_anon_vmas(struct vm_area_struct *vma) |
330 | { | 330 | { |
331 | struct anon_vma_chain *avc, *next; | 331 | struct anon_vma_chain *avc, *next; |
332 | struct anon_vma *root = NULL; | 332 | struct anon_vma *root = NULL; |
333 | 333 | ||
334 | /* | 334 | /* |
335 | * Unlink each anon_vma chained to the VMA. This list is ordered | 335 | * Unlink each anon_vma chained to the VMA. This list is ordered |
336 | * from newest to oldest, ensuring the root anon_vma gets freed last. | 336 | * from newest to oldest, ensuring the root anon_vma gets freed last. |
337 | */ | 337 | */ |
338 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 338 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
339 | struct anon_vma *anon_vma = avc->anon_vma; | 339 | struct anon_vma *anon_vma = avc->anon_vma; |
340 | 340 | ||
341 | root = lock_anon_vma_root(root, anon_vma); | 341 | root = lock_anon_vma_root(root, anon_vma); |
342 | list_del(&avc->same_anon_vma); | 342 | list_del(&avc->same_anon_vma); |
343 | 343 | ||
344 | /* | 344 | /* |
345 | * Leave empty anon_vmas on the list - we'll need | 345 | * Leave empty anon_vmas on the list - we'll need |
346 | * to free them outside the lock. | 346 | * to free them outside the lock. |
347 | */ | 347 | */ |
348 | if (list_empty(&anon_vma->head)) | 348 | if (list_empty(&anon_vma->head)) |
349 | continue; | 349 | continue; |
350 | 350 | ||
351 | list_del(&avc->same_vma); | 351 | list_del(&avc->same_vma); |
352 | anon_vma_chain_free(avc); | 352 | anon_vma_chain_free(avc); |
353 | } | 353 | } |
354 | unlock_anon_vma_root(root); | 354 | unlock_anon_vma_root(root); |
355 | 355 | ||
356 | /* | 356 | /* |
357 | * Iterate the list once more, it now only contains empty and unlinked | 357 | * Iterate the list once more, it now only contains empty and unlinked |
358 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | 358 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() |
359 | * needing to acquire the anon_vma->root->mutex. | 359 | * needing to acquire the anon_vma->root->mutex. |
360 | */ | 360 | */ |
361 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 361 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
362 | struct anon_vma *anon_vma = avc->anon_vma; | 362 | struct anon_vma *anon_vma = avc->anon_vma; |
363 | 363 | ||
364 | put_anon_vma(anon_vma); | 364 | put_anon_vma(anon_vma); |
365 | 365 | ||
366 | list_del(&avc->same_vma); | 366 | list_del(&avc->same_vma); |
367 | anon_vma_chain_free(avc); | 367 | anon_vma_chain_free(avc); |
368 | } | 368 | } |
369 | } | 369 | } |
370 | 370 | ||
371 | static void anon_vma_ctor(void *data) | 371 | static void anon_vma_ctor(void *data) |
372 | { | 372 | { |
373 | struct anon_vma *anon_vma = data; | 373 | struct anon_vma *anon_vma = data; |
374 | 374 | ||
375 | mutex_init(&anon_vma->mutex); | 375 | mutex_init(&anon_vma->mutex); |
376 | atomic_set(&anon_vma->refcount, 0); | 376 | atomic_set(&anon_vma->refcount, 0); |
377 | INIT_LIST_HEAD(&anon_vma->head); | 377 | INIT_LIST_HEAD(&anon_vma->head); |
378 | } | 378 | } |
379 | 379 | ||
380 | void __init anon_vma_init(void) | 380 | void __init anon_vma_init(void) |
381 | { | 381 | { |
382 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 382 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
383 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 383 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
384 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | 384 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); |
385 | } | 385 | } |
386 | 386 | ||
387 | /* | 387 | /* |
388 | * Getting a lock on a stable anon_vma from a page off the LRU is tricky! | 388 | * Getting a lock on a stable anon_vma from a page off the LRU is tricky! |
389 | * | 389 | * |
390 | * Since there is no serialization what so ever against page_remove_rmap() | 390 | * Since there is no serialization what so ever against page_remove_rmap() |
391 | * the best this function can do is return a locked anon_vma that might | 391 | * the best this function can do is return a locked anon_vma that might |
392 | * have been relevant to this page. | 392 | * have been relevant to this page. |
393 | * | 393 | * |
394 | * The page might have been remapped to a different anon_vma or the anon_vma | 394 | * The page might have been remapped to a different anon_vma or the anon_vma |
395 | * returned may already be freed (and even reused). | 395 | * returned may already be freed (and even reused). |
396 | * | 396 | * |
397 | * In case it was remapped to a different anon_vma, the new anon_vma will be a | 397 | * In case it was remapped to a different anon_vma, the new anon_vma will be a |
398 | * child of the old anon_vma, and the anon_vma lifetime rules will therefore | 398 | * child of the old anon_vma, and the anon_vma lifetime rules will therefore |
399 | * ensure that any anon_vma obtained from the page will still be valid for as | 399 | * ensure that any anon_vma obtained from the page will still be valid for as |
400 | * long as we observe page_mapped() [ hence all those page_mapped() tests ]. | 400 | * long as we observe page_mapped() [ hence all those page_mapped() tests ]. |
401 | * | 401 | * |
402 | * All users of this function must be very careful when walking the anon_vma | 402 | * All users of this function must be very careful when walking the anon_vma |
403 | * chain and verify that the page in question is indeed mapped in it | 403 | * chain and verify that the page in question is indeed mapped in it |
404 | * [ something equivalent to page_mapped_in_vma() ]. | 404 | * [ something equivalent to page_mapped_in_vma() ]. |
405 | * | 405 | * |
406 | * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() | 406 | * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() |
407 | * that the anon_vma pointer from page->mapping is valid if there is a | 407 | * that the anon_vma pointer from page->mapping is valid if there is a |
408 | * mapcount, we can dereference the anon_vma after observing those. | 408 | * mapcount, we can dereference the anon_vma after observing those. |
409 | */ | 409 | */ |
410 | struct anon_vma *page_get_anon_vma(struct page *page) | 410 | struct anon_vma *page_get_anon_vma(struct page *page) |
411 | { | 411 | { |
412 | struct anon_vma *anon_vma = NULL; | 412 | struct anon_vma *anon_vma = NULL; |
413 | unsigned long anon_mapping; | 413 | unsigned long anon_mapping; |
414 | 414 | ||
415 | rcu_read_lock(); | 415 | rcu_read_lock(); |
416 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 416 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
417 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 417 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
418 | goto out; | 418 | goto out; |
419 | if (!page_mapped(page)) | 419 | if (!page_mapped(page)) |
420 | goto out; | 420 | goto out; |
421 | 421 | ||
422 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 422 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
423 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { | 423 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { |
424 | anon_vma = NULL; | 424 | anon_vma = NULL; |
425 | goto out; | 425 | goto out; |
426 | } | 426 | } |
427 | 427 | ||
428 | /* | 428 | /* |
429 | * If this page is still mapped, then its anon_vma cannot have been | 429 | * If this page is still mapped, then its anon_vma cannot have been |
430 | * freed. But if it has been unmapped, we have no security against the | 430 | * freed. But if it has been unmapped, we have no security against the |
431 | * anon_vma structure being freed and reused (for another anon_vma: | 431 | * anon_vma structure being freed and reused (for another anon_vma: |
432 | * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() | 432 | * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() |
433 | * above cannot corrupt). | 433 | * above cannot corrupt). |
434 | */ | 434 | */ |
435 | if (!page_mapped(page)) { | 435 | if (!page_mapped(page)) { |
436 | put_anon_vma(anon_vma); | 436 | put_anon_vma(anon_vma); |
437 | anon_vma = NULL; | 437 | anon_vma = NULL; |
438 | } | 438 | } |
439 | out: | 439 | out: |
440 | rcu_read_unlock(); | 440 | rcu_read_unlock(); |
441 | 441 | ||
442 | return anon_vma; | 442 | return anon_vma; |
443 | } | 443 | } |
444 | 444 | ||
445 | /* | 445 | /* |
446 | * Similar to page_get_anon_vma() except it locks the anon_vma. | 446 | * Similar to page_get_anon_vma() except it locks the anon_vma. |
447 | * | 447 | * |
448 | * Its a little more complex as it tries to keep the fast path to a single | 448 | * Its a little more complex as it tries to keep the fast path to a single |
449 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | 449 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a |
450 | * reference like with page_get_anon_vma() and then block on the mutex. | 450 | * reference like with page_get_anon_vma() and then block on the mutex. |
451 | */ | 451 | */ |
452 | struct anon_vma *page_lock_anon_vma(struct page *page) | 452 | struct anon_vma *page_lock_anon_vma(struct page *page) |
453 | { | 453 | { |
454 | struct anon_vma *anon_vma = NULL; | 454 | struct anon_vma *anon_vma = NULL; |
455 | struct anon_vma *root_anon_vma; | 455 | struct anon_vma *root_anon_vma; |
456 | unsigned long anon_mapping; | 456 | unsigned long anon_mapping; |
457 | 457 | ||
458 | rcu_read_lock(); | 458 | rcu_read_lock(); |
459 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 459 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
460 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 460 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
461 | goto out; | 461 | goto out; |
462 | if (!page_mapped(page)) | 462 | if (!page_mapped(page)) |
463 | goto out; | 463 | goto out; |
464 | 464 | ||
465 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 465 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
466 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 466 | root_anon_vma = ACCESS_ONCE(anon_vma->root); |
467 | if (mutex_trylock(&root_anon_vma->mutex)) { | 467 | if (mutex_trylock(&root_anon_vma->mutex)) { |
468 | /* | 468 | /* |
469 | * If the page is still mapped, then this anon_vma is still | 469 | * If the page is still mapped, then this anon_vma is still |
470 | * its anon_vma, and holding the mutex ensures that it will | 470 | * its anon_vma, and holding the mutex ensures that it will |
471 | * not go away, see anon_vma_free(). | 471 | * not go away, see anon_vma_free(). |
472 | */ | 472 | */ |
473 | if (!page_mapped(page)) { | 473 | if (!page_mapped(page)) { |
474 | mutex_unlock(&root_anon_vma->mutex); | 474 | mutex_unlock(&root_anon_vma->mutex); |
475 | anon_vma = NULL; | 475 | anon_vma = NULL; |
476 | } | 476 | } |
477 | goto out; | 477 | goto out; |
478 | } | 478 | } |
479 | 479 | ||
480 | /* trylock failed, we got to sleep */ | 480 | /* trylock failed, we got to sleep */ |
481 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { | 481 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { |
482 | anon_vma = NULL; | 482 | anon_vma = NULL; |
483 | goto out; | 483 | goto out; |
484 | } | 484 | } |
485 | 485 | ||
486 | if (!page_mapped(page)) { | 486 | if (!page_mapped(page)) { |
487 | put_anon_vma(anon_vma); | 487 | put_anon_vma(anon_vma); |
488 | anon_vma = NULL; | 488 | anon_vma = NULL; |
489 | goto out; | 489 | goto out; |
490 | } | 490 | } |
491 | 491 | ||
492 | /* we pinned the anon_vma, its safe to sleep */ | 492 | /* we pinned the anon_vma, its safe to sleep */ |
493 | rcu_read_unlock(); | 493 | rcu_read_unlock(); |
494 | anon_vma_lock(anon_vma); | 494 | anon_vma_lock(anon_vma); |
495 | 495 | ||
496 | if (atomic_dec_and_test(&anon_vma->refcount)) { | 496 | if (atomic_dec_and_test(&anon_vma->refcount)) { |
497 | /* | 497 | /* |
498 | * Oops, we held the last refcount, release the lock | 498 | * Oops, we held the last refcount, release the lock |
499 | * and bail -- can't simply use put_anon_vma() because | 499 | * and bail -- can't simply use put_anon_vma() because |
500 | * we'll deadlock on the anon_vma_lock() recursion. | 500 | * we'll deadlock on the anon_vma_lock() recursion. |
501 | */ | 501 | */ |
502 | anon_vma_unlock(anon_vma); | 502 | anon_vma_unlock(anon_vma); |
503 | __put_anon_vma(anon_vma); | 503 | __put_anon_vma(anon_vma); |
504 | anon_vma = NULL; | 504 | anon_vma = NULL; |
505 | } | 505 | } |
506 | 506 | ||
507 | return anon_vma; | 507 | return anon_vma; |
508 | 508 | ||
509 | out: | 509 | out: |
510 | rcu_read_unlock(); | 510 | rcu_read_unlock(); |
511 | return anon_vma; | 511 | return anon_vma; |
512 | } | 512 | } |
513 | 513 | ||
514 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 514 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
515 | { | 515 | { |
516 | anon_vma_unlock(anon_vma); | 516 | anon_vma_unlock(anon_vma); |
517 | } | 517 | } |
518 | 518 | ||
519 | /* | 519 | /* |
520 | * At what user virtual address is page expected in @vma? | 520 | * At what user virtual address is page expected in @vma? |
521 | * Returns virtual address or -EFAULT if page's index/offset is not | 521 | * Returns virtual address or -EFAULT if page's index/offset is not |
522 | * within the range mapped the @vma. | 522 | * within the range mapped the @vma. |
523 | */ | 523 | */ |
524 | inline unsigned long | 524 | inline unsigned long |
525 | vma_address(struct page *page, struct vm_area_struct *vma) | 525 | vma_address(struct page *page, struct vm_area_struct *vma) |
526 | { | 526 | { |
527 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 527 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
528 | unsigned long address; | 528 | unsigned long address; |
529 | 529 | ||
530 | if (unlikely(is_vm_hugetlb_page(vma))) | 530 | if (unlikely(is_vm_hugetlb_page(vma))) |
531 | pgoff = page->index << huge_page_order(page_hstate(page)); | 531 | pgoff = page->index << huge_page_order(page_hstate(page)); |
532 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 532 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
533 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | 533 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { |
534 | /* page should be within @vma mapping range */ | 534 | /* page should be within @vma mapping range */ |
535 | return -EFAULT; | 535 | return -EFAULT; |
536 | } | 536 | } |
537 | return address; | 537 | return address; |
538 | } | 538 | } |
539 | 539 | ||
540 | /* | 540 | /* |
541 | * At what user virtual address is page expected in vma? | 541 | * At what user virtual address is page expected in vma? |
542 | * Caller should check the page is actually part of the vma. | 542 | * Caller should check the page is actually part of the vma. |
543 | */ | 543 | */ |
544 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 544 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
545 | { | 545 | { |
546 | if (PageAnon(page)) { | 546 | if (PageAnon(page)) { |
547 | struct anon_vma *page__anon_vma = page_anon_vma(page); | 547 | struct anon_vma *page__anon_vma = page_anon_vma(page); |
548 | /* | 548 | /* |
549 | * Note: swapoff's unuse_vma() is more efficient with this | 549 | * Note: swapoff's unuse_vma() is more efficient with this |
550 | * check, and needs it to match anon_vma when KSM is active. | 550 | * check, and needs it to match anon_vma when KSM is active. |
551 | */ | 551 | */ |
552 | if (!vma->anon_vma || !page__anon_vma || | 552 | if (!vma->anon_vma || !page__anon_vma || |
553 | vma->anon_vma->root != page__anon_vma->root) | 553 | vma->anon_vma->root != page__anon_vma->root) |
554 | return -EFAULT; | 554 | return -EFAULT; |
555 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 555 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
556 | if (!vma->vm_file || | 556 | if (!vma->vm_file || |
557 | vma->vm_file->f_mapping != page->mapping) | 557 | vma->vm_file->f_mapping != page->mapping) |
558 | return -EFAULT; | 558 | return -EFAULT; |
559 | } else | 559 | } else |
560 | return -EFAULT; | 560 | return -EFAULT; |
561 | return vma_address(page, vma); | 561 | return vma_address(page, vma); |
562 | } | 562 | } |
563 | 563 | ||
564 | /* | 564 | /* |
565 | * Check that @page is mapped at @address into @mm. | 565 | * Check that @page is mapped at @address into @mm. |
566 | * | 566 | * |
567 | * If @sync is false, page_check_address may perform a racy check to avoid | 567 | * If @sync is false, page_check_address may perform a racy check to avoid |
568 | * the page table lock when the pte is not present (helpful when reclaiming | 568 | * the page table lock when the pte is not present (helpful when reclaiming |
569 | * highly shared pages). | 569 | * highly shared pages). |
570 | * | 570 | * |
571 | * On success returns with pte mapped and locked. | 571 | * On success returns with pte mapped and locked. |
572 | */ | 572 | */ |
573 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | 573 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, |
574 | unsigned long address, spinlock_t **ptlp, int sync) | 574 | unsigned long address, spinlock_t **ptlp, int sync) |
575 | { | 575 | { |
576 | pgd_t *pgd; | 576 | pgd_t *pgd; |
577 | pud_t *pud; | 577 | pud_t *pud; |
578 | pmd_t *pmd; | 578 | pmd_t *pmd; |
579 | pte_t *pte; | 579 | pte_t *pte; |
580 | spinlock_t *ptl; | 580 | spinlock_t *ptl; |
581 | 581 | ||
582 | if (unlikely(PageHuge(page))) { | 582 | if (unlikely(PageHuge(page))) { |
583 | pte = huge_pte_offset(mm, address); | 583 | pte = huge_pte_offset(mm, address); |
584 | ptl = &mm->page_table_lock; | 584 | ptl = &mm->page_table_lock; |
585 | goto check; | 585 | goto check; |
586 | } | 586 | } |
587 | 587 | ||
588 | pgd = pgd_offset(mm, address); | 588 | pgd = pgd_offset(mm, address); |
589 | if (!pgd_present(*pgd)) | 589 | if (!pgd_present(*pgd)) |
590 | return NULL; | 590 | return NULL; |
591 | 591 | ||
592 | pud = pud_offset(pgd, address); | 592 | pud = pud_offset(pgd, address); |
593 | if (!pud_present(*pud)) | 593 | if (!pud_present(*pud)) |
594 | return NULL; | 594 | return NULL; |
595 | 595 | ||
596 | pmd = pmd_offset(pud, address); | 596 | pmd = pmd_offset(pud, address); |
597 | if (!pmd_present(*pmd)) | 597 | if (!pmd_present(*pmd)) |
598 | return NULL; | 598 | return NULL; |
599 | if (pmd_trans_huge(*pmd)) | 599 | if (pmd_trans_huge(*pmd)) |
600 | return NULL; | 600 | return NULL; |
601 | 601 | ||
602 | pte = pte_offset_map(pmd, address); | 602 | pte = pte_offset_map(pmd, address); |
603 | /* Make a quick check before getting the lock */ | 603 | /* Make a quick check before getting the lock */ |
604 | if (!sync && !pte_present(*pte)) { | 604 | if (!sync && !pte_present(*pte)) { |
605 | pte_unmap(pte); | 605 | pte_unmap(pte); |
606 | return NULL; | 606 | return NULL; |
607 | } | 607 | } |
608 | 608 | ||
609 | ptl = pte_lockptr(mm, pmd); | 609 | ptl = pte_lockptr(mm, pmd); |
610 | check: | 610 | check: |
611 | spin_lock(ptl); | 611 | spin_lock(ptl); |
612 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { | 612 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { |
613 | *ptlp = ptl; | 613 | *ptlp = ptl; |
614 | return pte; | 614 | return pte; |
615 | } | 615 | } |
616 | pte_unmap_unlock(pte, ptl); | 616 | pte_unmap_unlock(pte, ptl); |
617 | return NULL; | 617 | return NULL; |
618 | } | 618 | } |
619 | 619 | ||
620 | /** | 620 | /** |
621 | * page_mapped_in_vma - check whether a page is really mapped in a VMA | 621 | * page_mapped_in_vma - check whether a page is really mapped in a VMA |
622 | * @page: the page to test | 622 | * @page: the page to test |
623 | * @vma: the VMA to test | 623 | * @vma: the VMA to test |
624 | * | 624 | * |
625 | * Returns 1 if the page is mapped into the page tables of the VMA, 0 | 625 | * Returns 1 if the page is mapped into the page tables of the VMA, 0 |
626 | * if the page is not mapped into the page tables of this VMA. Only | 626 | * if the page is not mapped into the page tables of this VMA. Only |
627 | * valid for normal file or anonymous VMAs. | 627 | * valid for normal file or anonymous VMAs. |
628 | */ | 628 | */ |
629 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | 629 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) |
630 | { | 630 | { |
631 | unsigned long address; | 631 | unsigned long address; |
632 | pte_t *pte; | 632 | pte_t *pte; |
633 | spinlock_t *ptl; | 633 | spinlock_t *ptl; |
634 | 634 | ||
635 | address = vma_address(page, vma); | 635 | address = vma_address(page, vma); |
636 | if (address == -EFAULT) /* out of vma range */ | 636 | if (address == -EFAULT) /* out of vma range */ |
637 | return 0; | 637 | return 0; |
638 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); | 638 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); |
639 | if (!pte) /* the page is not in this mm */ | 639 | if (!pte) /* the page is not in this mm */ |
640 | return 0; | 640 | return 0; |
641 | pte_unmap_unlock(pte, ptl); | 641 | pte_unmap_unlock(pte, ptl); |
642 | 642 | ||
643 | return 1; | 643 | return 1; |
644 | } | 644 | } |
645 | 645 | ||
646 | /* | 646 | /* |
647 | * Subfunctions of page_referenced: page_referenced_one called | 647 | * Subfunctions of page_referenced: page_referenced_one called |
648 | * repeatedly from either page_referenced_anon or page_referenced_file. | 648 | * repeatedly from either page_referenced_anon or page_referenced_file. |
649 | */ | 649 | */ |
650 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 650 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
651 | unsigned long address, unsigned int *mapcount, | 651 | unsigned long address, unsigned int *mapcount, |
652 | unsigned long *vm_flags) | 652 | unsigned long *vm_flags) |
653 | { | 653 | { |
654 | struct mm_struct *mm = vma->vm_mm; | 654 | struct mm_struct *mm = vma->vm_mm; |
655 | int referenced = 0; | 655 | int referenced = 0; |
656 | 656 | ||
657 | if (unlikely(PageTransHuge(page))) { | 657 | if (unlikely(PageTransHuge(page))) { |
658 | pmd_t *pmd; | 658 | pmd_t *pmd; |
659 | 659 | ||
660 | spin_lock(&mm->page_table_lock); | 660 | spin_lock(&mm->page_table_lock); |
661 | /* | 661 | /* |
662 | * rmap might return false positives; we must filter | 662 | * rmap might return false positives; we must filter |
663 | * these out using page_check_address_pmd(). | 663 | * these out using page_check_address_pmd(). |
664 | */ | 664 | */ |
665 | pmd = page_check_address_pmd(page, mm, address, | 665 | pmd = page_check_address_pmd(page, mm, address, |
666 | PAGE_CHECK_ADDRESS_PMD_FLAG); | 666 | PAGE_CHECK_ADDRESS_PMD_FLAG); |
667 | if (!pmd) { | 667 | if (!pmd) { |
668 | spin_unlock(&mm->page_table_lock); | 668 | spin_unlock(&mm->page_table_lock); |
669 | goto out; | 669 | goto out; |
670 | } | 670 | } |
671 | 671 | ||
672 | if (vma->vm_flags & VM_LOCKED) { | 672 | if (vma->vm_flags & VM_LOCKED) { |
673 | spin_unlock(&mm->page_table_lock); | 673 | spin_unlock(&mm->page_table_lock); |
674 | *mapcount = 0; /* break early from loop */ | 674 | *mapcount = 0; /* break early from loop */ |
675 | *vm_flags |= VM_LOCKED; | 675 | *vm_flags |= VM_LOCKED; |
676 | goto out; | 676 | goto out; |
677 | } | 677 | } |
678 | 678 | ||
679 | /* go ahead even if the pmd is pmd_trans_splitting() */ | 679 | /* go ahead even if the pmd is pmd_trans_splitting() */ |
680 | if (pmdp_clear_flush_young_notify(vma, address, pmd)) | 680 | if (pmdp_clear_flush_young_notify(vma, address, pmd)) |
681 | referenced++; | 681 | referenced++; |
682 | spin_unlock(&mm->page_table_lock); | 682 | spin_unlock(&mm->page_table_lock); |
683 | } else { | 683 | } else { |
684 | pte_t *pte; | 684 | pte_t *pte; |
685 | spinlock_t *ptl; | 685 | spinlock_t *ptl; |
686 | 686 | ||
687 | /* | 687 | /* |
688 | * rmap might return false positives; we must filter | 688 | * rmap might return false positives; we must filter |
689 | * these out using page_check_address(). | 689 | * these out using page_check_address(). |
690 | */ | 690 | */ |
691 | pte = page_check_address(page, mm, address, &ptl, 0); | 691 | pte = page_check_address(page, mm, address, &ptl, 0); |
692 | if (!pte) | 692 | if (!pte) |
693 | goto out; | 693 | goto out; |
694 | 694 | ||
695 | if (vma->vm_flags & VM_LOCKED) { | 695 | if (vma->vm_flags & VM_LOCKED) { |
696 | pte_unmap_unlock(pte, ptl); | 696 | pte_unmap_unlock(pte, ptl); |
697 | *mapcount = 0; /* break early from loop */ | 697 | *mapcount = 0; /* break early from loop */ |
698 | *vm_flags |= VM_LOCKED; | 698 | *vm_flags |= VM_LOCKED; |
699 | goto out; | 699 | goto out; |
700 | } | 700 | } |
701 | 701 | ||
702 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 702 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
703 | /* | 703 | /* |
704 | * Don't treat a reference through a sequentially read | 704 | * Don't treat a reference through a sequentially read |
705 | * mapping as such. If the page has been used in | 705 | * mapping as such. If the page has been used in |
706 | * another mapping, we will catch it; if this other | 706 | * another mapping, we will catch it; if this other |
707 | * mapping is already gone, the unmap path will have | 707 | * mapping is already gone, the unmap path will have |
708 | * set PG_referenced or activated the page. | 708 | * set PG_referenced or activated the page. |
709 | */ | 709 | */ |
710 | if (likely(!VM_SequentialReadHint(vma))) | 710 | if (likely(!VM_SequentialReadHint(vma))) |
711 | referenced++; | 711 | referenced++; |
712 | } | 712 | } |
713 | pte_unmap_unlock(pte, ptl); | 713 | pte_unmap_unlock(pte, ptl); |
714 | } | 714 | } |
715 | 715 | ||
716 | /* Pretend the page is referenced if the task has the | 716 | /* Pretend the page is referenced if the task has the |
717 | swap token and is in the middle of a page fault. */ | 717 | swap token and is in the middle of a page fault. */ |
718 | if (mm != current->mm && has_swap_token(mm) && | 718 | if (mm != current->mm && has_swap_token(mm) && |
719 | rwsem_is_locked(&mm->mmap_sem)) | 719 | rwsem_is_locked(&mm->mmap_sem)) |
720 | referenced++; | 720 | referenced++; |
721 | 721 | ||
722 | (*mapcount)--; | 722 | (*mapcount)--; |
723 | 723 | ||
724 | if (referenced) | 724 | if (referenced) |
725 | *vm_flags |= vma->vm_flags; | 725 | *vm_flags |= vma->vm_flags; |
726 | out: | 726 | out: |
727 | return referenced; | 727 | return referenced; |
728 | } | 728 | } |
729 | 729 | ||
730 | static int page_referenced_anon(struct page *page, | 730 | static int page_referenced_anon(struct page *page, |
731 | struct mem_cgroup *mem_cont, | 731 | struct mem_cgroup *mem_cont, |
732 | unsigned long *vm_flags) | 732 | unsigned long *vm_flags) |
733 | { | 733 | { |
734 | unsigned int mapcount; | 734 | unsigned int mapcount; |
735 | struct anon_vma *anon_vma; | 735 | struct anon_vma *anon_vma; |
736 | struct anon_vma_chain *avc; | 736 | struct anon_vma_chain *avc; |
737 | int referenced = 0; | 737 | int referenced = 0; |
738 | 738 | ||
739 | anon_vma = page_lock_anon_vma(page); | 739 | anon_vma = page_lock_anon_vma(page); |
740 | if (!anon_vma) | 740 | if (!anon_vma) |
741 | return referenced; | 741 | return referenced; |
742 | 742 | ||
743 | mapcount = page_mapcount(page); | 743 | mapcount = page_mapcount(page); |
744 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 744 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
745 | struct vm_area_struct *vma = avc->vma; | 745 | struct vm_area_struct *vma = avc->vma; |
746 | unsigned long address = vma_address(page, vma); | 746 | unsigned long address = vma_address(page, vma); |
747 | if (address == -EFAULT) | 747 | if (address == -EFAULT) |
748 | continue; | 748 | continue; |
749 | /* | 749 | /* |
750 | * If we are reclaiming on behalf of a cgroup, skip | 750 | * If we are reclaiming on behalf of a cgroup, skip |
751 | * counting on behalf of references from different | 751 | * counting on behalf of references from different |
752 | * cgroups | 752 | * cgroups |
753 | */ | 753 | */ |
754 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 754 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
755 | continue; | 755 | continue; |
756 | referenced += page_referenced_one(page, vma, address, | 756 | referenced += page_referenced_one(page, vma, address, |
757 | &mapcount, vm_flags); | 757 | &mapcount, vm_flags); |
758 | if (!mapcount) | 758 | if (!mapcount) |
759 | break; | 759 | break; |
760 | } | 760 | } |
761 | 761 | ||
762 | page_unlock_anon_vma(anon_vma); | 762 | page_unlock_anon_vma(anon_vma); |
763 | return referenced; | 763 | return referenced; |
764 | } | 764 | } |
765 | 765 | ||
766 | /** | 766 | /** |
767 | * page_referenced_file - referenced check for object-based rmap | 767 | * page_referenced_file - referenced check for object-based rmap |
768 | * @page: the page we're checking references on. | 768 | * @page: the page we're checking references on. |
769 | * @mem_cont: target memory controller | 769 | * @mem_cont: target memory controller |
770 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | 770 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page |
771 | * | 771 | * |
772 | * For an object-based mapped page, find all the places it is mapped and | 772 | * For an object-based mapped page, find all the places it is mapped and |
773 | * check/clear the referenced flag. This is done by following the page->mapping | 773 | * check/clear the referenced flag. This is done by following the page->mapping |
774 | * pointer, then walking the chain of vmas it holds. It returns the number | 774 | * pointer, then walking the chain of vmas it holds. It returns the number |
775 | * of references it found. | 775 | * of references it found. |
776 | * | 776 | * |
777 | * This function is only called from page_referenced for object-based pages. | 777 | * This function is only called from page_referenced for object-based pages. |
778 | */ | 778 | */ |
779 | static int page_referenced_file(struct page *page, | 779 | static int page_referenced_file(struct page *page, |
780 | struct mem_cgroup *mem_cont, | 780 | struct mem_cgroup *mem_cont, |
781 | unsigned long *vm_flags) | 781 | unsigned long *vm_flags) |
782 | { | 782 | { |
783 | unsigned int mapcount; | 783 | unsigned int mapcount; |
784 | struct address_space *mapping = page->mapping; | 784 | struct address_space *mapping = page->mapping; |
785 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 785 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
786 | struct vm_area_struct *vma; | 786 | struct vm_area_struct *vma; |
787 | struct prio_tree_iter iter; | 787 | struct prio_tree_iter iter; |
788 | int referenced = 0; | 788 | int referenced = 0; |
789 | 789 | ||
790 | /* | 790 | /* |
791 | * The caller's checks on page->mapping and !PageAnon have made | 791 | * The caller's checks on page->mapping and !PageAnon have made |
792 | * sure that this is a file page: the check for page->mapping | 792 | * sure that this is a file page: the check for page->mapping |
793 | * excludes the case just before it gets set on an anon page. | 793 | * excludes the case just before it gets set on an anon page. |
794 | */ | 794 | */ |
795 | BUG_ON(PageAnon(page)); | 795 | BUG_ON(PageAnon(page)); |
796 | 796 | ||
797 | /* | 797 | /* |
798 | * The page lock not only makes sure that page->mapping cannot | 798 | * The page lock not only makes sure that page->mapping cannot |
799 | * suddenly be NULLified by truncation, it makes sure that the | 799 | * suddenly be NULLified by truncation, it makes sure that the |
800 | * structure at mapping cannot be freed and reused yet, | 800 | * structure at mapping cannot be freed and reused yet, |
801 | * so we can safely take mapping->i_mmap_mutex. | 801 | * so we can safely take mapping->i_mmap_mutex. |
802 | */ | 802 | */ |
803 | BUG_ON(!PageLocked(page)); | 803 | BUG_ON(!PageLocked(page)); |
804 | 804 | ||
805 | mutex_lock(&mapping->i_mmap_mutex); | 805 | mutex_lock(&mapping->i_mmap_mutex); |
806 | 806 | ||
807 | /* | 807 | /* |
808 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount | 808 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount |
809 | * is more likely to be accurate if we note it after spinning. | 809 | * is more likely to be accurate if we note it after spinning. |
810 | */ | 810 | */ |
811 | mapcount = page_mapcount(page); | 811 | mapcount = page_mapcount(page); |
812 | 812 | ||
813 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 813 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
814 | unsigned long address = vma_address(page, vma); | 814 | unsigned long address = vma_address(page, vma); |
815 | if (address == -EFAULT) | 815 | if (address == -EFAULT) |
816 | continue; | 816 | continue; |
817 | /* | 817 | /* |
818 | * If we are reclaiming on behalf of a cgroup, skip | 818 | * If we are reclaiming on behalf of a cgroup, skip |
819 | * counting on behalf of references from different | 819 | * counting on behalf of references from different |
820 | * cgroups | 820 | * cgroups |
821 | */ | 821 | */ |
822 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 822 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
823 | continue; | 823 | continue; |
824 | referenced += page_referenced_one(page, vma, address, | 824 | referenced += page_referenced_one(page, vma, address, |
825 | &mapcount, vm_flags); | 825 | &mapcount, vm_flags); |
826 | if (!mapcount) | 826 | if (!mapcount) |
827 | break; | 827 | break; |
828 | } | 828 | } |
829 | 829 | ||
830 | mutex_unlock(&mapping->i_mmap_mutex); | 830 | mutex_unlock(&mapping->i_mmap_mutex); |
831 | return referenced; | 831 | return referenced; |
832 | } | 832 | } |
833 | 833 | ||
834 | /** | 834 | /** |
835 | * page_referenced - test if the page was referenced | 835 | * page_referenced - test if the page was referenced |
836 | * @page: the page to test | 836 | * @page: the page to test |
837 | * @is_locked: caller holds lock on the page | 837 | * @is_locked: caller holds lock on the page |
838 | * @mem_cont: target memory controller | 838 | * @mem_cont: target memory controller |
839 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | 839 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page |
840 | * | 840 | * |
841 | * Quick test_and_clear_referenced for all mappings to a page, | 841 | * Quick test_and_clear_referenced for all mappings to a page, |
842 | * returns the number of ptes which referenced the page. | 842 | * returns the number of ptes which referenced the page. |
843 | */ | 843 | */ |
844 | int page_referenced(struct page *page, | 844 | int page_referenced(struct page *page, |
845 | int is_locked, | 845 | int is_locked, |
846 | struct mem_cgroup *mem_cont, | 846 | struct mem_cgroup *mem_cont, |
847 | unsigned long *vm_flags) | 847 | unsigned long *vm_flags) |
848 | { | 848 | { |
849 | int referenced = 0; | 849 | int referenced = 0; |
850 | int we_locked = 0; | 850 | int we_locked = 0; |
851 | 851 | ||
852 | *vm_flags = 0; | 852 | *vm_flags = 0; |
853 | if (page_mapped(page) && page_rmapping(page)) { | 853 | if (page_mapped(page) && page_rmapping(page)) { |
854 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { | 854 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
855 | we_locked = trylock_page(page); | 855 | we_locked = trylock_page(page); |
856 | if (!we_locked) { | 856 | if (!we_locked) { |
857 | referenced++; | 857 | referenced++; |
858 | goto out; | 858 | goto out; |
859 | } | 859 | } |
860 | } | 860 | } |
861 | if (unlikely(PageKsm(page))) | 861 | if (unlikely(PageKsm(page))) |
862 | referenced += page_referenced_ksm(page, mem_cont, | 862 | referenced += page_referenced_ksm(page, mem_cont, |
863 | vm_flags); | 863 | vm_flags); |
864 | else if (PageAnon(page)) | 864 | else if (PageAnon(page)) |
865 | referenced += page_referenced_anon(page, mem_cont, | 865 | referenced += page_referenced_anon(page, mem_cont, |
866 | vm_flags); | 866 | vm_flags); |
867 | else if (page->mapping) | 867 | else if (page->mapping) |
868 | referenced += page_referenced_file(page, mem_cont, | 868 | referenced += page_referenced_file(page, mem_cont, |
869 | vm_flags); | 869 | vm_flags); |
870 | if (we_locked) | 870 | if (we_locked) |
871 | unlock_page(page); | 871 | unlock_page(page); |
872 | 872 | ||
873 | if (page_test_and_clear_young(page_to_pfn(page))) | 873 | if (page_test_and_clear_young(page_to_pfn(page))) |
874 | referenced++; | 874 | referenced++; |
875 | } | 875 | } |
876 | out: | 876 | out: |
877 | return referenced; | 877 | return referenced; |
878 | } | 878 | } |
879 | 879 | ||
880 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | 880 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
881 | unsigned long address) | 881 | unsigned long address) |
882 | { | 882 | { |
883 | struct mm_struct *mm = vma->vm_mm; | 883 | struct mm_struct *mm = vma->vm_mm; |
884 | pte_t *pte; | 884 | pte_t *pte; |
885 | spinlock_t *ptl; | 885 | spinlock_t *ptl; |
886 | int ret = 0; | 886 | int ret = 0; |
887 | 887 | ||
888 | pte = page_check_address(page, mm, address, &ptl, 1); | 888 | pte = page_check_address(page, mm, address, &ptl, 1); |
889 | if (!pte) | 889 | if (!pte) |
890 | goto out; | 890 | goto out; |
891 | 891 | ||
892 | if (pte_dirty(*pte) || pte_write(*pte)) { | 892 | if (pte_dirty(*pte) || pte_write(*pte)) { |
893 | pte_t entry; | 893 | pte_t entry; |
894 | 894 | ||
895 | flush_cache_page(vma, address, pte_pfn(*pte)); | 895 | flush_cache_page(vma, address, pte_pfn(*pte)); |
896 | entry = ptep_clear_flush_notify(vma, address, pte); | 896 | entry = ptep_clear_flush_notify(vma, address, pte); |
897 | entry = pte_wrprotect(entry); | 897 | entry = pte_wrprotect(entry); |
898 | entry = pte_mkclean(entry); | 898 | entry = pte_mkclean(entry); |
899 | set_pte_at(mm, address, pte, entry); | 899 | set_pte_at(mm, address, pte, entry); |
900 | ret = 1; | 900 | ret = 1; |
901 | } | 901 | } |
902 | 902 | ||
903 | pte_unmap_unlock(pte, ptl); | 903 | pte_unmap_unlock(pte, ptl); |
904 | out: | 904 | out: |
905 | return ret; | 905 | return ret; |
906 | } | 906 | } |
907 | 907 | ||
908 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | 908 | static int page_mkclean_file(struct address_space *mapping, struct page *page) |
909 | { | 909 | { |
910 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 910 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
911 | struct vm_area_struct *vma; | 911 | struct vm_area_struct *vma; |
912 | struct prio_tree_iter iter; | 912 | struct prio_tree_iter iter; |
913 | int ret = 0; | 913 | int ret = 0; |
914 | 914 | ||
915 | BUG_ON(PageAnon(page)); | 915 | BUG_ON(PageAnon(page)); |
916 | 916 | ||
917 | mutex_lock(&mapping->i_mmap_mutex); | 917 | mutex_lock(&mapping->i_mmap_mutex); |
918 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 918 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
919 | if (vma->vm_flags & VM_SHARED) { | 919 | if (vma->vm_flags & VM_SHARED) { |
920 | unsigned long address = vma_address(page, vma); | 920 | unsigned long address = vma_address(page, vma); |
921 | if (address == -EFAULT) | 921 | if (address == -EFAULT) |
922 | continue; | 922 | continue; |
923 | ret += page_mkclean_one(page, vma, address); | 923 | ret += page_mkclean_one(page, vma, address); |
924 | } | 924 | } |
925 | } | 925 | } |
926 | mutex_unlock(&mapping->i_mmap_mutex); | 926 | mutex_unlock(&mapping->i_mmap_mutex); |
927 | return ret; | 927 | return ret; |
928 | } | 928 | } |
929 | 929 | ||
930 | int page_mkclean(struct page *page) | 930 | int page_mkclean(struct page *page) |
931 | { | 931 | { |
932 | int ret = 0; | 932 | int ret = 0; |
933 | 933 | ||
934 | BUG_ON(!PageLocked(page)); | 934 | BUG_ON(!PageLocked(page)); |
935 | 935 | ||
936 | if (page_mapped(page)) { | 936 | if (page_mapped(page)) { |
937 | struct address_space *mapping = page_mapping(page); | 937 | struct address_space *mapping = page_mapping(page); |
938 | if (mapping) { | 938 | if (mapping) { |
939 | ret = page_mkclean_file(mapping, page); | 939 | ret = page_mkclean_file(mapping, page); |
940 | if (page_test_and_clear_dirty(page_to_pfn(page), 1)) | 940 | if (page_test_and_clear_dirty(page_to_pfn(page), 1)) |
941 | ret = 1; | 941 | ret = 1; |
942 | } | 942 | } |
943 | } | 943 | } |
944 | 944 | ||
945 | return ret; | 945 | return ret; |
946 | } | 946 | } |
947 | EXPORT_SYMBOL_GPL(page_mkclean); | 947 | EXPORT_SYMBOL_GPL(page_mkclean); |
948 | 948 | ||
949 | /** | 949 | /** |
950 | * page_move_anon_rmap - move a page to our anon_vma | 950 | * page_move_anon_rmap - move a page to our anon_vma |
951 | * @page: the page to move to our anon_vma | 951 | * @page: the page to move to our anon_vma |
952 | * @vma: the vma the page belongs to | 952 | * @vma: the vma the page belongs to |
953 | * @address: the user virtual address mapped | 953 | * @address: the user virtual address mapped |
954 | * | 954 | * |
955 | * When a page belongs exclusively to one process after a COW event, | 955 | * When a page belongs exclusively to one process after a COW event, |
956 | * that page can be moved into the anon_vma that belongs to just that | 956 | * that page can be moved into the anon_vma that belongs to just that |
957 | * process, so the rmap code will not search the parent or sibling | 957 | * process, so the rmap code will not search the parent or sibling |
958 | * processes. | 958 | * processes. |
959 | */ | 959 | */ |
960 | void page_move_anon_rmap(struct page *page, | 960 | void page_move_anon_rmap(struct page *page, |
961 | struct vm_area_struct *vma, unsigned long address) | 961 | struct vm_area_struct *vma, unsigned long address) |
962 | { | 962 | { |
963 | struct anon_vma *anon_vma = vma->anon_vma; | 963 | struct anon_vma *anon_vma = vma->anon_vma; |
964 | 964 | ||
965 | VM_BUG_ON(!PageLocked(page)); | 965 | VM_BUG_ON(!PageLocked(page)); |
966 | VM_BUG_ON(!anon_vma); | 966 | VM_BUG_ON(!anon_vma); |
967 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | 967 | VM_BUG_ON(page->index != linear_page_index(vma, address)); |
968 | 968 | ||
969 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 969 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
970 | page->mapping = (struct address_space *) anon_vma; | 970 | page->mapping = (struct address_space *) anon_vma; |
971 | } | 971 | } |
972 | 972 | ||
973 | /** | 973 | /** |
974 | * __page_set_anon_rmap - set up new anonymous rmap | 974 | * __page_set_anon_rmap - set up new anonymous rmap |
975 | * @page: Page to add to rmap | 975 | * @page: Page to add to rmap |
976 | * @vma: VM area to add page to. | 976 | * @vma: VM area to add page to. |
977 | * @address: User virtual address of the mapping | 977 | * @address: User virtual address of the mapping |
978 | * @exclusive: the page is exclusively owned by the current process | 978 | * @exclusive: the page is exclusively owned by the current process |
979 | */ | 979 | */ |
980 | static void __page_set_anon_rmap(struct page *page, | 980 | static void __page_set_anon_rmap(struct page *page, |
981 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 981 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
982 | { | 982 | { |
983 | struct anon_vma *anon_vma = vma->anon_vma; | 983 | struct anon_vma *anon_vma = vma->anon_vma; |
984 | 984 | ||
985 | BUG_ON(!anon_vma); | 985 | BUG_ON(!anon_vma); |
986 | 986 | ||
987 | if (PageAnon(page)) | 987 | if (PageAnon(page)) |
988 | return; | 988 | return; |
989 | 989 | ||
990 | /* | 990 | /* |
991 | * If the page isn't exclusively mapped into this vma, | 991 | * If the page isn't exclusively mapped into this vma, |
992 | * we must use the _oldest_ possible anon_vma for the | 992 | * we must use the _oldest_ possible anon_vma for the |
993 | * page mapping! | 993 | * page mapping! |
994 | */ | 994 | */ |
995 | if (!exclusive) | 995 | if (!exclusive) |
996 | anon_vma = anon_vma->root; | 996 | anon_vma = anon_vma->root; |
997 | 997 | ||
998 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 998 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
999 | page->mapping = (struct address_space *) anon_vma; | 999 | page->mapping = (struct address_space *) anon_vma; |
1000 | page->index = linear_page_index(vma, address); | 1000 | page->index = linear_page_index(vma, address); |
1001 | } | 1001 | } |
1002 | 1002 | ||
1003 | /** | 1003 | /** |
1004 | * __page_check_anon_rmap - sanity check anonymous rmap addition | 1004 | * __page_check_anon_rmap - sanity check anonymous rmap addition |
1005 | * @page: the page to add the mapping to | 1005 | * @page: the page to add the mapping to |
1006 | * @vma: the vm area in which the mapping is added | 1006 | * @vma: the vm area in which the mapping is added |
1007 | * @address: the user virtual address mapped | 1007 | * @address: the user virtual address mapped |
1008 | */ | 1008 | */ |
1009 | static void __page_check_anon_rmap(struct page *page, | 1009 | static void __page_check_anon_rmap(struct page *page, |
1010 | struct vm_area_struct *vma, unsigned long address) | 1010 | struct vm_area_struct *vma, unsigned long address) |
1011 | { | 1011 | { |
1012 | #ifdef CONFIG_DEBUG_VM | 1012 | #ifdef CONFIG_DEBUG_VM |
1013 | /* | 1013 | /* |
1014 | * The page's anon-rmap details (mapping and index) are guaranteed to | 1014 | * The page's anon-rmap details (mapping and index) are guaranteed to |
1015 | * be set up correctly at this point. | 1015 | * be set up correctly at this point. |
1016 | * | 1016 | * |
1017 | * We have exclusion against page_add_anon_rmap because the caller | 1017 | * We have exclusion against page_add_anon_rmap because the caller |
1018 | * always holds the page locked, except if called from page_dup_rmap, | 1018 | * always holds the page locked, except if called from page_dup_rmap, |
1019 | * in which case the page is already known to be setup. | 1019 | * in which case the page is already known to be setup. |
1020 | * | 1020 | * |
1021 | * We have exclusion against page_add_new_anon_rmap because those pages | 1021 | * We have exclusion against page_add_new_anon_rmap because those pages |
1022 | * are initially only visible via the pagetables, and the pte is locked | 1022 | * are initially only visible via the pagetables, and the pte is locked |
1023 | * over the call to page_add_new_anon_rmap. | 1023 | * over the call to page_add_new_anon_rmap. |
1024 | */ | 1024 | */ |
1025 | BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); | 1025 | BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); |
1026 | BUG_ON(page->index != linear_page_index(vma, address)); | 1026 | BUG_ON(page->index != linear_page_index(vma, address)); |
1027 | #endif | 1027 | #endif |
1028 | } | 1028 | } |
1029 | 1029 | ||
1030 | /** | 1030 | /** |
1031 | * page_add_anon_rmap - add pte mapping to an anonymous page | 1031 | * page_add_anon_rmap - add pte mapping to an anonymous page |
1032 | * @page: the page to add the mapping to | 1032 | * @page: the page to add the mapping to |
1033 | * @vma: the vm area in which the mapping is added | 1033 | * @vma: the vm area in which the mapping is added |
1034 | * @address: the user virtual address mapped | 1034 | * @address: the user virtual address mapped |
1035 | * | 1035 | * |
1036 | * The caller needs to hold the pte lock, and the page must be locked in | 1036 | * The caller needs to hold the pte lock, and the page must be locked in |
1037 | * the anon_vma case: to serialize mapping,index checking after setting, | 1037 | * the anon_vma case: to serialize mapping,index checking after setting, |
1038 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | 1038 | * and to ensure that PageAnon is not being upgraded racily to PageKsm |
1039 | * (but PageKsm is never downgraded to PageAnon). | 1039 | * (but PageKsm is never downgraded to PageAnon). |
1040 | */ | 1040 | */ |
1041 | void page_add_anon_rmap(struct page *page, | 1041 | void page_add_anon_rmap(struct page *page, |
1042 | struct vm_area_struct *vma, unsigned long address) | 1042 | struct vm_area_struct *vma, unsigned long address) |
1043 | { | 1043 | { |
1044 | do_page_add_anon_rmap(page, vma, address, 0); | 1044 | do_page_add_anon_rmap(page, vma, address, 0); |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | /* | 1047 | /* |
1048 | * Special version of the above for do_swap_page, which often runs | 1048 | * Special version of the above for do_swap_page, which often runs |
1049 | * into pages that are exclusively owned by the current process. | 1049 | * into pages that are exclusively owned by the current process. |
1050 | * Everybody else should continue to use page_add_anon_rmap above. | 1050 | * Everybody else should continue to use page_add_anon_rmap above. |
1051 | */ | 1051 | */ |
1052 | void do_page_add_anon_rmap(struct page *page, | 1052 | void do_page_add_anon_rmap(struct page *page, |
1053 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 1053 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
1054 | { | 1054 | { |
1055 | int first = atomic_inc_and_test(&page->_mapcount); | 1055 | int first = atomic_inc_and_test(&page->_mapcount); |
1056 | if (first) { | 1056 | if (first) { |
1057 | if (!PageTransHuge(page)) | 1057 | if (!PageTransHuge(page)) |
1058 | __inc_zone_page_state(page, NR_ANON_PAGES); | 1058 | __inc_zone_page_state(page, NR_ANON_PAGES); |
1059 | else | 1059 | else |
1060 | __inc_zone_page_state(page, | 1060 | __inc_zone_page_state(page, |
1061 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1061 | NR_ANON_TRANSPARENT_HUGEPAGES); |
1062 | } | 1062 | } |
1063 | if (unlikely(PageKsm(page))) | 1063 | if (unlikely(PageKsm(page))) |
1064 | return; | 1064 | return; |
1065 | 1065 | ||
1066 | VM_BUG_ON(!PageLocked(page)); | 1066 | VM_BUG_ON(!PageLocked(page)); |
1067 | /* address might be in next vma when migration races vma_adjust */ | 1067 | /* address might be in next vma when migration races vma_adjust */ |
1068 | if (first) | 1068 | if (first) |
1069 | __page_set_anon_rmap(page, vma, address, exclusive); | 1069 | __page_set_anon_rmap(page, vma, address, exclusive); |
1070 | else | 1070 | else |
1071 | __page_check_anon_rmap(page, vma, address); | 1071 | __page_check_anon_rmap(page, vma, address); |
1072 | } | 1072 | } |
1073 | 1073 | ||
1074 | /** | 1074 | /** |
1075 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page | 1075 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page |
1076 | * @page: the page to add the mapping to | 1076 | * @page: the page to add the mapping to |
1077 | * @vma: the vm area in which the mapping is added | 1077 | * @vma: the vm area in which the mapping is added |
1078 | * @address: the user virtual address mapped | 1078 | * @address: the user virtual address mapped |
1079 | * | 1079 | * |
1080 | * Same as page_add_anon_rmap but must only be called on *new* pages. | 1080 | * Same as page_add_anon_rmap but must only be called on *new* pages. |
1081 | * This means the inc-and-test can be bypassed. | 1081 | * This means the inc-and-test can be bypassed. |
1082 | * Page does not have to be locked. | 1082 | * Page does not have to be locked. |
1083 | */ | 1083 | */ |
1084 | void page_add_new_anon_rmap(struct page *page, | 1084 | void page_add_new_anon_rmap(struct page *page, |
1085 | struct vm_area_struct *vma, unsigned long address) | 1085 | struct vm_area_struct *vma, unsigned long address) |
1086 | { | 1086 | { |
1087 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1087 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
1088 | SetPageSwapBacked(page); | 1088 | SetPageSwapBacked(page); |
1089 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1089 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
1090 | if (!PageTransHuge(page)) | 1090 | if (!PageTransHuge(page)) |
1091 | __inc_zone_page_state(page, NR_ANON_PAGES); | 1091 | __inc_zone_page_state(page, NR_ANON_PAGES); |
1092 | else | 1092 | else |
1093 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1093 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1094 | __page_set_anon_rmap(page, vma, address, 1); | 1094 | __page_set_anon_rmap(page, vma, address, 1); |
1095 | if (page_evictable(page, vma)) | 1095 | if (page_evictable(page, vma)) |
1096 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 1096 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
1097 | else | 1097 | else |
1098 | add_page_to_unevictable_list(page); | 1098 | add_page_to_unevictable_list(page); |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | /** | 1101 | /** |
1102 | * page_add_file_rmap - add pte mapping to a file page | 1102 | * page_add_file_rmap - add pte mapping to a file page |
1103 | * @page: the page to add the mapping to | 1103 | * @page: the page to add the mapping to |
1104 | * | 1104 | * |
1105 | * The caller needs to hold the pte lock. | 1105 | * The caller needs to hold the pte lock. |
1106 | */ | 1106 | */ |
1107 | void page_add_file_rmap(struct page *page) | 1107 | void page_add_file_rmap(struct page *page) |
1108 | { | 1108 | { |
1109 | if (atomic_inc_and_test(&page->_mapcount)) { | 1109 | if (atomic_inc_and_test(&page->_mapcount)) { |
1110 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1110 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1111 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1111 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1112 | } | 1112 | } |
1113 | } | 1113 | } |
1114 | 1114 | ||
1115 | /** | 1115 | /** |
1116 | * page_remove_rmap - take down pte mapping from a page | 1116 | * page_remove_rmap - take down pte mapping from a page |
1117 | * @page: page to remove mapping from | 1117 | * @page: page to remove mapping from |
1118 | * | 1118 | * |
1119 | * The caller needs to hold the pte lock. | 1119 | * The caller needs to hold the pte lock. |
1120 | */ | 1120 | */ |
1121 | void page_remove_rmap(struct page *page) | 1121 | void page_remove_rmap(struct page *page) |
1122 | { | 1122 | { |
1123 | /* page still mapped by someone else? */ | 1123 | /* page still mapped by someone else? */ |
1124 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1124 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1125 | return; | 1125 | return; |
1126 | 1126 | ||
1127 | /* | 1127 | /* |
1128 | * Now that the last pte has gone, s390 must transfer dirty | 1128 | * Now that the last pte has gone, s390 must transfer dirty |
1129 | * flag from storage key to struct page. We can usually skip | 1129 | * flag from storage key to struct page. We can usually skip |
1130 | * this if the page is anon, so about to be freed; but perhaps | 1130 | * this if the page is anon, so about to be freed; but perhaps |
1131 | * not if it's in swapcache - there might be another pte slot | 1131 | * not if it's in swapcache - there might be another pte slot |
1132 | * containing the swap entry, but page not yet written to swap. | 1132 | * containing the swap entry, but page not yet written to swap. |
1133 | */ | 1133 | */ |
1134 | if ((!PageAnon(page) || PageSwapCache(page)) && | 1134 | if ((!PageAnon(page) || PageSwapCache(page)) && |
1135 | page_test_and_clear_dirty(page_to_pfn(page), 1)) | 1135 | page_test_and_clear_dirty(page_to_pfn(page), 1)) |
1136 | set_page_dirty(page); | 1136 | set_page_dirty(page); |
1137 | /* | 1137 | /* |
1138 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED | 1138 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED |
1139 | * and not charged by memcg for now. | 1139 | * and not charged by memcg for now. |
1140 | */ | 1140 | */ |
1141 | if (unlikely(PageHuge(page))) | 1141 | if (unlikely(PageHuge(page))) |
1142 | return; | 1142 | return; |
1143 | if (PageAnon(page)) { | 1143 | if (PageAnon(page)) { |
1144 | mem_cgroup_uncharge_page(page); | 1144 | mem_cgroup_uncharge_page(page); |
1145 | if (!PageTransHuge(page)) | 1145 | if (!PageTransHuge(page)) |
1146 | __dec_zone_page_state(page, NR_ANON_PAGES); | 1146 | __dec_zone_page_state(page, NR_ANON_PAGES); |
1147 | else | 1147 | else |
1148 | __dec_zone_page_state(page, | 1148 | __dec_zone_page_state(page, |
1149 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1149 | NR_ANON_TRANSPARENT_HUGEPAGES); |
1150 | } else { | 1150 | } else { |
1151 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1151 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
1152 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1152 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1153 | } | 1153 | } |
1154 | /* | 1154 | /* |
1155 | * It would be tidy to reset the PageAnon mapping here, | 1155 | * It would be tidy to reset the PageAnon mapping here, |
1156 | * but that might overwrite a racing page_add_anon_rmap | 1156 | * but that might overwrite a racing page_add_anon_rmap |
1157 | * which increments mapcount after us but sets mapping | 1157 | * which increments mapcount after us but sets mapping |
1158 | * before us: so leave the reset to free_hot_cold_page, | 1158 | * before us: so leave the reset to free_hot_cold_page, |
1159 | * and remember that it's only reliable while mapped. | 1159 | * and remember that it's only reliable while mapped. |
1160 | * Leaving it set also helps swapoff to reinstate ptes | 1160 | * Leaving it set also helps swapoff to reinstate ptes |
1161 | * faster for those pages still in swapcache. | 1161 | * faster for those pages still in swapcache. |
1162 | */ | 1162 | */ |
1163 | } | 1163 | } |
1164 | 1164 | ||
1165 | /* | 1165 | /* |
1166 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 1166 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
1167 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 1167 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
1168 | */ | 1168 | */ |
1169 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1169 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1170 | unsigned long address, enum ttu_flags flags) | 1170 | unsigned long address, enum ttu_flags flags) |
1171 | { | 1171 | { |
1172 | struct mm_struct *mm = vma->vm_mm; | 1172 | struct mm_struct *mm = vma->vm_mm; |
1173 | pte_t *pte; | 1173 | pte_t *pte; |
1174 | pte_t pteval; | 1174 | pte_t pteval; |
1175 | spinlock_t *ptl; | 1175 | spinlock_t *ptl; |
1176 | int ret = SWAP_AGAIN; | 1176 | int ret = SWAP_AGAIN; |
1177 | 1177 | ||
1178 | pte = page_check_address(page, mm, address, &ptl, 0); | 1178 | pte = page_check_address(page, mm, address, &ptl, 0); |
1179 | if (!pte) | 1179 | if (!pte) |
1180 | goto out; | 1180 | goto out; |
1181 | 1181 | ||
1182 | /* | 1182 | /* |
1183 | * If the page is mlock()d, we cannot swap it out. | 1183 | * If the page is mlock()d, we cannot swap it out. |
1184 | * If it's recently referenced (perhaps page_referenced | 1184 | * If it's recently referenced (perhaps page_referenced |
1185 | * skipped over this mm) then we should reactivate it. | 1185 | * skipped over this mm) then we should reactivate it. |
1186 | */ | 1186 | */ |
1187 | if (!(flags & TTU_IGNORE_MLOCK)) { | 1187 | if (!(flags & TTU_IGNORE_MLOCK)) { |
1188 | if (vma->vm_flags & VM_LOCKED) | 1188 | if (vma->vm_flags & VM_LOCKED) |
1189 | goto out_mlock; | 1189 | goto out_mlock; |
1190 | 1190 | ||
1191 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | 1191 | if (TTU_ACTION(flags) == TTU_MUNLOCK) |
1192 | goto out_unmap; | 1192 | goto out_unmap; |
1193 | } | 1193 | } |
1194 | if (!(flags & TTU_IGNORE_ACCESS)) { | 1194 | if (!(flags & TTU_IGNORE_ACCESS)) { |
1195 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 1195 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
1196 | ret = SWAP_FAIL; | 1196 | ret = SWAP_FAIL; |
1197 | goto out_unmap; | 1197 | goto out_unmap; |
1198 | } | 1198 | } |
1199 | } | 1199 | } |
1200 | 1200 | ||
1201 | /* Nuke the page table entry. */ | 1201 | /* Nuke the page table entry. */ |
1202 | flush_cache_page(vma, address, page_to_pfn(page)); | 1202 | flush_cache_page(vma, address, page_to_pfn(page)); |
1203 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1203 | pteval = ptep_clear_flush_notify(vma, address, pte); |
1204 | 1204 | ||
1205 | /* Move the dirty bit to the physical page now the pte is gone. */ | 1205 | /* Move the dirty bit to the physical page now the pte is gone. */ |
1206 | if (pte_dirty(pteval)) | 1206 | if (pte_dirty(pteval)) |
1207 | set_page_dirty(page); | 1207 | set_page_dirty(page); |
1208 | 1208 | ||
1209 | /* Update high watermark before we lower rss */ | 1209 | /* Update high watermark before we lower rss */ |
1210 | update_hiwater_rss(mm); | 1210 | update_hiwater_rss(mm); |
1211 | 1211 | ||
1212 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 1212 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
1213 | if (PageAnon(page)) | 1213 | if (PageAnon(page)) |
1214 | dec_mm_counter(mm, MM_ANONPAGES); | 1214 | dec_mm_counter(mm, MM_ANONPAGES); |
1215 | else | 1215 | else |
1216 | dec_mm_counter(mm, MM_FILEPAGES); | 1216 | dec_mm_counter(mm, MM_FILEPAGES); |
1217 | set_pte_at(mm, address, pte, | 1217 | set_pte_at(mm, address, pte, |
1218 | swp_entry_to_pte(make_hwpoison_entry(page))); | 1218 | swp_entry_to_pte(make_hwpoison_entry(page))); |
1219 | } else if (PageAnon(page)) { | 1219 | } else if (PageAnon(page)) { |
1220 | swp_entry_t entry = { .val = page_private(page) }; | 1220 | swp_entry_t entry = { .val = page_private(page) }; |
1221 | 1221 | ||
1222 | if (PageSwapCache(page)) { | 1222 | if (PageSwapCache(page)) { |
1223 | /* | 1223 | /* |
1224 | * Store the swap location in the pte. | 1224 | * Store the swap location in the pte. |
1225 | * See handle_pte_fault() ... | 1225 | * See handle_pte_fault() ... |
1226 | */ | 1226 | */ |
1227 | if (swap_duplicate(entry) < 0) { | 1227 | if (swap_duplicate(entry) < 0) { |
1228 | set_pte_at(mm, address, pte, pteval); | 1228 | set_pte_at(mm, address, pte, pteval); |
1229 | ret = SWAP_FAIL; | 1229 | ret = SWAP_FAIL; |
1230 | goto out_unmap; | 1230 | goto out_unmap; |
1231 | } | 1231 | } |
1232 | if (list_empty(&mm->mmlist)) { | 1232 | if (list_empty(&mm->mmlist)) { |
1233 | spin_lock(&mmlist_lock); | 1233 | spin_lock(&mmlist_lock); |
1234 | if (list_empty(&mm->mmlist)) | 1234 | if (list_empty(&mm->mmlist)) |
1235 | list_add(&mm->mmlist, &init_mm.mmlist); | 1235 | list_add(&mm->mmlist, &init_mm.mmlist); |
1236 | spin_unlock(&mmlist_lock); | 1236 | spin_unlock(&mmlist_lock); |
1237 | } | 1237 | } |
1238 | dec_mm_counter(mm, MM_ANONPAGES); | 1238 | dec_mm_counter(mm, MM_ANONPAGES); |
1239 | inc_mm_counter(mm, MM_SWAPENTS); | 1239 | inc_mm_counter(mm, MM_SWAPENTS); |
1240 | } else if (PAGE_MIGRATION) { | 1240 | } else if (PAGE_MIGRATION) { |
1241 | /* | 1241 | /* |
1242 | * Store the pfn of the page in a special migration | 1242 | * Store the pfn of the page in a special migration |
1243 | * pte. do_swap_page() will wait until the migration | 1243 | * pte. do_swap_page() will wait until the migration |
1244 | * pte is removed and then restart fault handling. | 1244 | * pte is removed and then restart fault handling. |
1245 | */ | 1245 | */ |
1246 | BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); | 1246 | BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); |
1247 | entry = make_migration_entry(page, pte_write(pteval)); | 1247 | entry = make_migration_entry(page, pte_write(pteval)); |
1248 | } | 1248 | } |
1249 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 1249 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
1250 | BUG_ON(pte_file(*pte)); | 1250 | BUG_ON(pte_file(*pte)); |
1251 | } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { | 1251 | } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { |
1252 | /* Establish migration entry for a file page */ | 1252 | /* Establish migration entry for a file page */ |
1253 | swp_entry_t entry; | 1253 | swp_entry_t entry; |
1254 | entry = make_migration_entry(page, pte_write(pteval)); | 1254 | entry = make_migration_entry(page, pte_write(pteval)); |
1255 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 1255 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
1256 | } else | 1256 | } else |
1257 | dec_mm_counter(mm, MM_FILEPAGES); | 1257 | dec_mm_counter(mm, MM_FILEPAGES); |
1258 | 1258 | ||
1259 | page_remove_rmap(page); | 1259 | page_remove_rmap(page); |
1260 | page_cache_release(page); | 1260 | page_cache_release(page); |
1261 | 1261 | ||
1262 | out_unmap: | 1262 | out_unmap: |
1263 | pte_unmap_unlock(pte, ptl); | 1263 | pte_unmap_unlock(pte, ptl); |
1264 | out: | 1264 | out: |
1265 | return ret; | 1265 | return ret; |
1266 | 1266 | ||
1267 | out_mlock: | 1267 | out_mlock: |
1268 | pte_unmap_unlock(pte, ptl); | 1268 | pte_unmap_unlock(pte, ptl); |
1269 | 1269 | ||
1270 | 1270 | ||
1271 | /* | 1271 | /* |
1272 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1272 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1273 | * unstable result and race. Plus, We can't wait here because | 1273 | * unstable result and race. Plus, We can't wait here because |
1274 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. | 1274 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. |
1275 | * if trylock failed, the page remain in evictable lru and later | 1275 | * if trylock failed, the page remain in evictable lru and later |
1276 | * vmscan could retry to move the page to unevictable lru if the | 1276 | * vmscan could retry to move the page to unevictable lru if the |
1277 | * page is actually mlocked. | 1277 | * page is actually mlocked. |
1278 | */ | 1278 | */ |
1279 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | 1279 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
1280 | if (vma->vm_flags & VM_LOCKED) { | 1280 | if (vma->vm_flags & VM_LOCKED) { |
1281 | mlock_vma_page(page); | 1281 | mlock_vma_page(page); |
1282 | ret = SWAP_MLOCK; | 1282 | ret = SWAP_MLOCK; |
1283 | } | 1283 | } |
1284 | up_read(&vma->vm_mm->mmap_sem); | 1284 | up_read(&vma->vm_mm->mmap_sem); |
1285 | } | 1285 | } |
1286 | return ret; | 1286 | return ret; |
1287 | } | 1287 | } |
1288 | 1288 | ||
1289 | /* | 1289 | /* |
1290 | * objrmap doesn't work for nonlinear VMAs because the assumption that | 1290 | * objrmap doesn't work for nonlinear VMAs because the assumption that |
1291 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. | 1291 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. |
1292 | * Consequently, given a particular page and its ->index, we cannot locate the | 1292 | * Consequently, given a particular page and its ->index, we cannot locate the |
1293 | * ptes which are mapping that page without an exhaustive linear search. | 1293 | * ptes which are mapping that page without an exhaustive linear search. |
1294 | * | 1294 | * |
1295 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which | 1295 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which |
1296 | * maps the file to which the target page belongs. The ->vm_private_data field | 1296 | * maps the file to which the target page belongs. The ->vm_private_data field |
1297 | * holds the current cursor into that scan. Successive searches will circulate | 1297 | * holds the current cursor into that scan. Successive searches will circulate |
1298 | * around the vma's virtual address space. | 1298 | * around the vma's virtual address space. |
1299 | * | 1299 | * |
1300 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, | 1300 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, |
1301 | * more scanning pressure is placed against them as well. Eventually pages | 1301 | * more scanning pressure is placed against them as well. Eventually pages |
1302 | * will become fully unmapped and are eligible for eviction. | 1302 | * will become fully unmapped and are eligible for eviction. |
1303 | * | 1303 | * |
1304 | * For very sparsely populated VMAs this is a little inefficient - chances are | 1304 | * For very sparsely populated VMAs this is a little inefficient - chances are |
1305 | * there there won't be many ptes located within the scan cluster. In this case | 1305 | * there there won't be many ptes located within the scan cluster. In this case |
1306 | * maybe we could scan further - to the end of the pte page, perhaps. | 1306 | * maybe we could scan further - to the end of the pte page, perhaps. |
1307 | * | 1307 | * |
1308 | * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can | 1308 | * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can |
1309 | * acquire it without blocking. If vma locked, mlock the pages in the cluster, | 1309 | * acquire it without blocking. If vma locked, mlock the pages in the cluster, |
1310 | * rather than unmapping them. If we encounter the "check_page" that vmscan is | 1310 | * rather than unmapping them. If we encounter the "check_page" that vmscan is |
1311 | * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. | 1311 | * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. |
1312 | */ | 1312 | */ |
1313 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | 1313 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) |
1314 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | 1314 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) |
1315 | 1315 | ||
1316 | static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | 1316 | static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, |
1317 | struct vm_area_struct *vma, struct page *check_page) | 1317 | struct vm_area_struct *vma, struct page *check_page) |
1318 | { | 1318 | { |
1319 | struct mm_struct *mm = vma->vm_mm; | 1319 | struct mm_struct *mm = vma->vm_mm; |
1320 | pgd_t *pgd; | 1320 | pgd_t *pgd; |
1321 | pud_t *pud; | 1321 | pud_t *pud; |
1322 | pmd_t *pmd; | 1322 | pmd_t *pmd; |
1323 | pte_t *pte; | 1323 | pte_t *pte; |
1324 | pte_t pteval; | 1324 | pte_t pteval; |
1325 | spinlock_t *ptl; | 1325 | spinlock_t *ptl; |
1326 | struct page *page; | 1326 | struct page *page; |
1327 | unsigned long address; | 1327 | unsigned long address; |
1328 | unsigned long end; | 1328 | unsigned long end; |
1329 | int ret = SWAP_AGAIN; | 1329 | int ret = SWAP_AGAIN; |
1330 | int locked_vma = 0; | 1330 | int locked_vma = 0; |
1331 | 1331 | ||
1332 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 1332 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
1333 | end = address + CLUSTER_SIZE; | 1333 | end = address + CLUSTER_SIZE; |
1334 | if (address < vma->vm_start) | 1334 | if (address < vma->vm_start) |
1335 | address = vma->vm_start; | 1335 | address = vma->vm_start; |
1336 | if (end > vma->vm_end) | 1336 | if (end > vma->vm_end) |
1337 | end = vma->vm_end; | 1337 | end = vma->vm_end; |
1338 | 1338 | ||
1339 | pgd = pgd_offset(mm, address); | 1339 | pgd = pgd_offset(mm, address); |
1340 | if (!pgd_present(*pgd)) | 1340 | if (!pgd_present(*pgd)) |
1341 | return ret; | 1341 | return ret; |
1342 | 1342 | ||
1343 | pud = pud_offset(pgd, address); | 1343 | pud = pud_offset(pgd, address); |
1344 | if (!pud_present(*pud)) | 1344 | if (!pud_present(*pud)) |
1345 | return ret; | 1345 | return ret; |
1346 | 1346 | ||
1347 | pmd = pmd_offset(pud, address); | 1347 | pmd = pmd_offset(pud, address); |
1348 | if (!pmd_present(*pmd)) | 1348 | if (!pmd_present(*pmd)) |
1349 | return ret; | 1349 | return ret; |
1350 | 1350 | ||
1351 | /* | 1351 | /* |
1352 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, | 1352 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
1353 | * keep the sem while scanning the cluster for mlocking pages. | 1353 | * keep the sem while scanning the cluster for mlocking pages. |
1354 | */ | 1354 | */ |
1355 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | 1355 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
1356 | locked_vma = (vma->vm_flags & VM_LOCKED); | 1356 | locked_vma = (vma->vm_flags & VM_LOCKED); |
1357 | if (!locked_vma) | 1357 | if (!locked_vma) |
1358 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | 1358 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ |
1359 | } | 1359 | } |
1360 | 1360 | ||
1361 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 1361 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
1362 | 1362 | ||
1363 | /* Update high watermark before we lower rss */ | 1363 | /* Update high watermark before we lower rss */ |
1364 | update_hiwater_rss(mm); | 1364 | update_hiwater_rss(mm); |
1365 | 1365 | ||
1366 | for (; address < end; pte++, address += PAGE_SIZE) { | 1366 | for (; address < end; pte++, address += PAGE_SIZE) { |
1367 | if (!pte_present(*pte)) | 1367 | if (!pte_present(*pte)) |
1368 | continue; | 1368 | continue; |
1369 | page = vm_normal_page(vma, address, *pte); | 1369 | page = vm_normal_page(vma, address, *pte); |
1370 | BUG_ON(!page || PageAnon(page)); | 1370 | BUG_ON(!page || PageAnon(page)); |
1371 | 1371 | ||
1372 | if (locked_vma) { | 1372 | if (locked_vma) { |
1373 | mlock_vma_page(page); /* no-op if already mlocked */ | 1373 | mlock_vma_page(page); /* no-op if already mlocked */ |
1374 | if (page == check_page) | 1374 | if (page == check_page) |
1375 | ret = SWAP_MLOCK; | 1375 | ret = SWAP_MLOCK; |
1376 | continue; /* don't unmap */ | 1376 | continue; /* don't unmap */ |
1377 | } | 1377 | } |
1378 | 1378 | ||
1379 | if (ptep_clear_flush_young_notify(vma, address, pte)) | 1379 | if (ptep_clear_flush_young_notify(vma, address, pte)) |
1380 | continue; | 1380 | continue; |
1381 | 1381 | ||
1382 | /* Nuke the page table entry. */ | 1382 | /* Nuke the page table entry. */ |
1383 | flush_cache_page(vma, address, pte_pfn(*pte)); | 1383 | flush_cache_page(vma, address, pte_pfn(*pte)); |
1384 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1384 | pteval = ptep_clear_flush_notify(vma, address, pte); |
1385 | 1385 | ||
1386 | /* If nonlinear, store the file page offset in the pte. */ | 1386 | /* If nonlinear, store the file page offset in the pte. */ |
1387 | if (page->index != linear_page_index(vma, address)) | 1387 | if (page->index != linear_page_index(vma, address)) |
1388 | set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); | 1388 | set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); |
1389 | 1389 | ||
1390 | /* Move the dirty bit to the physical page now the pte is gone. */ | 1390 | /* Move the dirty bit to the physical page now the pte is gone. */ |
1391 | if (pte_dirty(pteval)) | 1391 | if (pte_dirty(pteval)) |
1392 | set_page_dirty(page); | 1392 | set_page_dirty(page); |
1393 | 1393 | ||
1394 | page_remove_rmap(page); | 1394 | page_remove_rmap(page); |
1395 | page_cache_release(page); | 1395 | page_cache_release(page); |
1396 | dec_mm_counter(mm, MM_FILEPAGES); | 1396 | dec_mm_counter(mm, MM_FILEPAGES); |
1397 | (*mapcount)--; | 1397 | (*mapcount)--; |
1398 | } | 1398 | } |
1399 | pte_unmap_unlock(pte - 1, ptl); | 1399 | pte_unmap_unlock(pte - 1, ptl); |
1400 | if (locked_vma) | 1400 | if (locked_vma) |
1401 | up_read(&vma->vm_mm->mmap_sem); | 1401 | up_read(&vma->vm_mm->mmap_sem); |
1402 | return ret; | 1402 | return ret; |
1403 | } | 1403 | } |
1404 | 1404 | ||
1405 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1405 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
1406 | { | 1406 | { |
1407 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1407 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
1408 | 1408 | ||
1409 | if (!maybe_stack) | 1409 | if (!maybe_stack) |
1410 | return false; | 1410 | return false; |
1411 | 1411 | ||
1412 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | 1412 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == |
1413 | VM_STACK_INCOMPLETE_SETUP) | 1413 | VM_STACK_INCOMPLETE_SETUP) |
1414 | return true; | 1414 | return true; |
1415 | 1415 | ||
1416 | return false; | 1416 | return false; |
1417 | } | 1417 | } |
1418 | 1418 | ||
1419 | /** | 1419 | /** |
1420 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1420 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
1421 | * rmap method | 1421 | * rmap method |
1422 | * @page: the page to unmap/unlock | 1422 | * @page: the page to unmap/unlock |
1423 | * @flags: action and flags | 1423 | * @flags: action and flags |
1424 | * | 1424 | * |
1425 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1425 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1426 | * contained in the anon_vma struct it points to. | 1426 | * contained in the anon_vma struct it points to. |
1427 | * | 1427 | * |
1428 | * This function is only called from try_to_unmap/try_to_munlock for | 1428 | * This function is only called from try_to_unmap/try_to_munlock for |
1429 | * anonymous pages. | 1429 | * anonymous pages. |
1430 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | 1430 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma |
1431 | * where the page was found will be held for write. So, we won't recheck | 1431 | * where the page was found will be held for write. So, we won't recheck |
1432 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1432 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
1433 | * 'LOCKED. | 1433 | * 'LOCKED. |
1434 | */ | 1434 | */ |
1435 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1435 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1436 | { | 1436 | { |
1437 | struct anon_vma *anon_vma; | 1437 | struct anon_vma *anon_vma; |
1438 | struct anon_vma_chain *avc; | 1438 | struct anon_vma_chain *avc; |
1439 | int ret = SWAP_AGAIN; | 1439 | int ret = SWAP_AGAIN; |
1440 | 1440 | ||
1441 | anon_vma = page_lock_anon_vma(page); | 1441 | anon_vma = page_lock_anon_vma(page); |
1442 | if (!anon_vma) | 1442 | if (!anon_vma) |
1443 | return ret; | 1443 | return ret; |
1444 | 1444 | ||
1445 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1445 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
1446 | struct vm_area_struct *vma = avc->vma; | 1446 | struct vm_area_struct *vma = avc->vma; |
1447 | unsigned long address; | 1447 | unsigned long address; |
1448 | 1448 | ||
1449 | /* | 1449 | /* |
1450 | * During exec, a temporary VMA is setup and later moved. | 1450 | * During exec, a temporary VMA is setup and later moved. |
1451 | * The VMA is moved under the anon_vma lock but not the | 1451 | * The VMA is moved under the anon_vma lock but not the |
1452 | * page tables leading to a race where migration cannot | 1452 | * page tables leading to a race where migration cannot |
1453 | * find the migration ptes. Rather than increasing the | 1453 | * find the migration ptes. Rather than increasing the |
1454 | * locking requirements of exec(), migration skips | 1454 | * locking requirements of exec(), migration skips |
1455 | * temporary VMAs until after exec() completes. | 1455 | * temporary VMAs until after exec() completes. |
1456 | */ | 1456 | */ |
1457 | if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && | 1457 | if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && |
1458 | is_vma_temporary_stack(vma)) | 1458 | is_vma_temporary_stack(vma)) |
1459 | continue; | 1459 | continue; |
1460 | 1460 | ||
1461 | address = vma_address(page, vma); | 1461 | address = vma_address(page, vma); |
1462 | if (address == -EFAULT) | 1462 | if (address == -EFAULT) |
1463 | continue; | 1463 | continue; |
1464 | ret = try_to_unmap_one(page, vma, address, flags); | 1464 | ret = try_to_unmap_one(page, vma, address, flags); |
1465 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1465 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1466 | break; | 1466 | break; |
1467 | } | 1467 | } |
1468 | 1468 | ||
1469 | page_unlock_anon_vma(anon_vma); | 1469 | page_unlock_anon_vma(anon_vma); |
1470 | return ret; | 1470 | return ret; |
1471 | } | 1471 | } |
1472 | 1472 | ||
1473 | /** | 1473 | /** |
1474 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method | 1474 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method |
1475 | * @page: the page to unmap/unlock | 1475 | * @page: the page to unmap/unlock |
1476 | * @flags: action and flags | 1476 | * @flags: action and flags |
1477 | * | 1477 | * |
1478 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1478 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1479 | * contained in the address_space struct it points to. | 1479 | * contained in the address_space struct it points to. |
1480 | * | 1480 | * |
1481 | * This function is only called from try_to_unmap/try_to_munlock for | 1481 | * This function is only called from try_to_unmap/try_to_munlock for |
1482 | * object-based pages. | 1482 | * object-based pages. |
1483 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | 1483 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma |
1484 | * where the page was found will be held for write. So, we won't recheck | 1484 | * where the page was found will be held for write. So, we won't recheck |
1485 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1485 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
1486 | * 'LOCKED. | 1486 | * 'LOCKED. |
1487 | */ | 1487 | */ |
1488 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | 1488 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) |
1489 | { | 1489 | { |
1490 | struct address_space *mapping = page->mapping; | 1490 | struct address_space *mapping = page->mapping; |
1491 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1491 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1492 | struct vm_area_struct *vma; | 1492 | struct vm_area_struct *vma; |
1493 | struct prio_tree_iter iter; | 1493 | struct prio_tree_iter iter; |
1494 | int ret = SWAP_AGAIN; | 1494 | int ret = SWAP_AGAIN; |
1495 | unsigned long cursor; | 1495 | unsigned long cursor; |
1496 | unsigned long max_nl_cursor = 0; | 1496 | unsigned long max_nl_cursor = 0; |
1497 | unsigned long max_nl_size = 0; | 1497 | unsigned long max_nl_size = 0; |
1498 | unsigned int mapcount; | 1498 | unsigned int mapcount; |
1499 | 1499 | ||
1500 | mutex_lock(&mapping->i_mmap_mutex); | 1500 | mutex_lock(&mapping->i_mmap_mutex); |
1501 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1501 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1502 | unsigned long address = vma_address(page, vma); | 1502 | unsigned long address = vma_address(page, vma); |
1503 | if (address == -EFAULT) | 1503 | if (address == -EFAULT) |
1504 | continue; | 1504 | continue; |
1505 | ret = try_to_unmap_one(page, vma, address, flags); | 1505 | ret = try_to_unmap_one(page, vma, address, flags); |
1506 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1506 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1507 | goto out; | 1507 | goto out; |
1508 | } | 1508 | } |
1509 | 1509 | ||
1510 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1510 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1511 | goto out; | 1511 | goto out; |
1512 | 1512 | ||
1513 | /* | 1513 | /* |
1514 | * We don't bother to try to find the munlocked page in nonlinears. | 1514 | * We don't bother to try to find the munlocked page in nonlinears. |
1515 | * It's costly. Instead, later, page reclaim logic may call | 1515 | * It's costly. Instead, later, page reclaim logic may call |
1516 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | 1516 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. |
1517 | */ | 1517 | */ |
1518 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | 1518 | if (TTU_ACTION(flags) == TTU_MUNLOCK) |
1519 | goto out; | 1519 | goto out; |
1520 | 1520 | ||
1521 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1521 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1522 | shared.vm_set.list) { | 1522 | shared.vm_set.list) { |
1523 | cursor = (unsigned long) vma->vm_private_data; | 1523 | cursor = (unsigned long) vma->vm_private_data; |
1524 | if (cursor > max_nl_cursor) | 1524 | if (cursor > max_nl_cursor) |
1525 | max_nl_cursor = cursor; | 1525 | max_nl_cursor = cursor; |
1526 | cursor = vma->vm_end - vma->vm_start; | 1526 | cursor = vma->vm_end - vma->vm_start; |
1527 | if (cursor > max_nl_size) | 1527 | if (cursor > max_nl_size) |
1528 | max_nl_size = cursor; | 1528 | max_nl_size = cursor; |
1529 | } | 1529 | } |
1530 | 1530 | ||
1531 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | 1531 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ |
1532 | ret = SWAP_FAIL; | 1532 | ret = SWAP_FAIL; |
1533 | goto out; | 1533 | goto out; |
1534 | } | 1534 | } |
1535 | 1535 | ||
1536 | /* | 1536 | /* |
1537 | * We don't try to search for this page in the nonlinear vmas, | 1537 | * We don't try to search for this page in the nonlinear vmas, |
1538 | * and page_referenced wouldn't have found it anyway. Instead | 1538 | * and page_referenced wouldn't have found it anyway. Instead |
1539 | * just walk the nonlinear vmas trying to age and unmap some. | 1539 | * just walk the nonlinear vmas trying to age and unmap some. |
1540 | * The mapcount of the page we came in with is irrelevant, | 1540 | * The mapcount of the page we came in with is irrelevant, |
1541 | * but even so use it as a guide to how hard we should try? | 1541 | * but even so use it as a guide to how hard we should try? |
1542 | */ | 1542 | */ |
1543 | mapcount = page_mapcount(page); | 1543 | mapcount = page_mapcount(page); |
1544 | if (!mapcount) | 1544 | if (!mapcount) |
1545 | goto out; | 1545 | goto out; |
1546 | cond_resched(); | 1546 | cond_resched(); |
1547 | 1547 | ||
1548 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1548 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
1549 | if (max_nl_cursor == 0) | 1549 | if (max_nl_cursor == 0) |
1550 | max_nl_cursor = CLUSTER_SIZE; | 1550 | max_nl_cursor = CLUSTER_SIZE; |
1551 | 1551 | ||
1552 | do { | 1552 | do { |
1553 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1553 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1554 | shared.vm_set.list) { | 1554 | shared.vm_set.list) { |
1555 | cursor = (unsigned long) vma->vm_private_data; | 1555 | cursor = (unsigned long) vma->vm_private_data; |
1556 | while ( cursor < max_nl_cursor && | 1556 | while ( cursor < max_nl_cursor && |
1557 | cursor < vma->vm_end - vma->vm_start) { | 1557 | cursor < vma->vm_end - vma->vm_start) { |
1558 | if (try_to_unmap_cluster(cursor, &mapcount, | 1558 | if (try_to_unmap_cluster(cursor, &mapcount, |
1559 | vma, page) == SWAP_MLOCK) | 1559 | vma, page) == SWAP_MLOCK) |
1560 | ret = SWAP_MLOCK; | 1560 | ret = SWAP_MLOCK; |
1561 | cursor += CLUSTER_SIZE; | 1561 | cursor += CLUSTER_SIZE; |
1562 | vma->vm_private_data = (void *) cursor; | 1562 | vma->vm_private_data = (void *) cursor; |
1563 | if ((int)mapcount <= 0) | 1563 | if ((int)mapcount <= 0) |
1564 | goto out; | 1564 | goto out; |
1565 | } | 1565 | } |
1566 | vma->vm_private_data = (void *) max_nl_cursor; | 1566 | vma->vm_private_data = (void *) max_nl_cursor; |
1567 | } | 1567 | } |
1568 | cond_resched(); | 1568 | cond_resched(); |
1569 | max_nl_cursor += CLUSTER_SIZE; | 1569 | max_nl_cursor += CLUSTER_SIZE; |
1570 | } while (max_nl_cursor <= max_nl_size); | 1570 | } while (max_nl_cursor <= max_nl_size); |
1571 | 1571 | ||
1572 | /* | 1572 | /* |
1573 | * Don't loop forever (perhaps all the remaining pages are | 1573 | * Don't loop forever (perhaps all the remaining pages are |
1574 | * in locked vmas). Reset cursor on all unreserved nonlinear | 1574 | * in locked vmas). Reset cursor on all unreserved nonlinear |
1575 | * vmas, now forgetting on which ones it had fallen behind. | 1575 | * vmas, now forgetting on which ones it had fallen behind. |
1576 | */ | 1576 | */ |
1577 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1577 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) |
1578 | vma->vm_private_data = NULL; | 1578 | vma->vm_private_data = NULL; |
1579 | out: | 1579 | out: |
1580 | mutex_unlock(&mapping->i_mmap_mutex); | 1580 | mutex_unlock(&mapping->i_mmap_mutex); |
1581 | return ret; | 1581 | return ret; |
1582 | } | 1582 | } |
1583 | 1583 | ||
1584 | /** | 1584 | /** |
1585 | * try_to_unmap - try to remove all page table mappings to a page | 1585 | * try_to_unmap - try to remove all page table mappings to a page |
1586 | * @page: the page to get unmapped | 1586 | * @page: the page to get unmapped |
1587 | * @flags: action and flags | 1587 | * @flags: action and flags |
1588 | * | 1588 | * |
1589 | * Tries to remove all the page table entries which are mapping this | 1589 | * Tries to remove all the page table entries which are mapping this |
1590 | * page, used in the pageout path. Caller must hold the page lock. | 1590 | * page, used in the pageout path. Caller must hold the page lock. |
1591 | * Return values are: | 1591 | * Return values are: |
1592 | * | 1592 | * |
1593 | * SWAP_SUCCESS - we succeeded in removing all mappings | 1593 | * SWAP_SUCCESS - we succeeded in removing all mappings |
1594 | * SWAP_AGAIN - we missed a mapping, try again later | 1594 | * SWAP_AGAIN - we missed a mapping, try again later |
1595 | * SWAP_FAIL - the page is unswappable | 1595 | * SWAP_FAIL - the page is unswappable |
1596 | * SWAP_MLOCK - page is mlocked. | 1596 | * SWAP_MLOCK - page is mlocked. |
1597 | */ | 1597 | */ |
1598 | int try_to_unmap(struct page *page, enum ttu_flags flags) | 1598 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
1599 | { | 1599 | { |
1600 | int ret; | 1600 | int ret; |
1601 | 1601 | ||
1602 | BUG_ON(!PageLocked(page)); | 1602 | BUG_ON(!PageLocked(page)); |
1603 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | 1603 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); |
1604 | 1604 | ||
1605 | if (unlikely(PageKsm(page))) | 1605 | if (unlikely(PageKsm(page))) |
1606 | ret = try_to_unmap_ksm(page, flags); | 1606 | ret = try_to_unmap_ksm(page, flags); |
1607 | else if (PageAnon(page)) | 1607 | else if (PageAnon(page)) |
1608 | ret = try_to_unmap_anon(page, flags); | 1608 | ret = try_to_unmap_anon(page, flags); |
1609 | else | 1609 | else |
1610 | ret = try_to_unmap_file(page, flags); | 1610 | ret = try_to_unmap_file(page, flags); |
1611 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1611 | if (ret != SWAP_MLOCK && !page_mapped(page)) |
1612 | ret = SWAP_SUCCESS; | 1612 | ret = SWAP_SUCCESS; |
1613 | return ret; | 1613 | return ret; |
1614 | } | 1614 | } |
1615 | 1615 | ||
1616 | /** | 1616 | /** |
1617 | * try_to_munlock - try to munlock a page | 1617 | * try_to_munlock - try to munlock a page |
1618 | * @page: the page to be munlocked | 1618 | * @page: the page to be munlocked |
1619 | * | 1619 | * |
1620 | * Called from munlock code. Checks all of the VMAs mapping the page | 1620 | * Called from munlock code. Checks all of the VMAs mapping the page |
1621 | * to make sure nobody else has this page mlocked. The page will be | 1621 | * to make sure nobody else has this page mlocked. The page will be |
1622 | * returned with PG_mlocked cleared if no other vmas have it mlocked. | 1622 | * returned with PG_mlocked cleared if no other vmas have it mlocked. |
1623 | * | 1623 | * |
1624 | * Return values are: | 1624 | * Return values are: |
1625 | * | 1625 | * |
1626 | * SWAP_AGAIN - no vma is holding page mlocked, or, | 1626 | * SWAP_AGAIN - no vma is holding page mlocked, or, |
1627 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | 1627 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem |
1628 | * SWAP_FAIL - page cannot be located at present | 1628 | * SWAP_FAIL - page cannot be located at present |
1629 | * SWAP_MLOCK - page is now mlocked. | 1629 | * SWAP_MLOCK - page is now mlocked. |
1630 | */ | 1630 | */ |
1631 | int try_to_munlock(struct page *page) | 1631 | int try_to_munlock(struct page *page) |
1632 | { | 1632 | { |
1633 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1633 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1634 | 1634 | ||
1635 | if (unlikely(PageKsm(page))) | 1635 | if (unlikely(PageKsm(page))) |
1636 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | 1636 | return try_to_unmap_ksm(page, TTU_MUNLOCK); |
1637 | else if (PageAnon(page)) | 1637 | else if (PageAnon(page)) |
1638 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1638 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
1639 | else | 1639 | else |
1640 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1640 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1641 | } | 1641 | } |
1642 | 1642 | ||
1643 | void __put_anon_vma(struct anon_vma *anon_vma) | 1643 | void __put_anon_vma(struct anon_vma *anon_vma) |
1644 | { | 1644 | { |
1645 | struct anon_vma *root = anon_vma->root; | 1645 | struct anon_vma *root = anon_vma->root; |
1646 | 1646 | ||
1647 | if (root != anon_vma && atomic_dec_and_test(&root->refcount)) | 1647 | if (root != anon_vma && atomic_dec_and_test(&root->refcount)) |
1648 | anon_vma_free(root); | 1648 | anon_vma_free(root); |
1649 | 1649 | ||
1650 | anon_vma_free(anon_vma); | 1650 | anon_vma_free(anon_vma); |
1651 | } | 1651 | } |
1652 | 1652 | ||
1653 | #ifdef CONFIG_MIGRATION | 1653 | #ifdef CONFIG_MIGRATION |
1654 | /* | 1654 | /* |
1655 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | 1655 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): |
1656 | * Called by migrate.c to remove migration ptes, but might be used more later. | 1656 | * Called by migrate.c to remove migration ptes, but might be used more later. |
1657 | */ | 1657 | */ |
1658 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | 1658 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, |
1659 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1659 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1660 | { | 1660 | { |
1661 | struct anon_vma *anon_vma; | 1661 | struct anon_vma *anon_vma; |
1662 | struct anon_vma_chain *avc; | 1662 | struct anon_vma_chain *avc; |
1663 | int ret = SWAP_AGAIN; | 1663 | int ret = SWAP_AGAIN; |
1664 | 1664 | ||
1665 | /* | 1665 | /* |
1666 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | 1666 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() |
1667 | * because that depends on page_mapped(); but not all its usages | 1667 | * because that depends on page_mapped(); but not all its usages |
1668 | * are holding mmap_sem. Users without mmap_sem are required to | 1668 | * are holding mmap_sem. Users without mmap_sem are required to |
1669 | * take a reference count to prevent the anon_vma disappearing | 1669 | * take a reference count to prevent the anon_vma disappearing |
1670 | */ | 1670 | */ |
1671 | anon_vma = page_anon_vma(page); | 1671 | anon_vma = page_anon_vma(page); |
1672 | if (!anon_vma) | 1672 | if (!anon_vma) |
1673 | return ret; | 1673 | return ret; |
1674 | anon_vma_lock(anon_vma); | 1674 | anon_vma_lock(anon_vma); |
1675 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1675 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
1676 | struct vm_area_struct *vma = avc->vma; | 1676 | struct vm_area_struct *vma = avc->vma; |
1677 | unsigned long address = vma_address(page, vma); | 1677 | unsigned long address = vma_address(page, vma); |
1678 | if (address == -EFAULT) | 1678 | if (address == -EFAULT) |
1679 | continue; | 1679 | continue; |
1680 | ret = rmap_one(page, vma, address, arg); | 1680 | ret = rmap_one(page, vma, address, arg); |
1681 | if (ret != SWAP_AGAIN) | 1681 | if (ret != SWAP_AGAIN) |
1682 | break; | 1682 | break; |
1683 | } | 1683 | } |
1684 | anon_vma_unlock(anon_vma); | 1684 | anon_vma_unlock(anon_vma); |
1685 | return ret; | 1685 | return ret; |
1686 | } | 1686 | } |
1687 | 1687 | ||
1688 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | 1688 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, |
1689 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1689 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1690 | { | 1690 | { |
1691 | struct address_space *mapping = page->mapping; | 1691 | struct address_space *mapping = page->mapping; |
1692 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1692 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1693 | struct vm_area_struct *vma; | 1693 | struct vm_area_struct *vma; |
1694 | struct prio_tree_iter iter; | 1694 | struct prio_tree_iter iter; |
1695 | int ret = SWAP_AGAIN; | 1695 | int ret = SWAP_AGAIN; |
1696 | 1696 | ||
1697 | if (!mapping) | 1697 | if (!mapping) |
1698 | return ret; | 1698 | return ret; |
1699 | mutex_lock(&mapping->i_mmap_mutex); | 1699 | mutex_lock(&mapping->i_mmap_mutex); |
1700 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1700 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1701 | unsigned long address = vma_address(page, vma); | 1701 | unsigned long address = vma_address(page, vma); |
1702 | if (address == -EFAULT) | 1702 | if (address == -EFAULT) |
1703 | continue; | 1703 | continue; |
1704 | ret = rmap_one(page, vma, address, arg); | 1704 | ret = rmap_one(page, vma, address, arg); |
1705 | if (ret != SWAP_AGAIN) | 1705 | if (ret != SWAP_AGAIN) |
1706 | break; | 1706 | break; |
1707 | } | 1707 | } |
1708 | /* | 1708 | /* |
1709 | * No nonlinear handling: being always shared, nonlinear vmas | 1709 | * No nonlinear handling: being always shared, nonlinear vmas |
1710 | * never contain migration ptes. Decide what to do about this | 1710 | * never contain migration ptes. Decide what to do about this |
1711 | * limitation to linear when we need rmap_walk() on nonlinear. | 1711 | * limitation to linear when we need rmap_walk() on nonlinear. |
1712 | */ | 1712 | */ |
1713 | mutex_unlock(&mapping->i_mmap_mutex); | 1713 | mutex_unlock(&mapping->i_mmap_mutex); |
1714 | return ret; | 1714 | return ret; |
1715 | } | 1715 | } |
1716 | 1716 | ||
1717 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | 1717 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, |
1718 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1718 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1719 | { | 1719 | { |
1720 | VM_BUG_ON(!PageLocked(page)); | 1720 | VM_BUG_ON(!PageLocked(page)); |
1721 | 1721 | ||
1722 | if (unlikely(PageKsm(page))) | 1722 | if (unlikely(PageKsm(page))) |
1723 | return rmap_walk_ksm(page, rmap_one, arg); | 1723 | return rmap_walk_ksm(page, rmap_one, arg); |
1724 | else if (PageAnon(page)) | 1724 | else if (PageAnon(page)) |
1725 | return rmap_walk_anon(page, rmap_one, arg); | 1725 | return rmap_walk_anon(page, rmap_one, arg); |
1726 | else | 1726 | else |
1727 | return rmap_walk_file(page, rmap_one, arg); | 1727 | return rmap_walk_file(page, rmap_one, arg); |
1728 | } | 1728 | } |
1729 | #endif /* CONFIG_MIGRATION */ | 1729 | #endif /* CONFIG_MIGRATION */ |
1730 | 1730 | ||
1731 | #ifdef CONFIG_HUGETLB_PAGE | 1731 | #ifdef CONFIG_HUGETLB_PAGE |
1732 | /* | 1732 | /* |
1733 | * The following three functions are for anonymous (private mapped) hugepages. | 1733 | * The following three functions are for anonymous (private mapped) hugepages. |
1734 | * Unlike common anonymous pages, anonymous hugepages have no accounting code | 1734 | * Unlike common anonymous pages, anonymous hugepages have no accounting code |
1735 | * and no lru code, because we handle hugepages differently from common pages. | 1735 | * and no lru code, because we handle hugepages differently from common pages. |
1736 | */ | 1736 | */ |
1737 | static void __hugepage_set_anon_rmap(struct page *page, | 1737 | static void __hugepage_set_anon_rmap(struct page *page, |
1738 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 1738 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
1739 | { | 1739 | { |
1740 | struct anon_vma *anon_vma = vma->anon_vma; | 1740 | struct anon_vma *anon_vma = vma->anon_vma; |
1741 | 1741 | ||
1742 | BUG_ON(!anon_vma); | 1742 | BUG_ON(!anon_vma); |
1743 | 1743 | ||
1744 | if (PageAnon(page)) | 1744 | if (PageAnon(page)) |
1745 | return; | 1745 | return; |
1746 | if (!exclusive) | 1746 | if (!exclusive) |
1747 | anon_vma = anon_vma->root; | 1747 | anon_vma = anon_vma->root; |
1748 | 1748 | ||
1749 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 1749 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
1750 | page->mapping = (struct address_space *) anon_vma; | 1750 | page->mapping = (struct address_space *) anon_vma; |
1751 | page->index = linear_page_index(vma, address); | 1751 | page->index = linear_page_index(vma, address); |
1752 | } | 1752 | } |
1753 | 1753 | ||
1754 | void hugepage_add_anon_rmap(struct page *page, | 1754 | void hugepage_add_anon_rmap(struct page *page, |
1755 | struct vm_area_struct *vma, unsigned long address) | 1755 | struct vm_area_struct *vma, unsigned long address) |
1756 | { | 1756 | { |
1757 | struct anon_vma *anon_vma = vma->anon_vma; | 1757 | struct anon_vma *anon_vma = vma->anon_vma; |
1758 | int first; | 1758 | int first; |
1759 | 1759 | ||
1760 | BUG_ON(!PageLocked(page)); | 1760 | BUG_ON(!PageLocked(page)); |
1761 | BUG_ON(!anon_vma); | 1761 | BUG_ON(!anon_vma); |
1762 | /* address might be in next vma when migration races vma_adjust */ | 1762 | /* address might be in next vma when migration races vma_adjust */ |
1763 | first = atomic_inc_and_test(&page->_mapcount); | 1763 | first = atomic_inc_and_test(&page->_mapcount); |
1764 | if (first) | 1764 | if (first) |
1765 | __hugepage_set_anon_rmap(page, vma, address, 0); | 1765 | __hugepage_set_anon_rmap(page, vma, address, 0); |
1766 | } | 1766 | } |
1767 | 1767 | ||
1768 | void hugepage_add_new_anon_rmap(struct page *page, | 1768 | void hugepage_add_new_anon_rmap(struct page *page, |
1769 | struct vm_area_struct *vma, unsigned long address) | 1769 | struct vm_area_struct *vma, unsigned long address) |
1770 | { | 1770 | { |
1771 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1771 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
1772 | atomic_set(&page->_mapcount, 0); | 1772 | atomic_set(&page->_mapcount, 0); |
1773 | __hugepage_set_anon_rmap(page, vma, address, 1); | 1773 | __hugepage_set_anon_rmap(page, vma, address, 1); |
1774 | } | 1774 | } |
1775 | #endif /* CONFIG_HUGETLB_PAGE */ | 1775 | #endif /* CONFIG_HUGETLB_PAGE */ |
1776 | 1776 |