Commit 4195f73d1329e49727bcceb028e58cb38376c2b0
Committed by
Al Viro
1 parent
545b9fd3d7
Exists in
master
and in
7 other branches
fs: block_dump missing dentry locking
I think the block_dump output in __mark_inode_dirty is missing dentry locking. Surely the i_dentry list can change any time, so we may not even *get* a dentry there. If we do get one by chance, then it would appear to be able to go away or get renamed at any time... Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Showing 1 changed file with 24 additions and 17 deletions Inline Diff
fs/fs-writeback.c
1 | /* | 1 | /* |
2 | * fs/fs-writeback.c | 2 | * fs/fs-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * | 5 | * |
6 | * Contains all the functions related to writing back and waiting | 6 | * Contains all the functions related to writing back and waiting |
7 | * upon dirty inodes against superblocks, and writing back dirty | 7 | * upon dirty inodes against superblocks, and writing back dirty |
8 | * pages against inodes. ie: data writeback. Writeout of the | 8 | * pages against inodes. ie: data writeback. Writeout of the |
9 | * inode itself is not handled here. | 9 | * inode itself is not handled here. |
10 | * | 10 | * |
11 | * 10Apr2002 Andrew Morton | 11 | * 10Apr2002 Andrew Morton |
12 | * Split out of fs/inode.c | 12 | * Split out of fs/inode.c |
13 | * Additions for address_space-based writeback | 13 | * Additions for address_space-based writeback |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/writeback.h> | 22 | #include <linux/writeback.h> |
23 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
24 | #include <linux/backing-dev.h> | 24 | #include <linux/backing-dev.h> |
25 | #include <linux/buffer_head.h> | 25 | #include <linux/buffer_head.h> |
26 | #include "internal.h" | 26 | #include "internal.h" |
27 | 27 | ||
28 | 28 | ||
29 | /** | 29 | /** |
30 | * writeback_acquire - attempt to get exclusive writeback access to a device | 30 | * writeback_acquire - attempt to get exclusive writeback access to a device |
31 | * @bdi: the device's backing_dev_info structure | 31 | * @bdi: the device's backing_dev_info structure |
32 | * | 32 | * |
33 | * It is a waste of resources to have more than one pdflush thread blocked on | 33 | * It is a waste of resources to have more than one pdflush thread blocked on |
34 | * a single request queue. Exclusion at the request_queue level is obtained | 34 | * a single request queue. Exclusion at the request_queue level is obtained |
35 | * via a flag in the request_queue's backing_dev_info.state. | 35 | * via a flag in the request_queue's backing_dev_info.state. |
36 | * | 36 | * |
37 | * Non-request_queue-backed address_spaces will share default_backing_dev_info, | 37 | * Non-request_queue-backed address_spaces will share default_backing_dev_info, |
38 | * unless they implement their own. Which is somewhat inefficient, as this | 38 | * unless they implement their own. Which is somewhat inefficient, as this |
39 | * may prevent concurrent writeback against multiple devices. | 39 | * may prevent concurrent writeback against multiple devices. |
40 | */ | 40 | */ |
41 | static int writeback_acquire(struct backing_dev_info *bdi) | 41 | static int writeback_acquire(struct backing_dev_info *bdi) |
42 | { | 42 | { |
43 | return !test_and_set_bit(BDI_pdflush, &bdi->state); | 43 | return !test_and_set_bit(BDI_pdflush, &bdi->state); |
44 | } | 44 | } |
45 | 45 | ||
46 | /** | 46 | /** |
47 | * writeback_in_progress - determine whether there is writeback in progress | 47 | * writeback_in_progress - determine whether there is writeback in progress |
48 | * @bdi: the device's backing_dev_info structure. | 48 | * @bdi: the device's backing_dev_info structure. |
49 | * | 49 | * |
50 | * Determine whether there is writeback in progress against a backing device. | 50 | * Determine whether there is writeback in progress against a backing device. |
51 | */ | 51 | */ |
52 | int writeback_in_progress(struct backing_dev_info *bdi) | 52 | int writeback_in_progress(struct backing_dev_info *bdi) |
53 | { | 53 | { |
54 | return test_bit(BDI_pdflush, &bdi->state); | 54 | return test_bit(BDI_pdflush, &bdi->state); |
55 | } | 55 | } |
56 | 56 | ||
57 | /** | 57 | /** |
58 | * writeback_release - relinquish exclusive writeback access against a device. | 58 | * writeback_release - relinquish exclusive writeback access against a device. |
59 | * @bdi: the device's backing_dev_info structure | 59 | * @bdi: the device's backing_dev_info structure |
60 | */ | 60 | */ |
61 | static void writeback_release(struct backing_dev_info *bdi) | 61 | static void writeback_release(struct backing_dev_info *bdi) |
62 | { | 62 | { |
63 | BUG_ON(!writeback_in_progress(bdi)); | 63 | BUG_ON(!writeback_in_progress(bdi)); |
64 | clear_bit(BDI_pdflush, &bdi->state); | 64 | clear_bit(BDI_pdflush, &bdi->state); |
65 | } | 65 | } |
66 | 66 | ||
67 | static noinline void block_dump___mark_inode_dirty(struct inode *inode) | ||
68 | { | ||
69 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { | ||
70 | struct dentry *dentry; | ||
71 | const char *name = "?"; | ||
72 | |||
73 | dentry = d_find_alias(inode); | ||
74 | if (dentry) { | ||
75 | spin_lock(&dentry->d_lock); | ||
76 | name = (const char *) dentry->d_name.name; | ||
77 | } | ||
78 | printk(KERN_DEBUG | ||
79 | "%s(%d): dirtied inode %lu (%s) on %s\n", | ||
80 | current->comm, task_pid_nr(current), inode->i_ino, | ||
81 | name, inode->i_sb->s_id); | ||
82 | if (dentry) { | ||
83 | spin_unlock(&dentry->d_lock); | ||
84 | dput(dentry); | ||
85 | } | ||
86 | } | ||
87 | } | ||
88 | |||
67 | /** | 89 | /** |
68 | * __mark_inode_dirty - internal function | 90 | * __mark_inode_dirty - internal function |
69 | * @inode: inode to mark | 91 | * @inode: inode to mark |
70 | * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) | 92 | * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) |
71 | * Mark an inode as dirty. Callers should use mark_inode_dirty or | 93 | * Mark an inode as dirty. Callers should use mark_inode_dirty or |
72 | * mark_inode_dirty_sync. | 94 | * mark_inode_dirty_sync. |
73 | * | 95 | * |
74 | * Put the inode on the super block's dirty list. | 96 | * Put the inode on the super block's dirty list. |
75 | * | 97 | * |
76 | * CAREFUL! We mark it dirty unconditionally, but move it onto the | 98 | * CAREFUL! We mark it dirty unconditionally, but move it onto the |
77 | * dirty list only if it is hashed or if it refers to a blockdev. | 99 | * dirty list only if it is hashed or if it refers to a blockdev. |
78 | * If it was not hashed, it will never be added to the dirty list | 100 | * If it was not hashed, it will never be added to the dirty list |
79 | * even if it is later hashed, as it will have been marked dirty already. | 101 | * even if it is later hashed, as it will have been marked dirty already. |
80 | * | 102 | * |
81 | * In short, make sure you hash any inodes _before_ you start marking | 103 | * In short, make sure you hash any inodes _before_ you start marking |
82 | * them dirty. | 104 | * them dirty. |
83 | * | 105 | * |
84 | * This function *must* be atomic for the I_DIRTY_PAGES case - | 106 | * This function *must* be atomic for the I_DIRTY_PAGES case - |
85 | * set_page_dirty() is called under spinlock in several places. | 107 | * set_page_dirty() is called under spinlock in several places. |
86 | * | 108 | * |
87 | * Note that for blockdevs, inode->dirtied_when represents the dirtying time of | 109 | * Note that for blockdevs, inode->dirtied_when represents the dirtying time of |
88 | * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of | 110 | * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of |
89 | * the kernel-internal blockdev inode represents the dirtying time of the | 111 | * the kernel-internal blockdev inode represents the dirtying time of the |
90 | * blockdev's pages. This is why for I_DIRTY_PAGES we always use | 112 | * blockdev's pages. This is why for I_DIRTY_PAGES we always use |
91 | * page->mapping->host, so the page-dirtying time is recorded in the internal | 113 | * page->mapping->host, so the page-dirtying time is recorded in the internal |
92 | * blockdev inode. | 114 | * blockdev inode. |
93 | */ | 115 | */ |
94 | void __mark_inode_dirty(struct inode *inode, int flags) | 116 | void __mark_inode_dirty(struct inode *inode, int flags) |
95 | { | 117 | { |
96 | struct super_block *sb = inode->i_sb; | 118 | struct super_block *sb = inode->i_sb; |
97 | 119 | ||
98 | /* | 120 | /* |
99 | * Don't do this for I_DIRTY_PAGES - that doesn't actually | 121 | * Don't do this for I_DIRTY_PAGES - that doesn't actually |
100 | * dirty the inode itself | 122 | * dirty the inode itself |
101 | */ | 123 | */ |
102 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 124 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
103 | if (sb->s_op->dirty_inode) | 125 | if (sb->s_op->dirty_inode) |
104 | sb->s_op->dirty_inode(inode); | 126 | sb->s_op->dirty_inode(inode); |
105 | } | 127 | } |
106 | 128 | ||
107 | /* | 129 | /* |
108 | * make sure that changes are seen by all cpus before we test i_state | 130 | * make sure that changes are seen by all cpus before we test i_state |
109 | * -- mikulas | 131 | * -- mikulas |
110 | */ | 132 | */ |
111 | smp_mb(); | 133 | smp_mb(); |
112 | 134 | ||
113 | /* avoid the locking if we can */ | 135 | /* avoid the locking if we can */ |
114 | if ((inode->i_state & flags) == flags) | 136 | if ((inode->i_state & flags) == flags) |
115 | return; | 137 | return; |
116 | 138 | ||
117 | if (unlikely(block_dump)) { | 139 | if (unlikely(block_dump)) |
118 | struct dentry *dentry = NULL; | 140 | block_dump___mark_inode_dirty(inode); |
119 | const char *name = "?"; | ||
120 | |||
121 | if (!list_empty(&inode->i_dentry)) { | ||
122 | dentry = list_entry(inode->i_dentry.next, | ||
123 | struct dentry, d_alias); | ||
124 | if (dentry && dentry->d_name.name) | ||
125 | name = (const char *) dentry->d_name.name; | ||
126 | } | ||
127 | |||
128 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) | ||
129 | printk(KERN_DEBUG | ||
130 | "%s(%d): dirtied inode %lu (%s) on %s\n", | ||
131 | current->comm, task_pid_nr(current), inode->i_ino, | ||
132 | name, inode->i_sb->s_id); | ||
133 | } | ||
134 | 141 | ||
135 | spin_lock(&inode_lock); | 142 | spin_lock(&inode_lock); |
136 | if ((inode->i_state & flags) != flags) { | 143 | if ((inode->i_state & flags) != flags) { |
137 | const int was_dirty = inode->i_state & I_DIRTY; | 144 | const int was_dirty = inode->i_state & I_DIRTY; |
138 | 145 | ||
139 | inode->i_state |= flags; | 146 | inode->i_state |= flags; |
140 | 147 | ||
141 | /* | 148 | /* |
142 | * If the inode is being synced, just update its dirty state. | 149 | * If the inode is being synced, just update its dirty state. |
143 | * The unlocker will place the inode on the appropriate | 150 | * The unlocker will place the inode on the appropriate |
144 | * superblock list, based upon its state. | 151 | * superblock list, based upon its state. |
145 | */ | 152 | */ |
146 | if (inode->i_state & I_SYNC) | 153 | if (inode->i_state & I_SYNC) |
147 | goto out; | 154 | goto out; |
148 | 155 | ||
149 | /* | 156 | /* |
150 | * Only add valid (hashed) inodes to the superblock's | 157 | * Only add valid (hashed) inodes to the superblock's |
151 | * dirty list. Add blockdev inodes as well. | 158 | * dirty list. Add blockdev inodes as well. |
152 | */ | 159 | */ |
153 | if (!S_ISBLK(inode->i_mode)) { | 160 | if (!S_ISBLK(inode->i_mode)) { |
154 | if (hlist_unhashed(&inode->i_hash)) | 161 | if (hlist_unhashed(&inode->i_hash)) |
155 | goto out; | 162 | goto out; |
156 | } | 163 | } |
157 | if (inode->i_state & (I_FREEING|I_CLEAR)) | 164 | if (inode->i_state & (I_FREEING|I_CLEAR)) |
158 | goto out; | 165 | goto out; |
159 | 166 | ||
160 | /* | 167 | /* |
161 | * If the inode was already on s_dirty/s_io/s_more_io, don't | 168 | * If the inode was already on s_dirty/s_io/s_more_io, don't |
162 | * reposition it (that would break s_dirty time-ordering). | 169 | * reposition it (that would break s_dirty time-ordering). |
163 | */ | 170 | */ |
164 | if (!was_dirty) { | 171 | if (!was_dirty) { |
165 | inode->dirtied_when = jiffies; | 172 | inode->dirtied_when = jiffies; |
166 | list_move(&inode->i_list, &sb->s_dirty); | 173 | list_move(&inode->i_list, &sb->s_dirty); |
167 | } | 174 | } |
168 | } | 175 | } |
169 | out: | 176 | out: |
170 | spin_unlock(&inode_lock); | 177 | spin_unlock(&inode_lock); |
171 | } | 178 | } |
172 | 179 | ||
173 | EXPORT_SYMBOL(__mark_inode_dirty); | 180 | EXPORT_SYMBOL(__mark_inode_dirty); |
174 | 181 | ||
175 | static int write_inode(struct inode *inode, int sync) | 182 | static int write_inode(struct inode *inode, int sync) |
176 | { | 183 | { |
177 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) | 184 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) |
178 | return inode->i_sb->s_op->write_inode(inode, sync); | 185 | return inode->i_sb->s_op->write_inode(inode, sync); |
179 | return 0; | 186 | return 0; |
180 | } | 187 | } |
181 | 188 | ||
182 | /* | 189 | /* |
183 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the | 190 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
184 | * furthest end of its superblock's dirty-inode list. | 191 | * furthest end of its superblock's dirty-inode list. |
185 | * | 192 | * |
186 | * Before stamping the inode's ->dirtied_when, we check to see whether it is | 193 | * Before stamping the inode's ->dirtied_when, we check to see whether it is |
187 | * already the most-recently-dirtied inode on the s_dirty list. If that is | 194 | * already the most-recently-dirtied inode on the s_dirty list. If that is |
188 | * the case then the inode must have been redirtied while it was being written | 195 | * the case then the inode must have been redirtied while it was being written |
189 | * out and we don't reset its dirtied_when. | 196 | * out and we don't reset its dirtied_when. |
190 | */ | 197 | */ |
191 | static void redirty_tail(struct inode *inode) | 198 | static void redirty_tail(struct inode *inode) |
192 | { | 199 | { |
193 | struct super_block *sb = inode->i_sb; | 200 | struct super_block *sb = inode->i_sb; |
194 | 201 | ||
195 | if (!list_empty(&sb->s_dirty)) { | 202 | if (!list_empty(&sb->s_dirty)) { |
196 | struct inode *tail_inode; | 203 | struct inode *tail_inode; |
197 | 204 | ||
198 | tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); | 205 | tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); |
199 | if (time_before(inode->dirtied_when, | 206 | if (time_before(inode->dirtied_when, |
200 | tail_inode->dirtied_when)) | 207 | tail_inode->dirtied_when)) |
201 | inode->dirtied_when = jiffies; | 208 | inode->dirtied_when = jiffies; |
202 | } | 209 | } |
203 | list_move(&inode->i_list, &sb->s_dirty); | 210 | list_move(&inode->i_list, &sb->s_dirty); |
204 | } | 211 | } |
205 | 212 | ||
206 | /* | 213 | /* |
207 | * requeue inode for re-scanning after sb->s_io list is exhausted. | 214 | * requeue inode for re-scanning after sb->s_io list is exhausted. |
208 | */ | 215 | */ |
209 | static void requeue_io(struct inode *inode) | 216 | static void requeue_io(struct inode *inode) |
210 | { | 217 | { |
211 | list_move(&inode->i_list, &inode->i_sb->s_more_io); | 218 | list_move(&inode->i_list, &inode->i_sb->s_more_io); |
212 | } | 219 | } |
213 | 220 | ||
214 | static void inode_sync_complete(struct inode *inode) | 221 | static void inode_sync_complete(struct inode *inode) |
215 | { | 222 | { |
216 | /* | 223 | /* |
217 | * Prevent speculative execution through spin_unlock(&inode_lock); | 224 | * Prevent speculative execution through spin_unlock(&inode_lock); |
218 | */ | 225 | */ |
219 | smp_mb(); | 226 | smp_mb(); |
220 | wake_up_bit(&inode->i_state, __I_SYNC); | 227 | wake_up_bit(&inode->i_state, __I_SYNC); |
221 | } | 228 | } |
222 | 229 | ||
223 | static bool inode_dirtied_after(struct inode *inode, unsigned long t) | 230 | static bool inode_dirtied_after(struct inode *inode, unsigned long t) |
224 | { | 231 | { |
225 | bool ret = time_after(inode->dirtied_when, t); | 232 | bool ret = time_after(inode->dirtied_when, t); |
226 | #ifndef CONFIG_64BIT | 233 | #ifndef CONFIG_64BIT |
227 | /* | 234 | /* |
228 | * For inodes being constantly redirtied, dirtied_when can get stuck. | 235 | * For inodes being constantly redirtied, dirtied_when can get stuck. |
229 | * It _appears_ to be in the future, but is actually in distant past. | 236 | * It _appears_ to be in the future, but is actually in distant past. |
230 | * This test is necessary to prevent such wrapped-around relative times | 237 | * This test is necessary to prevent such wrapped-around relative times |
231 | * from permanently stopping the whole pdflush writeback. | 238 | * from permanently stopping the whole pdflush writeback. |
232 | */ | 239 | */ |
233 | ret = ret && time_before_eq(inode->dirtied_when, jiffies); | 240 | ret = ret && time_before_eq(inode->dirtied_when, jiffies); |
234 | #endif | 241 | #endif |
235 | return ret; | 242 | return ret; |
236 | } | 243 | } |
237 | 244 | ||
238 | /* | 245 | /* |
239 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. | 246 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. |
240 | */ | 247 | */ |
241 | static void move_expired_inodes(struct list_head *delaying_queue, | 248 | static void move_expired_inodes(struct list_head *delaying_queue, |
242 | struct list_head *dispatch_queue, | 249 | struct list_head *dispatch_queue, |
243 | unsigned long *older_than_this) | 250 | unsigned long *older_than_this) |
244 | { | 251 | { |
245 | while (!list_empty(delaying_queue)) { | 252 | while (!list_empty(delaying_queue)) { |
246 | struct inode *inode = list_entry(delaying_queue->prev, | 253 | struct inode *inode = list_entry(delaying_queue->prev, |
247 | struct inode, i_list); | 254 | struct inode, i_list); |
248 | if (older_than_this && | 255 | if (older_than_this && |
249 | inode_dirtied_after(inode, *older_than_this)) | 256 | inode_dirtied_after(inode, *older_than_this)) |
250 | break; | 257 | break; |
251 | list_move(&inode->i_list, dispatch_queue); | 258 | list_move(&inode->i_list, dispatch_queue); |
252 | } | 259 | } |
253 | } | 260 | } |
254 | 261 | ||
255 | /* | 262 | /* |
256 | * Queue all expired dirty inodes for io, eldest first. | 263 | * Queue all expired dirty inodes for io, eldest first. |
257 | */ | 264 | */ |
258 | static void queue_io(struct super_block *sb, | 265 | static void queue_io(struct super_block *sb, |
259 | unsigned long *older_than_this) | 266 | unsigned long *older_than_this) |
260 | { | 267 | { |
261 | list_splice_init(&sb->s_more_io, sb->s_io.prev); | 268 | list_splice_init(&sb->s_more_io, sb->s_io.prev); |
262 | move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); | 269 | move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); |
263 | } | 270 | } |
264 | 271 | ||
265 | int sb_has_dirty_inodes(struct super_block *sb) | 272 | int sb_has_dirty_inodes(struct super_block *sb) |
266 | { | 273 | { |
267 | return !list_empty(&sb->s_dirty) || | 274 | return !list_empty(&sb->s_dirty) || |
268 | !list_empty(&sb->s_io) || | 275 | !list_empty(&sb->s_io) || |
269 | !list_empty(&sb->s_more_io); | 276 | !list_empty(&sb->s_more_io); |
270 | } | 277 | } |
271 | EXPORT_SYMBOL(sb_has_dirty_inodes); | 278 | EXPORT_SYMBOL(sb_has_dirty_inodes); |
272 | 279 | ||
273 | /* | 280 | /* |
274 | * Write a single inode's dirty pages and inode data out to disk. | 281 | * Write a single inode's dirty pages and inode data out to disk. |
275 | * If `wait' is set, wait on the writeout. | 282 | * If `wait' is set, wait on the writeout. |
276 | * | 283 | * |
277 | * The whole writeout design is quite complex and fragile. We want to avoid | 284 | * The whole writeout design is quite complex and fragile. We want to avoid |
278 | * starvation of particular inodes when others are being redirtied, prevent | 285 | * starvation of particular inodes when others are being redirtied, prevent |
279 | * livelocks, etc. | 286 | * livelocks, etc. |
280 | * | 287 | * |
281 | * Called under inode_lock. | 288 | * Called under inode_lock. |
282 | */ | 289 | */ |
283 | static int | 290 | static int |
284 | __sync_single_inode(struct inode *inode, struct writeback_control *wbc) | 291 | __sync_single_inode(struct inode *inode, struct writeback_control *wbc) |
285 | { | 292 | { |
286 | unsigned dirty; | 293 | unsigned dirty; |
287 | struct address_space *mapping = inode->i_mapping; | 294 | struct address_space *mapping = inode->i_mapping; |
288 | int wait = wbc->sync_mode == WB_SYNC_ALL; | 295 | int wait = wbc->sync_mode == WB_SYNC_ALL; |
289 | int ret; | 296 | int ret; |
290 | 297 | ||
291 | BUG_ON(inode->i_state & I_SYNC); | 298 | BUG_ON(inode->i_state & I_SYNC); |
292 | 299 | ||
293 | /* Set I_SYNC, reset I_DIRTY */ | 300 | /* Set I_SYNC, reset I_DIRTY */ |
294 | dirty = inode->i_state & I_DIRTY; | 301 | dirty = inode->i_state & I_DIRTY; |
295 | inode->i_state |= I_SYNC; | 302 | inode->i_state |= I_SYNC; |
296 | inode->i_state &= ~I_DIRTY; | 303 | inode->i_state &= ~I_DIRTY; |
297 | 304 | ||
298 | spin_unlock(&inode_lock); | 305 | spin_unlock(&inode_lock); |
299 | 306 | ||
300 | ret = do_writepages(mapping, wbc); | 307 | ret = do_writepages(mapping, wbc); |
301 | 308 | ||
302 | /* Don't write the inode if only I_DIRTY_PAGES was set */ | 309 | /* Don't write the inode if only I_DIRTY_PAGES was set */ |
303 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 310 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
304 | int err = write_inode(inode, wait); | 311 | int err = write_inode(inode, wait); |
305 | if (ret == 0) | 312 | if (ret == 0) |
306 | ret = err; | 313 | ret = err; |
307 | } | 314 | } |
308 | 315 | ||
309 | if (wait) { | 316 | if (wait) { |
310 | int err = filemap_fdatawait(mapping); | 317 | int err = filemap_fdatawait(mapping); |
311 | if (ret == 0) | 318 | if (ret == 0) |
312 | ret = err; | 319 | ret = err; |
313 | } | 320 | } |
314 | 321 | ||
315 | spin_lock(&inode_lock); | 322 | spin_lock(&inode_lock); |
316 | inode->i_state &= ~I_SYNC; | 323 | inode->i_state &= ~I_SYNC; |
317 | if (!(inode->i_state & I_FREEING)) { | 324 | if (!(inode->i_state & I_FREEING)) { |
318 | if (!(inode->i_state & I_DIRTY) && | 325 | if (!(inode->i_state & I_DIRTY) && |
319 | mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | 326 | mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { |
320 | /* | 327 | /* |
321 | * We didn't write back all the pages. nfs_writepages() | 328 | * We didn't write back all the pages. nfs_writepages() |
322 | * sometimes bales out without doing anything. Redirty | 329 | * sometimes bales out without doing anything. Redirty |
323 | * the inode; Move it from s_io onto s_more_io/s_dirty. | 330 | * the inode; Move it from s_io onto s_more_io/s_dirty. |
324 | */ | 331 | */ |
325 | /* | 332 | /* |
326 | * akpm: if the caller was the kupdate function we put | 333 | * akpm: if the caller was the kupdate function we put |
327 | * this inode at the head of s_dirty so it gets first | 334 | * this inode at the head of s_dirty so it gets first |
328 | * consideration. Otherwise, move it to the tail, for | 335 | * consideration. Otherwise, move it to the tail, for |
329 | * the reasons described there. I'm not really sure | 336 | * the reasons described there. I'm not really sure |
330 | * how much sense this makes. Presumably I had a good | 337 | * how much sense this makes. Presumably I had a good |
331 | * reasons for doing it this way, and I'd rather not | 338 | * reasons for doing it this way, and I'd rather not |
332 | * muck with it at present. | 339 | * muck with it at present. |
333 | */ | 340 | */ |
334 | if (wbc->for_kupdate) { | 341 | if (wbc->for_kupdate) { |
335 | /* | 342 | /* |
336 | * For the kupdate function we move the inode | 343 | * For the kupdate function we move the inode |
337 | * to s_more_io so it will get more writeout as | 344 | * to s_more_io so it will get more writeout as |
338 | * soon as the queue becomes uncongested. | 345 | * soon as the queue becomes uncongested. |
339 | */ | 346 | */ |
340 | inode->i_state |= I_DIRTY_PAGES; | 347 | inode->i_state |= I_DIRTY_PAGES; |
341 | if (wbc->nr_to_write <= 0) { | 348 | if (wbc->nr_to_write <= 0) { |
342 | /* | 349 | /* |
343 | * slice used up: queue for next turn | 350 | * slice used up: queue for next turn |
344 | */ | 351 | */ |
345 | requeue_io(inode); | 352 | requeue_io(inode); |
346 | } else { | 353 | } else { |
347 | /* | 354 | /* |
348 | * somehow blocked: retry later | 355 | * somehow blocked: retry later |
349 | */ | 356 | */ |
350 | redirty_tail(inode); | 357 | redirty_tail(inode); |
351 | } | 358 | } |
352 | } else { | 359 | } else { |
353 | /* | 360 | /* |
354 | * Otherwise fully redirty the inode so that | 361 | * Otherwise fully redirty the inode so that |
355 | * other inodes on this superblock will get some | 362 | * other inodes on this superblock will get some |
356 | * writeout. Otherwise heavy writing to one | 363 | * writeout. Otherwise heavy writing to one |
357 | * file would indefinitely suspend writeout of | 364 | * file would indefinitely suspend writeout of |
358 | * all the other files. | 365 | * all the other files. |
359 | */ | 366 | */ |
360 | inode->i_state |= I_DIRTY_PAGES; | 367 | inode->i_state |= I_DIRTY_PAGES; |
361 | redirty_tail(inode); | 368 | redirty_tail(inode); |
362 | } | 369 | } |
363 | } else if (inode->i_state & I_DIRTY) { | 370 | } else if (inode->i_state & I_DIRTY) { |
364 | /* | 371 | /* |
365 | * Someone redirtied the inode while were writing back | 372 | * Someone redirtied the inode while were writing back |
366 | * the pages. | 373 | * the pages. |
367 | */ | 374 | */ |
368 | redirty_tail(inode); | 375 | redirty_tail(inode); |
369 | } else if (atomic_read(&inode->i_count)) { | 376 | } else if (atomic_read(&inode->i_count)) { |
370 | /* | 377 | /* |
371 | * The inode is clean, inuse | 378 | * The inode is clean, inuse |
372 | */ | 379 | */ |
373 | list_move(&inode->i_list, &inode_in_use); | 380 | list_move(&inode->i_list, &inode_in_use); |
374 | } else { | 381 | } else { |
375 | /* | 382 | /* |
376 | * The inode is clean, unused | 383 | * The inode is clean, unused |
377 | */ | 384 | */ |
378 | list_move(&inode->i_list, &inode_unused); | 385 | list_move(&inode->i_list, &inode_unused); |
379 | } | 386 | } |
380 | } | 387 | } |
381 | inode_sync_complete(inode); | 388 | inode_sync_complete(inode); |
382 | return ret; | 389 | return ret; |
383 | } | 390 | } |
384 | 391 | ||
385 | /* | 392 | /* |
386 | * Write out an inode's dirty pages. Called under inode_lock. Either the | 393 | * Write out an inode's dirty pages. Called under inode_lock. Either the |
387 | * caller has ref on the inode (either via __iget or via syscall against an fd) | 394 | * caller has ref on the inode (either via __iget or via syscall against an fd) |
388 | * or the inode has I_WILL_FREE set (via generic_forget_inode) | 395 | * or the inode has I_WILL_FREE set (via generic_forget_inode) |
389 | */ | 396 | */ |
390 | static int | 397 | static int |
391 | __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | 398 | __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) |
392 | { | 399 | { |
393 | wait_queue_head_t *wqh; | 400 | wait_queue_head_t *wqh; |
394 | 401 | ||
395 | if (!atomic_read(&inode->i_count)) | 402 | if (!atomic_read(&inode->i_count)) |
396 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); | 403 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); |
397 | else | 404 | else |
398 | WARN_ON(inode->i_state & I_WILL_FREE); | 405 | WARN_ON(inode->i_state & I_WILL_FREE); |
399 | 406 | ||
400 | if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) { | 407 | if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) { |
401 | /* | 408 | /* |
402 | * We're skipping this inode because it's locked, and we're not | 409 | * We're skipping this inode because it's locked, and we're not |
403 | * doing writeback-for-data-integrity. Move it to s_more_io so | 410 | * doing writeback-for-data-integrity. Move it to s_more_io so |
404 | * that writeback can proceed with the other inodes on s_io. | 411 | * that writeback can proceed with the other inodes on s_io. |
405 | * We'll have another go at writing back this inode when we | 412 | * We'll have another go at writing back this inode when we |
406 | * completed a full scan of s_io. | 413 | * completed a full scan of s_io. |
407 | */ | 414 | */ |
408 | requeue_io(inode); | 415 | requeue_io(inode); |
409 | return 0; | 416 | return 0; |
410 | } | 417 | } |
411 | 418 | ||
412 | /* | 419 | /* |
413 | * It's a data-integrity sync. We must wait. | 420 | * It's a data-integrity sync. We must wait. |
414 | */ | 421 | */ |
415 | if (inode->i_state & I_SYNC) { | 422 | if (inode->i_state & I_SYNC) { |
416 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); | 423 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); |
417 | 424 | ||
418 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); | 425 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
419 | do { | 426 | do { |
420 | spin_unlock(&inode_lock); | 427 | spin_unlock(&inode_lock); |
421 | __wait_on_bit(wqh, &wq, inode_wait, | 428 | __wait_on_bit(wqh, &wq, inode_wait, |
422 | TASK_UNINTERRUPTIBLE); | 429 | TASK_UNINTERRUPTIBLE); |
423 | spin_lock(&inode_lock); | 430 | spin_lock(&inode_lock); |
424 | } while (inode->i_state & I_SYNC); | 431 | } while (inode->i_state & I_SYNC); |
425 | } | 432 | } |
426 | return __sync_single_inode(inode, wbc); | 433 | return __sync_single_inode(inode, wbc); |
427 | } | 434 | } |
428 | 435 | ||
429 | /* | 436 | /* |
430 | * Write out a superblock's list of dirty inodes. A wait will be performed | 437 | * Write out a superblock's list of dirty inodes. A wait will be performed |
431 | * upon no inodes, all inodes or the final one, depending upon sync_mode. | 438 | * upon no inodes, all inodes or the final one, depending upon sync_mode. |
432 | * | 439 | * |
433 | * If older_than_this is non-NULL, then only write out inodes which | 440 | * If older_than_this is non-NULL, then only write out inodes which |
434 | * had their first dirtying at a time earlier than *older_than_this. | 441 | * had their first dirtying at a time earlier than *older_than_this. |
435 | * | 442 | * |
436 | * If we're a pdflush thread, then implement pdflush collision avoidance | 443 | * If we're a pdflush thread, then implement pdflush collision avoidance |
437 | * against the entire list. | 444 | * against the entire list. |
438 | * | 445 | * |
439 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. | 446 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. |
440 | * This function assumes that the blockdev superblock's inodes are backed by | 447 | * This function assumes that the blockdev superblock's inodes are backed by |
441 | * a variety of queues, so all inodes are searched. For other superblocks, | 448 | * a variety of queues, so all inodes are searched. For other superblocks, |
442 | * assume that all inodes are backed by the same queue. | 449 | * assume that all inodes are backed by the same queue. |
443 | * | 450 | * |
444 | * FIXME: this linear search could get expensive with many fileystems. But | 451 | * FIXME: this linear search could get expensive with many fileystems. But |
445 | * how to fix? We need to go from an address_space to all inodes which share | 452 | * how to fix? We need to go from an address_space to all inodes which share |
446 | * a queue with that address_space. (Easy: have a global "dirty superblocks" | 453 | * a queue with that address_space. (Easy: have a global "dirty superblocks" |
447 | * list). | 454 | * list). |
448 | * | 455 | * |
449 | * The inodes to be written are parked on sb->s_io. They are moved back onto | 456 | * The inodes to be written are parked on sb->s_io. They are moved back onto |
450 | * sb->s_dirty as they are selected for writing. This way, none can be missed | 457 | * sb->s_dirty as they are selected for writing. This way, none can be missed |
451 | * on the writer throttling path, and we get decent balancing between many | 458 | * on the writer throttling path, and we get decent balancing between many |
452 | * throttled threads: we don't want them all piling up on inode_sync_wait. | 459 | * throttled threads: we don't want them all piling up on inode_sync_wait. |
453 | */ | 460 | */ |
454 | void generic_sync_sb_inodes(struct super_block *sb, | 461 | void generic_sync_sb_inodes(struct super_block *sb, |
455 | struct writeback_control *wbc) | 462 | struct writeback_control *wbc) |
456 | { | 463 | { |
457 | const unsigned long start = jiffies; /* livelock avoidance */ | 464 | const unsigned long start = jiffies; /* livelock avoidance */ |
458 | int sync = wbc->sync_mode == WB_SYNC_ALL; | 465 | int sync = wbc->sync_mode == WB_SYNC_ALL; |
459 | 466 | ||
460 | spin_lock(&inode_lock); | 467 | spin_lock(&inode_lock); |
461 | if (!wbc->for_kupdate || list_empty(&sb->s_io)) | 468 | if (!wbc->for_kupdate || list_empty(&sb->s_io)) |
462 | queue_io(sb, wbc->older_than_this); | 469 | queue_io(sb, wbc->older_than_this); |
463 | 470 | ||
464 | while (!list_empty(&sb->s_io)) { | 471 | while (!list_empty(&sb->s_io)) { |
465 | struct inode *inode = list_entry(sb->s_io.prev, | 472 | struct inode *inode = list_entry(sb->s_io.prev, |
466 | struct inode, i_list); | 473 | struct inode, i_list); |
467 | struct address_space *mapping = inode->i_mapping; | 474 | struct address_space *mapping = inode->i_mapping; |
468 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 475 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
469 | long pages_skipped; | 476 | long pages_skipped; |
470 | 477 | ||
471 | if (!bdi_cap_writeback_dirty(bdi)) { | 478 | if (!bdi_cap_writeback_dirty(bdi)) { |
472 | redirty_tail(inode); | 479 | redirty_tail(inode); |
473 | if (sb_is_blkdev_sb(sb)) { | 480 | if (sb_is_blkdev_sb(sb)) { |
474 | /* | 481 | /* |
475 | * Dirty memory-backed blockdev: the ramdisk | 482 | * Dirty memory-backed blockdev: the ramdisk |
476 | * driver does this. Skip just this inode | 483 | * driver does this. Skip just this inode |
477 | */ | 484 | */ |
478 | continue; | 485 | continue; |
479 | } | 486 | } |
480 | /* | 487 | /* |
481 | * Dirty memory-backed inode against a filesystem other | 488 | * Dirty memory-backed inode against a filesystem other |
482 | * than the kernel-internal bdev filesystem. Skip the | 489 | * than the kernel-internal bdev filesystem. Skip the |
483 | * entire superblock. | 490 | * entire superblock. |
484 | */ | 491 | */ |
485 | break; | 492 | break; |
486 | } | 493 | } |
487 | 494 | ||
488 | if (inode->i_state & I_NEW) { | 495 | if (inode->i_state & I_NEW) { |
489 | requeue_io(inode); | 496 | requeue_io(inode); |
490 | continue; | 497 | continue; |
491 | } | 498 | } |
492 | 499 | ||
493 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 500 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
494 | wbc->encountered_congestion = 1; | 501 | wbc->encountered_congestion = 1; |
495 | if (!sb_is_blkdev_sb(sb)) | 502 | if (!sb_is_blkdev_sb(sb)) |
496 | break; /* Skip a congested fs */ | 503 | break; /* Skip a congested fs */ |
497 | requeue_io(inode); | 504 | requeue_io(inode); |
498 | continue; /* Skip a congested blockdev */ | 505 | continue; /* Skip a congested blockdev */ |
499 | } | 506 | } |
500 | 507 | ||
501 | if (wbc->bdi && bdi != wbc->bdi) { | 508 | if (wbc->bdi && bdi != wbc->bdi) { |
502 | if (!sb_is_blkdev_sb(sb)) | 509 | if (!sb_is_blkdev_sb(sb)) |
503 | break; /* fs has the wrong queue */ | 510 | break; /* fs has the wrong queue */ |
504 | requeue_io(inode); | 511 | requeue_io(inode); |
505 | continue; /* blockdev has wrong queue */ | 512 | continue; /* blockdev has wrong queue */ |
506 | } | 513 | } |
507 | 514 | ||
508 | /* | 515 | /* |
509 | * Was this inode dirtied after sync_sb_inodes was called? | 516 | * Was this inode dirtied after sync_sb_inodes was called? |
510 | * This keeps sync from extra jobs and livelock. | 517 | * This keeps sync from extra jobs and livelock. |
511 | */ | 518 | */ |
512 | if (inode_dirtied_after(inode, start)) | 519 | if (inode_dirtied_after(inode, start)) |
513 | break; | 520 | break; |
514 | 521 | ||
515 | /* Is another pdflush already flushing this queue? */ | 522 | /* Is another pdflush already flushing this queue? */ |
516 | if (current_is_pdflush() && !writeback_acquire(bdi)) | 523 | if (current_is_pdflush() && !writeback_acquire(bdi)) |
517 | break; | 524 | break; |
518 | 525 | ||
519 | BUG_ON(inode->i_state & I_FREEING); | 526 | BUG_ON(inode->i_state & I_FREEING); |
520 | __iget(inode); | 527 | __iget(inode); |
521 | pages_skipped = wbc->pages_skipped; | 528 | pages_skipped = wbc->pages_skipped; |
522 | __writeback_single_inode(inode, wbc); | 529 | __writeback_single_inode(inode, wbc); |
523 | if (current_is_pdflush()) | 530 | if (current_is_pdflush()) |
524 | writeback_release(bdi); | 531 | writeback_release(bdi); |
525 | if (wbc->pages_skipped != pages_skipped) { | 532 | if (wbc->pages_skipped != pages_skipped) { |
526 | /* | 533 | /* |
527 | * writeback is not making progress due to locked | 534 | * writeback is not making progress due to locked |
528 | * buffers. Skip this inode for now. | 535 | * buffers. Skip this inode for now. |
529 | */ | 536 | */ |
530 | redirty_tail(inode); | 537 | redirty_tail(inode); |
531 | } | 538 | } |
532 | spin_unlock(&inode_lock); | 539 | spin_unlock(&inode_lock); |
533 | iput(inode); | 540 | iput(inode); |
534 | cond_resched(); | 541 | cond_resched(); |
535 | spin_lock(&inode_lock); | 542 | spin_lock(&inode_lock); |
536 | if (wbc->nr_to_write <= 0) { | 543 | if (wbc->nr_to_write <= 0) { |
537 | wbc->more_io = 1; | 544 | wbc->more_io = 1; |
538 | break; | 545 | break; |
539 | } | 546 | } |
540 | if (!list_empty(&sb->s_more_io)) | 547 | if (!list_empty(&sb->s_more_io)) |
541 | wbc->more_io = 1; | 548 | wbc->more_io = 1; |
542 | } | 549 | } |
543 | 550 | ||
544 | if (sync) { | 551 | if (sync) { |
545 | struct inode *inode, *old_inode = NULL; | 552 | struct inode *inode, *old_inode = NULL; |
546 | 553 | ||
547 | /* | 554 | /* |
548 | * Data integrity sync. Must wait for all pages under writeback, | 555 | * Data integrity sync. Must wait for all pages under writeback, |
549 | * because there may have been pages dirtied before our sync | 556 | * because there may have been pages dirtied before our sync |
550 | * call, but which had writeout started before we write it out. | 557 | * call, but which had writeout started before we write it out. |
551 | * In which case, the inode may not be on the dirty list, but | 558 | * In which case, the inode may not be on the dirty list, but |
552 | * we still have to wait for that writeout. | 559 | * we still have to wait for that writeout. |
553 | */ | 560 | */ |
554 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { | 561 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { |
555 | struct address_space *mapping; | 562 | struct address_space *mapping; |
556 | 563 | ||
557 | if (inode->i_state & | 564 | if (inode->i_state & |
558 | (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) | 565 | (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) |
559 | continue; | 566 | continue; |
560 | mapping = inode->i_mapping; | 567 | mapping = inode->i_mapping; |
561 | if (mapping->nrpages == 0) | 568 | if (mapping->nrpages == 0) |
562 | continue; | 569 | continue; |
563 | __iget(inode); | 570 | __iget(inode); |
564 | spin_unlock(&inode_lock); | 571 | spin_unlock(&inode_lock); |
565 | /* | 572 | /* |
566 | * We hold a reference to 'inode' so it couldn't have | 573 | * We hold a reference to 'inode' so it couldn't have |
567 | * been removed from s_inodes list while we dropped the | 574 | * been removed from s_inodes list while we dropped the |
568 | * inode_lock. We cannot iput the inode now as we can | 575 | * inode_lock. We cannot iput the inode now as we can |
569 | * be holding the last reference and we cannot iput it | 576 | * be holding the last reference and we cannot iput it |
570 | * under inode_lock. So we keep the reference and iput | 577 | * under inode_lock. So we keep the reference and iput |
571 | * it later. | 578 | * it later. |
572 | */ | 579 | */ |
573 | iput(old_inode); | 580 | iput(old_inode); |
574 | old_inode = inode; | 581 | old_inode = inode; |
575 | 582 | ||
576 | filemap_fdatawait(mapping); | 583 | filemap_fdatawait(mapping); |
577 | 584 | ||
578 | cond_resched(); | 585 | cond_resched(); |
579 | 586 | ||
580 | spin_lock(&inode_lock); | 587 | spin_lock(&inode_lock); |
581 | } | 588 | } |
582 | spin_unlock(&inode_lock); | 589 | spin_unlock(&inode_lock); |
583 | iput(old_inode); | 590 | iput(old_inode); |
584 | } else | 591 | } else |
585 | spin_unlock(&inode_lock); | 592 | spin_unlock(&inode_lock); |
586 | 593 | ||
587 | return; /* Leave any unwritten inodes on s_io */ | 594 | return; /* Leave any unwritten inodes on s_io */ |
588 | } | 595 | } |
589 | EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); | 596 | EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); |
590 | 597 | ||
591 | static void sync_sb_inodes(struct super_block *sb, | 598 | static void sync_sb_inodes(struct super_block *sb, |
592 | struct writeback_control *wbc) | 599 | struct writeback_control *wbc) |
593 | { | 600 | { |
594 | generic_sync_sb_inodes(sb, wbc); | 601 | generic_sync_sb_inodes(sb, wbc); |
595 | } | 602 | } |
596 | 603 | ||
597 | /* | 604 | /* |
598 | * Start writeback of dirty pagecache data against all unlocked inodes. | 605 | * Start writeback of dirty pagecache data against all unlocked inodes. |
599 | * | 606 | * |
600 | * Note: | 607 | * Note: |
601 | * We don't need to grab a reference to superblock here. If it has non-empty | 608 | * We don't need to grab a reference to superblock here. If it has non-empty |
602 | * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed | 609 | * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed |
603 | * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all | 610 | * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all |
604 | * empty. Since __sync_single_inode() regains inode_lock before it finally moves | 611 | * empty. Since __sync_single_inode() regains inode_lock before it finally moves |
605 | * inode from superblock lists we are OK. | 612 | * inode from superblock lists we are OK. |
606 | * | 613 | * |
607 | * If `older_than_this' is non-zero then only flush inodes which have a | 614 | * If `older_than_this' is non-zero then only flush inodes which have a |
608 | * flushtime older than *older_than_this. | 615 | * flushtime older than *older_than_this. |
609 | * | 616 | * |
610 | * If `bdi' is non-zero then we will scan the first inode against each | 617 | * If `bdi' is non-zero then we will scan the first inode against each |
611 | * superblock until we find the matching ones. One group will be the dirty | 618 | * superblock until we find the matching ones. One group will be the dirty |
612 | * inodes against a filesystem. Then when we hit the dummy blockdev superblock, | 619 | * inodes against a filesystem. Then when we hit the dummy blockdev superblock, |
613 | * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not | 620 | * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not |
614 | * super-efficient but we're about to do a ton of I/O... | 621 | * super-efficient but we're about to do a ton of I/O... |
615 | */ | 622 | */ |
616 | void | 623 | void |
617 | writeback_inodes(struct writeback_control *wbc) | 624 | writeback_inodes(struct writeback_control *wbc) |
618 | { | 625 | { |
619 | struct super_block *sb; | 626 | struct super_block *sb; |
620 | 627 | ||
621 | might_sleep(); | 628 | might_sleep(); |
622 | spin_lock(&sb_lock); | 629 | spin_lock(&sb_lock); |
623 | restart: | 630 | restart: |
624 | list_for_each_entry_reverse(sb, &super_blocks, s_list) { | 631 | list_for_each_entry_reverse(sb, &super_blocks, s_list) { |
625 | if (sb_has_dirty_inodes(sb)) { | 632 | if (sb_has_dirty_inodes(sb)) { |
626 | /* we're making our own get_super here */ | 633 | /* we're making our own get_super here */ |
627 | sb->s_count++; | 634 | sb->s_count++; |
628 | spin_unlock(&sb_lock); | 635 | spin_unlock(&sb_lock); |
629 | /* | 636 | /* |
630 | * If we can't get the readlock, there's no sense in | 637 | * If we can't get the readlock, there's no sense in |
631 | * waiting around, most of the time the FS is going to | 638 | * waiting around, most of the time the FS is going to |
632 | * be unmounted by the time it is released. | 639 | * be unmounted by the time it is released. |
633 | */ | 640 | */ |
634 | if (down_read_trylock(&sb->s_umount)) { | 641 | if (down_read_trylock(&sb->s_umount)) { |
635 | if (sb->s_root) | 642 | if (sb->s_root) |
636 | sync_sb_inodes(sb, wbc); | 643 | sync_sb_inodes(sb, wbc); |
637 | up_read(&sb->s_umount); | 644 | up_read(&sb->s_umount); |
638 | } | 645 | } |
639 | spin_lock(&sb_lock); | 646 | spin_lock(&sb_lock); |
640 | if (__put_super_and_need_restart(sb)) | 647 | if (__put_super_and_need_restart(sb)) |
641 | goto restart; | 648 | goto restart; |
642 | } | 649 | } |
643 | if (wbc->nr_to_write <= 0) | 650 | if (wbc->nr_to_write <= 0) |
644 | break; | 651 | break; |
645 | } | 652 | } |
646 | spin_unlock(&sb_lock); | 653 | spin_unlock(&sb_lock); |
647 | } | 654 | } |
648 | 655 | ||
649 | /* | 656 | /* |
650 | * writeback and wait upon the filesystem's dirty inodes. The caller will | 657 | * writeback and wait upon the filesystem's dirty inodes. The caller will |
651 | * do this in two passes - one to write, and one to wait. | 658 | * do this in two passes - one to write, and one to wait. |
652 | * | 659 | * |
653 | * A finite limit is set on the number of pages which will be written. | 660 | * A finite limit is set on the number of pages which will be written. |
654 | * To prevent infinite livelock of sys_sync(). | 661 | * To prevent infinite livelock of sys_sync(). |
655 | * | 662 | * |
656 | * We add in the number of potentially dirty inodes, because each inode write | 663 | * We add in the number of potentially dirty inodes, because each inode write |
657 | * can dirty pagecache in the underlying blockdev. | 664 | * can dirty pagecache in the underlying blockdev. |
658 | */ | 665 | */ |
659 | void sync_inodes_sb(struct super_block *sb, int wait) | 666 | void sync_inodes_sb(struct super_block *sb, int wait) |
660 | { | 667 | { |
661 | struct writeback_control wbc = { | 668 | struct writeback_control wbc = { |
662 | .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, | 669 | .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, |
663 | .range_start = 0, | 670 | .range_start = 0, |
664 | .range_end = LLONG_MAX, | 671 | .range_end = LLONG_MAX, |
665 | }; | 672 | }; |
666 | 673 | ||
667 | if (!wait) { | 674 | if (!wait) { |
668 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); | 675 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); |
669 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); | 676 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); |
670 | 677 | ||
671 | wbc.nr_to_write = nr_dirty + nr_unstable + | 678 | wbc.nr_to_write = nr_dirty + nr_unstable + |
672 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 679 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
673 | } else | 680 | } else |
674 | wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ | 681 | wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ |
675 | 682 | ||
676 | sync_sb_inodes(sb, &wbc); | 683 | sync_sb_inodes(sb, &wbc); |
677 | } | 684 | } |
678 | 685 | ||
679 | /** | 686 | /** |
680 | * write_inode_now - write an inode to disk | 687 | * write_inode_now - write an inode to disk |
681 | * @inode: inode to write to disk | 688 | * @inode: inode to write to disk |
682 | * @sync: whether the write should be synchronous or not | 689 | * @sync: whether the write should be synchronous or not |
683 | * | 690 | * |
684 | * This function commits an inode to disk immediately if it is dirty. This is | 691 | * This function commits an inode to disk immediately if it is dirty. This is |
685 | * primarily needed by knfsd. | 692 | * primarily needed by knfsd. |
686 | * | 693 | * |
687 | * The caller must either have a ref on the inode or must have set I_WILL_FREE. | 694 | * The caller must either have a ref on the inode or must have set I_WILL_FREE. |
688 | */ | 695 | */ |
689 | int write_inode_now(struct inode *inode, int sync) | 696 | int write_inode_now(struct inode *inode, int sync) |
690 | { | 697 | { |
691 | int ret; | 698 | int ret; |
692 | struct writeback_control wbc = { | 699 | struct writeback_control wbc = { |
693 | .nr_to_write = LONG_MAX, | 700 | .nr_to_write = LONG_MAX, |
694 | .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, | 701 | .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, |
695 | .range_start = 0, | 702 | .range_start = 0, |
696 | .range_end = LLONG_MAX, | 703 | .range_end = LLONG_MAX, |
697 | }; | 704 | }; |
698 | 705 | ||
699 | if (!mapping_cap_writeback_dirty(inode->i_mapping)) | 706 | if (!mapping_cap_writeback_dirty(inode->i_mapping)) |
700 | wbc.nr_to_write = 0; | 707 | wbc.nr_to_write = 0; |
701 | 708 | ||
702 | might_sleep(); | 709 | might_sleep(); |
703 | spin_lock(&inode_lock); | 710 | spin_lock(&inode_lock); |
704 | ret = __writeback_single_inode(inode, &wbc); | 711 | ret = __writeback_single_inode(inode, &wbc); |
705 | spin_unlock(&inode_lock); | 712 | spin_unlock(&inode_lock); |
706 | if (sync) | 713 | if (sync) |
707 | inode_sync_wait(inode); | 714 | inode_sync_wait(inode); |
708 | return ret; | 715 | return ret; |
709 | } | 716 | } |
710 | EXPORT_SYMBOL(write_inode_now); | 717 | EXPORT_SYMBOL(write_inode_now); |
711 | 718 | ||
712 | /** | 719 | /** |
713 | * sync_inode - write an inode and its pages to disk. | 720 | * sync_inode - write an inode and its pages to disk. |
714 | * @inode: the inode to sync | 721 | * @inode: the inode to sync |
715 | * @wbc: controls the writeback mode | 722 | * @wbc: controls the writeback mode |
716 | * | 723 | * |
717 | * sync_inode() will write an inode and its pages to disk. It will also | 724 | * sync_inode() will write an inode and its pages to disk. It will also |
718 | * correctly update the inode on its superblock's dirty inode lists and will | 725 | * correctly update the inode on its superblock's dirty inode lists and will |
719 | * update inode->i_state. | 726 | * update inode->i_state. |
720 | * | 727 | * |
721 | * The caller must have a ref on the inode. | 728 | * The caller must have a ref on the inode. |
722 | */ | 729 | */ |
723 | int sync_inode(struct inode *inode, struct writeback_control *wbc) | 730 | int sync_inode(struct inode *inode, struct writeback_control *wbc) |
724 | { | 731 | { |
725 | int ret; | 732 | int ret; |
726 | 733 | ||
727 | spin_lock(&inode_lock); | 734 | spin_lock(&inode_lock); |
728 | ret = __writeback_single_inode(inode, wbc); | 735 | ret = __writeback_single_inode(inode, wbc); |
729 | spin_unlock(&inode_lock); | 736 | spin_unlock(&inode_lock); |
730 | return ret; | 737 | return ret; |
731 | } | 738 | } |
732 | EXPORT_SYMBOL(sync_inode); | 739 | EXPORT_SYMBOL(sync_inode); |
733 | 740 | ||
734 | /** | 741 | /** |
735 | * generic_osync_inode - flush all dirty data for a given inode to disk | 742 | * generic_osync_inode - flush all dirty data for a given inode to disk |
736 | * @inode: inode to write | 743 | * @inode: inode to write |
737 | * @mapping: the address_space that should be flushed | 744 | * @mapping: the address_space that should be flushed |
738 | * @what: what to write and wait upon | 745 | * @what: what to write and wait upon |
739 | * | 746 | * |
740 | * This can be called by file_write functions for files which have the | 747 | * This can be called by file_write functions for files which have the |
741 | * O_SYNC flag set, to flush dirty writes to disk. | 748 | * O_SYNC flag set, to flush dirty writes to disk. |
742 | * | 749 | * |
743 | * @what is a bitmask, specifying which part of the inode's data should be | 750 | * @what is a bitmask, specifying which part of the inode's data should be |
744 | * written and waited upon. | 751 | * written and waited upon. |
745 | * | 752 | * |
746 | * OSYNC_DATA: i_mapping's dirty data | 753 | * OSYNC_DATA: i_mapping's dirty data |
747 | * OSYNC_METADATA: the buffers at i_mapping->private_list | 754 | * OSYNC_METADATA: the buffers at i_mapping->private_list |
748 | * OSYNC_INODE: the inode itself | 755 | * OSYNC_INODE: the inode itself |
749 | */ | 756 | */ |
750 | 757 | ||
751 | int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) | 758 | int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) |
752 | { | 759 | { |
753 | int err = 0; | 760 | int err = 0; |
754 | int need_write_inode_now = 0; | 761 | int need_write_inode_now = 0; |
755 | int err2; | 762 | int err2; |
756 | 763 | ||
757 | if (what & OSYNC_DATA) | 764 | if (what & OSYNC_DATA) |
758 | err = filemap_fdatawrite(mapping); | 765 | err = filemap_fdatawrite(mapping); |
759 | if (what & (OSYNC_METADATA|OSYNC_DATA)) { | 766 | if (what & (OSYNC_METADATA|OSYNC_DATA)) { |
760 | err2 = sync_mapping_buffers(mapping); | 767 | err2 = sync_mapping_buffers(mapping); |
761 | if (!err) | 768 | if (!err) |
762 | err = err2; | 769 | err = err2; |
763 | } | 770 | } |
764 | if (what & OSYNC_DATA) { | 771 | if (what & OSYNC_DATA) { |
765 | err2 = filemap_fdatawait(mapping); | 772 | err2 = filemap_fdatawait(mapping); |
766 | if (!err) | 773 | if (!err) |
767 | err = err2; | 774 | err = err2; |
768 | } | 775 | } |
769 | 776 | ||
770 | spin_lock(&inode_lock); | 777 | spin_lock(&inode_lock); |
771 | if ((inode->i_state & I_DIRTY) && | 778 | if ((inode->i_state & I_DIRTY) && |
772 | ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) | 779 | ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) |