Commit ae1b7f7d4b9ea587fda95c38301f4e72e8146634
Committed by
Jens Axboe
1 parent
7fbdea3232
Exists in
master
and in
7 other branches
writeback: cleanup writeback_single_inode()
Make the if-else straight in writeback_single_inode(). No behavior change. Cc: Jan Kara <jack@suse.cz> Cc: Michael Rubin <mrubin@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Showing 1 changed file with 7 additions and 8 deletions Inline Diff
fs/fs-writeback.c
1 | /* | 1 | /* |
2 | * fs/fs-writeback.c | 2 | * fs/fs-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * | 5 | * |
6 | * Contains all the functions related to writing back and waiting | 6 | * Contains all the functions related to writing back and waiting |
7 | * upon dirty inodes against superblocks, and writing back dirty | 7 | * upon dirty inodes against superblocks, and writing back dirty |
8 | * pages against inodes. ie: data writeback. Writeout of the | 8 | * pages against inodes. ie: data writeback. Writeout of the |
9 | * inode itself is not handled here. | 9 | * inode itself is not handled here. |
10 | * | 10 | * |
11 | * 10Apr2002 Andrew Morton | 11 | * 10Apr2002 Andrew Morton |
12 | * Split out of fs/inode.c | 12 | * Split out of fs/inode.c |
13 | * Additions for address_space-based writeback | 13 | * Additions for address_space-based writeback |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/kthread.h> | 22 | #include <linux/kthread.h> |
23 | #include <linux/freezer.h> | 23 | #include <linux/freezer.h> |
24 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> |
25 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
26 | #include <linux/backing-dev.h> | 26 | #include <linux/backing-dev.h> |
27 | #include <linux/buffer_head.h> | 27 | #include <linux/buffer_head.h> |
28 | #include "internal.h" | 28 | #include "internal.h" |
29 | 29 | ||
30 | #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) | 30 | #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * We don't actually have pdflush, but this one is exported though /proc... | 33 | * We don't actually have pdflush, but this one is exported though /proc... |
34 | */ | 34 | */ |
35 | int nr_pdflush_threads; | 35 | int nr_pdflush_threads; |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * Passed into wb_writeback(), essentially a subset of writeback_control | 38 | * Passed into wb_writeback(), essentially a subset of writeback_control |
39 | */ | 39 | */ |
40 | struct wb_writeback_args { | 40 | struct wb_writeback_args { |
41 | long nr_pages; | 41 | long nr_pages; |
42 | struct super_block *sb; | 42 | struct super_block *sb; |
43 | enum writeback_sync_modes sync_mode; | 43 | enum writeback_sync_modes sync_mode; |
44 | int for_kupdate:1; | 44 | int for_kupdate:1; |
45 | int range_cyclic:1; | 45 | int range_cyclic:1; |
46 | int for_background:1; | 46 | int for_background:1; |
47 | }; | 47 | }; |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * Work items for the bdi_writeback threads | 50 | * Work items for the bdi_writeback threads |
51 | */ | 51 | */ |
52 | struct bdi_work { | 52 | struct bdi_work { |
53 | struct list_head list; /* pending work list */ | 53 | struct list_head list; /* pending work list */ |
54 | struct rcu_head rcu_head; /* for RCU free/clear of work */ | 54 | struct rcu_head rcu_head; /* for RCU free/clear of work */ |
55 | 55 | ||
56 | unsigned long seen; /* threads that have seen this work */ | 56 | unsigned long seen; /* threads that have seen this work */ |
57 | atomic_t pending; /* number of threads still to do work */ | 57 | atomic_t pending; /* number of threads still to do work */ |
58 | 58 | ||
59 | struct wb_writeback_args args; /* writeback arguments */ | 59 | struct wb_writeback_args args; /* writeback arguments */ |
60 | 60 | ||
61 | unsigned long state; /* flag bits, see WS_* */ | 61 | unsigned long state; /* flag bits, see WS_* */ |
62 | }; | 62 | }; |
63 | 63 | ||
64 | enum { | 64 | enum { |
65 | WS_USED_B = 0, | 65 | WS_USED_B = 0, |
66 | WS_ONSTACK_B, | 66 | WS_ONSTACK_B, |
67 | }; | 67 | }; |
68 | 68 | ||
69 | #define WS_USED (1 << WS_USED_B) | 69 | #define WS_USED (1 << WS_USED_B) |
70 | #define WS_ONSTACK (1 << WS_ONSTACK_B) | 70 | #define WS_ONSTACK (1 << WS_ONSTACK_B) |
71 | 71 | ||
72 | static inline bool bdi_work_on_stack(struct bdi_work *work) | 72 | static inline bool bdi_work_on_stack(struct bdi_work *work) |
73 | { | 73 | { |
74 | return test_bit(WS_ONSTACK_B, &work->state); | 74 | return test_bit(WS_ONSTACK_B, &work->state); |
75 | } | 75 | } |
76 | 76 | ||
77 | static inline void bdi_work_init(struct bdi_work *work, | 77 | static inline void bdi_work_init(struct bdi_work *work, |
78 | struct wb_writeback_args *args) | 78 | struct wb_writeback_args *args) |
79 | { | 79 | { |
80 | INIT_RCU_HEAD(&work->rcu_head); | 80 | INIT_RCU_HEAD(&work->rcu_head); |
81 | work->args = *args; | 81 | work->args = *args; |
82 | work->state = WS_USED; | 82 | work->state = WS_USED; |
83 | } | 83 | } |
84 | 84 | ||
85 | /** | 85 | /** |
86 | * writeback_in_progress - determine whether there is writeback in progress | 86 | * writeback_in_progress - determine whether there is writeback in progress |
87 | * @bdi: the device's backing_dev_info structure. | 87 | * @bdi: the device's backing_dev_info structure. |
88 | * | 88 | * |
89 | * Determine whether there is writeback waiting to be handled against a | 89 | * Determine whether there is writeback waiting to be handled against a |
90 | * backing device. | 90 | * backing device. |
91 | */ | 91 | */ |
92 | int writeback_in_progress(struct backing_dev_info *bdi) | 92 | int writeback_in_progress(struct backing_dev_info *bdi) |
93 | { | 93 | { |
94 | return !list_empty(&bdi->work_list); | 94 | return !list_empty(&bdi->work_list); |
95 | } | 95 | } |
96 | 96 | ||
97 | static void bdi_work_clear(struct bdi_work *work) | 97 | static void bdi_work_clear(struct bdi_work *work) |
98 | { | 98 | { |
99 | clear_bit(WS_USED_B, &work->state); | 99 | clear_bit(WS_USED_B, &work->state); |
100 | smp_mb__after_clear_bit(); | 100 | smp_mb__after_clear_bit(); |
101 | /* | 101 | /* |
102 | * work can have disappeared at this point. bit waitq functions | 102 | * work can have disappeared at this point. bit waitq functions |
103 | * should be able to tolerate this, provided bdi_sched_wait does | 103 | * should be able to tolerate this, provided bdi_sched_wait does |
104 | * not dereference it's pointer argument. | 104 | * not dereference it's pointer argument. |
105 | */ | 105 | */ |
106 | wake_up_bit(&work->state, WS_USED_B); | 106 | wake_up_bit(&work->state, WS_USED_B); |
107 | } | 107 | } |
108 | 108 | ||
109 | static void bdi_work_free(struct rcu_head *head) | 109 | static void bdi_work_free(struct rcu_head *head) |
110 | { | 110 | { |
111 | struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); | 111 | struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); |
112 | 112 | ||
113 | if (!bdi_work_on_stack(work)) | 113 | if (!bdi_work_on_stack(work)) |
114 | kfree(work); | 114 | kfree(work); |
115 | else | 115 | else |
116 | bdi_work_clear(work); | 116 | bdi_work_clear(work); |
117 | } | 117 | } |
118 | 118 | ||
119 | static void wb_work_complete(struct bdi_work *work) | 119 | static void wb_work_complete(struct bdi_work *work) |
120 | { | 120 | { |
121 | const enum writeback_sync_modes sync_mode = work->args.sync_mode; | 121 | const enum writeback_sync_modes sync_mode = work->args.sync_mode; |
122 | int onstack = bdi_work_on_stack(work); | 122 | int onstack = bdi_work_on_stack(work); |
123 | 123 | ||
124 | /* | 124 | /* |
125 | * For allocated work, we can clear the done/seen bit right here. | 125 | * For allocated work, we can clear the done/seen bit right here. |
126 | * For on-stack work, we need to postpone both the clear and free | 126 | * For on-stack work, we need to postpone both the clear and free |
127 | * to after the RCU grace period, since the stack could be invalidated | 127 | * to after the RCU grace period, since the stack could be invalidated |
128 | * as soon as bdi_work_clear() has done the wakeup. | 128 | * as soon as bdi_work_clear() has done the wakeup. |
129 | */ | 129 | */ |
130 | if (!onstack) | 130 | if (!onstack) |
131 | bdi_work_clear(work); | 131 | bdi_work_clear(work); |
132 | if (sync_mode == WB_SYNC_NONE || onstack) | 132 | if (sync_mode == WB_SYNC_NONE || onstack) |
133 | call_rcu(&work->rcu_head, bdi_work_free); | 133 | call_rcu(&work->rcu_head, bdi_work_free); |
134 | } | 134 | } |
135 | 135 | ||
136 | static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) | 136 | static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) |
137 | { | 137 | { |
138 | /* | 138 | /* |
139 | * The caller has retrieved the work arguments from this work, | 139 | * The caller has retrieved the work arguments from this work, |
140 | * drop our reference. If this is the last ref, delete and free it | 140 | * drop our reference. If this is the last ref, delete and free it |
141 | */ | 141 | */ |
142 | if (atomic_dec_and_test(&work->pending)) { | 142 | if (atomic_dec_and_test(&work->pending)) { |
143 | struct backing_dev_info *bdi = wb->bdi; | 143 | struct backing_dev_info *bdi = wb->bdi; |
144 | 144 | ||
145 | spin_lock(&bdi->wb_lock); | 145 | spin_lock(&bdi->wb_lock); |
146 | list_del_rcu(&work->list); | 146 | list_del_rcu(&work->list); |
147 | spin_unlock(&bdi->wb_lock); | 147 | spin_unlock(&bdi->wb_lock); |
148 | 148 | ||
149 | wb_work_complete(work); | 149 | wb_work_complete(work); |
150 | } | 150 | } |
151 | } | 151 | } |
152 | 152 | ||
153 | static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) | 153 | static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) |
154 | { | 154 | { |
155 | work->seen = bdi->wb_mask; | 155 | work->seen = bdi->wb_mask; |
156 | BUG_ON(!work->seen); | 156 | BUG_ON(!work->seen); |
157 | atomic_set(&work->pending, bdi->wb_cnt); | 157 | atomic_set(&work->pending, bdi->wb_cnt); |
158 | BUG_ON(!bdi->wb_cnt); | 158 | BUG_ON(!bdi->wb_cnt); |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * list_add_tail_rcu() contains the necessary barriers to | 161 | * list_add_tail_rcu() contains the necessary barriers to |
162 | * make sure the above stores are seen before the item is | 162 | * make sure the above stores are seen before the item is |
163 | * noticed on the list | 163 | * noticed on the list |
164 | */ | 164 | */ |
165 | spin_lock(&bdi->wb_lock); | 165 | spin_lock(&bdi->wb_lock); |
166 | list_add_tail_rcu(&work->list, &bdi->work_list); | 166 | list_add_tail_rcu(&work->list, &bdi->work_list); |
167 | spin_unlock(&bdi->wb_lock); | 167 | spin_unlock(&bdi->wb_lock); |
168 | 168 | ||
169 | /* | 169 | /* |
170 | * If the default thread isn't there, make sure we add it. When | 170 | * If the default thread isn't there, make sure we add it. When |
171 | * it gets created and wakes up, we'll run this work. | 171 | * it gets created and wakes up, we'll run this work. |
172 | */ | 172 | */ |
173 | if (unlikely(list_empty_careful(&bdi->wb_list))) | 173 | if (unlikely(list_empty_careful(&bdi->wb_list))) |
174 | wake_up_process(default_backing_dev_info.wb.task); | 174 | wake_up_process(default_backing_dev_info.wb.task); |
175 | else { | 175 | else { |
176 | struct bdi_writeback *wb = &bdi->wb; | 176 | struct bdi_writeback *wb = &bdi->wb; |
177 | 177 | ||
178 | if (wb->task) | 178 | if (wb->task) |
179 | wake_up_process(wb->task); | 179 | wake_up_process(wb->task); |
180 | } | 180 | } |
181 | } | 181 | } |
182 | 182 | ||
183 | /* | 183 | /* |
184 | * Used for on-stack allocated work items. The caller needs to wait until | 184 | * Used for on-stack allocated work items. The caller needs to wait until |
185 | * the wb threads have acked the work before it's safe to continue. | 185 | * the wb threads have acked the work before it's safe to continue. |
186 | */ | 186 | */ |
187 | static void bdi_wait_on_work_clear(struct bdi_work *work) | 187 | static void bdi_wait_on_work_clear(struct bdi_work *work) |
188 | { | 188 | { |
189 | wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, | 189 | wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, |
190 | TASK_UNINTERRUPTIBLE); | 190 | TASK_UNINTERRUPTIBLE); |
191 | } | 191 | } |
192 | 192 | ||
193 | static void bdi_alloc_queue_work(struct backing_dev_info *bdi, | 193 | static void bdi_alloc_queue_work(struct backing_dev_info *bdi, |
194 | struct wb_writeback_args *args) | 194 | struct wb_writeback_args *args) |
195 | { | 195 | { |
196 | struct bdi_work *work; | 196 | struct bdi_work *work; |
197 | 197 | ||
198 | /* | 198 | /* |
199 | * This is WB_SYNC_NONE writeback, so if allocation fails just | 199 | * This is WB_SYNC_NONE writeback, so if allocation fails just |
200 | * wakeup the thread for old dirty data writeback | 200 | * wakeup the thread for old dirty data writeback |
201 | */ | 201 | */ |
202 | work = kmalloc(sizeof(*work), GFP_ATOMIC); | 202 | work = kmalloc(sizeof(*work), GFP_ATOMIC); |
203 | if (work) { | 203 | if (work) { |
204 | bdi_work_init(work, args); | 204 | bdi_work_init(work, args); |
205 | bdi_queue_work(bdi, work); | 205 | bdi_queue_work(bdi, work); |
206 | } else { | 206 | } else { |
207 | struct bdi_writeback *wb = &bdi->wb; | 207 | struct bdi_writeback *wb = &bdi->wb; |
208 | 208 | ||
209 | if (wb->task) | 209 | if (wb->task) |
210 | wake_up_process(wb->task); | 210 | wake_up_process(wb->task); |
211 | } | 211 | } |
212 | } | 212 | } |
213 | 213 | ||
214 | /** | 214 | /** |
215 | * bdi_sync_writeback - start and wait for writeback | 215 | * bdi_sync_writeback - start and wait for writeback |
216 | * @bdi: the backing device to write from | 216 | * @bdi: the backing device to write from |
217 | * @sb: write inodes from this super_block | 217 | * @sb: write inodes from this super_block |
218 | * | 218 | * |
219 | * Description: | 219 | * Description: |
220 | * This does WB_SYNC_ALL data integrity writeback and waits for the | 220 | * This does WB_SYNC_ALL data integrity writeback and waits for the |
221 | * IO to complete. Callers must hold the sb s_umount semaphore for | 221 | * IO to complete. Callers must hold the sb s_umount semaphore for |
222 | * reading, to avoid having the super disappear before we are done. | 222 | * reading, to avoid having the super disappear before we are done. |
223 | */ | 223 | */ |
224 | static void bdi_sync_writeback(struct backing_dev_info *bdi, | 224 | static void bdi_sync_writeback(struct backing_dev_info *bdi, |
225 | struct super_block *sb) | 225 | struct super_block *sb) |
226 | { | 226 | { |
227 | struct wb_writeback_args args = { | 227 | struct wb_writeback_args args = { |
228 | .sb = sb, | 228 | .sb = sb, |
229 | .sync_mode = WB_SYNC_ALL, | 229 | .sync_mode = WB_SYNC_ALL, |
230 | .nr_pages = LONG_MAX, | 230 | .nr_pages = LONG_MAX, |
231 | .range_cyclic = 0, | 231 | .range_cyclic = 0, |
232 | }; | 232 | }; |
233 | struct bdi_work work; | 233 | struct bdi_work work; |
234 | 234 | ||
235 | bdi_work_init(&work, &args); | 235 | bdi_work_init(&work, &args); |
236 | work.state |= WS_ONSTACK; | 236 | work.state |= WS_ONSTACK; |
237 | 237 | ||
238 | bdi_queue_work(bdi, &work); | 238 | bdi_queue_work(bdi, &work); |
239 | bdi_wait_on_work_clear(&work); | 239 | bdi_wait_on_work_clear(&work); |
240 | } | 240 | } |
241 | 241 | ||
242 | /** | 242 | /** |
243 | * bdi_start_writeback - start writeback | 243 | * bdi_start_writeback - start writeback |
244 | * @bdi: the backing device to write from | 244 | * @bdi: the backing device to write from |
245 | * @nr_pages: the number of pages to write | 245 | * @nr_pages: the number of pages to write |
246 | * | 246 | * |
247 | * Description: | 247 | * Description: |
248 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only | 248 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only |
249 | * started when this function returns, we make no guarentees on | 249 | * started when this function returns, we make no guarentees on |
250 | * completion. Caller need not hold sb s_umount semaphore. | 250 | * completion. Caller need not hold sb s_umount semaphore. |
251 | * | 251 | * |
252 | */ | 252 | */ |
253 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) | 253 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) |
254 | { | 254 | { |
255 | struct wb_writeback_args args = { | 255 | struct wb_writeback_args args = { |
256 | .sync_mode = WB_SYNC_NONE, | 256 | .sync_mode = WB_SYNC_NONE, |
257 | .nr_pages = nr_pages, | 257 | .nr_pages = nr_pages, |
258 | .range_cyclic = 1, | 258 | .range_cyclic = 1, |
259 | }; | 259 | }; |
260 | 260 | ||
261 | /* | 261 | /* |
262 | * We treat @nr_pages=0 as the special case to do background writeback, | 262 | * We treat @nr_pages=0 as the special case to do background writeback, |
263 | * ie. to sync pages until the background dirty threshold is reached. | 263 | * ie. to sync pages until the background dirty threshold is reached. |
264 | */ | 264 | */ |
265 | if (!nr_pages) { | 265 | if (!nr_pages) { |
266 | args.nr_pages = LONG_MAX; | 266 | args.nr_pages = LONG_MAX; |
267 | args.for_background = 1; | 267 | args.for_background = 1; |
268 | } | 268 | } |
269 | 269 | ||
270 | bdi_alloc_queue_work(bdi, &args); | 270 | bdi_alloc_queue_work(bdi, &args); |
271 | } | 271 | } |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the | 274 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
275 | * furthest end of its superblock's dirty-inode list. | 275 | * furthest end of its superblock's dirty-inode list. |
276 | * | 276 | * |
277 | * Before stamping the inode's ->dirtied_when, we check to see whether it is | 277 | * Before stamping the inode's ->dirtied_when, we check to see whether it is |
278 | * already the most-recently-dirtied inode on the b_dirty list. If that is | 278 | * already the most-recently-dirtied inode on the b_dirty list. If that is |
279 | * the case then the inode must have been redirtied while it was being written | 279 | * the case then the inode must have been redirtied while it was being written |
280 | * out and we don't reset its dirtied_when. | 280 | * out and we don't reset its dirtied_when. |
281 | */ | 281 | */ |
282 | static void redirty_tail(struct inode *inode) | 282 | static void redirty_tail(struct inode *inode) |
283 | { | 283 | { |
284 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 284 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
285 | 285 | ||
286 | if (!list_empty(&wb->b_dirty)) { | 286 | if (!list_empty(&wb->b_dirty)) { |
287 | struct inode *tail; | 287 | struct inode *tail; |
288 | 288 | ||
289 | tail = list_entry(wb->b_dirty.next, struct inode, i_list); | 289 | tail = list_entry(wb->b_dirty.next, struct inode, i_list); |
290 | if (time_before(inode->dirtied_when, tail->dirtied_when)) | 290 | if (time_before(inode->dirtied_when, tail->dirtied_when)) |
291 | inode->dirtied_when = jiffies; | 291 | inode->dirtied_when = jiffies; |
292 | } | 292 | } |
293 | list_move(&inode->i_list, &wb->b_dirty); | 293 | list_move(&inode->i_list, &wb->b_dirty); |
294 | } | 294 | } |
295 | 295 | ||
296 | /* | 296 | /* |
297 | * requeue inode for re-scanning after bdi->b_io list is exhausted. | 297 | * requeue inode for re-scanning after bdi->b_io list is exhausted. |
298 | */ | 298 | */ |
299 | static void requeue_io(struct inode *inode) | 299 | static void requeue_io(struct inode *inode) |
300 | { | 300 | { |
301 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 301 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
302 | 302 | ||
303 | list_move(&inode->i_list, &wb->b_more_io); | 303 | list_move(&inode->i_list, &wb->b_more_io); |
304 | } | 304 | } |
305 | 305 | ||
306 | static void inode_sync_complete(struct inode *inode) | 306 | static void inode_sync_complete(struct inode *inode) |
307 | { | 307 | { |
308 | /* | 308 | /* |
309 | * Prevent speculative execution through spin_unlock(&inode_lock); | 309 | * Prevent speculative execution through spin_unlock(&inode_lock); |
310 | */ | 310 | */ |
311 | smp_mb(); | 311 | smp_mb(); |
312 | wake_up_bit(&inode->i_state, __I_SYNC); | 312 | wake_up_bit(&inode->i_state, __I_SYNC); |
313 | } | 313 | } |
314 | 314 | ||
315 | static bool inode_dirtied_after(struct inode *inode, unsigned long t) | 315 | static bool inode_dirtied_after(struct inode *inode, unsigned long t) |
316 | { | 316 | { |
317 | bool ret = time_after(inode->dirtied_when, t); | 317 | bool ret = time_after(inode->dirtied_when, t); |
318 | #ifndef CONFIG_64BIT | 318 | #ifndef CONFIG_64BIT |
319 | /* | 319 | /* |
320 | * For inodes being constantly redirtied, dirtied_when can get stuck. | 320 | * For inodes being constantly redirtied, dirtied_when can get stuck. |
321 | * It _appears_ to be in the future, but is actually in distant past. | 321 | * It _appears_ to be in the future, but is actually in distant past. |
322 | * This test is necessary to prevent such wrapped-around relative times | 322 | * This test is necessary to prevent such wrapped-around relative times |
323 | * from permanently stopping the whole pdflush writeback. | 323 | * from permanently stopping the whole pdflush writeback. |
324 | */ | 324 | */ |
325 | ret = ret && time_before_eq(inode->dirtied_when, jiffies); | 325 | ret = ret && time_before_eq(inode->dirtied_when, jiffies); |
326 | #endif | 326 | #endif |
327 | return ret; | 327 | return ret; |
328 | } | 328 | } |
329 | 329 | ||
330 | /* | 330 | /* |
331 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. | 331 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. |
332 | */ | 332 | */ |
333 | static void move_expired_inodes(struct list_head *delaying_queue, | 333 | static void move_expired_inodes(struct list_head *delaying_queue, |
334 | struct list_head *dispatch_queue, | 334 | struct list_head *dispatch_queue, |
335 | unsigned long *older_than_this) | 335 | unsigned long *older_than_this) |
336 | { | 336 | { |
337 | while (!list_empty(delaying_queue)) { | 337 | while (!list_empty(delaying_queue)) { |
338 | struct inode *inode = list_entry(delaying_queue->prev, | 338 | struct inode *inode = list_entry(delaying_queue->prev, |
339 | struct inode, i_list); | 339 | struct inode, i_list); |
340 | if (older_than_this && | 340 | if (older_than_this && |
341 | inode_dirtied_after(inode, *older_than_this)) | 341 | inode_dirtied_after(inode, *older_than_this)) |
342 | break; | 342 | break; |
343 | list_move(&inode->i_list, dispatch_queue); | 343 | list_move(&inode->i_list, dispatch_queue); |
344 | } | 344 | } |
345 | } | 345 | } |
346 | 346 | ||
347 | /* | 347 | /* |
348 | * Queue all expired dirty inodes for io, eldest first. | 348 | * Queue all expired dirty inodes for io, eldest first. |
349 | */ | 349 | */ |
350 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) | 350 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) |
351 | { | 351 | { |
352 | list_splice_init(&wb->b_more_io, wb->b_io.prev); | 352 | list_splice_init(&wb->b_more_io, wb->b_io.prev); |
353 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | 353 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); |
354 | } | 354 | } |
355 | 355 | ||
356 | static int write_inode(struct inode *inode, int sync) | 356 | static int write_inode(struct inode *inode, int sync) |
357 | { | 357 | { |
358 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) | 358 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) |
359 | return inode->i_sb->s_op->write_inode(inode, sync); | 359 | return inode->i_sb->s_op->write_inode(inode, sync); |
360 | return 0; | 360 | return 0; |
361 | } | 361 | } |
362 | 362 | ||
363 | /* | 363 | /* |
364 | * Wait for writeback on an inode to complete. | 364 | * Wait for writeback on an inode to complete. |
365 | */ | 365 | */ |
366 | static void inode_wait_for_writeback(struct inode *inode) | 366 | static void inode_wait_for_writeback(struct inode *inode) |
367 | { | 367 | { |
368 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); | 368 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); |
369 | wait_queue_head_t *wqh; | 369 | wait_queue_head_t *wqh; |
370 | 370 | ||
371 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); | 371 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
372 | do { | 372 | do { |
373 | spin_unlock(&inode_lock); | 373 | spin_unlock(&inode_lock); |
374 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); | 374 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
375 | spin_lock(&inode_lock); | 375 | spin_lock(&inode_lock); |
376 | } while (inode->i_state & I_SYNC); | 376 | } while (inode->i_state & I_SYNC); |
377 | } | 377 | } |
378 | 378 | ||
379 | /* | 379 | /* |
380 | * Write out an inode's dirty pages. Called under inode_lock. Either the | 380 | * Write out an inode's dirty pages. Called under inode_lock. Either the |
381 | * caller has ref on the inode (either via __iget or via syscall against an fd) | 381 | * caller has ref on the inode (either via __iget or via syscall against an fd) |
382 | * or the inode has I_WILL_FREE set (via generic_forget_inode) | 382 | * or the inode has I_WILL_FREE set (via generic_forget_inode) |
383 | * | 383 | * |
384 | * If `wait' is set, wait on the writeout. | 384 | * If `wait' is set, wait on the writeout. |
385 | * | 385 | * |
386 | * The whole writeout design is quite complex and fragile. We want to avoid | 386 | * The whole writeout design is quite complex and fragile. We want to avoid |
387 | * starvation of particular inodes when others are being redirtied, prevent | 387 | * starvation of particular inodes when others are being redirtied, prevent |
388 | * livelocks, etc. | 388 | * livelocks, etc. |
389 | * | 389 | * |
390 | * Called under inode_lock. | 390 | * Called under inode_lock. |
391 | */ | 391 | */ |
392 | static int | 392 | static int |
393 | writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | 393 | writeback_single_inode(struct inode *inode, struct writeback_control *wbc) |
394 | { | 394 | { |
395 | struct address_space *mapping = inode->i_mapping; | 395 | struct address_space *mapping = inode->i_mapping; |
396 | int wait = wbc->sync_mode == WB_SYNC_ALL; | 396 | int wait = wbc->sync_mode == WB_SYNC_ALL; |
397 | unsigned dirty; | 397 | unsigned dirty; |
398 | int ret; | 398 | int ret; |
399 | 399 | ||
400 | if (!atomic_read(&inode->i_count)) | 400 | if (!atomic_read(&inode->i_count)) |
401 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); | 401 | WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); |
402 | else | 402 | else |
403 | WARN_ON(inode->i_state & I_WILL_FREE); | 403 | WARN_ON(inode->i_state & I_WILL_FREE); |
404 | 404 | ||
405 | if (inode->i_state & I_SYNC) { | 405 | if (inode->i_state & I_SYNC) { |
406 | /* | 406 | /* |
407 | * If this inode is locked for writeback and we are not doing | 407 | * If this inode is locked for writeback and we are not doing |
408 | * writeback-for-data-integrity, move it to b_more_io so that | 408 | * writeback-for-data-integrity, move it to b_more_io so that |
409 | * writeback can proceed with the other inodes on s_io. | 409 | * writeback can proceed with the other inodes on s_io. |
410 | * | 410 | * |
411 | * We'll have another go at writing back this inode when we | 411 | * We'll have another go at writing back this inode when we |
412 | * completed a full scan of b_io. | 412 | * completed a full scan of b_io. |
413 | */ | 413 | */ |
414 | if (!wait) { | 414 | if (!wait) { |
415 | requeue_io(inode); | 415 | requeue_io(inode); |
416 | return 0; | 416 | return 0; |
417 | } | 417 | } |
418 | 418 | ||
419 | /* | 419 | /* |
420 | * It's a data-integrity sync. We must wait. | 420 | * It's a data-integrity sync. We must wait. |
421 | */ | 421 | */ |
422 | inode_wait_for_writeback(inode); | 422 | inode_wait_for_writeback(inode); |
423 | } | 423 | } |
424 | 424 | ||
425 | BUG_ON(inode->i_state & I_SYNC); | 425 | BUG_ON(inode->i_state & I_SYNC); |
426 | 426 | ||
427 | /* Set I_SYNC, reset I_DIRTY */ | 427 | /* Set I_SYNC, reset I_DIRTY */ |
428 | dirty = inode->i_state & I_DIRTY; | 428 | dirty = inode->i_state & I_DIRTY; |
429 | inode->i_state |= I_SYNC; | 429 | inode->i_state |= I_SYNC; |
430 | inode->i_state &= ~I_DIRTY; | 430 | inode->i_state &= ~I_DIRTY; |
431 | 431 | ||
432 | spin_unlock(&inode_lock); | 432 | spin_unlock(&inode_lock); |
433 | 433 | ||
434 | ret = do_writepages(mapping, wbc); | 434 | ret = do_writepages(mapping, wbc); |
435 | 435 | ||
436 | /* Don't write the inode if only I_DIRTY_PAGES was set */ | 436 | /* Don't write the inode if only I_DIRTY_PAGES was set */ |
437 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 437 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
438 | int err = write_inode(inode, wait); | 438 | int err = write_inode(inode, wait); |
439 | if (ret == 0) | 439 | if (ret == 0) |
440 | ret = err; | 440 | ret = err; |
441 | } | 441 | } |
442 | 442 | ||
443 | if (wait) { | 443 | if (wait) { |
444 | int err = filemap_fdatawait(mapping); | 444 | int err = filemap_fdatawait(mapping); |
445 | if (ret == 0) | 445 | if (ret == 0) |
446 | ret = err; | 446 | ret = err; |
447 | } | 447 | } |
448 | 448 | ||
449 | spin_lock(&inode_lock); | 449 | spin_lock(&inode_lock); |
450 | inode->i_state &= ~I_SYNC; | 450 | inode->i_state &= ~I_SYNC; |
451 | if (!(inode->i_state & (I_FREEING | I_CLEAR))) { | 451 | if (!(inode->i_state & (I_FREEING | I_CLEAR))) { |
452 | if (!(inode->i_state & I_DIRTY) && | 452 | if (inode->i_state & I_DIRTY) { |
453 | mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||
454 | /* | 453 | /* |
454 | * Someone redirtied the inode while were writing back | ||
455 | * the pages. | ||
456 | */ | ||
457 | redirty_tail(inode); | ||
458 | } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||
459 | /* | ||
455 | * We didn't write back all the pages. nfs_writepages() | 460 | * We didn't write back all the pages. nfs_writepages() |
456 | * sometimes bales out without doing anything. Redirty | 461 | * sometimes bales out without doing anything. Redirty |
457 | * the inode; Move it from b_io onto b_more_io/b_dirty. | 462 | * the inode; Move it from b_io onto b_more_io/b_dirty. |
458 | */ | 463 | */ |
459 | /* | 464 | /* |
460 | * akpm: if the caller was the kupdate function we put | 465 | * akpm: if the caller was the kupdate function we put |
461 | * this inode at the head of b_dirty so it gets first | 466 | * this inode at the head of b_dirty so it gets first |
462 | * consideration. Otherwise, move it to the tail, for | 467 | * consideration. Otherwise, move it to the tail, for |
463 | * the reasons described there. I'm not really sure | 468 | * the reasons described there. I'm not really sure |
464 | * how much sense this makes. Presumably I had a good | 469 | * how much sense this makes. Presumably I had a good |
465 | * reasons for doing it this way, and I'd rather not | 470 | * reasons for doing it this way, and I'd rather not |
466 | * muck with it at present. | 471 | * muck with it at present. |
467 | */ | 472 | */ |
468 | if (wbc->for_kupdate) { | 473 | if (wbc->for_kupdate) { |
469 | /* | 474 | /* |
470 | * For the kupdate function we move the inode | 475 | * For the kupdate function we move the inode |
471 | * to b_more_io so it will get more writeout as | 476 | * to b_more_io so it will get more writeout as |
472 | * soon as the queue becomes uncongested. | 477 | * soon as the queue becomes uncongested. |
473 | */ | 478 | */ |
474 | inode->i_state |= I_DIRTY_PAGES; | 479 | inode->i_state |= I_DIRTY_PAGES; |
475 | if (wbc->nr_to_write <= 0) { | 480 | if (wbc->nr_to_write <= 0) { |
476 | /* | 481 | /* |
477 | * slice used up: queue for next turn | 482 | * slice used up: queue for next turn |
478 | */ | 483 | */ |
479 | requeue_io(inode); | 484 | requeue_io(inode); |
480 | } else { | 485 | } else { |
481 | /* | 486 | /* |
482 | * somehow blocked: retry later | 487 | * somehow blocked: retry later |
483 | */ | 488 | */ |
484 | redirty_tail(inode); | 489 | redirty_tail(inode); |
485 | } | 490 | } |
486 | } else { | 491 | } else { |
487 | /* | 492 | /* |
488 | * Otherwise fully redirty the inode so that | 493 | * Otherwise fully redirty the inode so that |
489 | * other inodes on this superblock will get some | 494 | * other inodes on this superblock will get some |
490 | * writeout. Otherwise heavy writing to one | 495 | * writeout. Otherwise heavy writing to one |
491 | * file would indefinitely suspend writeout of | 496 | * file would indefinitely suspend writeout of |
492 | * all the other files. | 497 | * all the other files. |
493 | */ | 498 | */ |
494 | inode->i_state |= I_DIRTY_PAGES; | 499 | inode->i_state |= I_DIRTY_PAGES; |
495 | redirty_tail(inode); | 500 | redirty_tail(inode); |
496 | } | 501 | } |
497 | } else if (inode->i_state & I_DIRTY) { | ||
498 | /* | ||
499 | * Someone redirtied the inode while were writing back | ||
500 | * the pages. | ||
501 | */ | ||
502 | redirty_tail(inode); | ||
503 | } else if (atomic_read(&inode->i_count)) { | 502 | } else if (atomic_read(&inode->i_count)) { |
504 | /* | 503 | /* |
505 | * The inode is clean, inuse | 504 | * The inode is clean, inuse |
506 | */ | 505 | */ |
507 | list_move(&inode->i_list, &inode_in_use); | 506 | list_move(&inode->i_list, &inode_in_use); |
508 | } else { | 507 | } else { |
509 | /* | 508 | /* |
510 | * The inode is clean, unused | 509 | * The inode is clean, unused |
511 | */ | 510 | */ |
512 | list_move(&inode->i_list, &inode_unused); | 511 | list_move(&inode->i_list, &inode_unused); |
513 | } | 512 | } |
514 | } | 513 | } |
515 | inode_sync_complete(inode); | 514 | inode_sync_complete(inode); |
516 | return ret; | 515 | return ret; |
517 | } | 516 | } |
518 | 517 | ||
519 | /* | 518 | /* |
520 | * For WB_SYNC_NONE writeback, the caller does not have the sb pinned | 519 | * For WB_SYNC_NONE writeback, the caller does not have the sb pinned |
521 | * before calling writeback. So make sure that we do pin it, so it doesn't | 520 | * before calling writeback. So make sure that we do pin it, so it doesn't |
522 | * go away while we are writing inodes from it. | 521 | * go away while we are writing inodes from it. |
523 | * | 522 | * |
524 | * Returns 0 if the super was successfully pinned (or pinning wasn't needed), | 523 | * Returns 0 if the super was successfully pinned (or pinning wasn't needed), |
525 | * 1 if we failed. | 524 | * 1 if we failed. |
526 | */ | 525 | */ |
527 | static int pin_sb_for_writeback(struct writeback_control *wbc, | 526 | static int pin_sb_for_writeback(struct writeback_control *wbc, |
528 | struct inode *inode) | 527 | struct inode *inode) |
529 | { | 528 | { |
530 | struct super_block *sb = inode->i_sb; | 529 | struct super_block *sb = inode->i_sb; |
531 | 530 | ||
532 | /* | 531 | /* |
533 | * Caller must already hold the ref for this | 532 | * Caller must already hold the ref for this |
534 | */ | 533 | */ |
535 | if (wbc->sync_mode == WB_SYNC_ALL) { | 534 | if (wbc->sync_mode == WB_SYNC_ALL) { |
536 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 535 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
537 | return 0; | 536 | return 0; |
538 | } | 537 | } |
539 | 538 | ||
540 | spin_lock(&sb_lock); | 539 | spin_lock(&sb_lock); |
541 | sb->s_count++; | 540 | sb->s_count++; |
542 | if (down_read_trylock(&sb->s_umount)) { | 541 | if (down_read_trylock(&sb->s_umount)) { |
543 | if (sb->s_root) { | 542 | if (sb->s_root) { |
544 | spin_unlock(&sb_lock); | 543 | spin_unlock(&sb_lock); |
545 | return 0; | 544 | return 0; |
546 | } | 545 | } |
547 | /* | 546 | /* |
548 | * umounted, drop rwsem again and fall through to failure | 547 | * umounted, drop rwsem again and fall through to failure |
549 | */ | 548 | */ |
550 | up_read(&sb->s_umount); | 549 | up_read(&sb->s_umount); |
551 | } | 550 | } |
552 | 551 | ||
553 | sb->s_count--; | 552 | sb->s_count--; |
554 | spin_unlock(&sb_lock); | 553 | spin_unlock(&sb_lock); |
555 | return 1; | 554 | return 1; |
556 | } | 555 | } |
557 | 556 | ||
558 | static void unpin_sb_for_writeback(struct writeback_control *wbc, | 557 | static void unpin_sb_for_writeback(struct writeback_control *wbc, |
559 | struct inode *inode) | 558 | struct inode *inode) |
560 | { | 559 | { |
561 | struct super_block *sb = inode->i_sb; | 560 | struct super_block *sb = inode->i_sb; |
562 | 561 | ||
563 | if (wbc->sync_mode == WB_SYNC_ALL) | 562 | if (wbc->sync_mode == WB_SYNC_ALL) |
564 | return; | 563 | return; |
565 | 564 | ||
566 | up_read(&sb->s_umount); | 565 | up_read(&sb->s_umount); |
567 | put_super(sb); | 566 | put_super(sb); |
568 | } | 567 | } |
569 | 568 | ||
570 | static void writeback_inodes_wb(struct bdi_writeback *wb, | 569 | static void writeback_inodes_wb(struct bdi_writeback *wb, |
571 | struct writeback_control *wbc) | 570 | struct writeback_control *wbc) |
572 | { | 571 | { |
573 | struct super_block *sb = wbc->sb; | 572 | struct super_block *sb = wbc->sb; |
574 | const int is_blkdev_sb = sb_is_blkdev_sb(sb); | 573 | const int is_blkdev_sb = sb_is_blkdev_sb(sb); |
575 | const unsigned long start = jiffies; /* livelock avoidance */ | 574 | const unsigned long start = jiffies; /* livelock avoidance */ |
576 | 575 | ||
577 | spin_lock(&inode_lock); | 576 | spin_lock(&inode_lock); |
578 | 577 | ||
579 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | 578 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) |
580 | queue_io(wb, wbc->older_than_this); | 579 | queue_io(wb, wbc->older_than_this); |
581 | 580 | ||
582 | while (!list_empty(&wb->b_io)) { | 581 | while (!list_empty(&wb->b_io)) { |
583 | struct inode *inode = list_entry(wb->b_io.prev, | 582 | struct inode *inode = list_entry(wb->b_io.prev, |
584 | struct inode, i_list); | 583 | struct inode, i_list); |
585 | long pages_skipped; | 584 | long pages_skipped; |
586 | 585 | ||
587 | /* | 586 | /* |
588 | * super block given and doesn't match, skip this inode | 587 | * super block given and doesn't match, skip this inode |
589 | */ | 588 | */ |
590 | if (sb && sb != inode->i_sb) { | 589 | if (sb && sb != inode->i_sb) { |
591 | redirty_tail(inode); | 590 | redirty_tail(inode); |
592 | continue; | 591 | continue; |
593 | } | 592 | } |
594 | 593 | ||
595 | if (!bdi_cap_writeback_dirty(wb->bdi)) { | 594 | if (!bdi_cap_writeback_dirty(wb->bdi)) { |
596 | redirty_tail(inode); | 595 | redirty_tail(inode); |
597 | if (is_blkdev_sb) { | 596 | if (is_blkdev_sb) { |
598 | /* | 597 | /* |
599 | * Dirty memory-backed blockdev: the ramdisk | 598 | * Dirty memory-backed blockdev: the ramdisk |
600 | * driver does this. Skip just this inode | 599 | * driver does this. Skip just this inode |
601 | */ | 600 | */ |
602 | continue; | 601 | continue; |
603 | } | 602 | } |
604 | /* | 603 | /* |
605 | * Dirty memory-backed inode against a filesystem other | 604 | * Dirty memory-backed inode against a filesystem other |
606 | * than the kernel-internal bdev filesystem. Skip the | 605 | * than the kernel-internal bdev filesystem. Skip the |
607 | * entire superblock. | 606 | * entire superblock. |
608 | */ | 607 | */ |
609 | break; | 608 | break; |
610 | } | 609 | } |
611 | 610 | ||
612 | if (inode->i_state & (I_NEW | I_WILL_FREE)) { | 611 | if (inode->i_state & (I_NEW | I_WILL_FREE)) { |
613 | requeue_io(inode); | 612 | requeue_io(inode); |
614 | continue; | 613 | continue; |
615 | } | 614 | } |
616 | 615 | ||
617 | if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { | 616 | if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { |
618 | wbc->encountered_congestion = 1; | 617 | wbc->encountered_congestion = 1; |
619 | if (!is_blkdev_sb) | 618 | if (!is_blkdev_sb) |
620 | break; /* Skip a congested fs */ | 619 | break; /* Skip a congested fs */ |
621 | requeue_io(inode); | 620 | requeue_io(inode); |
622 | continue; /* Skip a congested blockdev */ | 621 | continue; /* Skip a congested blockdev */ |
623 | } | 622 | } |
624 | 623 | ||
625 | /* | 624 | /* |
626 | * Was this inode dirtied after sync_sb_inodes was called? | 625 | * Was this inode dirtied after sync_sb_inodes was called? |
627 | * This keeps sync from extra jobs and livelock. | 626 | * This keeps sync from extra jobs and livelock. |
628 | */ | 627 | */ |
629 | if (inode_dirtied_after(inode, start)) | 628 | if (inode_dirtied_after(inode, start)) |
630 | break; | 629 | break; |
631 | 630 | ||
632 | if (pin_sb_for_writeback(wbc, inode)) { | 631 | if (pin_sb_for_writeback(wbc, inode)) { |
633 | requeue_io(inode); | 632 | requeue_io(inode); |
634 | continue; | 633 | continue; |
635 | } | 634 | } |
636 | 635 | ||
637 | BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); | 636 | BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); |
638 | __iget(inode); | 637 | __iget(inode); |
639 | pages_skipped = wbc->pages_skipped; | 638 | pages_skipped = wbc->pages_skipped; |
640 | writeback_single_inode(inode, wbc); | 639 | writeback_single_inode(inode, wbc); |
641 | unpin_sb_for_writeback(wbc, inode); | 640 | unpin_sb_for_writeback(wbc, inode); |
642 | if (wbc->pages_skipped != pages_skipped) { | 641 | if (wbc->pages_skipped != pages_skipped) { |
643 | /* | 642 | /* |
644 | * writeback is not making progress due to locked | 643 | * writeback is not making progress due to locked |
645 | * buffers. Skip this inode for now. | 644 | * buffers. Skip this inode for now. |
646 | */ | 645 | */ |
647 | redirty_tail(inode); | 646 | redirty_tail(inode); |
648 | } | 647 | } |
649 | spin_unlock(&inode_lock); | 648 | spin_unlock(&inode_lock); |
650 | iput(inode); | 649 | iput(inode); |
651 | cond_resched(); | 650 | cond_resched(); |
652 | spin_lock(&inode_lock); | 651 | spin_lock(&inode_lock); |
653 | if (wbc->nr_to_write <= 0) { | 652 | if (wbc->nr_to_write <= 0) { |
654 | wbc->more_io = 1; | 653 | wbc->more_io = 1; |
655 | break; | 654 | break; |
656 | } | 655 | } |
657 | if (!list_empty(&wb->b_more_io)) | 656 | if (!list_empty(&wb->b_more_io)) |
658 | wbc->more_io = 1; | 657 | wbc->more_io = 1; |
659 | } | 658 | } |
660 | 659 | ||
661 | spin_unlock(&inode_lock); | 660 | spin_unlock(&inode_lock); |
662 | /* Leave any unwritten inodes on b_io */ | 661 | /* Leave any unwritten inodes on b_io */ |
663 | } | 662 | } |
664 | 663 | ||
665 | void writeback_inodes_wbc(struct writeback_control *wbc) | 664 | void writeback_inodes_wbc(struct writeback_control *wbc) |
666 | { | 665 | { |
667 | struct backing_dev_info *bdi = wbc->bdi; | 666 | struct backing_dev_info *bdi = wbc->bdi; |
668 | 667 | ||
669 | writeback_inodes_wb(&bdi->wb, wbc); | 668 | writeback_inodes_wb(&bdi->wb, wbc); |
670 | } | 669 | } |
671 | 670 | ||
672 | /* | 671 | /* |
673 | * The maximum number of pages to writeout in a single bdi flush/kupdate | 672 | * The maximum number of pages to writeout in a single bdi flush/kupdate |
674 | * operation. We do this so we don't hold I_SYNC against an inode for | 673 | * operation. We do this so we don't hold I_SYNC against an inode for |
675 | * enormous amounts of time, which would block a userspace task which has | 674 | * enormous amounts of time, which would block a userspace task which has |
676 | * been forced to throttle against that inode. Also, the code reevaluates | 675 | * been forced to throttle against that inode. Also, the code reevaluates |
677 | * the dirty each time it has written this many pages. | 676 | * the dirty each time it has written this many pages. |
678 | */ | 677 | */ |
679 | #define MAX_WRITEBACK_PAGES 1024 | 678 | #define MAX_WRITEBACK_PAGES 1024 |
680 | 679 | ||
681 | static inline bool over_bground_thresh(void) | 680 | static inline bool over_bground_thresh(void) |
682 | { | 681 | { |
683 | unsigned long background_thresh, dirty_thresh; | 682 | unsigned long background_thresh, dirty_thresh; |
684 | 683 | ||
685 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 684 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
686 | 685 | ||
687 | return (global_page_state(NR_FILE_DIRTY) + | 686 | return (global_page_state(NR_FILE_DIRTY) + |
688 | global_page_state(NR_UNSTABLE_NFS) >= background_thresh); | 687 | global_page_state(NR_UNSTABLE_NFS) >= background_thresh); |
689 | } | 688 | } |
690 | 689 | ||
691 | /* | 690 | /* |
692 | * Explicit flushing or periodic writeback of "old" data. | 691 | * Explicit flushing or periodic writeback of "old" data. |
693 | * | 692 | * |
694 | * Define "old": the first time one of an inode's pages is dirtied, we mark the | 693 | * Define "old": the first time one of an inode's pages is dirtied, we mark the |
695 | * dirtying-time in the inode's address_space. So this periodic writeback code | 694 | * dirtying-time in the inode's address_space. So this periodic writeback code |
696 | * just walks the superblock inode list, writing back any inodes which are | 695 | * just walks the superblock inode list, writing back any inodes which are |
697 | * older than a specific point in time. | 696 | * older than a specific point in time. |
698 | * | 697 | * |
699 | * Try to run once per dirty_writeback_interval. But if a writeback event | 698 | * Try to run once per dirty_writeback_interval. But if a writeback event |
700 | * takes longer than a dirty_writeback_interval interval, then leave a | 699 | * takes longer than a dirty_writeback_interval interval, then leave a |
701 | * one-second gap. | 700 | * one-second gap. |
702 | * | 701 | * |
703 | * older_than_this takes precedence over nr_to_write. So we'll only write back | 702 | * older_than_this takes precedence over nr_to_write. So we'll only write back |
704 | * all dirty pages if they are all attached to "old" mappings. | 703 | * all dirty pages if they are all attached to "old" mappings. |
705 | */ | 704 | */ |
706 | static long wb_writeback(struct bdi_writeback *wb, | 705 | static long wb_writeback(struct bdi_writeback *wb, |
707 | struct wb_writeback_args *args) | 706 | struct wb_writeback_args *args) |
708 | { | 707 | { |
709 | struct writeback_control wbc = { | 708 | struct writeback_control wbc = { |
710 | .bdi = wb->bdi, | 709 | .bdi = wb->bdi, |
711 | .sb = args->sb, | 710 | .sb = args->sb, |
712 | .sync_mode = args->sync_mode, | 711 | .sync_mode = args->sync_mode, |
713 | .older_than_this = NULL, | 712 | .older_than_this = NULL, |
714 | .for_kupdate = args->for_kupdate, | 713 | .for_kupdate = args->for_kupdate, |
715 | .range_cyclic = args->range_cyclic, | 714 | .range_cyclic = args->range_cyclic, |
716 | }; | 715 | }; |
717 | unsigned long oldest_jif; | 716 | unsigned long oldest_jif; |
718 | long wrote = 0; | 717 | long wrote = 0; |
719 | struct inode *inode; | 718 | struct inode *inode; |
720 | 719 | ||
721 | if (wbc.for_kupdate) { | 720 | if (wbc.for_kupdate) { |
722 | wbc.older_than_this = &oldest_jif; | 721 | wbc.older_than_this = &oldest_jif; |
723 | oldest_jif = jiffies - | 722 | oldest_jif = jiffies - |
724 | msecs_to_jiffies(dirty_expire_interval * 10); | 723 | msecs_to_jiffies(dirty_expire_interval * 10); |
725 | } | 724 | } |
726 | if (!wbc.range_cyclic) { | 725 | if (!wbc.range_cyclic) { |
727 | wbc.range_start = 0; | 726 | wbc.range_start = 0; |
728 | wbc.range_end = LLONG_MAX; | 727 | wbc.range_end = LLONG_MAX; |
729 | } | 728 | } |
730 | 729 | ||
731 | for (;;) { | 730 | for (;;) { |
732 | /* | 731 | /* |
733 | * Stop writeback when nr_pages has been consumed | 732 | * Stop writeback when nr_pages has been consumed |
734 | */ | 733 | */ |
735 | if (args->nr_pages <= 0) | 734 | if (args->nr_pages <= 0) |
736 | break; | 735 | break; |
737 | 736 | ||
738 | /* | 737 | /* |
739 | * For background writeout, stop when we are below the | 738 | * For background writeout, stop when we are below the |
740 | * background dirty threshold | 739 | * background dirty threshold |
741 | */ | 740 | */ |
742 | if (args->for_background && !over_bground_thresh()) | 741 | if (args->for_background && !over_bground_thresh()) |
743 | break; | 742 | break; |
744 | 743 | ||
745 | wbc.more_io = 0; | 744 | wbc.more_io = 0; |
746 | wbc.encountered_congestion = 0; | 745 | wbc.encountered_congestion = 0; |
747 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 746 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
748 | wbc.pages_skipped = 0; | 747 | wbc.pages_skipped = 0; |
749 | writeback_inodes_wb(wb, &wbc); | 748 | writeback_inodes_wb(wb, &wbc); |
750 | args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 749 | args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
751 | wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 750 | wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
752 | 751 | ||
753 | /* | 752 | /* |
754 | * If we ran out of stuff to write, bail unless more_io got set | 753 | * If we ran out of stuff to write, bail unless more_io got set |
755 | */ | 754 | */ |
756 | if (wbc.nr_to_write > 0) { | 755 | if (wbc.nr_to_write > 0) { |
757 | if (wbc.more_io) { | 756 | if (wbc.more_io) { |
758 | if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) | 757 | if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) |
759 | continue; | 758 | continue; |
760 | /* | 759 | /* |
761 | * Nothing written. Wait for some inode to | 760 | * Nothing written. Wait for some inode to |
762 | * become available for writeback. Otherwise | 761 | * become available for writeback. Otherwise |
763 | * we'll just busyloop. | 762 | * we'll just busyloop. |
764 | */ | 763 | */ |
765 | spin_lock(&inode_lock); | 764 | spin_lock(&inode_lock); |
766 | if (!list_empty(&wb->b_more_io)) { | 765 | if (!list_empty(&wb->b_more_io)) { |
767 | inode = list_entry( | 766 | inode = list_entry( |
768 | wb->b_more_io.prev, | 767 | wb->b_more_io.prev, |
769 | struct inode, i_list); | 768 | struct inode, i_list); |
770 | inode_wait_for_writeback(inode); | 769 | inode_wait_for_writeback(inode); |
771 | } | 770 | } |
772 | spin_unlock(&inode_lock); | 771 | spin_unlock(&inode_lock); |
773 | continue; | 772 | continue; |
774 | } | 773 | } |
775 | break; | 774 | break; |
776 | } | 775 | } |
777 | } | 776 | } |
778 | 777 | ||
779 | return wrote; | 778 | return wrote; |
780 | } | 779 | } |
781 | 780 | ||
782 | /* | 781 | /* |
783 | * Return the next bdi_work struct that hasn't been processed by this | 782 | * Return the next bdi_work struct that hasn't been processed by this |
784 | * wb thread yet. ->seen is initially set for each thread that exists | 783 | * wb thread yet. ->seen is initially set for each thread that exists |
785 | * for this device, when a thread first notices a piece of work it | 784 | * for this device, when a thread first notices a piece of work it |
786 | * clears its bit. Depending on writeback type, the thread will notify | 785 | * clears its bit. Depending on writeback type, the thread will notify |
787 | * completion on either receiving the work (WB_SYNC_NONE) or after | 786 | * completion on either receiving the work (WB_SYNC_NONE) or after |
788 | * it is done (WB_SYNC_ALL). | 787 | * it is done (WB_SYNC_ALL). |
789 | */ | 788 | */ |
790 | static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, | 789 | static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, |
791 | struct bdi_writeback *wb) | 790 | struct bdi_writeback *wb) |
792 | { | 791 | { |
793 | struct bdi_work *work, *ret = NULL; | 792 | struct bdi_work *work, *ret = NULL; |
794 | 793 | ||
795 | rcu_read_lock(); | 794 | rcu_read_lock(); |
796 | 795 | ||
797 | list_for_each_entry_rcu(work, &bdi->work_list, list) { | 796 | list_for_each_entry_rcu(work, &bdi->work_list, list) { |
798 | if (!test_bit(wb->nr, &work->seen)) | 797 | if (!test_bit(wb->nr, &work->seen)) |
799 | continue; | 798 | continue; |
800 | clear_bit(wb->nr, &work->seen); | 799 | clear_bit(wb->nr, &work->seen); |
801 | 800 | ||
802 | ret = work; | 801 | ret = work; |
803 | break; | 802 | break; |
804 | } | 803 | } |
805 | 804 | ||
806 | rcu_read_unlock(); | 805 | rcu_read_unlock(); |
807 | return ret; | 806 | return ret; |
808 | } | 807 | } |
809 | 808 | ||
810 | static long wb_check_old_data_flush(struct bdi_writeback *wb) | 809 | static long wb_check_old_data_flush(struct bdi_writeback *wb) |
811 | { | 810 | { |
812 | unsigned long expired; | 811 | unsigned long expired; |
813 | long nr_pages; | 812 | long nr_pages; |
814 | 813 | ||
815 | expired = wb->last_old_flush + | 814 | expired = wb->last_old_flush + |
816 | msecs_to_jiffies(dirty_writeback_interval * 10); | 815 | msecs_to_jiffies(dirty_writeback_interval * 10); |
817 | if (time_before(jiffies, expired)) | 816 | if (time_before(jiffies, expired)) |
818 | return 0; | 817 | return 0; |
819 | 818 | ||
820 | wb->last_old_flush = jiffies; | 819 | wb->last_old_flush = jiffies; |
821 | nr_pages = global_page_state(NR_FILE_DIRTY) + | 820 | nr_pages = global_page_state(NR_FILE_DIRTY) + |
822 | global_page_state(NR_UNSTABLE_NFS) + | 821 | global_page_state(NR_UNSTABLE_NFS) + |
823 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 822 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
824 | 823 | ||
825 | if (nr_pages) { | 824 | if (nr_pages) { |
826 | struct wb_writeback_args args = { | 825 | struct wb_writeback_args args = { |
827 | .nr_pages = nr_pages, | 826 | .nr_pages = nr_pages, |
828 | .sync_mode = WB_SYNC_NONE, | 827 | .sync_mode = WB_SYNC_NONE, |
829 | .for_kupdate = 1, | 828 | .for_kupdate = 1, |
830 | .range_cyclic = 1, | 829 | .range_cyclic = 1, |
831 | }; | 830 | }; |
832 | 831 | ||
833 | return wb_writeback(wb, &args); | 832 | return wb_writeback(wb, &args); |
834 | } | 833 | } |
835 | 834 | ||
836 | return 0; | 835 | return 0; |
837 | } | 836 | } |
838 | 837 | ||
839 | /* | 838 | /* |
840 | * Retrieve work items and do the writeback they describe | 839 | * Retrieve work items and do the writeback they describe |
841 | */ | 840 | */ |
842 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | 841 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait) |
843 | { | 842 | { |
844 | struct backing_dev_info *bdi = wb->bdi; | 843 | struct backing_dev_info *bdi = wb->bdi; |
845 | struct bdi_work *work; | 844 | struct bdi_work *work; |
846 | long wrote = 0; | 845 | long wrote = 0; |
847 | 846 | ||
848 | while ((work = get_next_work_item(bdi, wb)) != NULL) { | 847 | while ((work = get_next_work_item(bdi, wb)) != NULL) { |
849 | struct wb_writeback_args args = work->args; | 848 | struct wb_writeback_args args = work->args; |
850 | 849 | ||
851 | /* | 850 | /* |
852 | * Override sync mode, in case we must wait for completion | 851 | * Override sync mode, in case we must wait for completion |
853 | */ | 852 | */ |
854 | if (force_wait) | 853 | if (force_wait) |
855 | work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; | 854 | work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; |
856 | 855 | ||
857 | /* | 856 | /* |
858 | * If this isn't a data integrity operation, just notify | 857 | * If this isn't a data integrity operation, just notify |
859 | * that we have seen this work and we are now starting it. | 858 | * that we have seen this work and we are now starting it. |
860 | */ | 859 | */ |
861 | if (args.sync_mode == WB_SYNC_NONE) | 860 | if (args.sync_mode == WB_SYNC_NONE) |
862 | wb_clear_pending(wb, work); | 861 | wb_clear_pending(wb, work); |
863 | 862 | ||
864 | wrote += wb_writeback(wb, &args); | 863 | wrote += wb_writeback(wb, &args); |
865 | 864 | ||
866 | /* | 865 | /* |
867 | * This is a data integrity writeback, so only do the | 866 | * This is a data integrity writeback, so only do the |
868 | * notification when we have completed the work. | 867 | * notification when we have completed the work. |
869 | */ | 868 | */ |
870 | if (args.sync_mode == WB_SYNC_ALL) | 869 | if (args.sync_mode == WB_SYNC_ALL) |
871 | wb_clear_pending(wb, work); | 870 | wb_clear_pending(wb, work); |
872 | } | 871 | } |
873 | 872 | ||
874 | /* | 873 | /* |
875 | * Check for periodic writeback, kupdated() style | 874 | * Check for periodic writeback, kupdated() style |
876 | */ | 875 | */ |
877 | wrote += wb_check_old_data_flush(wb); | 876 | wrote += wb_check_old_data_flush(wb); |
878 | 877 | ||
879 | return wrote; | 878 | return wrote; |
880 | } | 879 | } |
881 | 880 | ||
882 | /* | 881 | /* |
883 | * Handle writeback of dirty data for the device backed by this bdi. Also | 882 | * Handle writeback of dirty data for the device backed by this bdi. Also |
884 | * wakes up periodically and does kupdated style flushing. | 883 | * wakes up periodically and does kupdated style flushing. |
885 | */ | 884 | */ |
886 | int bdi_writeback_task(struct bdi_writeback *wb) | 885 | int bdi_writeback_task(struct bdi_writeback *wb) |
887 | { | 886 | { |
888 | unsigned long last_active = jiffies; | 887 | unsigned long last_active = jiffies; |
889 | unsigned long wait_jiffies = -1UL; | 888 | unsigned long wait_jiffies = -1UL; |
890 | long pages_written; | 889 | long pages_written; |
891 | 890 | ||
892 | while (!kthread_should_stop()) { | 891 | while (!kthread_should_stop()) { |
893 | pages_written = wb_do_writeback(wb, 0); | 892 | pages_written = wb_do_writeback(wb, 0); |
894 | 893 | ||
895 | if (pages_written) | 894 | if (pages_written) |
896 | last_active = jiffies; | 895 | last_active = jiffies; |
897 | else if (wait_jiffies != -1UL) { | 896 | else if (wait_jiffies != -1UL) { |
898 | unsigned long max_idle; | 897 | unsigned long max_idle; |
899 | 898 | ||
900 | /* | 899 | /* |
901 | * Longest period of inactivity that we tolerate. If we | 900 | * Longest period of inactivity that we tolerate. If we |
902 | * see dirty data again later, the task will get | 901 | * see dirty data again later, the task will get |
903 | * recreated automatically. | 902 | * recreated automatically. |
904 | */ | 903 | */ |
905 | max_idle = max(5UL * 60 * HZ, wait_jiffies); | 904 | max_idle = max(5UL * 60 * HZ, wait_jiffies); |
906 | if (time_after(jiffies, max_idle + last_active)) | 905 | if (time_after(jiffies, max_idle + last_active)) |
907 | break; | 906 | break; |
908 | } | 907 | } |
909 | 908 | ||
910 | wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); | 909 | wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); |
911 | schedule_timeout_interruptible(wait_jiffies); | 910 | schedule_timeout_interruptible(wait_jiffies); |
912 | try_to_freeze(); | 911 | try_to_freeze(); |
913 | } | 912 | } |
914 | 913 | ||
915 | return 0; | 914 | return 0; |
916 | } | 915 | } |
917 | 916 | ||
918 | /* | 917 | /* |
919 | * Schedule writeback for all backing devices. This does WB_SYNC_NONE | 918 | * Schedule writeback for all backing devices. This does WB_SYNC_NONE |
920 | * writeback, for integrity writeback see bdi_sync_writeback(). | 919 | * writeback, for integrity writeback see bdi_sync_writeback(). |
921 | */ | 920 | */ |
922 | static void bdi_writeback_all(struct super_block *sb, long nr_pages) | 921 | static void bdi_writeback_all(struct super_block *sb, long nr_pages) |
923 | { | 922 | { |
924 | struct wb_writeback_args args = { | 923 | struct wb_writeback_args args = { |
925 | .sb = sb, | 924 | .sb = sb, |
926 | .nr_pages = nr_pages, | 925 | .nr_pages = nr_pages, |
927 | .sync_mode = WB_SYNC_NONE, | 926 | .sync_mode = WB_SYNC_NONE, |
928 | }; | 927 | }; |
929 | struct backing_dev_info *bdi; | 928 | struct backing_dev_info *bdi; |
930 | 929 | ||
931 | rcu_read_lock(); | 930 | rcu_read_lock(); |
932 | 931 | ||
933 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { | 932 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { |
934 | if (!bdi_has_dirty_io(bdi)) | 933 | if (!bdi_has_dirty_io(bdi)) |
935 | continue; | 934 | continue; |
936 | 935 | ||
937 | bdi_alloc_queue_work(bdi, &args); | 936 | bdi_alloc_queue_work(bdi, &args); |
938 | } | 937 | } |
939 | 938 | ||
940 | rcu_read_unlock(); | 939 | rcu_read_unlock(); |
941 | } | 940 | } |
942 | 941 | ||
943 | /* | 942 | /* |
944 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | 943 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back |
945 | * the whole world. | 944 | * the whole world. |
946 | */ | 945 | */ |
947 | void wakeup_flusher_threads(long nr_pages) | 946 | void wakeup_flusher_threads(long nr_pages) |
948 | { | 947 | { |
949 | if (nr_pages == 0) | 948 | if (nr_pages == 0) |
950 | nr_pages = global_page_state(NR_FILE_DIRTY) + | 949 | nr_pages = global_page_state(NR_FILE_DIRTY) + |
951 | global_page_state(NR_UNSTABLE_NFS); | 950 | global_page_state(NR_UNSTABLE_NFS); |
952 | bdi_writeback_all(NULL, nr_pages); | 951 | bdi_writeback_all(NULL, nr_pages); |
953 | } | 952 | } |
954 | 953 | ||
955 | static noinline void block_dump___mark_inode_dirty(struct inode *inode) | 954 | static noinline void block_dump___mark_inode_dirty(struct inode *inode) |
956 | { | 955 | { |
957 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { | 956 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { |
958 | struct dentry *dentry; | 957 | struct dentry *dentry; |
959 | const char *name = "?"; | 958 | const char *name = "?"; |
960 | 959 | ||
961 | dentry = d_find_alias(inode); | 960 | dentry = d_find_alias(inode); |
962 | if (dentry) { | 961 | if (dentry) { |
963 | spin_lock(&dentry->d_lock); | 962 | spin_lock(&dentry->d_lock); |
964 | name = (const char *) dentry->d_name.name; | 963 | name = (const char *) dentry->d_name.name; |
965 | } | 964 | } |
966 | printk(KERN_DEBUG | 965 | printk(KERN_DEBUG |
967 | "%s(%d): dirtied inode %lu (%s) on %s\n", | 966 | "%s(%d): dirtied inode %lu (%s) on %s\n", |
968 | current->comm, task_pid_nr(current), inode->i_ino, | 967 | current->comm, task_pid_nr(current), inode->i_ino, |
969 | name, inode->i_sb->s_id); | 968 | name, inode->i_sb->s_id); |
970 | if (dentry) { | 969 | if (dentry) { |
971 | spin_unlock(&dentry->d_lock); | 970 | spin_unlock(&dentry->d_lock); |
972 | dput(dentry); | 971 | dput(dentry); |
973 | } | 972 | } |
974 | } | 973 | } |
975 | } | 974 | } |
976 | 975 | ||
977 | /** | 976 | /** |
978 | * __mark_inode_dirty - internal function | 977 | * __mark_inode_dirty - internal function |
979 | * @inode: inode to mark | 978 | * @inode: inode to mark |
980 | * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) | 979 | * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) |
981 | * Mark an inode as dirty. Callers should use mark_inode_dirty or | 980 | * Mark an inode as dirty. Callers should use mark_inode_dirty or |
982 | * mark_inode_dirty_sync. | 981 | * mark_inode_dirty_sync. |
983 | * | 982 | * |
984 | * Put the inode on the super block's dirty list. | 983 | * Put the inode on the super block's dirty list. |
985 | * | 984 | * |
986 | * CAREFUL! We mark it dirty unconditionally, but move it onto the | 985 | * CAREFUL! We mark it dirty unconditionally, but move it onto the |
987 | * dirty list only if it is hashed or if it refers to a blockdev. | 986 | * dirty list only if it is hashed or if it refers to a blockdev. |
988 | * If it was not hashed, it will never be added to the dirty list | 987 | * If it was not hashed, it will never be added to the dirty list |
989 | * even if it is later hashed, as it will have been marked dirty already. | 988 | * even if it is later hashed, as it will have been marked dirty already. |
990 | * | 989 | * |
991 | * In short, make sure you hash any inodes _before_ you start marking | 990 | * In short, make sure you hash any inodes _before_ you start marking |
992 | * them dirty. | 991 | * them dirty. |
993 | * | 992 | * |
994 | * This function *must* be atomic for the I_DIRTY_PAGES case - | 993 | * This function *must* be atomic for the I_DIRTY_PAGES case - |
995 | * set_page_dirty() is called under spinlock in several places. | 994 | * set_page_dirty() is called under spinlock in several places. |
996 | * | 995 | * |
997 | * Note that for blockdevs, inode->dirtied_when represents the dirtying time of | 996 | * Note that for blockdevs, inode->dirtied_when represents the dirtying time of |
998 | * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of | 997 | * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of |
999 | * the kernel-internal blockdev inode represents the dirtying time of the | 998 | * the kernel-internal blockdev inode represents the dirtying time of the |
1000 | * blockdev's pages. This is why for I_DIRTY_PAGES we always use | 999 | * blockdev's pages. This is why for I_DIRTY_PAGES we always use |
1001 | * page->mapping->host, so the page-dirtying time is recorded in the internal | 1000 | * page->mapping->host, so the page-dirtying time is recorded in the internal |
1002 | * blockdev inode. | 1001 | * blockdev inode. |
1003 | */ | 1002 | */ |
1004 | void __mark_inode_dirty(struct inode *inode, int flags) | 1003 | void __mark_inode_dirty(struct inode *inode, int flags) |
1005 | { | 1004 | { |
1006 | struct super_block *sb = inode->i_sb; | 1005 | struct super_block *sb = inode->i_sb; |
1007 | 1006 | ||
1008 | /* | 1007 | /* |
1009 | * Don't do this for I_DIRTY_PAGES - that doesn't actually | 1008 | * Don't do this for I_DIRTY_PAGES - that doesn't actually |
1010 | * dirty the inode itself | 1009 | * dirty the inode itself |
1011 | */ | 1010 | */ |
1012 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 1011 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
1013 | if (sb->s_op->dirty_inode) | 1012 | if (sb->s_op->dirty_inode) |
1014 | sb->s_op->dirty_inode(inode); | 1013 | sb->s_op->dirty_inode(inode); |
1015 | } | 1014 | } |
1016 | 1015 | ||
1017 | /* | 1016 | /* |
1018 | * make sure that changes are seen by all cpus before we test i_state | 1017 | * make sure that changes are seen by all cpus before we test i_state |
1019 | * -- mikulas | 1018 | * -- mikulas |
1020 | */ | 1019 | */ |
1021 | smp_mb(); | 1020 | smp_mb(); |
1022 | 1021 | ||
1023 | /* avoid the locking if we can */ | 1022 | /* avoid the locking if we can */ |
1024 | if ((inode->i_state & flags) == flags) | 1023 | if ((inode->i_state & flags) == flags) |
1025 | return; | 1024 | return; |
1026 | 1025 | ||
1027 | if (unlikely(block_dump)) | 1026 | if (unlikely(block_dump)) |
1028 | block_dump___mark_inode_dirty(inode); | 1027 | block_dump___mark_inode_dirty(inode); |
1029 | 1028 | ||
1030 | spin_lock(&inode_lock); | 1029 | spin_lock(&inode_lock); |
1031 | if ((inode->i_state & flags) != flags) { | 1030 | if ((inode->i_state & flags) != flags) { |
1032 | const int was_dirty = inode->i_state & I_DIRTY; | 1031 | const int was_dirty = inode->i_state & I_DIRTY; |
1033 | 1032 | ||
1034 | inode->i_state |= flags; | 1033 | inode->i_state |= flags; |
1035 | 1034 | ||
1036 | /* | 1035 | /* |
1037 | * If the inode is being synced, just update its dirty state. | 1036 | * If the inode is being synced, just update its dirty state. |
1038 | * The unlocker will place the inode on the appropriate | 1037 | * The unlocker will place the inode on the appropriate |
1039 | * superblock list, based upon its state. | 1038 | * superblock list, based upon its state. |
1040 | */ | 1039 | */ |
1041 | if (inode->i_state & I_SYNC) | 1040 | if (inode->i_state & I_SYNC) |
1042 | goto out; | 1041 | goto out; |
1043 | 1042 | ||
1044 | /* | 1043 | /* |
1045 | * Only add valid (hashed) inodes to the superblock's | 1044 | * Only add valid (hashed) inodes to the superblock's |
1046 | * dirty list. Add blockdev inodes as well. | 1045 | * dirty list. Add blockdev inodes as well. |
1047 | */ | 1046 | */ |
1048 | if (!S_ISBLK(inode->i_mode)) { | 1047 | if (!S_ISBLK(inode->i_mode)) { |
1049 | if (hlist_unhashed(&inode->i_hash)) | 1048 | if (hlist_unhashed(&inode->i_hash)) |
1050 | goto out; | 1049 | goto out; |
1051 | } | 1050 | } |
1052 | if (inode->i_state & (I_FREEING|I_CLEAR)) | 1051 | if (inode->i_state & (I_FREEING|I_CLEAR)) |
1053 | goto out; | 1052 | goto out; |
1054 | 1053 | ||
1055 | /* | 1054 | /* |
1056 | * If the inode was already on b_dirty/b_io/b_more_io, don't | 1055 | * If the inode was already on b_dirty/b_io/b_more_io, don't |
1057 | * reposition it (that would break b_dirty time-ordering). | 1056 | * reposition it (that would break b_dirty time-ordering). |
1058 | */ | 1057 | */ |
1059 | if (!was_dirty) { | 1058 | if (!was_dirty) { |
1060 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 1059 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
1061 | struct backing_dev_info *bdi = wb->bdi; | 1060 | struct backing_dev_info *bdi = wb->bdi; |
1062 | 1061 | ||
1063 | if (bdi_cap_writeback_dirty(bdi) && | 1062 | if (bdi_cap_writeback_dirty(bdi) && |
1064 | !test_bit(BDI_registered, &bdi->state)) { | 1063 | !test_bit(BDI_registered, &bdi->state)) { |
1065 | WARN_ON(1); | 1064 | WARN_ON(1); |
1066 | printk(KERN_ERR "bdi-%s not registered\n", | 1065 | printk(KERN_ERR "bdi-%s not registered\n", |
1067 | bdi->name); | 1066 | bdi->name); |
1068 | } | 1067 | } |
1069 | 1068 | ||
1070 | inode->dirtied_when = jiffies; | 1069 | inode->dirtied_when = jiffies; |
1071 | list_move(&inode->i_list, &wb->b_dirty); | 1070 | list_move(&inode->i_list, &wb->b_dirty); |
1072 | } | 1071 | } |
1073 | } | 1072 | } |
1074 | out: | 1073 | out: |
1075 | spin_unlock(&inode_lock); | 1074 | spin_unlock(&inode_lock); |
1076 | } | 1075 | } |
1077 | EXPORT_SYMBOL(__mark_inode_dirty); | 1076 | EXPORT_SYMBOL(__mark_inode_dirty); |
1078 | 1077 | ||
1079 | /* | 1078 | /* |
1080 | * Write out a superblock's list of dirty inodes. A wait will be performed | 1079 | * Write out a superblock's list of dirty inodes. A wait will be performed |
1081 | * upon no inodes, all inodes or the final one, depending upon sync_mode. | 1080 | * upon no inodes, all inodes or the final one, depending upon sync_mode. |
1082 | * | 1081 | * |
1083 | * If older_than_this is non-NULL, then only write out inodes which | 1082 | * If older_than_this is non-NULL, then only write out inodes which |
1084 | * had their first dirtying at a time earlier than *older_than_this. | 1083 | * had their first dirtying at a time earlier than *older_than_this. |
1085 | * | 1084 | * |
1086 | * If we're a pdlfush thread, then implement pdflush collision avoidance | 1085 | * If we're a pdlfush thread, then implement pdflush collision avoidance |
1087 | * against the entire list. | 1086 | * against the entire list. |
1088 | * | 1087 | * |
1089 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. | 1088 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. |
1090 | * This function assumes that the blockdev superblock's inodes are backed by | 1089 | * This function assumes that the blockdev superblock's inodes are backed by |
1091 | * a variety of queues, so all inodes are searched. For other superblocks, | 1090 | * a variety of queues, so all inodes are searched. For other superblocks, |
1092 | * assume that all inodes are backed by the same queue. | 1091 | * assume that all inodes are backed by the same queue. |
1093 | * | 1092 | * |
1094 | * The inodes to be written are parked on bdi->b_io. They are moved back onto | 1093 | * The inodes to be written are parked on bdi->b_io. They are moved back onto |
1095 | * bdi->b_dirty as they are selected for writing. This way, none can be missed | 1094 | * bdi->b_dirty as they are selected for writing. This way, none can be missed |
1096 | * on the writer throttling path, and we get decent balancing between many | 1095 | * on the writer throttling path, and we get decent balancing between many |
1097 | * throttled threads: we don't want them all piling up on inode_sync_wait. | 1096 | * throttled threads: we don't want them all piling up on inode_sync_wait. |
1098 | */ | 1097 | */ |
1099 | static void wait_sb_inodes(struct super_block *sb) | 1098 | static void wait_sb_inodes(struct super_block *sb) |
1100 | { | 1099 | { |
1101 | struct inode *inode, *old_inode = NULL; | 1100 | struct inode *inode, *old_inode = NULL; |
1102 | 1101 | ||
1103 | /* | 1102 | /* |
1104 | * We need to be protected against the filesystem going from | 1103 | * We need to be protected against the filesystem going from |
1105 | * r/o to r/w or vice versa. | 1104 | * r/o to r/w or vice versa. |
1106 | */ | 1105 | */ |
1107 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1106 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
1108 | 1107 | ||
1109 | spin_lock(&inode_lock); | 1108 | spin_lock(&inode_lock); |
1110 | 1109 | ||
1111 | /* | 1110 | /* |
1112 | * Data integrity sync. Must wait for all pages under writeback, | 1111 | * Data integrity sync. Must wait for all pages under writeback, |
1113 | * because there may have been pages dirtied before our sync | 1112 | * because there may have been pages dirtied before our sync |
1114 | * call, but which had writeout started before we write it out. | 1113 | * call, but which had writeout started before we write it out. |
1115 | * In which case, the inode may not be on the dirty list, but | 1114 | * In which case, the inode may not be on the dirty list, but |
1116 | * we still have to wait for that writeout. | 1115 | * we still have to wait for that writeout. |
1117 | */ | 1116 | */ |
1118 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { | 1117 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { |
1119 | struct address_space *mapping; | 1118 | struct address_space *mapping; |
1120 | 1119 | ||
1121 | if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) | 1120 | if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) |
1122 | continue; | 1121 | continue; |
1123 | mapping = inode->i_mapping; | 1122 | mapping = inode->i_mapping; |
1124 | if (mapping->nrpages == 0) | 1123 | if (mapping->nrpages == 0) |
1125 | continue; | 1124 | continue; |
1126 | __iget(inode); | 1125 | __iget(inode); |
1127 | spin_unlock(&inode_lock); | 1126 | spin_unlock(&inode_lock); |
1128 | /* | 1127 | /* |
1129 | * We hold a reference to 'inode' so it couldn't have | 1128 | * We hold a reference to 'inode' so it couldn't have |
1130 | * been removed from s_inodes list while we dropped the | 1129 | * been removed from s_inodes list while we dropped the |
1131 | * inode_lock. We cannot iput the inode now as we can | 1130 | * inode_lock. We cannot iput the inode now as we can |
1132 | * be holding the last reference and we cannot iput it | 1131 | * be holding the last reference and we cannot iput it |
1133 | * under inode_lock. So we keep the reference and iput | 1132 | * under inode_lock. So we keep the reference and iput |
1134 | * it later. | 1133 | * it later. |
1135 | */ | 1134 | */ |
1136 | iput(old_inode); | 1135 | iput(old_inode); |
1137 | old_inode = inode; | 1136 | old_inode = inode; |
1138 | 1137 | ||
1139 | filemap_fdatawait(mapping); | 1138 | filemap_fdatawait(mapping); |
1140 | 1139 | ||
1141 | cond_resched(); | 1140 | cond_resched(); |
1142 | 1141 | ||
1143 | spin_lock(&inode_lock); | 1142 | spin_lock(&inode_lock); |
1144 | } | 1143 | } |
1145 | spin_unlock(&inode_lock); | 1144 | spin_unlock(&inode_lock); |
1146 | iput(old_inode); | 1145 | iput(old_inode); |
1147 | } | 1146 | } |
1148 | 1147 | ||
1149 | /** | 1148 | /** |
1150 | * writeback_inodes_sb - writeback dirty inodes from given super_block | 1149 | * writeback_inodes_sb - writeback dirty inodes from given super_block |
1151 | * @sb: the superblock | 1150 | * @sb: the superblock |
1152 | * | 1151 | * |
1153 | * Start writeback on some inodes on this super_block. No guarantees are made | 1152 | * Start writeback on some inodes on this super_block. No guarantees are made |
1154 | * on how many (if any) will be written, and this function does not wait | 1153 | * on how many (if any) will be written, and this function does not wait |
1155 | * for IO completion of submitted IO. The number of pages submitted is | 1154 | * for IO completion of submitted IO. The number of pages submitted is |
1156 | * returned. | 1155 | * returned. |
1157 | */ | 1156 | */ |
1158 | void writeback_inodes_sb(struct super_block *sb) | 1157 | void writeback_inodes_sb(struct super_block *sb) |
1159 | { | 1158 | { |
1160 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); | 1159 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); |
1161 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); | 1160 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); |
1162 | long nr_to_write; | 1161 | long nr_to_write; |
1163 | 1162 | ||
1164 | nr_to_write = nr_dirty + nr_unstable + | 1163 | nr_to_write = nr_dirty + nr_unstable + |
1165 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 1164 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
1166 | 1165 | ||
1167 | bdi_writeback_all(sb, nr_to_write); | 1166 | bdi_writeback_all(sb, nr_to_write); |
1168 | } | 1167 | } |
1169 | EXPORT_SYMBOL(writeback_inodes_sb); | 1168 | EXPORT_SYMBOL(writeback_inodes_sb); |
1170 | 1169 | ||
1171 | /** | 1170 | /** |
1172 | * sync_inodes_sb - sync sb inode pages | 1171 | * sync_inodes_sb - sync sb inode pages |
1173 | * @sb: the superblock | 1172 | * @sb: the superblock |
1174 | * | 1173 | * |
1175 | * This function writes and waits on any dirty inode belonging to this | 1174 | * This function writes and waits on any dirty inode belonging to this |
1176 | * super_block. The number of pages synced is returned. | 1175 | * super_block. The number of pages synced is returned. |
1177 | */ | 1176 | */ |
1178 | void sync_inodes_sb(struct super_block *sb) | 1177 | void sync_inodes_sb(struct super_block *sb) |
1179 | { | 1178 | { |
1180 | bdi_sync_writeback(sb->s_bdi, sb); | 1179 | bdi_sync_writeback(sb->s_bdi, sb); |
1181 | wait_sb_inodes(sb); | 1180 | wait_sb_inodes(sb); |
1182 | } | 1181 | } |
1183 | EXPORT_SYMBOL(sync_inodes_sb); | 1182 | EXPORT_SYMBOL(sync_inodes_sb); |
1184 | 1183 | ||
1185 | /** | 1184 | /** |
1186 | * write_inode_now - write an inode to disk | 1185 | * write_inode_now - write an inode to disk |
1187 | * @inode: inode to write to disk | 1186 | * @inode: inode to write to disk |
1188 | * @sync: whether the write should be synchronous or not | 1187 | * @sync: whether the write should be synchronous or not |
1189 | * | 1188 | * |
1190 | * This function commits an inode to disk immediately if it is dirty. This is | 1189 | * This function commits an inode to disk immediately if it is dirty. This is |
1191 | * primarily needed by knfsd. | 1190 | * primarily needed by knfsd. |
1192 | * | 1191 | * |
1193 | * The caller must either have a ref on the inode or must have set I_WILL_FREE. | 1192 | * The caller must either have a ref on the inode or must have set I_WILL_FREE. |
1194 | */ | 1193 | */ |
1195 | int write_inode_now(struct inode *inode, int sync) | 1194 | int write_inode_now(struct inode *inode, int sync) |
1196 | { | 1195 | { |
1197 | int ret; | 1196 | int ret; |
1198 | struct writeback_control wbc = { | 1197 | struct writeback_control wbc = { |
1199 | .nr_to_write = LONG_MAX, | 1198 | .nr_to_write = LONG_MAX, |
1200 | .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, | 1199 | .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, |
1201 | .range_start = 0, | 1200 | .range_start = 0, |
1202 | .range_end = LLONG_MAX, | 1201 | .range_end = LLONG_MAX, |
1203 | }; | 1202 | }; |
1204 | 1203 | ||
1205 | if (!mapping_cap_writeback_dirty(inode->i_mapping)) | 1204 | if (!mapping_cap_writeback_dirty(inode->i_mapping)) |
1206 | wbc.nr_to_write = 0; | 1205 | wbc.nr_to_write = 0; |
1207 | 1206 | ||
1208 | might_sleep(); | 1207 | might_sleep(); |
1209 | spin_lock(&inode_lock); | 1208 | spin_lock(&inode_lock); |
1210 | ret = writeback_single_inode(inode, &wbc); | 1209 | ret = writeback_single_inode(inode, &wbc); |
1211 | spin_unlock(&inode_lock); | 1210 | spin_unlock(&inode_lock); |
1212 | if (sync) | 1211 | if (sync) |
1213 | inode_sync_wait(inode); | 1212 | inode_sync_wait(inode); |
1214 | return ret; | 1213 | return ret; |
1215 | } | 1214 | } |
1216 | EXPORT_SYMBOL(write_inode_now); | 1215 | EXPORT_SYMBOL(write_inode_now); |
1217 | 1216 | ||
1218 | /** | 1217 | /** |
1219 | * sync_inode - write an inode and its pages to disk. | 1218 | * sync_inode - write an inode and its pages to disk. |
1220 | * @inode: the inode to sync | 1219 | * @inode: the inode to sync |
1221 | * @wbc: controls the writeback mode | 1220 | * @wbc: controls the writeback mode |
1222 | * | 1221 | * |
1223 | * sync_inode() will write an inode and its pages to disk. It will also | 1222 | * sync_inode() will write an inode and its pages to disk. It will also |
1224 | * correctly update the inode on its superblock's dirty inode lists and will | 1223 | * correctly update the inode on its superblock's dirty inode lists and will |
1225 | * update inode->i_state. | 1224 | * update inode->i_state. |
1226 | * | 1225 | * |
1227 | * The caller must have a ref on the inode. | 1226 | * The caller must have a ref on the inode. |
1228 | */ | 1227 | */ |
1229 | int sync_inode(struct inode *inode, struct writeback_control *wbc) | 1228 | int sync_inode(struct inode *inode, struct writeback_control *wbc) |
1230 | { | 1229 | { |
1231 | int ret; | 1230 | int ret; |
1232 | 1231 | ||
1233 | spin_lock(&inode_lock); | 1232 | spin_lock(&inode_lock); |