Blame view
fs/fs-writeback.c
37.5 KB
1da177e4c
|
1 2 3 4 5 6 7 8 9 10 |
/* * fs/fs-writeback.c * * Copyright (C) 2002, Linus Torvalds. * * Contains all the functions related to writing back and waiting * upon dirty inodes against superblocks, and writing back dirty * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * |
e1f8e8744
|
11 |
* 10Apr2002 Andrew Morton |
1da177e4c
|
12 13 14 15 16 |
* Split out of fs/inode.c * Additions for address_space-based writeback */ #include <linux/kernel.h> |
f5ff8422b
|
17 |
#include <linux/module.h> |
1da177e4c
|
18 |
#include <linux/spinlock.h> |
5a0e3ad6a
|
19 |
#include <linux/slab.h> |
1da177e4c
|
20 21 22 |
#include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> |
03ba3782e
|
23 24 |
#include <linux/kthread.h> #include <linux/freezer.h> |
1da177e4c
|
25 26 27 28 |
#include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> |
455b28646
|
29 |
#include <linux/tracepoint.h> |
07f3f05c1
|
30 |
#include "internal.h" |
1da177e4c
|
31 |
|
d0bceac74
|
32 |
/* |
c4a77a6c7
|
33 34 |
* Passed into wb_writeback(), essentially a subset of writeback_control */ |
83ba7b071
|
35 |
struct wb_writeback_work { |
c4a77a6c7
|
36 37 |
long nr_pages; struct super_block *sb; |
d46db3d58
|
38 |
unsigned long *older_than_this; |
c4a77a6c7
|
39 |
enum writeback_sync_modes sync_mode; |
6e6938b6d
|
40 |
unsigned int tagged_writepages:1; |
52957fe1c
|
41 42 43 |
unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; |
0e175a183
|
44 |
enum wb_reason reason; /* why was writeback initiated? */ |
c4a77a6c7
|
45 |
|
8010c3b63
|
46 |
struct list_head list; /* pending work list */ |
83ba7b071
|
47 |
struct completion *done; /* set if the caller waits */ |
03ba3782e
|
48 |
}; |
0e175a183
|
49 50 51 52 53 54 55 56 57 58 |
const char *wb_reason_name[] = { [WB_REASON_BACKGROUND] = "background", [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", [WB_REASON_SYNC] = "sync", [WB_REASON_PERIODIC] = "periodic", [WB_REASON_LAPTOP_TIMER] = "laptop_timer", [WB_REASON_FREE_MORE_MEM] = "free_more_memory", [WB_REASON_FS_FREE_SPACE] = "fs_free_space", [WB_REASON_FORKER_THREAD] = "forker_thread" }; |
455b28646
|
59 60 61 62 63 64 65 |
/* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this * file. */ #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> |
455b28646
|
66 67 68 69 |
/* * We don't actually have pdflush, but this one is exported though /proc... */ int nr_pdflush_threads; |
f11b00f3b
|
70 71 72 73 |
/** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. * |
03ba3782e
|
74 75 |
* Determine whether there is writeback waiting to be handled against a * backing device. |
f11b00f3b
|
76 77 78 |
*/ int writeback_in_progress(struct backing_dev_info *bdi) { |
81d73a32d
|
79 |
return test_bit(BDI_writeback_running, &bdi->state); |
f11b00f3b
|
80 |
} |
692ebd17c
|
81 82 83 |
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb = inode->i_sb; |
692ebd17c
|
84 |
|
aaead25b9
|
85 86 87 88 |
if (strcmp(sb->s_type->name, "bdev") == 0) return inode->i_mapping->backing_dev_info; return sb->s_bdi; |
692ebd17c
|
89 |
} |
7ccf19a80
|
90 91 92 93 |
static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_wb_list); } |
6585027a5
|
94 95 |
/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ static void bdi_wakeup_flusher(struct backing_dev_info *bdi) |
03ba3782e
|
96 |
{ |
fff5b85aa
|
97 98 99 100 101 102 103 |
if (bdi->wb.task) { wake_up_process(bdi->wb.task); } else { /* * The bdi thread isn't there, wake up the forker thread which * will create and run it. */ |
03ba3782e
|
104 |
wake_up_process(default_backing_dev_info.wb.task); |
1da177e4c
|
105 |
} |
6585027a5
|
106 107 108 109 110 111 112 113 114 115 116 117 |
} static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { trace_writeback_queue(bdi, work); spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); if (!bdi->wb.task) trace_writeback_nothread(bdi, work); bdi_wakeup_flusher(bdi); |
6467716a3
|
118 |
spin_unlock_bh(&bdi->wb_lock); |
1da177e4c
|
119 |
} |
83ba7b071
|
120 121 |
static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, |
0e175a183
|
122 |
bool range_cyclic, enum wb_reason reason) |
1da177e4c
|
123 |
{ |
83ba7b071
|
124 |
struct wb_writeback_work *work; |
03ba3782e
|
125 |
|
bcddc3f01
|
126 127 128 129 |
/* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ |
83ba7b071
|
130 131 |
work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { |
455b28646
|
132 133 |
if (bdi->wb.task) { trace_writeback_nowork(bdi); |
83ba7b071
|
134 |
wake_up_process(bdi->wb.task); |
455b28646
|
135 |
} |
83ba7b071
|
136 |
return; |
bcddc3f01
|
137 |
} |
03ba3782e
|
138 |
|
83ba7b071
|
139 140 141 |
work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; |
0e175a183
|
142 |
work->reason = reason; |
03ba3782e
|
143 |
|
83ba7b071
|
144 |
bdi_queue_work(bdi, work); |
b6e51316d
|
145 146 147 148 149 150 |
} /** * bdi_start_writeback - start writeback * @bdi: the backing device to write from * @nr_pages: the number of pages to write |
786228ab3
|
151 |
* @reason: reason why some writeback work was initiated |
b6e51316d
|
152 153 154 |
* * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only |
25985edce
|
155 |
* started when this function returns, we make no guarantees on |
0e3c9a228
|
156 |
* completion. Caller need not hold sb s_umount semaphore. |
b6e51316d
|
157 158 |
* */ |
0e175a183
|
159 160 |
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason) |
b6e51316d
|
161 |
{ |
0e175a183
|
162 |
__bdi_start_writeback(bdi, nr_pages, true, reason); |
c5444198c
|
163 |
} |
d3ddec763
|
164 |
|
c5444198c
|
165 166 167 168 169 |
/** * bdi_start_background_writeback - start background writeback * @bdi: the backing device to write from * * Description: |
6585027a5
|
170 171 172 173 |
* This makes sure WB_SYNC_NONE background writeback happens. When * this function returns, it is only guaranteed that for given BDI * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. |
c5444198c
|
174 175 176 |
*/ void bdi_start_background_writeback(struct backing_dev_info *bdi) { |
6585027a5
|
177 178 179 180 |
/* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ |
71927e84e
|
181 |
trace_writeback_wake_background(bdi); |
6585027a5
|
182 183 184 |
spin_lock_bh(&bdi->wb_lock); bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); |
1da177e4c
|
185 186 187 |
} /* |
a66979aba
|
188 189 190 191 |
* Remove the inode from the writeback list it is on. */ void inode_wb_list_del(struct inode *inode) { |
f758eeabe
|
192 193 194 |
struct backing_dev_info *bdi = inode_to_bdi(inode); spin_lock(&bdi->wb.list_lock); |
a66979aba
|
195 |
list_del_init(&inode->i_wb_list); |
f758eeabe
|
196 |
spin_unlock(&bdi->wb.list_lock); |
a66979aba
|
197 |
} |
a66979aba
|
198 |
/* |
6610a0bc8
|
199 200 201 202 |
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is |
66f3b8e2e
|
203 |
* already the most-recently-dirtied inode on the b_dirty list. If that is |
6610a0bc8
|
204 205 206 |
* the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ |
f758eeabe
|
207 |
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) |
6610a0bc8
|
208 |
{ |
f758eeabe
|
209 |
assert_spin_locked(&wb->list_lock); |
03ba3782e
|
210 |
if (!list_empty(&wb->b_dirty)) { |
66f3b8e2e
|
211 |
struct inode *tail; |
6610a0bc8
|
212 |
|
7ccf19a80
|
213 |
tail = wb_inode(wb->b_dirty.next); |
66f3b8e2e
|
214 |
if (time_before(inode->dirtied_when, tail->dirtied_when)) |
6610a0bc8
|
215 216 |
inode->dirtied_when = jiffies; } |
7ccf19a80
|
217 |
list_move(&inode->i_wb_list, &wb->b_dirty); |
6610a0bc8
|
218 219 220 |
} /* |
66f3b8e2e
|
221 |
* requeue inode for re-scanning after bdi->b_io list is exhausted. |
c986d1e2a
|
222 |
*/ |
f758eeabe
|
223 |
static void requeue_io(struct inode *inode, struct bdi_writeback *wb) |
c986d1e2a
|
224 |
{ |
f758eeabe
|
225 |
assert_spin_locked(&wb->list_lock); |
7ccf19a80
|
226 |
list_move(&inode->i_wb_list, &wb->b_more_io); |
c986d1e2a
|
227 |
} |
1c0eeaf56
|
228 229 230 |
static void inode_sync_complete(struct inode *inode) { /* |
a66979aba
|
231 |
* Prevent speculative execution through |
f758eeabe
|
232 |
* spin_unlock(&wb->list_lock); |
1c0eeaf56
|
233 |
*/ |
a66979aba
|
234 |
|
1c0eeaf56
|
235 236 237 |
smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } |
d2caa3c54
|
238 239 240 241 242 243 244 245 |
static bool inode_dirtied_after(struct inode *inode, unsigned long t) { bool ret = time_after(inode->dirtied_when, t); #ifndef CONFIG_64BIT /* * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times |
5b0830cb9
|
246 |
* from permanently stopping the whole bdi writeback. |
d2caa3c54
|
247 248 249 250 251 |
*/ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif return ret; } |
c986d1e2a
|
252 |
/* |
2c1365791
|
253 254 |
* Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ |
e84d0a4f8
|
255 |
static int move_expired_inodes(struct list_head *delaying_queue, |
2c1365791
|
256 |
struct list_head *dispatch_queue, |
ad4e38dd6
|
257 |
struct wb_writeback_work *work) |
2c1365791
|
258 |
{ |
5c03449d3
|
259 260 |
LIST_HEAD(tmp); struct list_head *pos, *node; |
cf137307c
|
261 |
struct super_block *sb = NULL; |
5c03449d3
|
262 |
struct inode *inode; |
cf137307c
|
263 |
int do_sb_sort = 0; |
e84d0a4f8
|
264 |
int moved = 0; |
5c03449d3
|
265 |
|
2c1365791
|
266 |
while (!list_empty(delaying_queue)) { |
7ccf19a80
|
267 |
inode = wb_inode(delaying_queue->prev); |
ad4e38dd6
|
268 269 |
if (work->older_than_this && inode_dirtied_after(inode, *work->older_than_this)) |
2c1365791
|
270 |
break; |
cf137307c
|
271 272 273 |
if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; |
7ccf19a80
|
274 |
list_move(&inode->i_wb_list, &tmp); |
e84d0a4f8
|
275 |
moved++; |
5c03449d3
|
276 |
} |
cf137307c
|
277 278 279 |
/* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); |
e84d0a4f8
|
280 |
goto out; |
cf137307c
|
281 |
} |
5c03449d3
|
282 283 |
/* Move inodes from one superblock together */ while (!list_empty(&tmp)) { |
7ccf19a80
|
284 |
sb = wb_inode(tmp.prev)->i_sb; |
5c03449d3
|
285 |
list_for_each_prev_safe(pos, node, &tmp) { |
7ccf19a80
|
286 |
inode = wb_inode(pos); |
5c03449d3
|
287 |
if (inode->i_sb == sb) |
7ccf19a80
|
288 |
list_move(&inode->i_wb_list, dispatch_queue); |
5c03449d3
|
289 |
} |
2c1365791
|
290 |
} |
e84d0a4f8
|
291 292 |
out: return moved; |
2c1365791
|
293 294 295 296 |
} /* * Queue all expired dirty inodes for io, eldest first. |
4ea879b96
|
297 298 299 300 301 302 303 304 |
* Before * newly dirtied b_dirty b_io b_more_io * =============> gf edc BA * After * newly dirtied b_dirty b_io b_more_io * =============> g fBAedc * | * +--> dequeue for IO |
2c1365791
|
305 |
*/ |
ad4e38dd6
|
306 |
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) |
66f3b8e2e
|
307 |
{ |
e84d0a4f8
|
308 |
int moved; |
f758eeabe
|
309 |
assert_spin_locked(&wb->list_lock); |
4ea879b96
|
310 |
list_splice_init(&wb->b_more_io, &wb->b_io); |
ad4e38dd6
|
311 312 |
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); trace_writeback_queue_io(wb, work, moved); |
66f3b8e2e
|
313 |
} |
a9185b41a
|
314 |
static int write_inode(struct inode *inode, struct writeback_control *wbc) |
08d8e9749
|
315 |
{ |
03ba3782e
|
316 |
if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) |
a9185b41a
|
317 |
return inode->i_sb->s_op->write_inode(inode, wbc); |
03ba3782e
|
318 |
return 0; |
08d8e9749
|
319 |
} |
08d8e9749
|
320 |
|
2c1365791
|
321 |
/* |
01c031945
|
322 323 |
* Wait for writeback on an inode to complete. */ |
f758eeabe
|
324 325 |
static void inode_wait_for_writeback(struct inode *inode, struct bdi_writeback *wb) |
01c031945
|
326 327 328 329 330 |
{ DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
250df6ed2
|
331 332 |
while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); |
f758eeabe
|
333 |
spin_unlock(&wb->list_lock); |
01c031945
|
334 |
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
f758eeabe
|
335 |
spin_lock(&wb->list_lock); |
250df6ed2
|
336 |
spin_lock(&inode->i_lock); |
58a9d3d8d
|
337 |
} |
01c031945
|
338 339 340 |
} /* |
f758eeabe
|
341 |
* Write out an inode's dirty pages. Called under wb->list_lock and |
0f1b1fd86
|
342 343 |
* inode->i_lock. Either the caller has an active reference on the inode or * the inode has I_WILL_FREE set. |
01c031945
|
344 |
* |
1da177e4c
|
345 346 347 348 349 |
* If `wait' is set, wait on the writeout. * * The whole writeout design is quite complex and fragile. We want to avoid * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. |
1da177e4c
|
350 351 |
*/ static int |
f758eeabe
|
352 353 |
writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc) |
1da177e4c
|
354 |
{ |
1da177e4c
|
355 |
struct address_space *mapping = inode->i_mapping; |
251d6a471
|
356 |
long nr_to_write = wbc->nr_to_write; |
01c031945
|
357 |
unsigned dirty; |
1da177e4c
|
358 |
int ret; |
f758eeabe
|
359 |
assert_spin_locked(&wb->list_lock); |
0f1b1fd86
|
360 |
assert_spin_locked(&inode->i_lock); |
01c031945
|
361 362 363 364 365 366 367 368 |
if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) { /* * If this inode is locked for writeback and we are not doing |
66f3b8e2e
|
369 |
* writeback-for-data-integrity, move it to b_more_io so that |
01c031945
|
370 371 372 |
* writeback can proceed with the other inodes on s_io. * * We'll have another go at writing back this inode when we |
66f3b8e2e
|
373 |
* completed a full scan of b_io. |
01c031945
|
374 |
*/ |
a9185b41a
|
375 |
if (wbc->sync_mode != WB_SYNC_ALL) { |
f758eeabe
|
376 |
requeue_io(inode, wb); |
251d6a471
|
377 378 |
trace_writeback_single_inode_requeue(inode, wbc, nr_to_write); |
01c031945
|
379 380 381 382 383 384 |
return 0; } /* * It's a data-integrity sync. We must wait. */ |
f758eeabe
|
385 |
inode_wait_for_writeback(inode, wb); |
01c031945
|
386 |
} |
1c0eeaf56
|
387 |
BUG_ON(inode->i_state & I_SYNC); |
1da177e4c
|
388 |
|
5547e8aac
|
389 |
/* Set I_SYNC, reset I_DIRTY_PAGES */ |
1c0eeaf56
|
390 |
inode->i_state |= I_SYNC; |
5547e8aac
|
391 |
inode->i_state &= ~I_DIRTY_PAGES; |
250df6ed2
|
392 |
spin_unlock(&inode->i_lock); |
f758eeabe
|
393 |
spin_unlock(&wb->list_lock); |
1da177e4c
|
394 395 |
ret = do_writepages(mapping, wbc); |
26821ed40
|
396 397 398 399 400 |
/* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data * I/O completion. */ |
a9185b41a
|
401 |
if (wbc->sync_mode == WB_SYNC_ALL) { |
26821ed40
|
402 |
int err = filemap_fdatawait(mapping); |
1da177e4c
|
403 404 405 |
if (ret == 0) ret = err; } |
5547e8aac
|
406 407 408 409 410 |
/* * Some filesystems may redirty the inode during the writeback * due to delalloc, clear dirty metadata flags right before * write_inode() */ |
250df6ed2
|
411 |
spin_lock(&inode->i_lock); |
5547e8aac
|
412 413 |
dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); |
250df6ed2
|
414 |
spin_unlock(&inode->i_lock); |
26821ed40
|
415 416 |
/* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
a9185b41a
|
417 |
int err = write_inode(inode, wbc); |
1da177e4c
|
418 419 420 |
if (ret == 0) ret = err; } |
f758eeabe
|
421 |
spin_lock(&wb->list_lock); |
250df6ed2
|
422 |
spin_lock(&inode->i_lock); |
1c0eeaf56
|
423 |
inode->i_state &= ~I_SYNC; |
a4ffdde6e
|
424 |
if (!(inode->i_state & I_FREEING)) { |
94c3dcbb0
|
425 426 427 428 429 430 431 432 |
/* * Sync livelock prevention. Each inode is tagged and synced in * one shot. If still dirty, it will be redirty_tail()'ed below. * Update the dirty time to prevent enqueue and sync it again. */ if ((inode->i_state & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; |
23539afc7
|
433 |
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { |
1da177e4c
|
434 435 |
/* * We didn't write back all the pages. nfs_writepages() |
a50aeb401
|
436 |
* sometimes bales out without doing anything. |
1b43ef91d
|
437 |
*/ |
a50aeb401
|
438 439 |
inode->i_state |= I_DIRTY_PAGES; if (wbc->nr_to_write <= 0) { |
1da177e4c
|
440 |
/* |
a50aeb401
|
441 |
* slice used up: queue for next turn |
1da177e4c
|
442 |
*/ |
f758eeabe
|
443 |
requeue_io(inode, wb); |
1da177e4c
|
444 445 |
} else { /* |
a50aeb401
|
446 447 448 449 450 |
* Writeback blocked by something other than * congestion. Delay the inode for some time to * avoid spinning on the CPU (100% iowait) * retrying writeback of the dirty page/inode * that cannot be performed immediately. |
1da177e4c
|
451 |
*/ |
f758eeabe
|
452 |
redirty_tail(inode, wb); |
1da177e4c
|
453 |
} |
23539afc7
|
454 455 456 457 458 459 460 |
} else if (inode->i_state & I_DIRTY) { /* * Filesystems can dirty the inode during writeback * operations, such as delayed allocation during * submission or metadata updates after data IO * completion. */ |
f758eeabe
|
461 |
redirty_tail(inode, wb); |
1da177e4c
|
462 463 |
} else { /* |
9e38d86ff
|
464 465 466 |
* The inode is clean. At this point we either have * a reference to the inode or it's on it's way out. * No need to add it back to the LRU. |
1da177e4c
|
467 |
*/ |
7ccf19a80
|
468 |
list_del_init(&inode->i_wb_list); |
1da177e4c
|
469 470 |
} } |
1c0eeaf56
|
471 |
inode_sync_complete(inode); |
251d6a471
|
472 |
trace_writeback_single_inode(inode, wbc, nr_to_write); |
1da177e4c
|
473 474 |
return ret; } |
1a12d8bd7
|
475 476 |
static long writeback_chunk_size(struct backing_dev_info *bdi, struct wb_writeback_work *work) |
d46db3d58
|
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 |
{ long pages; /* * WB_SYNC_ALL mode does livelock avoidance by syncing dirty * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX * here avoids calling into writeback_inodes_wb() more than once. * * The intended call sequence for WB_SYNC_ALL writeback is: * * wb_writeback() * writeback_sb_inodes() <== called only once * write_cache_pages() <== called once for each inode * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; |
1a12d8bd7
|
495 496 497 498 499 500 501 |
else { pages = min(bdi->avg_write_bandwidth / 2, global_dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); } |
d46db3d58
|
502 503 504 |
return pages; } |
03ba3782e
|
505 |
/* |
f11c9c5c2
|
506 |
* Write a portion of b_io inodes which belong to @sb. |
edadfb10b
|
507 508 |
* * If @only_this_sb is true, then find and write all such |
f11c9c5c2
|
509 510 |
* inodes. Otherwise write only ones which go sequentially * in reverse order. |
edadfb10b
|
511 |
* |
d46db3d58
|
512 |
* Return the number of pages and/or inodes written. |
f11c9c5c2
|
513 |
*/ |
d46db3d58
|
514 515 516 |
static long writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, struct wb_writeback_work *work) |
1da177e4c
|
517 |
{ |
d46db3d58
|
518 519 520 521 522 523 524 525 526 527 528 529 |
struct writeback_control wbc = { .sync_mode = work->sync_mode, .tagged_writepages = work->tagged_writepages, .for_kupdate = work->for_kupdate, .for_background = work->for_background, .range_cyclic = work->range_cyclic, .range_start = 0, .range_end = LLONG_MAX, }; unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ |
03ba3782e
|
530 |
while (!list_empty(&wb->b_io)) { |
7ccf19a80
|
531 |
struct inode *inode = wb_inode(wb->b_io.prev); |
edadfb10b
|
532 533 |
if (inode->i_sb != sb) { |
d46db3d58
|
534 |
if (work->sb) { |
edadfb10b
|
535 536 537 538 539 |
/* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ |
f758eeabe
|
540 |
redirty_tail(inode, wb); |
edadfb10b
|
541 542 543 544 545 546 547 548 |
continue; } /* * The inode belongs to a different superblock. * Bounce back to the caller to unpin this and * pin the next superblock. */ |
d46db3d58
|
549 |
break; |
edadfb10b
|
550 |
} |
9843b76aa
|
551 552 553 554 555 |
/* * Don't bother with new inodes or inodes beeing freed, first * kind does not need peridic writeout yet, and for the latter * kind writeout is handled by the freer. */ |
250df6ed2
|
556 |
spin_lock(&inode->i_lock); |
9843b76aa
|
557 |
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { |
250df6ed2
|
558 |
spin_unlock(&inode->i_lock); |
fcc5c2221
|
559 |
redirty_tail(inode, wb); |
7ef0d7377
|
560 561 |
continue; } |
1da177e4c
|
562 |
__iget(inode); |
1a12d8bd7
|
563 |
write_chunk = writeback_chunk_size(wb->bdi, work); |
d46db3d58
|
564 565 |
wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; |
250df6ed2
|
566 |
|
d46db3d58
|
567 |
writeback_single_inode(inode, wb, &wbc); |
250df6ed2
|
568 |
|
d46db3d58
|
569 570 571 572 573 |
work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; if (!(inode->i_state & I_DIRTY)) wrote++; if (wbc.pages_skipped) { |
1da177e4c
|
574 575 576 577 |
/* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ |
f758eeabe
|
578 |
redirty_tail(inode, wb); |
1da177e4c
|
579 |
} |
0f1b1fd86
|
580 |
spin_unlock(&inode->i_lock); |
f758eeabe
|
581 |
spin_unlock(&wb->list_lock); |
1da177e4c
|
582 |
iput(inode); |
4ffc84442
|
583 |
cond_resched(); |
f758eeabe
|
584 |
spin_lock(&wb->list_lock); |
d46db3d58
|
585 586 587 588 589 590 591 592 593 |
/* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ if (wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; |
8bc3be275
|
594 |
} |
1da177e4c
|
595 |
} |
d46db3d58
|
596 |
return wrote; |
f11c9c5c2
|
597 |
} |
d46db3d58
|
598 599 |
static long __writeback_inodes_wb(struct bdi_writeback *wb, struct wb_writeback_work *work) |
f11c9c5c2
|
600 |
{ |
d46db3d58
|
601 602 |
unsigned long start_time = jiffies; long wrote = 0; |
38f219776
|
603 |
|
f11c9c5c2
|
604 |
while (!list_empty(&wb->b_io)) { |
7ccf19a80
|
605 |
struct inode *inode = wb_inode(wb->b_io.prev); |
f11c9c5c2
|
606 |
struct super_block *sb = inode->i_sb; |
9ecc2738a
|
607 |
|
12ad3ab66
|
608 |
if (!grab_super_passive(sb)) { |
0e995816f
|
609 610 611 612 613 614 |
/* * grab_super_passive() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ redirty_tail(inode, wb); |
edadfb10b
|
615 |
continue; |
f11c9c5c2
|
616 |
} |
d46db3d58
|
617 |
wrote += writeback_sb_inodes(sb, wb, work); |
edadfb10b
|
618 |
drop_super(sb); |
f11c9c5c2
|
619 |
|
d46db3d58
|
620 621 622 623 624 625 626 |
/* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } |
f11c9c5c2
|
627 |
} |
66f3b8e2e
|
628 |
/* Leave any unwritten inodes on b_io */ |
d46db3d58
|
629 |
return wrote; |
66f3b8e2e
|
630 |
} |
0e175a183
|
631 632 |
long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason) |
edadfb10b
|
633 |
{ |
d46db3d58
|
634 635 636 637 |
struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, |
0e175a183
|
638 |
.reason = reason, |
d46db3d58
|
639 |
}; |
edadfb10b
|
640 |
|
f758eeabe
|
641 |
spin_lock(&wb->list_lock); |
424b351fe
|
642 |
if (list_empty(&wb->b_io)) |
ad4e38dd6
|
643 |
queue_io(wb, &work); |
d46db3d58
|
644 |
__writeback_inodes_wb(wb, &work); |
f758eeabe
|
645 |
spin_unlock(&wb->list_lock); |
edadfb10b
|
646 |
|
d46db3d58
|
647 648 |
return nr_pages - work.nr_pages; } |
03ba3782e
|
649 |
|
b00949aa2
|
650 |
static bool over_bground_thresh(struct backing_dev_info *bdi) |
03ba3782e
|
651 652 |
{ unsigned long background_thresh, dirty_thresh; |
16c4042f0
|
653 |
global_dirty_limits(&background_thresh, &dirty_thresh); |
03ba3782e
|
654 |
|
b00949aa2
|
655 656 657 658 659 660 661 662 663 |
if (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) > background_thresh) return true; if (bdi_stat(bdi, BDI_RECLAIMABLE) > bdi_dirty_limit(bdi, background_thresh)) return true; return false; |
03ba3782e
|
664 665 666 |
} /* |
e98be2d59
|
667 668 669 670 671 672 |
* Called under wb->list_lock. If there are multiple wb per bdi, * only the flusher working on the first wb should do it. */ static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { |
af6a31138
|
673 |
__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); |
e98be2d59
|
674 675 676 |
} /* |
03ba3782e
|
677 |
* Explicit flushing or periodic writeback of "old" data. |
66f3b8e2e
|
678 |
* |
03ba3782e
|
679 680 681 682 |
* Define "old": the first time one of an inode's pages is dirtied, we mark the * dirtying-time in the inode's address_space. So this periodic writeback code * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. |
66f3b8e2e
|
683 |
* |
03ba3782e
|
684 685 686 |
* Try to run once per dirty_writeback_interval. But if a writeback event * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. |
66f3b8e2e
|
687 |
* |
03ba3782e
|
688 689 |
* older_than_this takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. |
66f3b8e2e
|
690 |
*/ |
c4a77a6c7
|
691 |
static long wb_writeback(struct bdi_writeback *wb, |
83ba7b071
|
692 |
struct wb_writeback_work *work) |
66f3b8e2e
|
693 |
{ |
e98be2d59
|
694 |
unsigned long wb_start = jiffies; |
d46db3d58
|
695 |
long nr_pages = work->nr_pages; |
03ba3782e
|
696 |
unsigned long oldest_jif; |
a5989bdc9
|
697 |
struct inode *inode; |
d46db3d58
|
698 |
long progress; |
66f3b8e2e
|
699 |
|
e185dda89
|
700 |
oldest_jif = jiffies; |
d46db3d58
|
701 |
work->older_than_this = &oldest_jif; |
38f219776
|
702 |
|
e8dfc3058
|
703 |
spin_lock(&wb->list_lock); |
03ba3782e
|
704 705 |
for (;;) { /* |
d3ddec763
|
706 |
* Stop writeback when nr_pages has been consumed |
03ba3782e
|
707 |
*/ |
83ba7b071
|
708 |
if (work->nr_pages <= 0) |
03ba3782e
|
709 |
break; |
66f3b8e2e
|
710 |
|
38f219776
|
711 |
/* |
aa373cf55
|
712 713 714 715 716 717 718 719 720 721 |
* Background writeout and kupdate-style writeback may * run forever. Stop them if there is other work to do * so that e.g. sync can proceed. They'll be restarted * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && !list_empty(&wb->bdi->work_list)) break; /* |
d3ddec763
|
722 723 |
* For background writeout, stop when we are below the * background dirty threshold |
38f219776
|
724 |
*/ |
b00949aa2
|
725 |
if (work->for_background && !over_bground_thresh(wb->bdi)) |
03ba3782e
|
726 |
break; |
38f219776
|
727 |
|
ba9aa8399
|
728 729 730 |
if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); |
d46db3d58
|
731 |
work->older_than_this = &oldest_jif; |
ba9aa8399
|
732 |
} |
028c2dd18
|
733 |
|
d46db3d58
|
734 |
trace_writeback_start(wb->bdi, work); |
e8dfc3058
|
735 |
if (list_empty(&wb->b_io)) |
ad4e38dd6
|
736 |
queue_io(wb, work); |
83ba7b071
|
737 |
if (work->sb) |
d46db3d58
|
738 |
progress = writeback_sb_inodes(work->sb, wb, work); |
edadfb10b
|
739 |
else |
d46db3d58
|
740 741 |
progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb->bdi, work); |
028c2dd18
|
742 |
|
e98be2d59
|
743 |
wb_update_bandwidth(wb, wb_start); |
03ba3782e
|
744 745 |
/* |
e6fb6da2e
|
746 747 748 749 750 751 |
* Did we write something? Try for more * * Dirty inodes are moved to b_io for writeback in batches. * The completion of the current batch does not necessarily * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. |
03ba3782e
|
752 |
*/ |
d46db3d58
|
753 |
if (progress) |
71fd05a88
|
754 755 |
continue; /* |
e6fb6da2e
|
756 |
* No more inodes for IO, bail |
71fd05a88
|
757 |
*/ |
b7a2441f9
|
758 |
if (list_empty(&wb->b_more_io)) |
03ba3782e
|
759 |
break; |
71fd05a88
|
760 |
/* |
71fd05a88
|
761 762 763 764 |
* Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ |
71fd05a88
|
765 |
if (!list_empty(&wb->b_more_io)) { |
d46db3d58
|
766 |
trace_writeback_wait(wb->bdi, work); |
7ccf19a80
|
767 |
inode = wb_inode(wb->b_more_io.prev); |
250df6ed2
|
768 |
spin_lock(&inode->i_lock); |
f758eeabe
|
769 |
inode_wait_for_writeback(inode, wb); |
250df6ed2
|
770 |
spin_unlock(&inode->i_lock); |
03ba3782e
|
771 772 |
} } |
e8dfc3058
|
773 |
spin_unlock(&wb->list_lock); |
03ba3782e
|
774 |
|
d46db3d58
|
775 |
return nr_pages - work->nr_pages; |
03ba3782e
|
776 777 778 |
} /* |
83ba7b071
|
779 |
* Return the next wb_writeback_work struct that hasn't been processed yet. |
03ba3782e
|
780 |
*/ |
83ba7b071
|
781 |
static struct wb_writeback_work * |
08852b6d6
|
782 |
get_next_work_item(struct backing_dev_info *bdi) |
03ba3782e
|
783 |
{ |
83ba7b071
|
784 |
struct wb_writeback_work *work = NULL; |
03ba3782e
|
785 |
|
6467716a3
|
786 |
spin_lock_bh(&bdi->wb_lock); |
83ba7b071
|
787 788 789 790 |
if (!list_empty(&bdi->work_list)) { work = list_entry(bdi->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); |
03ba3782e
|
791 |
} |
6467716a3
|
792 |
spin_unlock_bh(&bdi->wb_lock); |
83ba7b071
|
793 |
return work; |
03ba3782e
|
794 |
} |
cdf01dd54
|
795 796 797 798 799 800 801 802 803 804 |
/* * Add in the number of potentially dirty inodes, because each inode * write can dirty pagecache in the underlying blockdev. */ static unsigned long get_nr_dirty_pages(void) { return global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } |
6585027a5
|
805 806 |
static long wb_check_background_flush(struct bdi_writeback *wb) { |
b00949aa2
|
807 |
if (over_bground_thresh(wb->bdi)) { |
6585027a5
|
808 809 810 811 812 813 |
struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, |
0e175a183
|
814 |
.reason = WB_REASON_BACKGROUND, |
6585027a5
|
815 816 817 818 819 820 821 |
}; return wb_writeback(wb, &work); } return 0; } |
03ba3782e
|
822 823 824 825 |
static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; long nr_pages; |
69b62d01e
|
826 827 828 829 830 |
/* * When set to zero, disable periodic writeback */ if (!dirty_writeback_interval) return 0; |
03ba3782e
|
831 832 833 834 835 836 |
expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) return 0; wb->last_old_flush = jiffies; |
cdf01dd54
|
837 |
nr_pages = get_nr_dirty_pages(); |
03ba3782e
|
838 |
|
c4a77a6c7
|
839 |
if (nr_pages) { |
83ba7b071
|
840 |
struct wb_writeback_work work = { |
c4a77a6c7
|
841 842 843 844 |
.nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, |
0e175a183
|
845 |
.reason = WB_REASON_PERIODIC, |
c4a77a6c7
|
846 |
}; |
83ba7b071
|
847 |
return wb_writeback(wb, &work); |
c4a77a6c7
|
848 |
} |
03ba3782e
|
849 850 851 852 853 854 855 856 857 858 |
return 0; } /* * Retrieve work items and do the writeback they describe */ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; |
83ba7b071
|
859 |
struct wb_writeback_work *work; |
c4a77a6c7
|
860 |
long wrote = 0; |
03ba3782e
|
861 |
|
81d73a32d
|
862 |
set_bit(BDI_writeback_running, &wb->bdi->state); |
08852b6d6
|
863 |
while ((work = get_next_work_item(bdi)) != NULL) { |
03ba3782e
|
864 865 |
/* * Override sync mode, in case we must wait for completion |
83ba7b071
|
866 |
* because this thread is exiting now. |
03ba3782e
|
867 868 |
*/ if (force_wait) |
83ba7b071
|
869 |
work->sync_mode = WB_SYNC_ALL; |
03ba3782e
|
870 |
|
455b28646
|
871 |
trace_writeback_exec(bdi, work); |
83ba7b071
|
872 |
wrote += wb_writeback(wb, work); |
03ba3782e
|
873 874 |
/* |
83ba7b071
|
875 876 |
* Notify the caller of completion if this is a synchronous * work item, otherwise just free it. |
03ba3782e
|
877 |
*/ |
83ba7b071
|
878 879 880 881 |
if (work->done) complete(work->done); else kfree(work); |
03ba3782e
|
882 883 884 885 886 887 |
} /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); |
6585027a5
|
888 |
wrote += wb_check_background_flush(wb); |
81d73a32d
|
889 |
clear_bit(BDI_writeback_running, &wb->bdi->state); |
03ba3782e
|
890 891 892 893 894 895 896 897 |
return wrote; } /* * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ |
082439004
|
898 |
int bdi_writeback_thread(void *data) |
03ba3782e
|
899 |
{ |
082439004
|
900 901 |
struct bdi_writeback *wb = data; struct backing_dev_info *bdi = wb->bdi; |
03ba3782e
|
902 |
long pages_written; |
766f91641
|
903 |
current->flags |= PF_SWAPWRITE; |
082439004
|
904 |
set_freezable(); |
ecd584030
|
905 |
wb->last_active = jiffies; |
082439004
|
906 907 908 909 910 |
/* * Our parent may run at a different priority, just set us to normal */ set_user_nice(current, 0); |
455b28646
|
911 |
trace_writeback_thread_start(bdi); |
03ba3782e
|
912 |
while (!kthread_should_stop()) { |
6467716a3
|
913 914 915 916 917 |
/* * Remove own delayed wake-up timer, since we are already awake * and we'll take care of the preriodic write-back. */ del_timer(&wb->wakeup_timer); |
03ba3782e
|
918 |
pages_written = wb_do_writeback(wb, 0); |
455b28646
|
919 |
trace_writeback_pages_written(pages_written); |
03ba3782e
|
920 |
if (pages_written) |
ecd584030
|
921 |
wb->last_active = jiffies; |
03ba3782e
|
922 |
|
297252c81
|
923 |
set_current_state(TASK_INTERRUPTIBLE); |
b76b4014f
|
924 |
if (!list_empty(&bdi->work_list) || kthread_should_stop()) { |
f9eadbbd4
|
925 |
__set_current_state(TASK_RUNNING); |
297252c81
|
926 |
continue; |
03ba3782e
|
927 |
} |
253c34e9b
|
928 |
if (wb_has_dirty_io(wb) && dirty_writeback_interval) |
fff5b85aa
|
929 |
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
253c34e9b
|
930 931 932 933 934 935 |
else { /* * We have nothing to do, so can go sleep without any * timeout and save power. When a work is queued or * something is made dirty - we will be woken up. */ |
297252c81
|
936 |
schedule(); |
f9eadbbd4
|
937 |
} |
69b62d01e
|
938 |
|
03ba3782e
|
939 940 |
try_to_freeze(); } |
fff5b85aa
|
941 |
/* Flush any work that raced with us exiting */ |
082439004
|
942 943 |
if (!list_empty(&bdi->work_list)) wb_do_writeback(wb, 1); |
455b28646
|
944 945 |
trace_writeback_thread_stop(bdi); |
03ba3782e
|
946 947 |
return 0; } |
082439004
|
948 |
|
03ba3782e
|
949 |
/* |
b8c2f3474
|
950 951 |
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. |
03ba3782e
|
952 |
*/ |
0e175a183
|
953 |
void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) |
03ba3782e
|
954 |
{ |
b8c2f3474
|
955 |
struct backing_dev_info *bdi; |
03ba3782e
|
956 |
|
83ba7b071
|
957 958 |
if (!nr_pages) { nr_pages = global_page_state(NR_FILE_DIRTY) + |
b8c2f3474
|
959 960 |
global_page_state(NR_UNSTABLE_NFS); } |
03ba3782e
|
961 |
|
b8c2f3474
|
962 |
rcu_read_lock(); |
cfc4ba536
|
963 |
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { |
03ba3782e
|
964 965 |
if (!bdi_has_dirty_io(bdi)) continue; |
0e175a183
|
966 |
__bdi_start_writeback(bdi, nr_pages, false, reason); |
03ba3782e
|
967 |
} |
cfc4ba536
|
968 |
rcu_read_unlock(); |
1da177e4c
|
969 |
} |
03ba3782e
|
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 |
static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { struct dentry *dentry; const char *name = "?"; dentry = d_find_alias(inode); if (dentry) { spin_lock(&dentry->d_lock); name = (const char *) dentry->d_name.name; } printk(KERN_DEBUG "%s(%d): dirtied inode %lu (%s) on %s ", current->comm, task_pid_nr(current), inode->i_ino, name, inode->i_sb->s_id); if (dentry) { spin_unlock(&dentry->d_lock); dput(dentry); } } } /** * __mark_inode_dirty - internal function * @inode: inode to mark * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) * Mark an inode as dirty. Callers should use mark_inode_dirty or * mark_inode_dirty_sync. |
1da177e4c
|
999 |
* |
03ba3782e
|
1000 1001 1002 1003 1004 1005 1006 1007 1008 |
* Put the inode on the super block's dirty list. * * CAREFUL! We mark it dirty unconditionally, but move it onto the * dirty list only if it is hashed or if it refers to a blockdev. * If it was not hashed, it will never be added to the dirty list * even if it is later hashed, as it will have been marked dirty already. * * In short, make sure you hash any inodes _before_ you start marking * them dirty. |
1da177e4c
|
1009 |
* |
03ba3782e
|
1010 1011 1012 1013 1014 1015 |
* Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of * the kernel-internal blockdev inode represents the dirtying time of the * blockdev's pages. This is why for I_DIRTY_PAGES we always use * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. |
1da177e4c
|
1016 |
*/ |
03ba3782e
|
1017 |
void __mark_inode_dirty(struct inode *inode, int flags) |
1da177e4c
|
1018 |
{ |
03ba3782e
|
1019 |
struct super_block *sb = inode->i_sb; |
253c34e9b
|
1020 |
struct backing_dev_info *bdi = NULL; |
1da177e4c
|
1021 |
|
03ba3782e
|
1022 1023 1024 1025 1026 1027 |
/* * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself */ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { if (sb->s_op->dirty_inode) |
aa3857295
|
1028 |
sb->s_op->dirty_inode(inode, flags); |
03ba3782e
|
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 |
} /* * make sure that changes are seen by all cpus before we test i_state * -- mikulas */ smp_mb(); /* avoid the locking if we can */ if ((inode->i_state & flags) == flags) return; if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); |
250df6ed2
|
1043 |
spin_lock(&inode->i_lock); |
03ba3782e
|
1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 |
if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; inode->i_state |= flags; /* * If the inode is being synced, just update its dirty state. * The unlocker will place the inode on the appropriate * superblock list, based upon its state. */ if (inode->i_state & I_SYNC) |
250df6ed2
|
1055 |
goto out_unlock_inode; |
03ba3782e
|
1056 1057 1058 1059 1060 1061 |
/* * Only add valid (hashed) inodes to the superblock's * dirty list. Add blockdev inodes as well. */ if (!S_ISBLK(inode->i_mode)) { |
1d3382cbf
|
1062 |
if (inode_unhashed(inode)) |
250df6ed2
|
1063 |
goto out_unlock_inode; |
03ba3782e
|
1064 |
} |
a4ffdde6e
|
1065 |
if (inode->i_state & I_FREEING) |
250df6ed2
|
1066 |
goto out_unlock_inode; |
03ba3782e
|
1067 1068 1069 1070 1071 1072 |
/* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { |
a66979aba
|
1073 |
bool wakeup_bdi = false; |
253c34e9b
|
1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 |
bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { WARN(!test_bit(BDI_registered, &bdi->state), "bdi-%s not registered ", bdi->name); /* * If this is the first dirty inode for this * bdi, we have to wake-up the corresponding * bdi thread to make sure background * write-back happens later. */ if (!wb_has_dirty_io(&bdi->wb)) wakeup_bdi = true; |
500b067c5
|
1089 |
} |
03ba3782e
|
1090 |
|
a66979aba
|
1091 |
spin_unlock(&inode->i_lock); |
f758eeabe
|
1092 |
spin_lock(&bdi->wb.list_lock); |
03ba3782e
|
1093 |
inode->dirtied_when = jiffies; |
7ccf19a80
|
1094 |
list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
f758eeabe
|
1095 |
spin_unlock(&bdi->wb.list_lock); |
a66979aba
|
1096 1097 1098 1099 |
if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); return; |
1da177e4c
|
1100 |
} |
1da177e4c
|
1101 |
} |
250df6ed2
|
1102 1103 |
out_unlock_inode: spin_unlock(&inode->i_lock); |
253c34e9b
|
1104 |
|
03ba3782e
|
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 |
} EXPORT_SYMBOL(__mark_inode_dirty); /* * Write out a superblock's list of dirty inodes. A wait will be performed * upon no inodes, all inodes or the final one, depending upon sync_mode. * * If older_than_this is non-NULL, then only write out inodes which * had their first dirtying at a time earlier than *older_than_this. * |
03ba3782e
|
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 |
* If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, * assume that all inodes are backed by the same queue. * * The inodes to be written are parked on bdi->b_io. They are moved back onto * bdi->b_dirty as they are selected for writing. This way, none can be missed * on the writer throttling path, and we get decent balancing between many * throttled threads: we don't want them all piling up on inode_sync_wait. */ |
b6e51316d
|
1125 |
static void wait_sb_inodes(struct super_block *sb) |
03ba3782e
|
1126 1127 1128 1129 1130 1131 1132 |
{ struct inode *inode, *old_inode = NULL; /* * We need to be protected against the filesystem going from * r/o to r/w or vice versa. */ |
b6e51316d
|
1133 |
WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
03ba3782e
|
1134 |
|
55fa6091d
|
1135 |
spin_lock(&inode_sb_list_lock); |
03ba3782e
|
1136 1137 1138 1139 1140 1141 1142 1143 |
/* * Data integrity sync. Must wait for all pages under writeback, * because there may have been pages dirtied before our sync * call, but which had writeout started before we write it out. * In which case, the inode may not be on the dirty list, but * we still have to wait for that writeout. */ |
b6e51316d
|
1144 |
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { |
250df6ed2
|
1145 |
struct address_space *mapping = inode->i_mapping; |
03ba3782e
|
1146 |
|
250df6ed2
|
1147 1148 1149 1150 |
spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || (mapping->nrpages == 0)) { spin_unlock(&inode->i_lock); |
03ba3782e
|
1151 |
continue; |
250df6ed2
|
1152 |
} |
03ba3782e
|
1153 |
__iget(inode); |
250df6ed2
|
1154 |
spin_unlock(&inode->i_lock); |
55fa6091d
|
1155 |
spin_unlock(&inode_sb_list_lock); |
03ba3782e
|
1156 |
/* |
55fa6091d
|
1157 1158 1159 1160 1161 1162 |
* We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * inode_sb_list_lock. We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under * inode_sb_list_lock. So we keep the reference and iput it * later. |
03ba3782e
|
1163 1164 1165 1166 1167 1168 1169 |
*/ iput(old_inode); old_inode = inode; filemap_fdatawait(mapping); cond_resched(); |
55fa6091d
|
1170 |
spin_lock(&inode_sb_list_lock); |
03ba3782e
|
1171 |
} |
55fa6091d
|
1172 |
spin_unlock(&inode_sb_list_lock); |
03ba3782e
|
1173 |
iput(old_inode); |
1da177e4c
|
1174 |
} |
d8a8559cd
|
1175 |
/** |
3259f8bed
|
1176 |
* writeback_inodes_sb_nr - writeback dirty inodes from given super_block |
d8a8559cd
|
1177 |
* @sb: the superblock |
3259f8bed
|
1178 |
* @nr: the number of pages to write |
786228ab3
|
1179 |
* @reason: reason why some writeback work initiated |
1da177e4c
|
1180 |
* |
d8a8559cd
|
1181 1182 |
* Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait |
3259f8bed
|
1183 |
* for IO completion of submitted IO. |
1da177e4c
|
1184 |
*/ |
0e175a183
|
1185 1186 1187 |
void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) |
1da177e4c
|
1188 |
{ |
83ba7b071
|
1189 1190 |
DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { |
6e6938b6d
|
1191 1192 1193 1194 1195 |
.sb = sb, .sync_mode = WB_SYNC_NONE, .tagged_writepages = 1, .done = &done, .nr_pages = nr, |
0e175a183
|
1196 |
.reason = reason, |
3c4d71653
|
1197 |
}; |
d8a8559cd
|
1198 |
|
cf37e9724
|
1199 |
WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
83ba7b071
|
1200 1201 |
bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); |
e913fc825
|
1202 |
} |
3259f8bed
|
1203 1204 1205 1206 1207 |
EXPORT_SYMBOL(writeback_inodes_sb_nr); /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock |
786228ab3
|
1208 |
* @reason: reason why some writeback work was initiated |
3259f8bed
|
1209 1210 1211 1212 1213 |
* * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ |
0e175a183
|
1214 |
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) |
3259f8bed
|
1215 |
{ |
0e175a183
|
1216 |
return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); |
3259f8bed
|
1217 |
} |
0e3c9a228
|
1218 |
EXPORT_SYMBOL(writeback_inodes_sb); |
e913fc825
|
1219 1220 |
/** |
17bd55d03
|
1221 1222 |
* writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock |
786228ab3
|
1223 |
* @reason: reason why some writeback work was initiated |
17bd55d03
|
1224 1225 1226 1227 |
* * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ |
0e175a183
|
1228 |
int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) |
17bd55d03
|
1229 1230 |
{ if (!writeback_in_progress(sb->s_bdi)) { |
cf37e9724
|
1231 |
down_read(&sb->s_umount); |
0e175a183
|
1232 |
writeback_inodes_sb(sb, reason); |
cf37e9724
|
1233 |
up_read(&sb->s_umount); |
17bd55d03
|
1234 1235 1236 1237 1238 1239 1240 |
return 1; } else return 0; } EXPORT_SYMBOL(writeback_inodes_sb_if_idle); /** |
3259f8bed
|
1241 1242 1243 |
* writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock * @nr: the number of pages to write |
786228ab3
|
1244 |
* @reason: reason why some writeback work was initiated |
3259f8bed
|
1245 1246 1247 1248 1249 |
* * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ int writeback_inodes_sb_nr_if_idle(struct super_block *sb, |
0e175a183
|
1250 1251 |
unsigned long nr, enum wb_reason reason) |
3259f8bed
|
1252 1253 1254 |
{ if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); |
0e175a183
|
1255 |
writeback_inodes_sb_nr(sb, nr, reason); |
3259f8bed
|
1256 1257 1258 1259 1260 1261 1262 1263 |
up_read(&sb->s_umount); return 1; } else return 0; } EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); /** |
d8a8559cd
|
1264 1265 1266 1267 |
* sync_inodes_sb - sync sb inode pages * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this |
cb9ef8d5e
|
1268 |
* super_block. |
d8a8559cd
|
1269 |
*/ |
b6e51316d
|
1270 |
void sync_inodes_sb(struct super_block *sb) |
d8a8559cd
|
1271 |
{ |
83ba7b071
|
1272 1273 |
DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { |
3c4d71653
|
1274 1275 1276 1277 |
.sb = sb, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, |
83ba7b071
|
1278 |
.done = &done, |
0e175a183
|
1279 |
.reason = WB_REASON_SYNC, |
3c4d71653
|
1280 |
}; |
cf37e9724
|
1281 |
WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
83ba7b071
|
1282 1283 |
bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); |
b6e51316d
|
1284 |
wait_sb_inodes(sb); |
1da177e4c
|
1285 |
} |
d8a8559cd
|
1286 |
EXPORT_SYMBOL(sync_inodes_sb); |
1da177e4c
|
1287 |
|
1da177e4c
|
1288 |
/** |
7f04c26d7
|
1289 1290 1291 1292 1293 1294 |
* write_inode_now - write an inode to disk * @inode: inode to write to disk * @sync: whether the write should be synchronous or not * * This function commits an inode to disk immediately if it is dirty. This is * primarily needed by knfsd. |
1da177e4c
|
1295 |
* |
7f04c26d7
|
1296 |
* The caller must either have a ref on the inode or must have set I_WILL_FREE. |
1da177e4c
|
1297 |
*/ |
1da177e4c
|
1298 1299 |
int write_inode_now(struct inode *inode, int sync) { |
f758eeabe
|
1300 |
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
1da177e4c
|
1301 1302 1303 |
int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, |
18914b188
|
1304 |
.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, |
111ebb6e6
|
1305 1306 |
.range_start = 0, .range_end = LLONG_MAX, |
1da177e4c
|
1307 1308 1309 |
}; if (!mapping_cap_writeback_dirty(inode->i_mapping)) |
49364ce25
|
1310 |
wbc.nr_to_write = 0; |
1da177e4c
|
1311 1312 |
might_sleep(); |
f758eeabe
|
1313 |
spin_lock(&wb->list_lock); |
0f1b1fd86
|
1314 |
spin_lock(&inode->i_lock); |
f758eeabe
|
1315 |
ret = writeback_single_inode(inode, wb, &wbc); |
0f1b1fd86
|
1316 |
spin_unlock(&inode->i_lock); |
f758eeabe
|
1317 |
spin_unlock(&wb->list_lock); |
1da177e4c
|
1318 |
if (sync) |
1c0eeaf56
|
1319 |
inode_sync_wait(inode); |
1da177e4c
|
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 |
return ret; } EXPORT_SYMBOL(write_inode_now); /** * sync_inode - write an inode and its pages to disk. * @inode: the inode to sync * @wbc: controls the writeback mode * * sync_inode() will write an inode and its pages to disk. It will also * correctly update the inode on its superblock's dirty inode lists and will * update inode->i_state. * * The caller must have a ref on the inode. */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { |
f758eeabe
|
1337 |
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
1da177e4c
|
1338 |
int ret; |
f758eeabe
|
1339 |
spin_lock(&wb->list_lock); |
0f1b1fd86
|
1340 |
spin_lock(&inode->i_lock); |
f758eeabe
|
1341 |
ret = writeback_single_inode(inode, wb, wbc); |
0f1b1fd86
|
1342 |
spin_unlock(&inode->i_lock); |
f758eeabe
|
1343 |
spin_unlock(&wb->list_lock); |
1da177e4c
|
1344 1345 1346 |
return ret; } EXPORT_SYMBOL(sync_inode); |
c37650161
|
1347 1348 |
/** |
c691b9d98
|
1349 |
* sync_inode_metadata - write an inode to disk |
c37650161
|
1350 1351 1352 |
* @inode: the inode to sync * @wait: wait for I/O to complete. * |
c691b9d98
|
1353 |
* Write an inode to disk and adjust its dirty state after completion. |
c37650161
|
1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 |
* * Note: only writes the actual inode, no associated data or other metadata. */ int sync_inode_metadata(struct inode *inode, int wait) { struct writeback_control wbc = { .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .nr_to_write = 0, /* metadata-only */ }; return sync_inode(inode, &wbc); } EXPORT_SYMBOL(sync_inode_metadata); |