Commit f01ef569cddb1a8627b1c6b3a134998ad1cf4b22
Exists in
master
and in
4 other branches
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits) mm: properly reflect task dirty limits in dirty_exceeded logic writeback: don't busy retry writeback on new/freeing inodes writeback: scale IO chunk size up to half device bandwidth writeback: trace global_dirty_state writeback: introduce max-pause and pass-good dirty limits writeback: introduce smoothed global dirty limit writeback: consolidate variable names in balance_dirty_pages() writeback: show bdi write bandwidth in debugfs writeback: bdi write bandwidth estimation writeback: account per-bdi accumulated written pages writeback: make writeback_control.nr_to_write straight writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr() writeback: trace event writeback_queue_io writeback: trace event writeback_single_inode writeback: remove .nonblocking and .encountered_congestion writeback: remove writeback_control.more_io writeback: skip balance_dirty_pages() for in-memory fs writeback: add bdi_dirty_limit() kernel-doc writeback: avoid extra sync work at enqueue time writeback: elevate queue_io() into wb_writeback() ... Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c
Showing 15 changed files Side-by-side Diff
fs/block_dev.c
... | ... | @@ -44,24 +44,28 @@ |
44 | 44 | { |
45 | 45 | return &BDEV_I(inode)->bdev; |
46 | 46 | } |
47 | - | |
48 | 47 | EXPORT_SYMBOL(I_BDEV); |
49 | 48 | |
50 | 49 | /* |
51 | - * move the inode from it's current bdi to the a new bdi. if the inode is dirty | |
52 | - * we need to move it onto the dirty list of @dst so that the inode is always | |
53 | - * on the right list. | |
50 | + * Move the inode from its current bdi to a new bdi. If the inode is dirty we | |
51 | + * need to move it onto the dirty list of @dst so that the inode is always on | |
52 | + * the right list. | |
54 | 53 | */ |
55 | 54 | static void bdev_inode_switch_bdi(struct inode *inode, |
56 | 55 | struct backing_dev_info *dst) |
57 | 56 | { |
58 | - spin_lock(&inode_wb_list_lock); | |
57 | + struct backing_dev_info *old = inode->i_data.backing_dev_info; | |
58 | + | |
59 | + if (unlikely(dst == old)) /* deadlock avoidance */ | |
60 | + return; | |
61 | + bdi_lock_two(&old->wb, &dst->wb); | |
59 | 62 | spin_lock(&inode->i_lock); |
60 | 63 | inode->i_data.backing_dev_info = dst; |
61 | 64 | if (inode->i_state & I_DIRTY) |
62 | 65 | list_move(&inode->i_wb_list, &dst->wb.b_dirty); |
63 | 66 | spin_unlock(&inode->i_lock); |
64 | - spin_unlock(&inode_wb_list_lock); | |
67 | + spin_unlock(&old->wb.list_lock); | |
68 | + spin_unlock(&dst->wb.list_lock); | |
65 | 69 | } |
66 | 70 | |
67 | 71 | static sector_t max_block(struct block_device *bdev) |
fs/btrfs/extent_io.c
... | ... | @@ -2551,7 +2551,6 @@ |
2551 | 2551 | }; |
2552 | 2552 | struct writeback_control wbc_writepages = { |
2553 | 2553 | .sync_mode = wbc->sync_mode, |
2554 | - .older_than_this = NULL, | |
2555 | 2554 | .nr_to_write = 64, |
2556 | 2555 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, |
2557 | 2556 | .range_end = (loff_t)-1, |
... | ... | @@ -2584,7 +2583,6 @@ |
2584 | 2583 | }; |
2585 | 2584 | struct writeback_control wbc_writepages = { |
2586 | 2585 | .sync_mode = mode, |
2587 | - .older_than_this = NULL, | |
2588 | 2586 | .nr_to_write = nr_pages * 2, |
2589 | 2587 | .range_start = start, |
2590 | 2588 | .range_end = end + 1, |
fs/ext4/inode.c
... | ... | @@ -2741,7 +2741,7 @@ |
2741 | 2741 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2742 | 2742 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
2743 | 2743 | |
2744 | - if (wbc->sync_mode == WB_SYNC_ALL) | |
2744 | + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | |
2745 | 2745 | tag = PAGECACHE_TAG_TOWRITE; |
2746 | 2746 | else |
2747 | 2747 | tag = PAGECACHE_TAG_DIRTY; |
... | ... | @@ -2973,7 +2973,7 @@ |
2973 | 2973 | } |
2974 | 2974 | |
2975 | 2975 | retry: |
2976 | - if (wbc->sync_mode == WB_SYNC_ALL) | |
2976 | + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | |
2977 | 2977 | tag_pages_for_writeback(mapping, index, end); |
2978 | 2978 | |
2979 | 2979 | while (!ret && wbc->nr_to_write > 0) { |
fs/fs-writeback.c
... | ... | @@ -35,7 +35,9 @@ |
35 | 35 | struct wb_writeback_work { |
36 | 36 | long nr_pages; |
37 | 37 | struct super_block *sb; |
38 | + unsigned long *older_than_this; | |
38 | 39 | enum writeback_sync_modes sync_mode; |
40 | + unsigned int tagged_writepages:1; | |
39 | 41 | unsigned int for_kupdate:1; |
40 | 42 | unsigned int range_cyclic:1; |
41 | 43 | unsigned int for_background:1; |
42 | 44 | |
43 | 45 | |
... | ... | @@ -180,12 +182,13 @@ |
180 | 182 | */ |
181 | 183 | void inode_wb_list_del(struct inode *inode) |
182 | 184 | { |
183 | - spin_lock(&inode_wb_list_lock); | |
185 | + struct backing_dev_info *bdi = inode_to_bdi(inode); | |
186 | + | |
187 | + spin_lock(&bdi->wb.list_lock); | |
184 | 188 | list_del_init(&inode->i_wb_list); |
185 | - spin_unlock(&inode_wb_list_lock); | |
189 | + spin_unlock(&bdi->wb.list_lock); | |
186 | 190 | } |
187 | 191 | |
188 | - | |
189 | 192 | /* |
190 | 193 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
191 | 194 | * furthest end of its superblock's dirty-inode list. |
192 | 195 | |
... | ... | @@ -195,11 +198,9 @@ |
195 | 198 | * the case then the inode must have been redirtied while it was being written |
196 | 199 | * out and we don't reset its dirtied_when. |
197 | 200 | */ |
198 | -static void redirty_tail(struct inode *inode) | |
201 | +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) | |
199 | 202 | { |
200 | - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | |
201 | - | |
202 | - assert_spin_locked(&inode_wb_list_lock); | |
203 | + assert_spin_locked(&wb->list_lock); | |
203 | 204 | if (!list_empty(&wb->b_dirty)) { |
204 | 205 | struct inode *tail; |
205 | 206 | |
206 | 207 | |
... | ... | @@ -213,11 +214,9 @@ |
213 | 214 | /* |
214 | 215 | * requeue inode for re-scanning after bdi->b_io list is exhausted. |
215 | 216 | */ |
216 | -static void requeue_io(struct inode *inode) | |
217 | +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) | |
217 | 218 | { |
218 | - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | |
219 | - | |
220 | - assert_spin_locked(&inode_wb_list_lock); | |
219 | + assert_spin_locked(&wb->list_lock); | |
221 | 220 | list_move(&inode->i_wb_list, &wb->b_more_io); |
222 | 221 | } |
223 | 222 | |
... | ... | @@ -225,7 +224,7 @@ |
225 | 224 | { |
226 | 225 | /* |
227 | 226 | * Prevent speculative execution through |
228 | - * spin_unlock(&inode_wb_list_lock); | |
227 | + * spin_unlock(&wb->list_lock); | |
229 | 228 | */ |
230 | 229 | |
231 | 230 | smp_mb(); |
232 | 231 | |
233 | 232 | |
... | ... | @@ -250,15 +249,16 @@ |
250 | 249 | /* |
251 | 250 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. |
252 | 251 | */ |
253 | -static void move_expired_inodes(struct list_head *delaying_queue, | |
252 | +static int move_expired_inodes(struct list_head *delaying_queue, | |
254 | 253 | struct list_head *dispatch_queue, |
255 | - unsigned long *older_than_this) | |
254 | + unsigned long *older_than_this) | |
256 | 255 | { |
257 | 256 | LIST_HEAD(tmp); |
258 | 257 | struct list_head *pos, *node; |
259 | 258 | struct super_block *sb = NULL; |
260 | 259 | struct inode *inode; |
261 | 260 | int do_sb_sort = 0; |
261 | + int moved = 0; | |
262 | 262 | |
263 | 263 | while (!list_empty(delaying_queue)) { |
264 | 264 | inode = wb_inode(delaying_queue->prev); |
265 | 265 | |
... | ... | @@ -269,12 +269,13 @@ |
269 | 269 | do_sb_sort = 1; |
270 | 270 | sb = inode->i_sb; |
271 | 271 | list_move(&inode->i_wb_list, &tmp); |
272 | + moved++; | |
272 | 273 | } |
273 | 274 | |
274 | 275 | /* just one sb in list, splice to dispatch_queue and we're done */ |
275 | 276 | if (!do_sb_sort) { |
276 | 277 | list_splice(&tmp, dispatch_queue); |
277 | - return; | |
278 | + goto out; | |
278 | 279 | } |
279 | 280 | |
280 | 281 | /* Move inodes from one superblock together */ |
... | ... | @@ -286,6 +287,8 @@ |
286 | 287 | list_move(&inode->i_wb_list, dispatch_queue); |
287 | 288 | } |
288 | 289 | } |
290 | +out: | |
291 | + return moved; | |
289 | 292 | } |
290 | 293 | |
291 | 294 | /* |
292 | 295 | |
... | ... | @@ -301,9 +304,11 @@ |
301 | 304 | */ |
302 | 305 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) |
303 | 306 | { |
304 | - assert_spin_locked(&inode_wb_list_lock); | |
307 | + int moved; | |
308 | + assert_spin_locked(&wb->list_lock); | |
305 | 309 | list_splice_init(&wb->b_more_io, &wb->b_io); |
306 | - move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | |
310 | + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | |
311 | + trace_writeback_queue_io(wb, older_than_this, moved); | |
307 | 312 | } |
308 | 313 | |
309 | 314 | static int write_inode(struct inode *inode, struct writeback_control *wbc) |
... | ... | @@ -316,7 +321,8 @@ |
316 | 321 | /* |
317 | 322 | * Wait for writeback on an inode to complete. |
318 | 323 | */ |
319 | -static void inode_wait_for_writeback(struct inode *inode) | |
324 | +static void inode_wait_for_writeback(struct inode *inode, | |
325 | + struct bdi_writeback *wb) | |
320 | 326 | { |
321 | 327 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); |
322 | 328 | wait_queue_head_t *wqh; |
323 | 329 | |
324 | 330 | |
... | ... | @@ -324,15 +330,15 @@ |
324 | 330 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
325 | 331 | while (inode->i_state & I_SYNC) { |
326 | 332 | spin_unlock(&inode->i_lock); |
327 | - spin_unlock(&inode_wb_list_lock); | |
333 | + spin_unlock(&wb->list_lock); | |
328 | 334 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
329 | - spin_lock(&inode_wb_list_lock); | |
335 | + spin_lock(&wb->list_lock); | |
330 | 336 | spin_lock(&inode->i_lock); |
331 | 337 | } |
332 | 338 | } |
333 | 339 | |
334 | 340 | /* |
335 | - * Write out an inode's dirty pages. Called under inode_wb_list_lock and | |
341 | + * Write out an inode's dirty pages. Called under wb->list_lock and | |
336 | 342 | * inode->i_lock. Either the caller has an active reference on the inode or |
337 | 343 | * the inode has I_WILL_FREE set. |
338 | 344 | * |
339 | 345 | |
340 | 346 | |
... | ... | @@ -343,13 +349,15 @@ |
343 | 349 | * livelocks, etc. |
344 | 350 | */ |
345 | 351 | static int |
346 | -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |
352 | +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | |
353 | + struct writeback_control *wbc) | |
347 | 354 | { |
348 | 355 | struct address_space *mapping = inode->i_mapping; |
356 | + long nr_to_write = wbc->nr_to_write; | |
349 | 357 | unsigned dirty; |
350 | 358 | int ret; |
351 | 359 | |
352 | - assert_spin_locked(&inode_wb_list_lock); | |
360 | + assert_spin_locked(&wb->list_lock); | |
353 | 361 | assert_spin_locked(&inode->i_lock); |
354 | 362 | |
355 | 363 | if (!atomic_read(&inode->i_count)) |
356 | 364 | |
... | ... | @@ -367,14 +375,16 @@ |
367 | 375 | * completed a full scan of b_io. |
368 | 376 | */ |
369 | 377 | if (wbc->sync_mode != WB_SYNC_ALL) { |
370 | - requeue_io(inode); | |
378 | + requeue_io(inode, wb); | |
379 | + trace_writeback_single_inode_requeue(inode, wbc, | |
380 | + nr_to_write); | |
371 | 381 | return 0; |
372 | 382 | } |
373 | 383 | |
374 | 384 | /* |
375 | 385 | * It's a data-integrity sync. We must wait. |
376 | 386 | */ |
377 | - inode_wait_for_writeback(inode); | |
387 | + inode_wait_for_writeback(inode, wb); | |
378 | 388 | } |
379 | 389 | |
380 | 390 | BUG_ON(inode->i_state & I_SYNC); |
... | ... | @@ -383,7 +393,7 @@ |
383 | 393 | inode->i_state |= I_SYNC; |
384 | 394 | inode->i_state &= ~I_DIRTY_PAGES; |
385 | 395 | spin_unlock(&inode->i_lock); |
386 | - spin_unlock(&inode_wb_list_lock); | |
396 | + spin_unlock(&wb->list_lock); | |
387 | 397 | |
388 | 398 | ret = do_writepages(mapping, wbc); |
389 | 399 | |
390 | 400 | |
... | ... | @@ -414,10 +424,19 @@ |
414 | 424 | ret = err; |
415 | 425 | } |
416 | 426 | |
417 | - spin_lock(&inode_wb_list_lock); | |
427 | + spin_lock(&wb->list_lock); | |
418 | 428 | spin_lock(&inode->i_lock); |
419 | 429 | inode->i_state &= ~I_SYNC; |
420 | 430 | if (!(inode->i_state & I_FREEING)) { |
431 | + /* | |
432 | + * Sync livelock prevention. Each inode is tagged and synced in | |
433 | + * one shot. If still dirty, it will be redirty_tail()'ed below. | |
434 | + * Update the dirty time to prevent enqueue and sync it again. | |
435 | + */ | |
436 | + if ((inode->i_state & I_DIRTY) && | |
437 | + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) | |
438 | + inode->dirtied_when = jiffies; | |
439 | + | |
421 | 440 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { |
422 | 441 | /* |
423 | 442 | * We didn't write back all the pages. nfs_writepages() |
... | ... | @@ -428,7 +447,7 @@ |
428 | 447 | /* |
429 | 448 | * slice used up: queue for next turn |
430 | 449 | */ |
431 | - requeue_io(inode); | |
450 | + requeue_io(inode, wb); | |
432 | 451 | } else { |
433 | 452 | /* |
434 | 453 | * Writeback blocked by something other than |
... | ... | @@ -437,7 +456,7 @@ |
437 | 456 | * retrying writeback of the dirty page/inode |
438 | 457 | * that cannot be performed immediately. |
439 | 458 | */ |
440 | - redirty_tail(inode); | |
459 | + redirty_tail(inode, wb); | |
441 | 460 | } |
442 | 461 | } else if (inode->i_state & I_DIRTY) { |
443 | 462 | /* |
... | ... | @@ -446,7 +465,7 @@ |
446 | 465 | * submission or metadata updates after data IO |
447 | 466 | * completion. |
448 | 467 | */ |
449 | - redirty_tail(inode); | |
468 | + redirty_tail(inode, wb); | |
450 | 469 | } else { |
451 | 470 | /* |
452 | 471 | * The inode is clean. At this point we either have |
453 | 472 | |
... | ... | @@ -457,9 +476,41 @@ |
457 | 476 | } |
458 | 477 | } |
459 | 478 | inode_sync_complete(inode); |
479 | + trace_writeback_single_inode(inode, wbc, nr_to_write); | |
460 | 480 | return ret; |
461 | 481 | } |
462 | 482 | |
483 | +static long writeback_chunk_size(struct backing_dev_info *bdi, | |
484 | + struct wb_writeback_work *work) | |
485 | +{ | |
486 | + long pages; | |
487 | + | |
488 | + /* | |
489 | + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | |
490 | + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | |
491 | + * here avoids calling into writeback_inodes_wb() more than once. | |
492 | + * | |
493 | + * The intended call sequence for WB_SYNC_ALL writeback is: | |
494 | + * | |
495 | + * wb_writeback() | |
496 | + * writeback_sb_inodes() <== called only once | |
497 | + * write_cache_pages() <== called once for each inode | |
498 | + * (quickly) tag currently dirty pages | |
499 | + * (maybe slowly) sync all tagged pages | |
500 | + */ | |
501 | + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) | |
502 | + pages = LONG_MAX; | |
503 | + else { | |
504 | + pages = min(bdi->avg_write_bandwidth / 2, | |
505 | + global_dirty_limit / DIRTY_SCOPE); | |
506 | + pages = min(pages, work->nr_pages); | |
507 | + pages = round_down(pages + MIN_WRITEBACK_PAGES, | |
508 | + MIN_WRITEBACK_PAGES); | |
509 | + } | |
510 | + | |
511 | + return pages; | |
512 | +} | |
513 | + | |
463 | 514 | /* |
464 | 515 | * Write a portion of b_io inodes which belong to @sb. |
465 | 516 | * |
466 | 517 | |
467 | 518 | |
468 | 519 | |
469 | 520 | |
470 | 521 | |
... | ... | @@ -467,24 +518,36 @@ |
467 | 518 | * inodes. Otherwise write only ones which go sequentially |
468 | 519 | * in reverse order. |
469 | 520 | * |
470 | - * Return 1, if the caller writeback routine should be | |
471 | - * interrupted. Otherwise return 0. | |
521 | + * Return the number of pages and/or inodes written. | |
472 | 522 | */ |
473 | -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |
474 | - struct writeback_control *wbc, bool only_this_sb) | |
523 | +static long writeback_sb_inodes(struct super_block *sb, | |
524 | + struct bdi_writeback *wb, | |
525 | + struct wb_writeback_work *work) | |
475 | 526 | { |
527 | + struct writeback_control wbc = { | |
528 | + .sync_mode = work->sync_mode, | |
529 | + .tagged_writepages = work->tagged_writepages, | |
530 | + .for_kupdate = work->for_kupdate, | |
531 | + .for_background = work->for_background, | |
532 | + .range_cyclic = work->range_cyclic, | |
533 | + .range_start = 0, | |
534 | + .range_end = LLONG_MAX, | |
535 | + }; | |
536 | + unsigned long start_time = jiffies; | |
537 | + long write_chunk; | |
538 | + long wrote = 0; /* count both pages and inodes */ | |
539 | + | |
476 | 540 | while (!list_empty(&wb->b_io)) { |
477 | - long pages_skipped; | |
478 | 541 | struct inode *inode = wb_inode(wb->b_io.prev); |
479 | 542 | |
480 | 543 | if (inode->i_sb != sb) { |
481 | - if (only_this_sb) { | |
544 | + if (work->sb) { | |
482 | 545 | /* |
483 | 546 | * We only want to write back data for this |
484 | 547 | * superblock, move all inodes not belonging |
485 | 548 | * to it back onto the dirty list. |
486 | 549 | */ |
487 | - redirty_tail(inode); | |
550 | + redirty_tail(inode, wb); | |
488 | 551 | continue; |
489 | 552 | } |
490 | 553 | |
... | ... | @@ -493,7 +556,7 @@ |
493 | 556 | * Bounce back to the caller to unpin this and |
494 | 557 | * pin the next superblock. |
495 | 558 | */ |
496 | - return 0; | |
559 | + break; | |
497 | 560 | } |
498 | 561 | |
499 | 562 | /* |
500 | 563 | |
501 | 564 | |
502 | 565 | |
503 | 566 | |
504 | 567 | |
505 | 568 | |
506 | 569 | |
507 | 570 | |
508 | 571 | |
509 | 572 | |
510 | 573 | |
511 | 574 | |
512 | 575 | |
513 | 576 | |
514 | 577 | |
515 | 578 | |
516 | 579 | |
517 | 580 | |
518 | 581 | |
519 | 582 | |
... | ... | @@ -504,96 +567,92 @@ |
504 | 567 | spin_lock(&inode->i_lock); |
505 | 568 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { |
506 | 569 | spin_unlock(&inode->i_lock); |
507 | - requeue_io(inode); | |
570 | + redirty_tail(inode, wb); | |
508 | 571 | continue; |
509 | 572 | } |
510 | - | |
511 | - /* | |
512 | - * Was this inode dirtied after sync_sb_inodes was called? | |
513 | - * This keeps sync from extra jobs and livelock. | |
514 | - */ | |
515 | - if (inode_dirtied_after(inode, wbc->wb_start)) { | |
516 | - spin_unlock(&inode->i_lock); | |
517 | - return 1; | |
518 | - } | |
519 | - | |
520 | 573 | __iget(inode); |
574 | + write_chunk = writeback_chunk_size(wb->bdi, work); | |
575 | + wbc.nr_to_write = write_chunk; | |
576 | + wbc.pages_skipped = 0; | |
521 | 577 | |
522 | - pages_skipped = wbc->pages_skipped; | |
523 | - writeback_single_inode(inode, wbc); | |
524 | - if (wbc->pages_skipped != pages_skipped) { | |
578 | + writeback_single_inode(inode, wb, &wbc); | |
579 | + | |
580 | + work->nr_pages -= write_chunk - wbc.nr_to_write; | |
581 | + wrote += write_chunk - wbc.nr_to_write; | |
582 | + if (!(inode->i_state & I_DIRTY)) | |
583 | + wrote++; | |
584 | + if (wbc.pages_skipped) { | |
525 | 585 | /* |
526 | 586 | * writeback is not making progress due to locked |
527 | 587 | * buffers. Skip this inode for now. |
528 | 588 | */ |
529 | - redirty_tail(inode); | |
589 | + redirty_tail(inode, wb); | |
530 | 590 | } |
531 | 591 | spin_unlock(&inode->i_lock); |
532 | - spin_unlock(&inode_wb_list_lock); | |
592 | + spin_unlock(&wb->list_lock); | |
533 | 593 | iput(inode); |
534 | 594 | cond_resched(); |
535 | - spin_lock(&inode_wb_list_lock); | |
536 | - if (wbc->nr_to_write <= 0) { | |
537 | - wbc->more_io = 1; | |
538 | - return 1; | |
595 | + spin_lock(&wb->list_lock); | |
596 | + /* | |
597 | + * bail out to wb_writeback() often enough to check | |
598 | + * background threshold and other termination conditions. | |
599 | + */ | |
600 | + if (wrote) { | |
601 | + if (time_is_before_jiffies(start_time + HZ / 10UL)) | |
602 | + break; | |
603 | + if (work->nr_pages <= 0) | |
604 | + break; | |
539 | 605 | } |
540 | - if (!list_empty(&wb->b_more_io)) | |
541 | - wbc->more_io = 1; | |
542 | 606 | } |
543 | - /* b_io is empty */ | |
544 | - return 1; | |
607 | + return wrote; | |
545 | 608 | } |
546 | 609 | |
547 | -void writeback_inodes_wb(struct bdi_writeback *wb, | |
548 | - struct writeback_control *wbc) | |
610 | +static long __writeback_inodes_wb(struct bdi_writeback *wb, | |
611 | + struct wb_writeback_work *work) | |
549 | 612 | { |
550 | - int ret = 0; | |
613 | + unsigned long start_time = jiffies; | |
614 | + long wrote = 0; | |
551 | 615 | |
552 | - if (!wbc->wb_start) | |
553 | - wbc->wb_start = jiffies; /* livelock avoidance */ | |
554 | - spin_lock(&inode_wb_list_lock); | |
555 | - if (!wbc->for_kupdate || list_empty(&wb->b_io)) | |
556 | - queue_io(wb, wbc->older_than_this); | |
557 | - | |
558 | 616 | while (!list_empty(&wb->b_io)) { |
559 | 617 | struct inode *inode = wb_inode(wb->b_io.prev); |
560 | 618 | struct super_block *sb = inode->i_sb; |
561 | 619 | |
562 | 620 | if (!grab_super_passive(sb)) { |
563 | - requeue_io(inode); | |
621 | + requeue_io(inode, wb); | |
564 | 622 | continue; |
565 | 623 | } |
566 | - ret = writeback_sb_inodes(sb, wb, wbc, false); | |
624 | + wrote += writeback_sb_inodes(sb, wb, work); | |
567 | 625 | drop_super(sb); |
568 | 626 | |
569 | - if (ret) | |
570 | - break; | |
627 | + /* refer to the same tests at the end of writeback_sb_inodes */ | |
628 | + if (wrote) { | |
629 | + if (time_is_before_jiffies(start_time + HZ / 10UL)) | |
630 | + break; | |
631 | + if (work->nr_pages <= 0) | |
632 | + break; | |
633 | + } | |
571 | 634 | } |
572 | - spin_unlock(&inode_wb_list_lock); | |
573 | 635 | /* Leave any unwritten inodes on b_io */ |
636 | + return wrote; | |
574 | 637 | } |
575 | 638 | |
576 | -static void __writeback_inodes_sb(struct super_block *sb, | |
577 | - struct bdi_writeback *wb, struct writeback_control *wbc) | |
639 | +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) | |
578 | 640 | { |
579 | - WARN_ON(!rwsem_is_locked(&sb->s_umount)); | |
641 | + struct wb_writeback_work work = { | |
642 | + .nr_pages = nr_pages, | |
643 | + .sync_mode = WB_SYNC_NONE, | |
644 | + .range_cyclic = 1, | |
645 | + }; | |
580 | 646 | |
581 | - spin_lock(&inode_wb_list_lock); | |
582 | - if (!wbc->for_kupdate || list_empty(&wb->b_io)) | |
583 | - queue_io(wb, wbc->older_than_this); | |
584 | - writeback_sb_inodes(sb, wb, wbc, true); | |
585 | - spin_unlock(&inode_wb_list_lock); | |
647 | + spin_lock(&wb->list_lock); | |
648 | + if (list_empty(&wb->b_io)) | |
649 | + queue_io(wb, NULL); | |
650 | + __writeback_inodes_wb(wb, &work); | |
651 | + spin_unlock(&wb->list_lock); | |
652 | + | |
653 | + return nr_pages - work.nr_pages; | |
586 | 654 | } |
587 | 655 | |
588 | -/* | |
589 | - * The maximum number of pages to writeout in a single bdi flush/kupdate | |
590 | - * operation. We do this so we don't hold I_SYNC against an inode for | |
591 | - * enormous amounts of time, which would block a userspace task which has | |
592 | - * been forced to throttle against that inode. Also, the code reevaluates | |
593 | - * the dirty each time it has written this many pages. | |
594 | - */ | |
595 | -#define MAX_WRITEBACK_PAGES 1024 | |
596 | - | |
597 | 656 | static inline bool over_bground_thresh(void) |
598 | 657 | { |
599 | 658 | unsigned long background_thresh, dirty_thresh; |
... | ... | @@ -605,6 +664,16 @@ |
605 | 664 | } |
606 | 665 | |
607 | 666 | /* |
667 | + * Called under wb->list_lock. If there are multiple wb per bdi, | |
668 | + * only the flusher working on the first wb should do it. | |
669 | + */ | |
670 | +static void wb_update_bandwidth(struct bdi_writeback *wb, | |
671 | + unsigned long start_time) | |
672 | +{ | |
673 | + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); | |
674 | +} | |
675 | + | |
676 | +/* | |
608 | 677 | * Explicit flushing or periodic writeback of "old" data. |
609 | 678 | * |
610 | 679 | * Define "old": the first time one of an inode's pages is dirtied, we mark the |
611 | 680 | |
612 | 681 | |
613 | 682 | |
614 | 683 | |
... | ... | @@ -622,47 +691,16 @@ |
622 | 691 | static long wb_writeback(struct bdi_writeback *wb, |
623 | 692 | struct wb_writeback_work *work) |
624 | 693 | { |
625 | - struct writeback_control wbc = { | |
626 | - .sync_mode = work->sync_mode, | |
627 | - .older_than_this = NULL, | |
628 | - .for_kupdate = work->for_kupdate, | |
629 | - .for_background = work->for_background, | |
630 | - .range_cyclic = work->range_cyclic, | |
631 | - }; | |
694 | + unsigned long wb_start = jiffies; | |
695 | + long nr_pages = work->nr_pages; | |
632 | 696 | unsigned long oldest_jif; |
633 | - long wrote = 0; | |
634 | - long write_chunk; | |
635 | 697 | struct inode *inode; |
698 | + long progress; | |
636 | 699 | |
637 | - if (wbc.for_kupdate) { | |
638 | - wbc.older_than_this = &oldest_jif; | |
639 | - oldest_jif = jiffies - | |
640 | - msecs_to_jiffies(dirty_expire_interval * 10); | |
641 | - } | |
642 | - if (!wbc.range_cyclic) { | |
643 | - wbc.range_start = 0; | |
644 | - wbc.range_end = LLONG_MAX; | |
645 | - } | |
700 | + oldest_jif = jiffies; | |
701 | + work->older_than_this = &oldest_jif; | |
646 | 702 | |
647 | - /* | |
648 | - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | |
649 | - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | |
650 | - * here avoids calling into writeback_inodes_wb() more than once. | |
651 | - * | |
652 | - * The intended call sequence for WB_SYNC_ALL writeback is: | |
653 | - * | |
654 | - * wb_writeback() | |
655 | - * __writeback_inodes_sb() <== called only once | |
656 | - * write_cache_pages() <== called once for each inode | |
657 | - * (quickly) tag currently dirty pages | |
658 | - * (maybe slowly) sync all tagged pages | |
659 | - */ | |
660 | - if (wbc.sync_mode == WB_SYNC_NONE) | |
661 | - write_chunk = MAX_WRITEBACK_PAGES; | |
662 | - else | |
663 | - write_chunk = LONG_MAX; | |
664 | - | |
665 | - wbc.wb_start = jiffies; /* livelock avoidance */ | |
703 | + spin_lock(&wb->list_lock); | |
666 | 704 | for (;;) { |
667 | 705 | /* |
668 | 706 | * Stop writeback when nr_pages has been consumed |
669 | 707 | |
670 | 708 | |
671 | 709 | |
672 | 710 | |
673 | 711 | |
674 | 712 | |
675 | 713 | |
676 | 714 | |
677 | 715 | |
678 | 716 | |
679 | 717 | |
680 | 718 | |
681 | 719 | |
682 | 720 | |
683 | 721 | |
684 | 722 | |
... | ... | @@ -687,52 +725,54 @@ |
687 | 725 | if (work->for_background && !over_bground_thresh()) |
688 | 726 | break; |
689 | 727 | |
690 | - wbc.more_io = 0; | |
691 | - wbc.nr_to_write = write_chunk; | |
692 | - wbc.pages_skipped = 0; | |
728 | + if (work->for_kupdate) { | |
729 | + oldest_jif = jiffies - | |
730 | + msecs_to_jiffies(dirty_expire_interval * 10); | |
731 | + work->older_than_this = &oldest_jif; | |
732 | + } | |
693 | 733 | |
694 | - trace_wbc_writeback_start(&wbc, wb->bdi); | |
734 | + trace_writeback_start(wb->bdi, work); | |
735 | + if (list_empty(&wb->b_io)) | |
736 | + queue_io(wb, work->older_than_this); | |
695 | 737 | if (work->sb) |
696 | - __writeback_inodes_sb(work->sb, wb, &wbc); | |
738 | + progress = writeback_sb_inodes(work->sb, wb, work); | |
697 | 739 | else |
698 | - writeback_inodes_wb(wb, &wbc); | |
699 | - trace_wbc_writeback_written(&wbc, wb->bdi); | |
740 | + progress = __writeback_inodes_wb(wb, work); | |
741 | + trace_writeback_written(wb->bdi, work); | |
700 | 742 | |
701 | - work->nr_pages -= write_chunk - wbc.nr_to_write; | |
702 | - wrote += write_chunk - wbc.nr_to_write; | |
743 | + wb_update_bandwidth(wb, wb_start); | |
703 | 744 | |
704 | 745 | /* |
705 | - * If we consumed everything, see if we have more | |
746 | + * Did we write something? Try for more | |
747 | + * | |
748 | + * Dirty inodes are moved to b_io for writeback in batches. | |
749 | + * The completion of the current batch does not necessarily | |
750 | + * mean the overall work is done. So we keep looping as long | |
751 | + * as made some progress on cleaning pages or inodes. | |
706 | 752 | */ |
707 | - if (wbc.nr_to_write <= 0) | |
753 | + if (progress) | |
708 | 754 | continue; |
709 | 755 | /* |
710 | - * Didn't write everything and we don't have more IO, bail | |
756 | + * No more inodes for IO, bail | |
711 | 757 | */ |
712 | - if (!wbc.more_io) | |
758 | + if (list_empty(&wb->b_more_io)) | |
713 | 759 | break; |
714 | 760 | /* |
715 | - * Did we write something? Try for more | |
716 | - */ | |
717 | - if (wbc.nr_to_write < write_chunk) | |
718 | - continue; | |
719 | - /* | |
720 | 761 | * Nothing written. Wait for some inode to |
721 | 762 | * become available for writeback. Otherwise |
722 | 763 | * we'll just busyloop. |
723 | 764 | */ |
724 | - spin_lock(&inode_wb_list_lock); | |
725 | 765 | if (!list_empty(&wb->b_more_io)) { |
766 | + trace_writeback_wait(wb->bdi, work); | |
726 | 767 | inode = wb_inode(wb->b_more_io.prev); |
727 | - trace_wbc_writeback_wait(&wbc, wb->bdi); | |
728 | 768 | spin_lock(&inode->i_lock); |
729 | - inode_wait_for_writeback(inode); | |
769 | + inode_wait_for_writeback(inode, wb); | |
730 | 770 | spin_unlock(&inode->i_lock); |
731 | 771 | } |
732 | - spin_unlock(&inode_wb_list_lock); | |
733 | 772 | } |
773 | + spin_unlock(&wb->list_lock); | |
734 | 774 | |
735 | - return wrote; | |
775 | + return nr_pages - work->nr_pages; | |
736 | 776 | } |
737 | 777 | |
738 | 778 | /* |
739 | 779 | |
... | ... | @@ -1063,10 +1103,10 @@ |
1063 | 1103 | } |
1064 | 1104 | |
1065 | 1105 | spin_unlock(&inode->i_lock); |
1066 | - spin_lock(&inode_wb_list_lock); | |
1106 | + spin_lock(&bdi->wb.list_lock); | |
1067 | 1107 | inode->dirtied_when = jiffies; |
1068 | 1108 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
1069 | - spin_unlock(&inode_wb_list_lock); | |
1109 | + spin_unlock(&bdi->wb.list_lock); | |
1070 | 1110 | |
1071 | 1111 | if (wakeup_bdi) |
1072 | 1112 | bdi_wakeup_thread_delayed(bdi); |
... | ... | @@ -1162,10 +1202,11 @@ |
1162 | 1202 | { |
1163 | 1203 | DECLARE_COMPLETION_ONSTACK(done); |
1164 | 1204 | struct wb_writeback_work work = { |
1165 | - .sb = sb, | |
1166 | - .sync_mode = WB_SYNC_NONE, | |
1167 | - .done = &done, | |
1168 | - .nr_pages = nr, | |
1205 | + .sb = sb, | |
1206 | + .sync_mode = WB_SYNC_NONE, | |
1207 | + .tagged_writepages = 1, | |
1208 | + .done = &done, | |
1209 | + .nr_pages = nr, | |
1169 | 1210 | }; |
1170 | 1211 | |
1171 | 1212 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
... | ... | @@ -1267,6 +1308,7 @@ |
1267 | 1308 | */ |
1268 | 1309 | int write_inode_now(struct inode *inode, int sync) |
1269 | 1310 | { |
1311 | + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | |
1270 | 1312 | int ret; |
1271 | 1313 | struct writeback_control wbc = { |
1272 | 1314 | .nr_to_write = LONG_MAX, |
1273 | 1315 | |
1274 | 1316 | |
... | ... | @@ -1279,11 +1321,11 @@ |
1279 | 1321 | wbc.nr_to_write = 0; |
1280 | 1322 | |
1281 | 1323 | might_sleep(); |
1282 | - spin_lock(&inode_wb_list_lock); | |
1324 | + spin_lock(&wb->list_lock); | |
1283 | 1325 | spin_lock(&inode->i_lock); |
1284 | - ret = writeback_single_inode(inode, &wbc); | |
1326 | + ret = writeback_single_inode(inode, wb, &wbc); | |
1285 | 1327 | spin_unlock(&inode->i_lock); |
1286 | - spin_unlock(&inode_wb_list_lock); | |
1328 | + spin_unlock(&wb->list_lock); | |
1287 | 1329 | if (sync) |
1288 | 1330 | inode_sync_wait(inode); |
1289 | 1331 | return ret; |
1290 | 1332 | |
1291 | 1333 | |
1292 | 1334 | |
... | ... | @@ -1303,13 +1345,14 @@ |
1303 | 1345 | */ |
1304 | 1346 | int sync_inode(struct inode *inode, struct writeback_control *wbc) |
1305 | 1347 | { |
1348 | + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | |
1306 | 1349 | int ret; |
1307 | 1350 | |
1308 | - spin_lock(&inode_wb_list_lock); | |
1351 | + spin_lock(&wb->list_lock); | |
1309 | 1352 | spin_lock(&inode->i_lock); |
1310 | - ret = writeback_single_inode(inode, wbc); | |
1353 | + ret = writeback_single_inode(inode, wb, wbc); | |
1311 | 1354 | spin_unlock(&inode->i_lock); |
1312 | - spin_unlock(&inode_wb_list_lock); | |
1355 | + spin_unlock(&wb->list_lock); | |
1313 | 1356 | return ret; |
1314 | 1357 | } |
1315 | 1358 | EXPORT_SYMBOL(sync_inode); |
fs/inode.c
... | ... | @@ -37,7 +37,7 @@ |
37 | 37 | * inode->i_sb->s_inode_lru, inode->i_lru |
38 | 38 | * inode_sb_list_lock protects: |
39 | 39 | * sb->s_inodes, inode->i_sb_list |
40 | - * inode_wb_list_lock protects: | |
40 | + * bdi->wb.list_lock protects: | |
41 | 41 | * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list |
42 | 42 | * inode_hash_lock protects: |
43 | 43 | * inode_hashtable, inode->i_hash |
... | ... | @@ -48,7 +48,7 @@ |
48 | 48 | * inode->i_lock |
49 | 49 | * inode->i_sb->s_inode_lru_lock |
50 | 50 | * |
51 | - * inode_wb_list_lock | |
51 | + * bdi->wb.list_lock | |
52 | 52 | * inode->i_lock |
53 | 53 | * |
54 | 54 | * inode_hash_lock |
... | ... | @@ -65,7 +65,6 @@ |
65 | 65 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); |
66 | 66 | |
67 | 67 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); |
68 | -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); | |
69 | 68 | |
70 | 69 | /* |
71 | 70 | * Empty aops. Can be used for the cases where the user does not |
fs/nfs/write.c
... | ... | @@ -1566,8 +1566,7 @@ |
1566 | 1566 | int status; |
1567 | 1567 | bool sync = true; |
1568 | 1568 | |
1569 | - if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || | |
1570 | - wbc->for_background) | |
1569 | + if (wbc->sync_mode == WB_SYNC_NONE) | |
1571 | 1570 | sync = false; |
1572 | 1571 | |
1573 | 1572 | status = pnfs_layoutcommit_inode(inode, sync); |
include/linux/backing-dev.h
... | ... | @@ -40,6 +40,7 @@ |
40 | 40 | enum bdi_stat_item { |
41 | 41 | BDI_RECLAIMABLE, |
42 | 42 | BDI_WRITEBACK, |
43 | + BDI_WRITTEN, | |
43 | 44 | NR_BDI_STAT_ITEMS |
44 | 45 | }; |
45 | 46 | |
... | ... | @@ -57,6 +58,7 @@ |
57 | 58 | struct list_head b_dirty; /* dirty inodes */ |
58 | 59 | struct list_head b_io; /* parked for writeback */ |
59 | 60 | struct list_head b_more_io; /* parked for more writeback */ |
61 | + spinlock_t list_lock; /* protects the b_* lists */ | |
60 | 62 | }; |
61 | 63 | |
62 | 64 | struct backing_dev_info { |
... | ... | @@ -71,6 +73,11 @@ |
71 | 73 | |
72 | 74 | struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; |
73 | 75 | |
76 | + unsigned long bw_time_stamp; /* last time write bw is updated */ | |
77 | + unsigned long written_stamp; /* pages written at bw_time_stamp */ | |
78 | + unsigned long write_bandwidth; /* the estimated write bandwidth */ | |
79 | + unsigned long avg_write_bandwidth; /* further smoothed write bw */ | |
80 | + | |
74 | 81 | struct prop_local_percpu completions; |
75 | 82 | int dirty_exceeded; |
76 | 83 | |
... | ... | @@ -106,6 +113,7 @@ |
106 | 113 | int bdi_has_dirty_io(struct backing_dev_info *bdi); |
107 | 114 | void bdi_arm_supers_timer(void); |
108 | 115 | void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); |
116 | +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); | |
109 | 117 | |
110 | 118 | extern spinlock_t bdi_lock; |
111 | 119 | extern struct list_head bdi_list; |
include/linux/writeback.h
... | ... | @@ -7,10 +7,40 @@ |
7 | 7 | #include <linux/sched.h> |
8 | 8 | #include <linux/fs.h> |
9 | 9 | |
10 | -struct backing_dev_info; | |
10 | +/* | |
11 | + * The 1/4 region under the global dirty thresh is for smooth dirty throttling: | |
12 | + * | |
13 | + * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) | |
14 | + * | |
15 | + * The 1/16 region above the global dirty limit will be put to maximum pauses: | |
16 | + * | |
17 | + * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) | |
18 | + * | |
19 | + * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put | |
20 | + * to loops: | |
21 | + * | |
22 | + * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA) | |
23 | + * | |
24 | + * Further beyond, all dirtier tasks will enter a loop waiting (possibly long | |
25 | + * time) for the dirty pages to drop, unless written enough pages. | |
26 | + * | |
27 | + * The global dirty threshold is normally equal to the global dirty limit, | |
28 | + * except when the system suddenly allocates a lot of anonymous memory and | |
29 | + * knocks down the global dirty threshold quickly, in which case the global | |
30 | + * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. | |
31 | + */ | |
32 | +#define DIRTY_SCOPE 8 | |
33 | +#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) | |
34 | +#define DIRTY_MAXPAUSE_AREA 16 | |
35 | +#define DIRTY_PASSGOOD_AREA 8 | |
11 | 36 | |
12 | -extern spinlock_t inode_wb_list_lock; | |
37 | +/* | |
38 | + * 4MB minimal write chunk size | |
39 | + */ | |
40 | +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) | |
13 | 41 | |
42 | +struct backing_dev_info; | |
43 | + | |
14 | 44 | /* |
15 | 45 | * fs/fs-writeback.c |
16 | 46 | */ |
... | ... | @@ -26,11 +56,6 @@ |
26 | 56 | */ |
27 | 57 | struct writeback_control { |
28 | 58 | enum writeback_sync_modes sync_mode; |
29 | - unsigned long *older_than_this; /* If !NULL, only write back inodes | |
30 | - older than this */ | |
31 | - unsigned long wb_start; /* Time writeback_inodes_wb was | |
32 | - called. This is needed to avoid | |
33 | - extra jobs and livelock */ | |
34 | 59 | long nr_to_write; /* Write this many pages, and decrement |
35 | 60 | this for each page written */ |
36 | 61 | long pages_skipped; /* Pages which were not written */ |
37 | 62 | |
38 | 63 | |
... | ... | @@ -43,13 +68,11 @@ |
43 | 68 | loff_t range_start; |
44 | 69 | loff_t range_end; |
45 | 70 | |
46 | - unsigned nonblocking:1; /* Don't get stuck on request queues */ | |
47 | - unsigned encountered_congestion:1; /* An output: a queue is full */ | |
48 | 71 | unsigned for_kupdate:1; /* A kupdate writeback */ |
49 | 72 | unsigned for_background:1; /* A background writeback */ |
73 | + unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ | |
50 | 74 | unsigned for_reclaim:1; /* Invoked from the page allocator */ |
51 | 75 | unsigned range_cyclic:1; /* range_start is cyclic */ |
52 | - unsigned more_io:1; /* more io to be dispatched */ | |
53 | 76 | }; |
54 | 77 | |
55 | 78 | /* |
... | ... | @@ -62,8 +85,7 @@ |
62 | 85 | int writeback_inodes_sb_if_idle(struct super_block *); |
63 | 86 | int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); |
64 | 87 | void sync_inodes_sb(struct super_block *); |
65 | -void writeback_inodes_wb(struct bdi_writeback *wb, | |
66 | - struct writeback_control *wbc); | |
88 | +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); | |
67 | 89 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait); |
68 | 90 | void wakeup_flusher_threads(long nr_pages); |
69 | 91 | |
... | ... | @@ -94,6 +116,8 @@ |
94 | 116 | #endif |
95 | 117 | void throttle_vm_writeout(gfp_t gfp_mask); |
96 | 118 | |
119 | +extern unsigned long global_dirty_limit; | |
120 | + | |
97 | 121 | /* These are exported to sysctl. */ |
98 | 122 | extern int dirty_background_ratio; |
99 | 123 | extern unsigned long dirty_background_bytes; |
... | ... | @@ -127,6 +151,13 @@ |
127 | 151 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); |
128 | 152 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, |
129 | 153 | unsigned long dirty); |
154 | + | |
155 | +void __bdi_update_bandwidth(struct backing_dev_info *bdi, | |
156 | + unsigned long thresh, | |
157 | + unsigned long dirty, | |
158 | + unsigned long bdi_thresh, | |
159 | + unsigned long bdi_dirty, | |
160 | + unsigned long start_time); | |
130 | 161 | |
131 | 162 | void page_writeback_init(void); |
132 | 163 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
include/trace/events/btrfs.h
... | ... | @@ -284,7 +284,6 @@ |
284 | 284 | __field( long, pages_skipped ) |
285 | 285 | __field( loff_t, range_start ) |
286 | 286 | __field( loff_t, range_end ) |
287 | - __field( char, nonblocking ) | |
288 | 287 | __field( char, for_kupdate ) |
289 | 288 | __field( char, for_reclaim ) |
290 | 289 | __field( char, range_cyclic ) |
... | ... | @@ -299,7 +298,6 @@ |
299 | 298 | __entry->pages_skipped = wbc->pages_skipped; |
300 | 299 | __entry->range_start = wbc->range_start; |
301 | 300 | __entry->range_end = wbc->range_end; |
302 | - __entry->nonblocking = wbc->nonblocking; | |
303 | 301 | __entry->for_kupdate = wbc->for_kupdate; |
304 | 302 | __entry->for_reclaim = wbc->for_reclaim; |
305 | 303 | __entry->range_cyclic = wbc->range_cyclic; |
306 | 304 | |
... | ... | @@ -310,13 +308,13 @@ |
310 | 308 | |
311 | 309 | TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " |
312 | 310 | "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " |
313 | - "range_end = %llu, nonblocking = %d, for_kupdate = %d, " | |
311 | + "range_end = %llu, for_kupdate = %d, " | |
314 | 312 | "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", |
315 | 313 | show_root_type(__entry->root_objectid), |
316 | 314 | (unsigned long)__entry->ino, __entry->index, |
317 | 315 | __entry->nr_to_write, __entry->pages_skipped, |
318 | 316 | __entry->range_start, __entry->range_end, |
319 | - __entry->nonblocking, __entry->for_kupdate, | |
317 | + __entry->for_kupdate, | |
320 | 318 | __entry->for_reclaim, __entry->range_cyclic, |
321 | 319 | (unsigned long)__entry->writeback_index) |
322 | 320 | ); |
include/trace/events/ext4.h
... | ... | @@ -380,7 +380,6 @@ |
380 | 380 | __field( int, pages_written ) |
381 | 381 | __field( long, pages_skipped ) |
382 | 382 | __field( int, sync_mode ) |
383 | - __field( char, more_io ) | |
384 | 383 | __field( pgoff_t, writeback_index ) |
385 | 384 | ), |
386 | 385 | |
387 | 386 | |
388 | 387 | |
... | ... | @@ -391,16 +390,15 @@ |
391 | 390 | __entry->pages_written = pages_written; |
392 | 391 | __entry->pages_skipped = wbc->pages_skipped; |
393 | 392 | __entry->sync_mode = wbc->sync_mode; |
394 | - __entry->more_io = wbc->more_io; | |
395 | 393 | __entry->writeback_index = inode->i_mapping->writeback_index; |
396 | 394 | ), |
397 | 395 | |
398 | 396 | TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " |
399 | - " more_io %d sync_mode %d writeback_index %lu", | |
397 | + "sync_mode %d writeback_index %lu", | |
400 | 398 | MAJOR(__entry->dev), MINOR(__entry->dev), |
401 | 399 | (unsigned long) __entry->ino, __entry->ret, |
402 | 400 | __entry->pages_written, __entry->pages_skipped, |
403 | - __entry->more_io, __entry->sync_mode, | |
401 | + __entry->sync_mode, | |
404 | 402 | (unsigned long) __entry->writeback_index) |
405 | 403 | ); |
406 | 404 |
include/trace/events/writeback.h
... | ... | @@ -8,6 +8,19 @@ |
8 | 8 | #include <linux/device.h> |
9 | 9 | #include <linux/writeback.h> |
10 | 10 | |
11 | +#define show_inode_state(state) \ | |
12 | + __print_flags(state, "|", \ | |
13 | + {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ | |
14 | + {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ | |
15 | + {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ | |
16 | + {I_NEW, "I_NEW"}, \ | |
17 | + {I_WILL_FREE, "I_WILL_FREE"}, \ | |
18 | + {I_FREEING, "I_FREEING"}, \ | |
19 | + {I_CLEAR, "I_CLEAR"}, \ | |
20 | + {I_SYNC, "I_SYNC"}, \ | |
21 | + {I_REFERENCED, "I_REFERENCED"} \ | |
22 | + ) | |
23 | + | |
11 | 24 | struct wb_writeback_work; |
12 | 25 | |
13 | 26 | DECLARE_EVENT_CLASS(writeback_work_class, |
... | ... | @@ -49,6 +62,9 @@ |
49 | 62 | DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); |
50 | 63 | DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); |
51 | 64 | DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); |
65 | +DEFINE_WRITEBACK_WORK_EVENT(writeback_start); | |
66 | +DEFINE_WRITEBACK_WORK_EVENT(writeback_written); | |
67 | +DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); | |
52 | 68 | |
53 | 69 | TRACE_EVENT(writeback_pages_written, |
54 | 70 | TP_PROTO(long pages_written), |
55 | 71 | |
... | ... | @@ -88,7 +104,31 @@ |
88 | 104 | DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); |
89 | 105 | DEFINE_WRITEBACK_EVENT(writeback_thread_start); |
90 | 106 | DEFINE_WRITEBACK_EVENT(writeback_thread_stop); |
107 | +DEFINE_WRITEBACK_EVENT(balance_dirty_start); | |
108 | +DEFINE_WRITEBACK_EVENT(balance_dirty_wait); | |
91 | 109 | |
110 | +TRACE_EVENT(balance_dirty_written, | |
111 | + | |
112 | + TP_PROTO(struct backing_dev_info *bdi, int written), | |
113 | + | |
114 | + TP_ARGS(bdi, written), | |
115 | + | |
116 | + TP_STRUCT__entry( | |
117 | + __array(char, name, 32) | |
118 | + __field(int, written) | |
119 | + ), | |
120 | + | |
121 | + TP_fast_assign( | |
122 | + strncpy(__entry->name, dev_name(bdi->dev), 32); | |
123 | + __entry->written = written; | |
124 | + ), | |
125 | + | |
126 | + TP_printk("bdi %s written %d", | |
127 | + __entry->name, | |
128 | + __entry->written | |
129 | + ) | |
130 | +); | |
131 | + | |
92 | 132 | DECLARE_EVENT_CLASS(wbc_class, |
93 | 133 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), |
94 | 134 | TP_ARGS(wbc, bdi), |
... | ... | @@ -101,8 +141,6 @@ |
101 | 141 | __field(int, for_background) |
102 | 142 | __field(int, for_reclaim) |
103 | 143 | __field(int, range_cyclic) |
104 | - __field(int, more_io) | |
105 | - __field(unsigned long, older_than_this) | |
106 | 144 | __field(long, range_start) |
107 | 145 | __field(long, range_end) |
108 | 146 | ), |
109 | 147 | |
... | ... | @@ -116,15 +154,12 @@ |
116 | 154 | __entry->for_background = wbc->for_background; |
117 | 155 | __entry->for_reclaim = wbc->for_reclaim; |
118 | 156 | __entry->range_cyclic = wbc->range_cyclic; |
119 | - __entry->more_io = wbc->more_io; | |
120 | - __entry->older_than_this = wbc->older_than_this ? | |
121 | - *wbc->older_than_this : 0; | |
122 | 157 | __entry->range_start = (long)wbc->range_start; |
123 | 158 | __entry->range_end = (long)wbc->range_end; |
124 | 159 | ), |
125 | 160 | |
126 | 161 | TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " |
127 | - "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " | |
162 | + "bgrd=%d reclm=%d cyclic=%d " | |
128 | 163 | "start=0x%lx end=0x%lx", |
129 | 164 | __entry->name, |
130 | 165 | __entry->nr_to_write, |
... | ... | @@ -134,8 +169,6 @@ |
134 | 169 | __entry->for_background, |
135 | 170 | __entry->for_reclaim, |
136 | 171 | __entry->range_cyclic, |
137 | - __entry->more_io, | |
138 | - __entry->older_than_this, | |
139 | 172 | __entry->range_start, |
140 | 173 | __entry->range_end) |
141 | 174 | ) |
142 | 175 | |
... | ... | @@ -144,14 +177,79 @@ |
144 | 177 | DEFINE_EVENT(wbc_class, name, \ |
145 | 178 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ |
146 | 179 | TP_ARGS(wbc, bdi)) |
147 | -DEFINE_WBC_EVENT(wbc_writeback_start); | |
148 | -DEFINE_WBC_EVENT(wbc_writeback_written); | |
149 | -DEFINE_WBC_EVENT(wbc_writeback_wait); | |
150 | -DEFINE_WBC_EVENT(wbc_balance_dirty_start); | |
151 | -DEFINE_WBC_EVENT(wbc_balance_dirty_written); | |
152 | -DEFINE_WBC_EVENT(wbc_balance_dirty_wait); | |
153 | 180 | DEFINE_WBC_EVENT(wbc_writepage); |
154 | 181 | |
182 | +TRACE_EVENT(writeback_queue_io, | |
183 | + TP_PROTO(struct bdi_writeback *wb, | |
184 | + unsigned long *older_than_this, | |
185 | + int moved), | |
186 | + TP_ARGS(wb, older_than_this, moved), | |
187 | + TP_STRUCT__entry( | |
188 | + __array(char, name, 32) | |
189 | + __field(unsigned long, older) | |
190 | + __field(long, age) | |
191 | + __field(int, moved) | |
192 | + ), | |
193 | + TP_fast_assign( | |
194 | + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); | |
195 | + __entry->older = older_than_this ? *older_than_this : 0; | |
196 | + __entry->age = older_than_this ? | |
197 | + (jiffies - *older_than_this) * 1000 / HZ : -1; | |
198 | + __entry->moved = moved; | |
199 | + ), | |
200 | + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d", | |
201 | + __entry->name, | |
202 | + __entry->older, /* older_than_this in jiffies */ | |
203 | + __entry->age, /* older_than_this in relative milliseconds */ | |
204 | + __entry->moved) | |
205 | +); | |
206 | + | |
207 | +TRACE_EVENT(global_dirty_state, | |
208 | + | |
209 | + TP_PROTO(unsigned long background_thresh, | |
210 | + unsigned long dirty_thresh | |
211 | + ), | |
212 | + | |
213 | + TP_ARGS(background_thresh, | |
214 | + dirty_thresh | |
215 | + ), | |
216 | + | |
217 | + TP_STRUCT__entry( | |
218 | + __field(unsigned long, nr_dirty) | |
219 | + __field(unsigned long, nr_writeback) | |
220 | + __field(unsigned long, nr_unstable) | |
221 | + __field(unsigned long, background_thresh) | |
222 | + __field(unsigned long, dirty_thresh) | |
223 | + __field(unsigned long, dirty_limit) | |
224 | + __field(unsigned long, nr_dirtied) | |
225 | + __field(unsigned long, nr_written) | |
226 | + ), | |
227 | + | |
228 | + TP_fast_assign( | |
229 | + __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); | |
230 | + __entry->nr_writeback = global_page_state(NR_WRITEBACK); | |
231 | + __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); | |
232 | + __entry->nr_dirtied = global_page_state(NR_DIRTIED); | |
233 | + __entry->nr_written = global_page_state(NR_WRITTEN); | |
234 | + __entry->background_thresh = background_thresh; | |
235 | + __entry->dirty_thresh = dirty_thresh; | |
236 | + __entry->dirty_limit = global_dirty_limit; | |
237 | + ), | |
238 | + | |
239 | + TP_printk("dirty=%lu writeback=%lu unstable=%lu " | |
240 | + "bg_thresh=%lu thresh=%lu limit=%lu " | |
241 | + "dirtied=%lu written=%lu", | |
242 | + __entry->nr_dirty, | |
243 | + __entry->nr_writeback, | |
244 | + __entry->nr_unstable, | |
245 | + __entry->background_thresh, | |
246 | + __entry->dirty_thresh, | |
247 | + __entry->dirty_limit, | |
248 | + __entry->nr_dirtied, | |
249 | + __entry->nr_written | |
250 | + ) | |
251 | +); | |
252 | + | |
155 | 253 | DECLARE_EVENT_CLASS(writeback_congest_waited_template, |
156 | 254 | |
157 | 255 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), |
... | ... | @@ -185,6 +283,63 @@ |
185 | 283 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), |
186 | 284 | |
187 | 285 | TP_ARGS(usec_timeout, usec_delayed) |
286 | +); | |
287 | + | |
288 | +DECLARE_EVENT_CLASS(writeback_single_inode_template, | |
289 | + | |
290 | + TP_PROTO(struct inode *inode, | |
291 | + struct writeback_control *wbc, | |
292 | + unsigned long nr_to_write | |
293 | + ), | |
294 | + | |
295 | + TP_ARGS(inode, wbc, nr_to_write), | |
296 | + | |
297 | + TP_STRUCT__entry( | |
298 | + __array(char, name, 32) | |
299 | + __field(unsigned long, ino) | |
300 | + __field(unsigned long, state) | |
301 | + __field(unsigned long, age) | |
302 | + __field(unsigned long, writeback_index) | |
303 | + __field(long, nr_to_write) | |
304 | + __field(unsigned long, wrote) | |
305 | + ), | |
306 | + | |
307 | + TP_fast_assign( | |
308 | + strncpy(__entry->name, | |
309 | + dev_name(inode->i_mapping->backing_dev_info->dev), 32); | |
310 | + __entry->ino = inode->i_ino; | |
311 | + __entry->state = inode->i_state; | |
312 | + __entry->age = (jiffies - inode->dirtied_when) * | |
313 | + 1000 / HZ; | |
314 | + __entry->writeback_index = inode->i_mapping->writeback_index; | |
315 | + __entry->nr_to_write = nr_to_write; | |
316 | + __entry->wrote = nr_to_write - wbc->nr_to_write; | |
317 | + ), | |
318 | + | |
319 | + TP_printk("bdi %s: ino=%lu state=%s age=%lu " | |
320 | + "index=%lu to_write=%ld wrote=%lu", | |
321 | + __entry->name, | |
322 | + __entry->ino, | |
323 | + show_inode_state(__entry->state), | |
324 | + __entry->age, | |
325 | + __entry->writeback_index, | |
326 | + __entry->nr_to_write, | |
327 | + __entry->wrote | |
328 | + ) | |
329 | +); | |
330 | + | |
331 | +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue, | |
332 | + TP_PROTO(struct inode *inode, | |
333 | + struct writeback_control *wbc, | |
334 | + unsigned long nr_to_write), | |
335 | + TP_ARGS(inode, wbc, nr_to_write) | |
336 | +); | |
337 | + | |
338 | +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, | |
339 | + TP_PROTO(struct inode *inode, | |
340 | + struct writeback_control *wbc, | |
341 | + unsigned long nr_to_write), | |
342 | + TP_ARGS(inode, wbc, nr_to_write) | |
188 | 343 | ); |
189 | 344 | |
190 | 345 | #endif /* _TRACE_WRITEBACK_H */ |
mm/backing-dev.c
... | ... | @@ -45,6 +45,17 @@ |
45 | 45 | static int bdi_sync_supers(void *); |
46 | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | |
48 | +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | |
49 | +{ | |
50 | + if (wb1 < wb2) { | |
51 | + spin_lock(&wb1->list_lock); | |
52 | + spin_lock_nested(&wb2->list_lock, 1); | |
53 | + } else { | |
54 | + spin_lock(&wb2->list_lock); | |
55 | + spin_lock_nested(&wb1->list_lock, 1); | |
56 | + } | |
57 | +} | |
58 | + | |
48 | 59 | #ifdef CONFIG_DEBUG_FS |
49 | 60 | #include <linux/debugfs.h> |
50 | 61 | #include <linux/seq_file.h> |
51 | 62 | |
52 | 63 | |
53 | 64 | |
... | ... | @@ -67,34 +78,42 @@ |
67 | 78 | struct inode *inode; |
68 | 79 | |
69 | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | - spin_lock(&inode_wb_list_lock); | |
81 | + spin_lock(&wb->list_lock); | |
71 | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | 83 | nr_dirty++; |
73 | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | 85 | nr_io++; |
75 | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | 87 | nr_more_io++; |
77 | - spin_unlock(&inode_wb_list_lock); | |
88 | + spin_unlock(&wb->list_lock); | |
78 | 89 | |
79 | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | |
82 | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | 94 | seq_printf(m, |
84 | - "BdiWriteback: %8lu kB\n" | |
85 | - "BdiReclaimable: %8lu kB\n" | |
86 | - "BdiDirtyThresh: %8lu kB\n" | |
87 | - "DirtyThresh: %8lu kB\n" | |
88 | - "BackgroundThresh: %8lu kB\n" | |
89 | - "b_dirty: %8lu\n" | |
90 | - "b_io: %8lu\n" | |
91 | - "b_more_io: %8lu\n" | |
92 | - "bdi_list: %8u\n" | |
93 | - "state: %8lx\n", | |
95 | + "BdiWriteback: %10lu kB\n" | |
96 | + "BdiReclaimable: %10lu kB\n" | |
97 | + "BdiDirtyThresh: %10lu kB\n" | |
98 | + "DirtyThresh: %10lu kB\n" | |
99 | + "BackgroundThresh: %10lu kB\n" | |
100 | + "BdiWritten: %10lu kB\n" | |
101 | + "BdiWriteBandwidth: %10lu kBps\n" | |
102 | + "b_dirty: %10lu\n" | |
103 | + "b_io: %10lu\n" | |
104 | + "b_more_io: %10lu\n" | |
105 | + "bdi_list: %10u\n" | |
106 | + "state: %10lx\n", | |
94 | 107 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | 108 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | - K(bdi_thresh), K(dirty_thresh), | |
97 | - K(background_thresh), nr_dirty, nr_io, nr_more_io, | |
109 | + K(bdi_thresh), | |
110 | + K(dirty_thresh), | |
111 | + K(background_thresh), | |
112 | + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | |
113 | + (unsigned long) K(bdi->write_bandwidth), | |
114 | + nr_dirty, | |
115 | + nr_io, | |
116 | + nr_more_io, | |
98 | 117 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | 118 | #undef K |
100 | 119 | |
... | ... | @@ -249,18 +268,6 @@ |
249 | 268 | return wb_has_dirty_io(&bdi->wb); |
250 | 269 | } |
251 | 270 | |
252 | -static void bdi_flush_io(struct backing_dev_info *bdi) | |
253 | -{ | |
254 | - struct writeback_control wbc = { | |
255 | - .sync_mode = WB_SYNC_NONE, | |
256 | - .older_than_this = NULL, | |
257 | - .range_cyclic = 1, | |
258 | - .nr_to_write = 1024, | |
259 | - }; | |
260 | - | |
261 | - writeback_inodes_wb(&bdi->wb, &wbc); | |
262 | -} | |
263 | - | |
264 | 271 | /* |
265 | 272 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | 273 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
267 | 274 | |
... | ... | @@ -446,9 +453,10 @@ |
446 | 453 | if (IS_ERR(task)) { |
447 | 454 | /* |
448 | 455 | * If thread creation fails, force writeout of |
449 | - * the bdi from the thread. | |
456 | + * the bdi from the thread. Hopefully 1024 is | |
457 | + * large enough for efficient IO. | |
450 | 458 | */ |
451 | - bdi_flush_io(bdi); | |
459 | + writeback_inodes_wb(&bdi->wb, 1024); | |
452 | 460 | } else { |
453 | 461 | /* |
454 | 462 | * The spinlock makes sure we do not lose |
455 | 463 | |
... | ... | @@ -629,9 +637,15 @@ |
629 | 637 | INIT_LIST_HEAD(&wb->b_dirty); |
630 | 638 | INIT_LIST_HEAD(&wb->b_io); |
631 | 639 | INIT_LIST_HEAD(&wb->b_more_io); |
640 | + spin_lock_init(&wb->list_lock); | |
632 | 641 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
633 | 642 | } |
634 | 643 | |
644 | +/* | |
645 | + * Initial write bandwidth: 100 MB/s | |
646 | + */ | |
647 | +#define INIT_BW (100 << (20 - PAGE_SHIFT)) | |
648 | + | |
635 | 649 | int bdi_init(struct backing_dev_info *bdi) |
636 | 650 | { |
637 | 651 | int i, err; |
... | ... | @@ -654,6 +668,13 @@ |
654 | 668 | } |
655 | 669 | |
656 | 670 | bdi->dirty_exceeded = 0; |
671 | + | |
672 | + bdi->bw_time_stamp = jiffies; | |
673 | + bdi->written_stamp = 0; | |
674 | + | |
675 | + bdi->write_bandwidth = INIT_BW; | |
676 | + bdi->avg_write_bandwidth = INIT_BW; | |
677 | + | |
657 | 678 | err = prop_local_init_percpu(&bdi->completions); |
658 | 679 | |
659 | 680 | if (err) { |
660 | 681 | |
... | ... | @@ -677,11 +698,12 @@ |
677 | 698 | if (bdi_has_dirty_io(bdi)) { |
678 | 699 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
679 | 700 | |
680 | - spin_lock(&inode_wb_list_lock); | |
701 | + bdi_lock_two(&bdi->wb, dst); | |
681 | 702 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
682 | 703 | list_splice(&bdi->wb.b_io, &dst->b_io); |
683 | 704 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
684 | - spin_unlock(&inode_wb_list_lock); | |
705 | + spin_unlock(&bdi->wb.list_lock); | |
706 | + spin_unlock(&dst->list_lock); | |
685 | 707 | } |
686 | 708 | |
687 | 709 | bdi_unregister(bdi); |
mm/filemap.c
... | ... | @@ -78,7 +78,7 @@ |
78 | 78 | * ->i_mutex (generic_file_buffered_write) |
79 | 79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | 80 | * |
81 | - * inode_wb_list_lock | |
81 | + * bdi->wb.list_lock | |
82 | 82 | * sb_lock (fs/fs-writeback.c) |
83 | 83 | * ->mapping->tree_lock (__sync_single_inode) |
84 | 84 | * |
85 | 85 | |
... | ... | @@ -96,9 +96,9 @@ |
96 | 96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
97 | 97 | * ->private_lock (page_remove_rmap->set_page_dirty) |
98 | 98 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
99 | - * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | |
99 | + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) | |
100 | 100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
101 | - * inode_wb_list_lock (zap_pte_range->set_page_dirty) | |
101 | + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) | |
102 | 102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
103 | 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | 104 | * |
mm/page-writeback.c
... | ... | @@ -37,6 +37,16 @@ |
37 | 37 | #include <trace/events/writeback.h> |
38 | 38 | |
39 | 39 | /* |
40 | + * Sleep at most 200ms at a time in balance_dirty_pages(). | |
41 | + */ | |
42 | +#define MAX_PAUSE max(HZ/5, 1) | |
43 | + | |
44 | +/* | |
45 | + * Estimate write bandwidth at 200ms intervals. | |
46 | + */ | |
47 | +#define BANDWIDTH_INTERVAL max(HZ/5, 1) | |
48 | + | |
49 | +/* | |
40 | 50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
41 | 51 | * will look to see if it needs to force writeback or throttling. |
42 | 52 | */ |
... | ... | @@ -111,6 +121,7 @@ |
111 | 121 | |
112 | 122 | /* End of sysctl-exported parameters */ |
113 | 123 | |
124 | +unsigned long global_dirty_limit; | |
114 | 125 | |
115 | 126 | /* |
116 | 127 | * Scale the writeback cache size proportional to the relative writeout speeds. |
... | ... | @@ -219,6 +230,7 @@ |
219 | 230 | */ |
220 | 231 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | 232 | { |
233 | + __inc_bdi_stat(bdi, BDI_WRITTEN); | |
222 | 234 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | 235 | bdi->max_prop_frac); |
224 | 236 | } |
225 | 237 | |
... | ... | @@ -244,13 +256,8 @@ |
244 | 256 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | 257 | long *numerator, long *denominator) |
246 | 258 | { |
247 | - if (bdi_cap_writeback_dirty(bdi)) { | |
248 | - prop_fraction_percpu(&vm_completions, &bdi->completions, | |
259 | + prop_fraction_percpu(&vm_completions, &bdi->completions, | |
249 | 260 | numerator, denominator); |
250 | - } else { | |
251 | - *numerator = 0; | |
252 | - *denominator = 1; | |
253 | - } | |
254 | 261 | } |
255 | 262 | |
256 | 263 | static inline void task_dirties_fraction(struct task_struct *tsk, |
257 | 264 | |
... | ... | @@ -274,12 +281,13 @@ |
274 | 281 | * effectively curb the growth of dirty pages. Light dirtiers with high enough |
275 | 282 | * dirty threshold may never get throttled. |
276 | 283 | */ |
284 | +#define TASK_LIMIT_FRACTION 8 | |
277 | 285 | static unsigned long task_dirty_limit(struct task_struct *tsk, |
278 | 286 | unsigned long bdi_dirty) |
279 | 287 | { |
280 | 288 | long numerator, denominator; |
281 | 289 | unsigned long dirty = bdi_dirty; |
282 | - u64 inv = dirty >> 3; | |
290 | + u64 inv = dirty / TASK_LIMIT_FRACTION; | |
283 | 291 | |
284 | 292 | task_dirties_fraction(tsk, &numerator, &denominator); |
285 | 293 | inv *= numerator; |
... | ... | @@ -290,6 +298,12 @@ |
290 | 298 | return max(dirty, bdi_dirty/2); |
291 | 299 | } |
292 | 300 | |
301 | +/* Minimum limit for any task */ | |
302 | +static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | |
303 | +{ | |
304 | + return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | |
305 | +} | |
306 | + | |
293 | 307 | /* |
294 | 308 | * |
295 | 309 | */ |
... | ... | @@ -397,6 +411,11 @@ |
397 | 411 | return x + 1; /* Ensure that we never return 0 */ |
398 | 412 | } |
399 | 413 | |
414 | +static unsigned long hard_dirty_limit(unsigned long thresh) | |
415 | +{ | |
416 | + return max(thresh, global_dirty_limit); | |
417 | +} | |
418 | + | |
400 | 419 | /* |
401 | 420 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | 421 | * |
403 | 422 | |
404 | 423 | |
405 | 424 | |
... | ... | @@ -435,12 +454,20 @@ |
435 | 454 | } |
436 | 455 | *pbackground = background; |
437 | 456 | *pdirty = dirty; |
457 | + trace_global_dirty_state(background, dirty); | |
438 | 458 | } |
439 | 459 | |
440 | -/* | |
460 | +/** | |
441 | 461 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
462 | + * @bdi: the backing_dev_info to query | |
463 | + * @dirty: global dirty limit in pages | |
442 | 464 | * |
443 | - * Allocate high/low dirty limits to fast/slow devices, in order to prevent | |
465 | + * Returns @bdi's dirty limit in pages. The term "dirty" in the context of | |
466 | + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | |
467 | + * And the "limit" in the name is not seriously taken as hard limit in | |
468 | + * balance_dirty_pages(). | |
469 | + * | |
470 | + * It allocates high/low dirty limits to fast/slow devices, in order to prevent | |
444 | 471 | * - starving fast devices |
445 | 472 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | 473 | * |
447 | 474 | |
... | ... | @@ -468,7 +495,154 @@ |
468 | 495 | return bdi_dirty; |
469 | 496 | } |
470 | 497 | |
498 | +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | |
499 | + unsigned long elapsed, | |
500 | + unsigned long written) | |
501 | +{ | |
502 | + const unsigned long period = roundup_pow_of_two(3 * HZ); | |
503 | + unsigned long avg = bdi->avg_write_bandwidth; | |
504 | + unsigned long old = bdi->write_bandwidth; | |
505 | + u64 bw; | |
506 | + | |
507 | + /* | |
508 | + * bw = written * HZ / elapsed | |
509 | + * | |
510 | + * bw * elapsed + write_bandwidth * (period - elapsed) | |
511 | + * write_bandwidth = --------------------------------------------------- | |
512 | + * period | |
513 | + */ | |
514 | + bw = written - bdi->written_stamp; | |
515 | + bw *= HZ; | |
516 | + if (unlikely(elapsed > period)) { | |
517 | + do_div(bw, elapsed); | |
518 | + avg = bw; | |
519 | + goto out; | |
520 | + } | |
521 | + bw += (u64)bdi->write_bandwidth * (period - elapsed); | |
522 | + bw >>= ilog2(period); | |
523 | + | |
524 | + /* | |
525 | + * one more level of smoothing, for filtering out sudden spikes | |
526 | + */ | |
527 | + if (avg > old && old >= (unsigned long)bw) | |
528 | + avg -= (avg - old) >> 3; | |
529 | + | |
530 | + if (avg < old && old <= (unsigned long)bw) | |
531 | + avg += (old - avg) >> 3; | |
532 | + | |
533 | +out: | |
534 | + bdi->write_bandwidth = bw; | |
535 | + bdi->avg_write_bandwidth = avg; | |
536 | +} | |
537 | + | |
471 | 538 | /* |
539 | + * The global dirtyable memory and dirty threshold could be suddenly knocked | |
540 | + * down by a large amount (eg. on the startup of KVM in a swapless system). | |
541 | + * This may throw the system into deep dirty exceeded state and throttle | |
542 | + * heavy/light dirtiers alike. To retain good responsiveness, maintain | |
543 | + * global_dirty_limit for tracking slowly down to the knocked down dirty | |
544 | + * threshold. | |
545 | + */ | |
546 | +static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | |
547 | +{ | |
548 | + unsigned long limit = global_dirty_limit; | |
549 | + | |
550 | + /* | |
551 | + * Follow up in one step. | |
552 | + */ | |
553 | + if (limit < thresh) { | |
554 | + limit = thresh; | |
555 | + goto update; | |
556 | + } | |
557 | + | |
558 | + /* | |
559 | + * Follow down slowly. Use the higher one as the target, because thresh | |
560 | + * may drop below dirty. This is exactly the reason to introduce | |
561 | + * global_dirty_limit which is guaranteed to lie above the dirty pages. | |
562 | + */ | |
563 | + thresh = max(thresh, dirty); | |
564 | + if (limit > thresh) { | |
565 | + limit -= (limit - thresh) >> 5; | |
566 | + goto update; | |
567 | + } | |
568 | + return; | |
569 | +update: | |
570 | + global_dirty_limit = limit; | |
571 | +} | |
572 | + | |
573 | +static void global_update_bandwidth(unsigned long thresh, | |
574 | + unsigned long dirty, | |
575 | + unsigned long now) | |
576 | +{ | |
577 | + static DEFINE_SPINLOCK(dirty_lock); | |
578 | + static unsigned long update_time; | |
579 | + | |
580 | + /* | |
581 | + * check locklessly first to optimize away locking for the most time | |
582 | + */ | |
583 | + if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | |
584 | + return; | |
585 | + | |
586 | + spin_lock(&dirty_lock); | |
587 | + if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | |
588 | + update_dirty_limit(thresh, dirty); | |
589 | + update_time = now; | |
590 | + } | |
591 | + spin_unlock(&dirty_lock); | |
592 | +} | |
593 | + | |
594 | +void __bdi_update_bandwidth(struct backing_dev_info *bdi, | |
595 | + unsigned long thresh, | |
596 | + unsigned long dirty, | |
597 | + unsigned long bdi_thresh, | |
598 | + unsigned long bdi_dirty, | |
599 | + unsigned long start_time) | |
600 | +{ | |
601 | + unsigned long now = jiffies; | |
602 | + unsigned long elapsed = now - bdi->bw_time_stamp; | |
603 | + unsigned long written; | |
604 | + | |
605 | + /* | |
606 | + * rate-limit, only update once every 200ms. | |
607 | + */ | |
608 | + if (elapsed < BANDWIDTH_INTERVAL) | |
609 | + return; | |
610 | + | |
611 | + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | |
612 | + | |
613 | + /* | |
614 | + * Skip quiet periods when disk bandwidth is under-utilized. | |
615 | + * (at least 1s idle time between two flusher runs) | |
616 | + */ | |
617 | + if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | |
618 | + goto snapshot; | |
619 | + | |
620 | + if (thresh) | |
621 | + global_update_bandwidth(thresh, dirty, now); | |
622 | + | |
623 | + bdi_update_write_bandwidth(bdi, elapsed, written); | |
624 | + | |
625 | +snapshot: | |
626 | + bdi->written_stamp = written; | |
627 | + bdi->bw_time_stamp = now; | |
628 | +} | |
629 | + | |
630 | +static void bdi_update_bandwidth(struct backing_dev_info *bdi, | |
631 | + unsigned long thresh, | |
632 | + unsigned long dirty, | |
633 | + unsigned long bdi_thresh, | |
634 | + unsigned long bdi_dirty, | |
635 | + unsigned long start_time) | |
636 | +{ | |
637 | + if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | |
638 | + return; | |
639 | + spin_lock(&bdi->wb.list_lock); | |
640 | + __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, | |
641 | + start_time); | |
642 | + spin_unlock(&bdi->wb.list_lock); | |
643 | +} | |
644 | + | |
645 | +/* | |
472 | 646 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | 647 | * data. It looks at the number of dirty pages in the machine and will force |
474 | 648 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. |
475 | 649 | |
476 | 650 | |
477 | 651 | |
478 | 652 | |
479 | 653 | |
... | ... | @@ -478,27 +652,25 @@ |
478 | 652 | static void balance_dirty_pages(struct address_space *mapping, |
479 | 653 | unsigned long write_chunk) |
480 | 654 | { |
481 | - long nr_reclaimable, bdi_nr_reclaimable; | |
482 | - long nr_writeback, bdi_nr_writeback; | |
655 | + unsigned long nr_reclaimable, bdi_nr_reclaimable; | |
656 | + unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ | |
657 | + unsigned long bdi_dirty; | |
483 | 658 | unsigned long background_thresh; |
484 | 659 | unsigned long dirty_thresh; |
485 | 660 | unsigned long bdi_thresh; |
661 | + unsigned long task_bdi_thresh; | |
662 | + unsigned long min_task_bdi_thresh; | |
486 | 663 | unsigned long pages_written = 0; |
487 | 664 | unsigned long pause = 1; |
488 | 665 | bool dirty_exceeded = false; |
666 | + bool clear_dirty_exceeded = true; | |
489 | 667 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
668 | + unsigned long start_time = jiffies; | |
490 | 669 | |
491 | 670 | for (;;) { |
492 | - struct writeback_control wbc = { | |
493 | - .sync_mode = WB_SYNC_NONE, | |
494 | - .older_than_this = NULL, | |
495 | - .nr_to_write = write_chunk, | |
496 | - .range_cyclic = 1, | |
497 | - }; | |
498 | - | |
499 | 671 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | 672 | global_page_state(NR_UNSTABLE_NFS); |
501 | - nr_writeback = global_page_state(NR_WRITEBACK); | |
673 | + nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); | |
502 | 674 | |
503 | 675 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 676 | |
505 | 677 | |
... | ... | @@ -507,12 +679,12 @@ |
507 | 679 | * catch-up. This avoids (excessively) small writeouts |
508 | 680 | * when the bdi limits are ramping up. |
509 | 681 | */ |
510 | - if (nr_reclaimable + nr_writeback <= | |
511 | - (background_thresh + dirty_thresh) / 2) | |
682 | + if (nr_dirty <= (background_thresh + dirty_thresh) / 2) | |
512 | 683 | break; |
513 | 684 | |
514 | 685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | - bdi_thresh = task_dirty_limit(current, bdi_thresh); | |
686 | + min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); | |
687 | + task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | |
516 | 688 | |
517 | 689 | /* |
518 | 690 | * In order to avoid the stacked BDI deadlock we need |
519 | 691 | |
520 | 692 | |
... | ... | @@ -524,12 +696,14 @@ |
524 | 696 | * actually dirty; with m+n sitting in the percpu |
525 | 697 | * deltas. |
526 | 698 | */ |
527 | - if (bdi_thresh < 2*bdi_stat_error(bdi)) { | |
699 | + if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { | |
528 | 700 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | - bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | |
701 | + bdi_dirty = bdi_nr_reclaimable + | |
702 | + bdi_stat_sum(bdi, BDI_WRITEBACK); | |
530 | 703 | } else { |
531 | 704 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | |
705 | + bdi_dirty = bdi_nr_reclaimable + | |
706 | + bdi_stat(bdi, BDI_WRITEBACK); | |
533 | 707 | } |
534 | 708 | |
535 | 709 | /* |
... | ... | @@ -538,9 +712,10 @@ |
538 | 712 | * bdi or process from holding back light ones; The latter is |
539 | 713 | * the last resort safeguard. |
540 | 714 | */ |
541 | - dirty_exceeded = | |
542 | - (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | |
543 | - || (nr_reclaimable + nr_writeback > dirty_thresh); | |
715 | + dirty_exceeded = (bdi_dirty > task_bdi_thresh) || | |
716 | + (nr_dirty > dirty_thresh); | |
717 | + clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && | |
718 | + (nr_dirty <= dirty_thresh); | |
544 | 719 | |
545 | 720 | if (!dirty_exceeded) |
546 | 721 | break; |
... | ... | @@ -548,6 +723,9 @@ |
548 | 723 | if (!bdi->dirty_exceeded) |
549 | 724 | bdi->dirty_exceeded = 1; |
550 | 725 | |
726 | + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, | |
727 | + bdi_thresh, bdi_dirty, start_time); | |
728 | + | |
551 | 729 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
552 | 730 | * Unstable writes are a feature of certain networked |
553 | 731 | * filesystems (i.e. NFS) in which data may have been |
554 | 732 | |
555 | 733 | |
556 | 734 | |
557 | 735 | |
... | ... | @@ -557,19 +735,42 @@ |
557 | 735 | * threshold otherwise wait until the disk writes catch |
558 | 736 | * up. |
559 | 737 | */ |
560 | - trace_wbc_balance_dirty_start(&wbc, bdi); | |
561 | - if (bdi_nr_reclaimable > bdi_thresh) { | |
562 | - writeback_inodes_wb(&bdi->wb, &wbc); | |
563 | - pages_written += write_chunk - wbc.nr_to_write; | |
564 | - trace_wbc_balance_dirty_written(&wbc, bdi); | |
738 | + trace_balance_dirty_start(bdi); | |
739 | + if (bdi_nr_reclaimable > task_bdi_thresh) { | |
740 | + pages_written += writeback_inodes_wb(&bdi->wb, | |
741 | + write_chunk); | |
742 | + trace_balance_dirty_written(bdi, pages_written); | |
565 | 743 | if (pages_written >= write_chunk) |
566 | 744 | break; /* We've done our duty */ |
567 | 745 | } |
568 | - trace_wbc_balance_dirty_wait(&wbc, bdi); | |
569 | 746 | __set_current_state(TASK_UNINTERRUPTIBLE); |
570 | 747 | io_schedule_timeout(pause); |
748 | + trace_balance_dirty_wait(bdi); | |
571 | 749 | |
750 | + dirty_thresh = hard_dirty_limit(dirty_thresh); | |
572 | 751 | /* |
752 | + * max-pause area. If dirty exceeded but still within this | |
753 | + * area, no need to sleep for more than 200ms: (a) 8 pages per | |
754 | + * 200ms is typically more than enough to curb heavy dirtiers; | |
755 | + * (b) the pause time limit makes the dirtiers more responsive. | |
756 | + */ | |
757 | + if (nr_dirty < dirty_thresh + | |
758 | + dirty_thresh / DIRTY_MAXPAUSE_AREA && | |
759 | + time_after(jiffies, start_time + MAX_PAUSE)) | |
760 | + break; | |
761 | + /* | |
762 | + * pass-good area. When some bdi gets blocked (eg. NFS server | |
763 | + * not responding), or write bandwidth dropped dramatically due | |
764 | + * to concurrent reads, or dirty threshold suddenly dropped and | |
765 | + * the dirty pages cannot be brought down anytime soon (eg. on | |
766 | + * slow USB stick), at least let go of the good bdi's. | |
767 | + */ | |
768 | + if (nr_dirty < dirty_thresh + | |
769 | + dirty_thresh / DIRTY_PASSGOOD_AREA && | |
770 | + bdi_dirty < bdi_thresh) | |
771 | + break; | |
772 | + | |
773 | + /* | |
573 | 774 | * Increase the delay for each loop, up to our previous |
574 | 775 | * default of taking a 100ms nap. |
575 | 776 | */ |
... | ... | @@ -578,7 +779,8 @@ |
578 | 779 | pause = HZ / 10; |
579 | 780 | } |
580 | 781 | |
581 | - if (!dirty_exceeded && bdi->dirty_exceeded) | |
782 | + /* Clear dirty_exceeded flag only when no task can exceed the limit */ | |
783 | + if (clear_dirty_exceeded && bdi->dirty_exceeded) | |
582 | 784 | bdi->dirty_exceeded = 0; |
583 | 785 | |
584 | 786 | if (writeback_in_progress(bdi)) |
585 | 787 | |
... | ... | @@ -626,9 +828,13 @@ |
626 | 828 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | 829 | unsigned long nr_pages_dirtied) |
628 | 830 | { |
831 | + struct backing_dev_info *bdi = mapping->backing_dev_info; | |
629 | 832 | unsigned long ratelimit; |
630 | 833 | unsigned long *p; |
631 | 834 | |
835 | + if (!bdi_cap_account_dirty(bdi)) | |
836 | + return; | |
837 | + | |
632 | 838 | ratelimit = ratelimit_pages; |
633 | 839 | if (mapping->backing_dev_info->dirty_exceeded) |
634 | 840 | ratelimit = 8; |
635 | 841 | |
... | ... | @@ -892,12 +1098,12 @@ |
892 | 1098 | range_whole = 1; |
893 | 1099 | cycled = 1; /* ignore range_cyclic tests */ |
894 | 1100 | } |
895 | - if (wbc->sync_mode == WB_SYNC_ALL) | |
1101 | + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | |
896 | 1102 | tag = PAGECACHE_TAG_TOWRITE; |
897 | 1103 | else |
898 | 1104 | tag = PAGECACHE_TAG_DIRTY; |
899 | 1105 | retry: |
900 | - if (wbc->sync_mode == WB_SYNC_ALL) | |
1106 | + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | |
901 | 1107 | tag_pages_for_writeback(mapping, index, end); |
902 | 1108 | done_index = index; |
903 | 1109 | while (!done && (index <= end)) { |
mm/rmap.c
... | ... | @@ -31,11 +31,11 @@ |
31 | 31 | * mmlist_lock (in mmput, drain_mmlist and others) |
32 | 32 | * mapping->private_lock (in __set_page_dirty_buffers) |
33 | 33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
34 | - * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | |
34 | + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) | |
35 | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | - * within inode_wb_list_lock in __sync_single_inode) | |
38 | + * within bdi.wb->list_lock in __sync_single_inode) | |
39 | 39 | * |
40 | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | 41 | * ->tasklist_lock |