Commit f01ef569cddb1a8627b1c6b3a134998ad1cf4b22

Authored by Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits)
  mm: properly reflect task dirty limits in dirty_exceeded logic
  writeback: don't busy retry writeback on new/freeing inodes
  writeback: scale IO chunk size up to half device bandwidth
  writeback: trace global_dirty_state
  writeback: introduce max-pause and pass-good dirty limits
  writeback: introduce smoothed global dirty limit
  writeback: consolidate variable names in balance_dirty_pages()
  writeback: show bdi write bandwidth in debugfs
  writeback: bdi write bandwidth estimation
  writeback: account per-bdi accumulated written pages
  writeback: make writeback_control.nr_to_write straight
  writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()
  writeback: trace event writeback_queue_io
  writeback: trace event writeback_single_inode
  writeback: remove .nonblocking and .encountered_congestion
  writeback: remove writeback_control.more_io
  writeback: skip balance_dirty_pages() for in-memory fs
  writeback: add bdi_dirty_limit() kernel-doc
  writeback: avoid extra sync work at enqueue time
  writeback: elevate queue_io() into wb_writeback()
  ...

Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c

Showing 15 changed files Side-by-side Diff

... ... @@ -44,24 +44,28 @@
44 44 {
45 45 return &BDEV_I(inode)->bdev;
46 46 }
47   -
48 47 EXPORT_SYMBOL(I_BDEV);
49 48  
50 49 /*
51   - * move the inode from it's current bdi to the a new bdi. if the inode is dirty
52   - * we need to move it onto the dirty list of @dst so that the inode is always
53   - * on the right list.
  50 + * Move the inode from its current bdi to a new bdi. If the inode is dirty we
  51 + * need to move it onto the dirty list of @dst so that the inode is always on
  52 + * the right list.
54 53 */
55 54 static void bdev_inode_switch_bdi(struct inode *inode,
56 55 struct backing_dev_info *dst)
57 56 {
58   - spin_lock(&inode_wb_list_lock);
  57 + struct backing_dev_info *old = inode->i_data.backing_dev_info;
  58 +
  59 + if (unlikely(dst == old)) /* deadlock avoidance */
  60 + return;
  61 + bdi_lock_two(&old->wb, &dst->wb);
59 62 spin_lock(&inode->i_lock);
60 63 inode->i_data.backing_dev_info = dst;
61 64 if (inode->i_state & I_DIRTY)
62 65 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 66 spin_unlock(&inode->i_lock);
64   - spin_unlock(&inode_wb_list_lock);
  67 + spin_unlock(&old->wb.list_lock);
  68 + spin_unlock(&dst->wb.list_lock);
65 69 }
66 70  
67 71 static sector_t max_block(struct block_device *bdev)
fs/btrfs/extent_io.c
... ... @@ -2551,7 +2551,6 @@
2551 2551 };
2552 2552 struct writeback_control wbc_writepages = {
2553 2553 .sync_mode = wbc->sync_mode,
2554   - .older_than_this = NULL,
2555 2554 .nr_to_write = 64,
2556 2555 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2557 2556 .range_end = (loff_t)-1,
... ... @@ -2584,7 +2583,6 @@
2584 2583 };
2585 2584 struct writeback_control wbc_writepages = {
2586 2585 .sync_mode = mode,
2587   - .older_than_this = NULL,
2588 2586 .nr_to_write = nr_pages * 2,
2589 2587 .range_start = start,
2590 2588 .range_end = end + 1,
... ... @@ -2741,7 +2741,7 @@
2741 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2742 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2743 2743  
2744   - if (wbc->sync_mode == WB_SYNC_ALL)
  2744 + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2745 2745 tag = PAGECACHE_TAG_TOWRITE;
2746 2746 else
2747 2747 tag = PAGECACHE_TAG_DIRTY;
... ... @@ -2973,7 +2973,7 @@
2973 2973 }
2974 2974  
2975 2975 retry:
2976   - if (wbc->sync_mode == WB_SYNC_ALL)
  2976 + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2977 2977 tag_pages_for_writeback(mapping, index, end);
2978 2978  
2979 2979 while (!ret && wbc->nr_to_write > 0) {
... ... @@ -35,7 +35,9 @@
35 35 struct wb_writeback_work {
36 36 long nr_pages;
37 37 struct super_block *sb;
  38 + unsigned long *older_than_this;
38 39 enum writeback_sync_modes sync_mode;
  40 + unsigned int tagged_writepages:1;
39 41 unsigned int for_kupdate:1;
40 42 unsigned int range_cyclic:1;
41 43 unsigned int for_background:1;
42 44  
43 45  
... ... @@ -180,12 +182,13 @@
180 182 */
181 183 void inode_wb_list_del(struct inode *inode)
182 184 {
183   - spin_lock(&inode_wb_list_lock);
  185 + struct backing_dev_info *bdi = inode_to_bdi(inode);
  186 +
  187 + spin_lock(&bdi->wb.list_lock);
184 188 list_del_init(&inode->i_wb_list);
185   - spin_unlock(&inode_wb_list_lock);
  189 + spin_unlock(&bdi->wb.list_lock);
186 190 }
187 191  
188   -
189 192 /*
190 193 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
191 194 * furthest end of its superblock's dirty-inode list.
192 195  
... ... @@ -195,11 +198,9 @@
195 198 * the case then the inode must have been redirtied while it was being written
196 199 * out and we don't reset its dirtied_when.
197 200 */
198   -static void redirty_tail(struct inode *inode)
  201 +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
199 202 {
200   - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201   -
202   - assert_spin_locked(&inode_wb_list_lock);
  203 + assert_spin_locked(&wb->list_lock);
203 204 if (!list_empty(&wb->b_dirty)) {
204 205 struct inode *tail;
205 206  
206 207  
... ... @@ -213,11 +214,9 @@
213 214 /*
214 215 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 216 */
216   -static void requeue_io(struct inode *inode)
  217 +static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
217 218 {
218   - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
219   -
220   - assert_spin_locked(&inode_wb_list_lock);
  219 + assert_spin_locked(&wb->list_lock);
221 220 list_move(&inode->i_wb_list, &wb->b_more_io);
222 221 }
223 222  
... ... @@ -225,7 +224,7 @@
225 224 {
226 225 /*
227 226 * Prevent speculative execution through
228   - * spin_unlock(&inode_wb_list_lock);
  227 + * spin_unlock(&wb->list_lock);
229 228 */
230 229  
231 230 smp_mb();
232 231  
233 232  
... ... @@ -250,15 +249,16 @@
250 249 /*
251 250 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
252 251 */
253   -static void move_expired_inodes(struct list_head *delaying_queue,
  252 +static int move_expired_inodes(struct list_head *delaying_queue,
254 253 struct list_head *dispatch_queue,
255   - unsigned long *older_than_this)
  254 + unsigned long *older_than_this)
256 255 {
257 256 LIST_HEAD(tmp);
258 257 struct list_head *pos, *node;
259 258 struct super_block *sb = NULL;
260 259 struct inode *inode;
261 260 int do_sb_sort = 0;
  261 + int moved = 0;
262 262  
263 263 while (!list_empty(delaying_queue)) {
264 264 inode = wb_inode(delaying_queue->prev);
265 265  
... ... @@ -269,12 +269,13 @@
269 269 do_sb_sort = 1;
270 270 sb = inode->i_sb;
271 271 list_move(&inode->i_wb_list, &tmp);
  272 + moved++;
272 273 }
273 274  
274 275 /* just one sb in list, splice to dispatch_queue and we're done */
275 276 if (!do_sb_sort) {
276 277 list_splice(&tmp, dispatch_queue);
277   - return;
  278 + goto out;
278 279 }
279 280  
280 281 /* Move inodes from one superblock together */
... ... @@ -286,6 +287,8 @@
286 287 list_move(&inode->i_wb_list, dispatch_queue);
287 288 }
288 289 }
  290 +out:
  291 + return moved;
289 292 }
290 293  
291 294 /*
292 295  
... ... @@ -301,9 +304,11 @@
301 304 */
302 305 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
303 306 {
304   - assert_spin_locked(&inode_wb_list_lock);
  307 + int moved;
  308 + assert_spin_locked(&wb->list_lock);
305 309 list_splice_init(&wb->b_more_io, &wb->b_io);
306   - move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
  310 + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
  311 + trace_writeback_queue_io(wb, older_than_this, moved);
307 312 }
308 313  
309 314 static int write_inode(struct inode *inode, struct writeback_control *wbc)
... ... @@ -316,7 +321,8 @@
316 321 /*
317 322 * Wait for writeback on an inode to complete.
318 323 */
319   -static void inode_wait_for_writeback(struct inode *inode)
  324 +static void inode_wait_for_writeback(struct inode *inode,
  325 + struct bdi_writeback *wb)
320 326 {
321 327 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
322 328 wait_queue_head_t *wqh;
323 329  
324 330  
... ... @@ -324,15 +330,15 @@
324 330 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
325 331 while (inode->i_state & I_SYNC) {
326 332 spin_unlock(&inode->i_lock);
327   - spin_unlock(&inode_wb_list_lock);
  333 + spin_unlock(&wb->list_lock);
328 334 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
329   - spin_lock(&inode_wb_list_lock);
  335 + spin_lock(&wb->list_lock);
330 336 spin_lock(&inode->i_lock);
331 337 }
332 338 }
333 339  
334 340 /*
335   - * Write out an inode's dirty pages. Called under inode_wb_list_lock and
  341 + * Write out an inode's dirty pages. Called under wb->list_lock and
336 342 * inode->i_lock. Either the caller has an active reference on the inode or
337 343 * the inode has I_WILL_FREE set.
338 344 *
339 345  
340 346  
... ... @@ -343,13 +349,15 @@
343 349 * livelocks, etc.
344 350 */
345 351 static int
346   -writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  352 +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
  353 + struct writeback_control *wbc)
347 354 {
348 355 struct address_space *mapping = inode->i_mapping;
  356 + long nr_to_write = wbc->nr_to_write;
349 357 unsigned dirty;
350 358 int ret;
351 359  
352   - assert_spin_locked(&inode_wb_list_lock);
  360 + assert_spin_locked(&wb->list_lock);
353 361 assert_spin_locked(&inode->i_lock);
354 362  
355 363 if (!atomic_read(&inode->i_count))
356 364  
... ... @@ -367,14 +375,16 @@
367 375 * completed a full scan of b_io.
368 376 */
369 377 if (wbc->sync_mode != WB_SYNC_ALL) {
370   - requeue_io(inode);
  378 + requeue_io(inode, wb);
  379 + trace_writeback_single_inode_requeue(inode, wbc,
  380 + nr_to_write);
371 381 return 0;
372 382 }
373 383  
374 384 /*
375 385 * It's a data-integrity sync. We must wait.
376 386 */
377   - inode_wait_for_writeback(inode);
  387 + inode_wait_for_writeback(inode, wb);
378 388 }
379 389  
380 390 BUG_ON(inode->i_state & I_SYNC);
... ... @@ -383,7 +393,7 @@
383 393 inode->i_state |= I_SYNC;
384 394 inode->i_state &= ~I_DIRTY_PAGES;
385 395 spin_unlock(&inode->i_lock);
386   - spin_unlock(&inode_wb_list_lock);
  396 + spin_unlock(&wb->list_lock);
387 397  
388 398 ret = do_writepages(mapping, wbc);
389 399  
390 400  
... ... @@ -414,10 +424,19 @@
414 424 ret = err;
415 425 }
416 426  
417   - spin_lock(&inode_wb_list_lock);
  427 + spin_lock(&wb->list_lock);
418 428 spin_lock(&inode->i_lock);
419 429 inode->i_state &= ~I_SYNC;
420 430 if (!(inode->i_state & I_FREEING)) {
  431 + /*
  432 + * Sync livelock prevention. Each inode is tagged and synced in
  433 + * one shot. If still dirty, it will be redirty_tail()'ed below.
  434 + * Update the dirty time to prevent enqueue and sync it again.
  435 + */
  436 + if ((inode->i_state & I_DIRTY) &&
  437 + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
  438 + inode->dirtied_when = jiffies;
  439 +
421 440 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
422 441 /*
423 442 * We didn't write back all the pages. nfs_writepages()
... ... @@ -428,7 +447,7 @@
428 447 /*
429 448 * slice used up: queue for next turn
430 449 */
431   - requeue_io(inode);
  450 + requeue_io(inode, wb);
432 451 } else {
433 452 /*
434 453 * Writeback blocked by something other than
... ... @@ -437,7 +456,7 @@
437 456 * retrying writeback of the dirty page/inode
438 457 * that cannot be performed immediately.
439 458 */
440   - redirty_tail(inode);
  459 + redirty_tail(inode, wb);
441 460 }
442 461 } else if (inode->i_state & I_DIRTY) {
443 462 /*
... ... @@ -446,7 +465,7 @@
446 465 * submission or metadata updates after data IO
447 466 * completion.
448 467 */
449   - redirty_tail(inode);
  468 + redirty_tail(inode, wb);
450 469 } else {
451 470 /*
452 471 * The inode is clean. At this point we either have
453 472  
... ... @@ -457,9 +476,41 @@
457 476 }
458 477 }
459 478 inode_sync_complete(inode);
  479 + trace_writeback_single_inode(inode, wbc, nr_to_write);
460 480 return ret;
461 481 }
462 482  
  483 +static long writeback_chunk_size(struct backing_dev_info *bdi,
  484 + struct wb_writeback_work *work)
  485 +{
  486 + long pages;
  487 +
  488 + /*
  489 + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
  490 + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
  491 + * here avoids calling into writeback_inodes_wb() more than once.
  492 + *
  493 + * The intended call sequence for WB_SYNC_ALL writeback is:
  494 + *
  495 + * wb_writeback()
  496 + * writeback_sb_inodes() <== called only once
  497 + * write_cache_pages() <== called once for each inode
  498 + * (quickly) tag currently dirty pages
  499 + * (maybe slowly) sync all tagged pages
  500 + */
  501 + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
  502 + pages = LONG_MAX;
  503 + else {
  504 + pages = min(bdi->avg_write_bandwidth / 2,
  505 + global_dirty_limit / DIRTY_SCOPE);
  506 + pages = min(pages, work->nr_pages);
  507 + pages = round_down(pages + MIN_WRITEBACK_PAGES,
  508 + MIN_WRITEBACK_PAGES);
  509 + }
  510 +
  511 + return pages;
  512 +}
  513 +
463 514 /*
464 515 * Write a portion of b_io inodes which belong to @sb.
465 516 *
466 517  
467 518  
468 519  
469 520  
470 521  
... ... @@ -467,24 +518,36 @@
467 518 * inodes. Otherwise write only ones which go sequentially
468 519 * in reverse order.
469 520 *
470   - * Return 1, if the caller writeback routine should be
471   - * interrupted. Otherwise return 0.
  521 + * Return the number of pages and/or inodes written.
472 522 */
473   -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
474   - struct writeback_control *wbc, bool only_this_sb)
  523 +static long writeback_sb_inodes(struct super_block *sb,
  524 + struct bdi_writeback *wb,
  525 + struct wb_writeback_work *work)
475 526 {
  527 + struct writeback_control wbc = {
  528 + .sync_mode = work->sync_mode,
  529 + .tagged_writepages = work->tagged_writepages,
  530 + .for_kupdate = work->for_kupdate,
  531 + .for_background = work->for_background,
  532 + .range_cyclic = work->range_cyclic,
  533 + .range_start = 0,
  534 + .range_end = LLONG_MAX,
  535 + };
  536 + unsigned long start_time = jiffies;
  537 + long write_chunk;
  538 + long wrote = 0; /* count both pages and inodes */
  539 +
476 540 while (!list_empty(&wb->b_io)) {
477   - long pages_skipped;
478 541 struct inode *inode = wb_inode(wb->b_io.prev);
479 542  
480 543 if (inode->i_sb != sb) {
481   - if (only_this_sb) {
  544 + if (work->sb) {
482 545 /*
483 546 * We only want to write back data for this
484 547 * superblock, move all inodes not belonging
485 548 * to it back onto the dirty list.
486 549 */
487   - redirty_tail(inode);
  550 + redirty_tail(inode, wb);
488 551 continue;
489 552 }
490 553  
... ... @@ -493,7 +556,7 @@
493 556 * Bounce back to the caller to unpin this and
494 557 * pin the next superblock.
495 558 */
496   - return 0;
  559 + break;
497 560 }
498 561  
499 562 /*
500 563  
501 564  
502 565  
503 566  
504 567  
505 568  
506 569  
507 570  
508 571  
509 572  
510 573  
511 574  
512 575  
513 576  
514 577  
515 578  
516 579  
517 580  
518 581  
519 582  
... ... @@ -504,96 +567,92 @@
504 567 spin_lock(&inode->i_lock);
505 568 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
506 569 spin_unlock(&inode->i_lock);
507   - requeue_io(inode);
  570 + redirty_tail(inode, wb);
508 571 continue;
509 572 }
510   -
511   - /*
512   - * Was this inode dirtied after sync_sb_inodes was called?
513   - * This keeps sync from extra jobs and livelock.
514   - */
515   - if (inode_dirtied_after(inode, wbc->wb_start)) {
516   - spin_unlock(&inode->i_lock);
517   - return 1;
518   - }
519   -
520 573 __iget(inode);
  574 + write_chunk = writeback_chunk_size(wb->bdi, work);
  575 + wbc.nr_to_write = write_chunk;
  576 + wbc.pages_skipped = 0;
521 577  
522   - pages_skipped = wbc->pages_skipped;
523   - writeback_single_inode(inode, wbc);
524   - if (wbc->pages_skipped != pages_skipped) {
  578 + writeback_single_inode(inode, wb, &wbc);
  579 +
  580 + work->nr_pages -= write_chunk - wbc.nr_to_write;
  581 + wrote += write_chunk - wbc.nr_to_write;
  582 + if (!(inode->i_state & I_DIRTY))
  583 + wrote++;
  584 + if (wbc.pages_skipped) {
525 585 /*
526 586 * writeback is not making progress due to locked
527 587 * buffers. Skip this inode for now.
528 588 */
529   - redirty_tail(inode);
  589 + redirty_tail(inode, wb);
530 590 }
531 591 spin_unlock(&inode->i_lock);
532   - spin_unlock(&inode_wb_list_lock);
  592 + spin_unlock(&wb->list_lock);
533 593 iput(inode);
534 594 cond_resched();
535   - spin_lock(&inode_wb_list_lock);
536   - if (wbc->nr_to_write <= 0) {
537   - wbc->more_io = 1;
538   - return 1;
  595 + spin_lock(&wb->list_lock);
  596 + /*
  597 + * bail out to wb_writeback() often enough to check
  598 + * background threshold and other termination conditions.
  599 + */
  600 + if (wrote) {
  601 + if (time_is_before_jiffies(start_time + HZ / 10UL))
  602 + break;
  603 + if (work->nr_pages <= 0)
  604 + break;
539 605 }
540   - if (!list_empty(&wb->b_more_io))
541   - wbc->more_io = 1;
542 606 }
543   - /* b_io is empty */
544   - return 1;
  607 + return wrote;
545 608 }
546 609  
547   -void writeback_inodes_wb(struct bdi_writeback *wb,
548   - struct writeback_control *wbc)
  610 +static long __writeback_inodes_wb(struct bdi_writeback *wb,
  611 + struct wb_writeback_work *work)
549 612 {
550   - int ret = 0;
  613 + unsigned long start_time = jiffies;
  614 + long wrote = 0;
551 615  
552   - if (!wbc->wb_start)
553   - wbc->wb_start = jiffies; /* livelock avoidance */
554   - spin_lock(&inode_wb_list_lock);
555   - if (!wbc->for_kupdate || list_empty(&wb->b_io))
556   - queue_io(wb, wbc->older_than_this);
557   -
558 616 while (!list_empty(&wb->b_io)) {
559 617 struct inode *inode = wb_inode(wb->b_io.prev);
560 618 struct super_block *sb = inode->i_sb;
561 619  
562 620 if (!grab_super_passive(sb)) {
563   - requeue_io(inode);
  621 + requeue_io(inode, wb);
564 622 continue;
565 623 }
566   - ret = writeback_sb_inodes(sb, wb, wbc, false);
  624 + wrote += writeback_sb_inodes(sb, wb, work);
567 625 drop_super(sb);
568 626  
569   - if (ret)
570   - break;
  627 + /* refer to the same tests at the end of writeback_sb_inodes */
  628 + if (wrote) {
  629 + if (time_is_before_jiffies(start_time + HZ / 10UL))
  630 + break;
  631 + if (work->nr_pages <= 0)
  632 + break;
  633 + }
571 634 }
572   - spin_unlock(&inode_wb_list_lock);
573 635 /* Leave any unwritten inodes on b_io */
  636 + return wrote;
574 637 }
575 638  
576   -static void __writeback_inodes_sb(struct super_block *sb,
577   - struct bdi_writeback *wb, struct writeback_control *wbc)
  639 +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
578 640 {
579   - WARN_ON(!rwsem_is_locked(&sb->s_umount));
  641 + struct wb_writeback_work work = {
  642 + .nr_pages = nr_pages,
  643 + .sync_mode = WB_SYNC_NONE,
  644 + .range_cyclic = 1,
  645 + };
580 646  
581   - spin_lock(&inode_wb_list_lock);
582   - if (!wbc->for_kupdate || list_empty(&wb->b_io))
583   - queue_io(wb, wbc->older_than_this);
584   - writeback_sb_inodes(sb, wb, wbc, true);
585   - spin_unlock(&inode_wb_list_lock);
  647 + spin_lock(&wb->list_lock);
  648 + if (list_empty(&wb->b_io))
  649 + queue_io(wb, NULL);
  650 + __writeback_inodes_wb(wb, &work);
  651 + spin_unlock(&wb->list_lock);
  652 +
  653 + return nr_pages - work.nr_pages;
586 654 }
587 655  
588   -/*
589   - * The maximum number of pages to writeout in a single bdi flush/kupdate
590   - * operation. We do this so we don't hold I_SYNC against an inode for
591   - * enormous amounts of time, which would block a userspace task which has
592   - * been forced to throttle against that inode. Also, the code reevaluates
593   - * the dirty each time it has written this many pages.
594   - */
595   -#define MAX_WRITEBACK_PAGES 1024
596   -
597 656 static inline bool over_bground_thresh(void)
598 657 {
599 658 unsigned long background_thresh, dirty_thresh;
... ... @@ -605,6 +664,16 @@
605 664 }
606 665  
607 666 /*
  667 + * Called under wb->list_lock. If there are multiple wb per bdi,
  668 + * only the flusher working on the first wb should do it.
  669 + */
  670 +static void wb_update_bandwidth(struct bdi_writeback *wb,
  671 + unsigned long start_time)
  672 +{
  673 + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
  674 +}
  675 +
  676 +/*
608 677 * Explicit flushing or periodic writeback of "old" data.
609 678 *
610 679 * Define "old": the first time one of an inode's pages is dirtied, we mark the
611 680  
612 681  
613 682  
614 683  
... ... @@ -622,47 +691,16 @@
622 691 static long wb_writeback(struct bdi_writeback *wb,
623 692 struct wb_writeback_work *work)
624 693 {
625   - struct writeback_control wbc = {
626   - .sync_mode = work->sync_mode,
627   - .older_than_this = NULL,
628   - .for_kupdate = work->for_kupdate,
629   - .for_background = work->for_background,
630   - .range_cyclic = work->range_cyclic,
631   - };
  694 + unsigned long wb_start = jiffies;
  695 + long nr_pages = work->nr_pages;
632 696 unsigned long oldest_jif;
633   - long wrote = 0;
634   - long write_chunk;
635 697 struct inode *inode;
  698 + long progress;
636 699  
637   - if (wbc.for_kupdate) {
638   - wbc.older_than_this = &oldest_jif;
639   - oldest_jif = jiffies -
640   - msecs_to_jiffies(dirty_expire_interval * 10);
641   - }
642   - if (!wbc.range_cyclic) {
643   - wbc.range_start = 0;
644   - wbc.range_end = LLONG_MAX;
645   - }
  700 + oldest_jif = jiffies;
  701 + work->older_than_this = &oldest_jif;
646 702  
647   - /*
648   - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
649   - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
650   - * here avoids calling into writeback_inodes_wb() more than once.
651   - *
652   - * The intended call sequence for WB_SYNC_ALL writeback is:
653   - *
654   - * wb_writeback()
655   - * __writeback_inodes_sb() <== called only once
656   - * write_cache_pages() <== called once for each inode
657   - * (quickly) tag currently dirty pages
658   - * (maybe slowly) sync all tagged pages
659   - */
660   - if (wbc.sync_mode == WB_SYNC_NONE)
661   - write_chunk = MAX_WRITEBACK_PAGES;
662   - else
663   - write_chunk = LONG_MAX;
664   -
665   - wbc.wb_start = jiffies; /* livelock avoidance */
  703 + spin_lock(&wb->list_lock);
666 704 for (;;) {
667 705 /*
668 706 * Stop writeback when nr_pages has been consumed
669 707  
670 708  
671 709  
672 710  
673 711  
674 712  
675 713  
676 714  
677 715  
678 716  
679 717  
680 718  
681 719  
682 720  
683 721  
684 722  
... ... @@ -687,52 +725,54 @@
687 725 if (work->for_background && !over_bground_thresh())
688 726 break;
689 727  
690   - wbc.more_io = 0;
691   - wbc.nr_to_write = write_chunk;
692   - wbc.pages_skipped = 0;
  728 + if (work->for_kupdate) {
  729 + oldest_jif = jiffies -
  730 + msecs_to_jiffies(dirty_expire_interval * 10);
  731 + work->older_than_this = &oldest_jif;
  732 + }
693 733  
694   - trace_wbc_writeback_start(&wbc, wb->bdi);
  734 + trace_writeback_start(wb->bdi, work);
  735 + if (list_empty(&wb->b_io))
  736 + queue_io(wb, work->older_than_this);
695 737 if (work->sb)
696   - __writeback_inodes_sb(work->sb, wb, &wbc);
  738 + progress = writeback_sb_inodes(work->sb, wb, work);
697 739 else
698   - writeback_inodes_wb(wb, &wbc);
699   - trace_wbc_writeback_written(&wbc, wb->bdi);
  740 + progress = __writeback_inodes_wb(wb, work);
  741 + trace_writeback_written(wb->bdi, work);
700 742  
701   - work->nr_pages -= write_chunk - wbc.nr_to_write;
702   - wrote += write_chunk - wbc.nr_to_write;
  743 + wb_update_bandwidth(wb, wb_start);
703 744  
704 745 /*
705   - * If we consumed everything, see if we have more
  746 + * Did we write something? Try for more
  747 + *
  748 + * Dirty inodes are moved to b_io for writeback in batches.
  749 + * The completion of the current batch does not necessarily
  750 + * mean the overall work is done. So we keep looping as long
  751 + * as made some progress on cleaning pages or inodes.
706 752 */
707   - if (wbc.nr_to_write <= 0)
  753 + if (progress)
708 754 continue;
709 755 /*
710   - * Didn't write everything and we don't have more IO, bail
  756 + * No more inodes for IO, bail
711 757 */
712   - if (!wbc.more_io)
  758 + if (list_empty(&wb->b_more_io))
713 759 break;
714 760 /*
715   - * Did we write something? Try for more
716   - */
717   - if (wbc.nr_to_write < write_chunk)
718   - continue;
719   - /*
720 761 * Nothing written. Wait for some inode to
721 762 * become available for writeback. Otherwise
722 763 * we'll just busyloop.
723 764 */
724   - spin_lock(&inode_wb_list_lock);
725 765 if (!list_empty(&wb->b_more_io)) {
  766 + trace_writeback_wait(wb->bdi, work);
726 767 inode = wb_inode(wb->b_more_io.prev);
727   - trace_wbc_writeback_wait(&wbc, wb->bdi);
728 768 spin_lock(&inode->i_lock);
729   - inode_wait_for_writeback(inode);
  769 + inode_wait_for_writeback(inode, wb);
730 770 spin_unlock(&inode->i_lock);
731 771 }
732   - spin_unlock(&inode_wb_list_lock);
733 772 }
  773 + spin_unlock(&wb->list_lock);
734 774  
735   - return wrote;
  775 + return nr_pages - work->nr_pages;
736 776 }
737 777  
738 778 /*
739 779  
... ... @@ -1063,10 +1103,10 @@
1063 1103 }
1064 1104  
1065 1105 spin_unlock(&inode->i_lock);
1066   - spin_lock(&inode_wb_list_lock);
  1106 + spin_lock(&bdi->wb.list_lock);
1067 1107 inode->dirtied_when = jiffies;
1068 1108 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1069   - spin_unlock(&inode_wb_list_lock);
  1109 + spin_unlock(&bdi->wb.list_lock);
1070 1110  
1071 1111 if (wakeup_bdi)
1072 1112 bdi_wakeup_thread_delayed(bdi);
... ... @@ -1162,10 +1202,11 @@
1162 1202 {
1163 1203 DECLARE_COMPLETION_ONSTACK(done);
1164 1204 struct wb_writeback_work work = {
1165   - .sb = sb,
1166   - .sync_mode = WB_SYNC_NONE,
1167   - .done = &done,
1168   - .nr_pages = nr,
  1205 + .sb = sb,
  1206 + .sync_mode = WB_SYNC_NONE,
  1207 + .tagged_writepages = 1,
  1208 + .done = &done,
  1209 + .nr_pages = nr,
1169 1210 };
1170 1211  
1171 1212 WARN_ON(!rwsem_is_locked(&sb->s_umount));
... ... @@ -1267,6 +1308,7 @@
1267 1308 */
1268 1309 int write_inode_now(struct inode *inode, int sync)
1269 1310 {
  1311 + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1270 1312 int ret;
1271 1313 struct writeback_control wbc = {
1272 1314 .nr_to_write = LONG_MAX,
1273 1315  
1274 1316  
... ... @@ -1279,11 +1321,11 @@
1279 1321 wbc.nr_to_write = 0;
1280 1322  
1281 1323 might_sleep();
1282   - spin_lock(&inode_wb_list_lock);
  1324 + spin_lock(&wb->list_lock);
1283 1325 spin_lock(&inode->i_lock);
1284   - ret = writeback_single_inode(inode, &wbc);
  1326 + ret = writeback_single_inode(inode, wb, &wbc);
1285 1327 spin_unlock(&inode->i_lock);
1286   - spin_unlock(&inode_wb_list_lock);
  1328 + spin_unlock(&wb->list_lock);
1287 1329 if (sync)
1288 1330 inode_sync_wait(inode);
1289 1331 return ret;
1290 1332  
1291 1333  
1292 1334  
... ... @@ -1303,13 +1345,14 @@
1303 1345 */
1304 1346 int sync_inode(struct inode *inode, struct writeback_control *wbc)
1305 1347 {
  1348 + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1306 1349 int ret;
1307 1350  
1308   - spin_lock(&inode_wb_list_lock);
  1351 + spin_lock(&wb->list_lock);
1309 1352 spin_lock(&inode->i_lock);
1310   - ret = writeback_single_inode(inode, wbc);
  1353 + ret = writeback_single_inode(inode, wb, wbc);
1311 1354 spin_unlock(&inode->i_lock);
1312   - spin_unlock(&inode_wb_list_lock);
  1355 + spin_unlock(&wb->list_lock);
1313 1356 return ret;
1314 1357 }
1315 1358 EXPORT_SYMBOL(sync_inode);
... ... @@ -37,7 +37,7 @@
37 37 * inode->i_sb->s_inode_lru, inode->i_lru
38 38 * inode_sb_list_lock protects:
39 39 * sb->s_inodes, inode->i_sb_list
40   - * inode_wb_list_lock protects:
  40 + * bdi->wb.list_lock protects:
41 41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 42 * inode_hash_lock protects:
43 43 * inode_hashtable, inode->i_hash
... ... @@ -48,7 +48,7 @@
48 48 * inode->i_lock
49 49 * inode->i_sb->s_inode_lru_lock
50 50 *
51   - * inode_wb_list_lock
  51 + * bdi->wb.list_lock
52 52 * inode->i_lock
53 53 *
54 54 * inode_hash_lock
... ... @@ -65,7 +65,6 @@
65 65 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
66 66  
67 67 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
68   -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
69 68  
70 69 /*
71 70 * Empty aops. Can be used for the cases where the user does not
... ... @@ -1566,8 +1566,7 @@
1566 1566 int status;
1567 1567 bool sync = true;
1568 1568  
1569   - if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
1570   - wbc->for_background)
  1569 + if (wbc->sync_mode == WB_SYNC_NONE)
1571 1570 sync = false;
1572 1571  
1573 1572 status = pnfs_layoutcommit_inode(inode, sync);
include/linux/backing-dev.h
... ... @@ -40,6 +40,7 @@
40 40 enum bdi_stat_item {
41 41 BDI_RECLAIMABLE,
42 42 BDI_WRITEBACK,
  43 + BDI_WRITTEN,
43 44 NR_BDI_STAT_ITEMS
44 45 };
45 46  
... ... @@ -57,6 +58,7 @@
57 58 struct list_head b_dirty; /* dirty inodes */
58 59 struct list_head b_io; /* parked for writeback */
59 60 struct list_head b_more_io; /* parked for more writeback */
  61 + spinlock_t list_lock; /* protects the b_* lists */
60 62 };
61 63  
62 64 struct backing_dev_info {
... ... @@ -71,6 +73,11 @@
71 73  
72 74 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
73 75  
  76 + unsigned long bw_time_stamp; /* last time write bw is updated */
  77 + unsigned long written_stamp; /* pages written at bw_time_stamp */
  78 + unsigned long write_bandwidth; /* the estimated write bandwidth */
  79 + unsigned long avg_write_bandwidth; /* further smoothed write bw */
  80 +
74 81 struct prop_local_percpu completions;
75 82 int dirty_exceeded;
76 83  
... ... @@ -106,6 +113,7 @@
106 113 int bdi_has_dirty_io(struct backing_dev_info *bdi);
107 114 void bdi_arm_supers_timer(void);
108 115 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
  116 +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
109 117  
110 118 extern spinlock_t bdi_lock;
111 119 extern struct list_head bdi_list;
include/linux/writeback.h
... ... @@ -7,10 +7,40 @@
7 7 #include <linux/sched.h>
8 8 #include <linux/fs.h>
9 9  
10   -struct backing_dev_info;
  10 +/*
  11 + * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
  12 + *
  13 + * (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
  14 + *
  15 + * The 1/16 region above the global dirty limit will be put to maximum pauses:
  16 + *
  17 + * (limit, limit + limit/DIRTY_MAXPAUSE_AREA)
  18 + *
  19 + * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
  20 + * to loops:
  21 + *
  22 + * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
  23 + *
  24 + * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
  25 + * time) for the dirty pages to drop, unless written enough pages.
  26 + *
  27 + * The global dirty threshold is normally equal to the global dirty limit,
  28 + * except when the system suddenly allocates a lot of anonymous memory and
  29 + * knocks down the global dirty threshold quickly, in which case the global
  30 + * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
  31 + */
  32 +#define DIRTY_SCOPE 8
  33 +#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
  34 +#define DIRTY_MAXPAUSE_AREA 16
  35 +#define DIRTY_PASSGOOD_AREA 8
11 36  
12   -extern spinlock_t inode_wb_list_lock;
  37 +/*
  38 + * 4MB minimal write chunk size
  39 + */
  40 +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
13 41  
  42 +struct backing_dev_info;
  43 +
14 44 /*
15 45 * fs/fs-writeback.c
16 46 */
... ... @@ -26,11 +56,6 @@
26 56 */
27 57 struct writeback_control {
28 58 enum writeback_sync_modes sync_mode;
29   - unsigned long *older_than_this; /* If !NULL, only write back inodes
30   - older than this */
31   - unsigned long wb_start; /* Time writeback_inodes_wb was
32   - called. This is needed to avoid
33   - extra jobs and livelock */
34 59 long nr_to_write; /* Write this many pages, and decrement
35 60 this for each page written */
36 61 long pages_skipped; /* Pages which were not written */
37 62  
38 63  
... ... @@ -43,13 +68,11 @@
43 68 loff_t range_start;
44 69 loff_t range_end;
45 70  
46   - unsigned nonblocking:1; /* Don't get stuck on request queues */
47   - unsigned encountered_congestion:1; /* An output: a queue is full */
48 71 unsigned for_kupdate:1; /* A kupdate writeback */
49 72 unsigned for_background:1; /* A background writeback */
  73 + unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
50 74 unsigned for_reclaim:1; /* Invoked from the page allocator */
51 75 unsigned range_cyclic:1; /* range_start is cyclic */
52   - unsigned more_io:1; /* more io to be dispatched */
53 76 };
54 77  
55 78 /*
... ... @@ -62,8 +85,7 @@
62 85 int writeback_inodes_sb_if_idle(struct super_block *);
63 86 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
64 87 void sync_inodes_sb(struct super_block *);
65   -void writeback_inodes_wb(struct bdi_writeback *wb,
66   - struct writeback_control *wbc);
  88 +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
67 89 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
68 90 void wakeup_flusher_threads(long nr_pages);
69 91  
... ... @@ -94,6 +116,8 @@
94 116 #endif
95 117 void throttle_vm_writeout(gfp_t gfp_mask);
96 118  
  119 +extern unsigned long global_dirty_limit;
  120 +
97 121 /* These are exported to sysctl. */
98 122 extern int dirty_background_ratio;
99 123 extern unsigned long dirty_background_bytes;
... ... @@ -127,6 +151,13 @@
127 151 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
128 152 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
129 153 unsigned long dirty);
  154 +
  155 +void __bdi_update_bandwidth(struct backing_dev_info *bdi,
  156 + unsigned long thresh,
  157 + unsigned long dirty,
  158 + unsigned long bdi_thresh,
  159 + unsigned long bdi_dirty,
  160 + unsigned long start_time);
130 161  
131 162 void page_writeback_init(void);
132 163 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
include/trace/events/btrfs.h
... ... @@ -284,7 +284,6 @@
284 284 __field( long, pages_skipped )
285 285 __field( loff_t, range_start )
286 286 __field( loff_t, range_end )
287   - __field( char, nonblocking )
288 287 __field( char, for_kupdate )
289 288 __field( char, for_reclaim )
290 289 __field( char, range_cyclic )
... ... @@ -299,7 +298,6 @@
299 298 __entry->pages_skipped = wbc->pages_skipped;
300 299 __entry->range_start = wbc->range_start;
301 300 __entry->range_end = wbc->range_end;
302   - __entry->nonblocking = wbc->nonblocking;
303 301 __entry->for_kupdate = wbc->for_kupdate;
304 302 __entry->for_reclaim = wbc->for_reclaim;
305 303 __entry->range_cyclic = wbc->range_cyclic;
306 304  
... ... @@ -310,13 +308,13 @@
310 308  
311 309 TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
312 310 "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
313   - "range_end = %llu, nonblocking = %d, for_kupdate = %d, "
  311 + "range_end = %llu, for_kupdate = %d, "
314 312 "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
315 313 show_root_type(__entry->root_objectid),
316 314 (unsigned long)__entry->ino, __entry->index,
317 315 __entry->nr_to_write, __entry->pages_skipped,
318 316 __entry->range_start, __entry->range_end,
319   - __entry->nonblocking, __entry->for_kupdate,
  317 + __entry->for_kupdate,
320 318 __entry->for_reclaim, __entry->range_cyclic,
321 319 (unsigned long)__entry->writeback_index)
322 320 );
include/trace/events/ext4.h
... ... @@ -380,7 +380,6 @@
380 380 __field( int, pages_written )
381 381 __field( long, pages_skipped )
382 382 __field( int, sync_mode )
383   - __field( char, more_io )
384 383 __field( pgoff_t, writeback_index )
385 384 ),
386 385  
387 386  
388 387  
... ... @@ -391,16 +390,15 @@
391 390 __entry->pages_written = pages_written;
392 391 __entry->pages_skipped = wbc->pages_skipped;
393 392 __entry->sync_mode = wbc->sync_mode;
394   - __entry->more_io = wbc->more_io;
395 393 __entry->writeback_index = inode->i_mapping->writeback_index;
396 394 ),
397 395  
398 396 TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
399   - " more_io %d sync_mode %d writeback_index %lu",
  397 + "sync_mode %d writeback_index %lu",
400 398 MAJOR(__entry->dev), MINOR(__entry->dev),
401 399 (unsigned long) __entry->ino, __entry->ret,
402 400 __entry->pages_written, __entry->pages_skipped,
403   - __entry->more_io, __entry->sync_mode,
  401 + __entry->sync_mode,
404 402 (unsigned long) __entry->writeback_index)
405 403 );
406 404  
include/trace/events/writeback.h
... ... @@ -8,6 +8,19 @@
8 8 #include <linux/device.h>
9 9 #include <linux/writeback.h>
10 10  
  11 +#define show_inode_state(state) \
  12 + __print_flags(state, "|", \
  13 + {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \
  14 + {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \
  15 + {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \
  16 + {I_NEW, "I_NEW"}, \
  17 + {I_WILL_FREE, "I_WILL_FREE"}, \
  18 + {I_FREEING, "I_FREEING"}, \
  19 + {I_CLEAR, "I_CLEAR"}, \
  20 + {I_SYNC, "I_SYNC"}, \
  21 + {I_REFERENCED, "I_REFERENCED"} \
  22 + )
  23 +
11 24 struct wb_writeback_work;
12 25  
13 26 DECLARE_EVENT_CLASS(writeback_work_class,
... ... @@ -49,6 +62,9 @@
49 62 DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
50 63 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
51 64 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
  65 +DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
  66 +DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
  67 +DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
52 68  
53 69 TRACE_EVENT(writeback_pages_written,
54 70 TP_PROTO(long pages_written),
55 71  
... ... @@ -88,7 +104,31 @@
88 104 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
89 105 DEFINE_WRITEBACK_EVENT(writeback_thread_start);
90 106 DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
  107 +DEFINE_WRITEBACK_EVENT(balance_dirty_start);
  108 +DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
91 109  
  110 +TRACE_EVENT(balance_dirty_written,
  111 +
  112 + TP_PROTO(struct backing_dev_info *bdi, int written),
  113 +
  114 + TP_ARGS(bdi, written),
  115 +
  116 + TP_STRUCT__entry(
  117 + __array(char, name, 32)
  118 + __field(int, written)
  119 + ),
  120 +
  121 + TP_fast_assign(
  122 + strncpy(__entry->name, dev_name(bdi->dev), 32);
  123 + __entry->written = written;
  124 + ),
  125 +
  126 + TP_printk("bdi %s written %d",
  127 + __entry->name,
  128 + __entry->written
  129 + )
  130 +);
  131 +
92 132 DECLARE_EVENT_CLASS(wbc_class,
93 133 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
94 134 TP_ARGS(wbc, bdi),
... ... @@ -101,8 +141,6 @@
101 141 __field(int, for_background)
102 142 __field(int, for_reclaim)
103 143 __field(int, range_cyclic)
104   - __field(int, more_io)
105   - __field(unsigned long, older_than_this)
106 144 __field(long, range_start)
107 145 __field(long, range_end)
108 146 ),
109 147  
... ... @@ -116,15 +154,12 @@
116 154 __entry->for_background = wbc->for_background;
117 155 __entry->for_reclaim = wbc->for_reclaim;
118 156 __entry->range_cyclic = wbc->range_cyclic;
119   - __entry->more_io = wbc->more_io;
120   - __entry->older_than_this = wbc->older_than_this ?
121   - *wbc->older_than_this : 0;
122 157 __entry->range_start = (long)wbc->range_start;
123 158 __entry->range_end = (long)wbc->range_end;
124 159 ),
125 160  
126 161 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
127   - "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
  162 + "bgrd=%d reclm=%d cyclic=%d "
128 163 "start=0x%lx end=0x%lx",
129 164 __entry->name,
130 165 __entry->nr_to_write,
... ... @@ -134,8 +169,6 @@
134 169 __entry->for_background,
135 170 __entry->for_reclaim,
136 171 __entry->range_cyclic,
137   - __entry->more_io,
138   - __entry->older_than_this,
139 172 __entry->range_start,
140 173 __entry->range_end)
141 174 )
142 175  
... ... @@ -144,14 +177,79 @@
144 177 DEFINE_EVENT(wbc_class, name, \
145 178 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
146 179 TP_ARGS(wbc, bdi))
147   -DEFINE_WBC_EVENT(wbc_writeback_start);
148   -DEFINE_WBC_EVENT(wbc_writeback_written);
149   -DEFINE_WBC_EVENT(wbc_writeback_wait);
150   -DEFINE_WBC_EVENT(wbc_balance_dirty_start);
151   -DEFINE_WBC_EVENT(wbc_balance_dirty_written);
152   -DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
153 180 DEFINE_WBC_EVENT(wbc_writepage);
154 181  
  182 +TRACE_EVENT(writeback_queue_io,
  183 + TP_PROTO(struct bdi_writeback *wb,
  184 + unsigned long *older_than_this,
  185 + int moved),
  186 + TP_ARGS(wb, older_than_this, moved),
  187 + TP_STRUCT__entry(
  188 + __array(char, name, 32)
  189 + __field(unsigned long, older)
  190 + __field(long, age)
  191 + __field(int, moved)
  192 + ),
  193 + TP_fast_assign(
  194 + strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
  195 + __entry->older = older_than_this ? *older_than_this : 0;
  196 + __entry->age = older_than_this ?
  197 + (jiffies - *older_than_this) * 1000 / HZ : -1;
  198 + __entry->moved = moved;
  199 + ),
  200 + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
  201 + __entry->name,
  202 + __entry->older, /* older_than_this in jiffies */
  203 + __entry->age, /* older_than_this in relative milliseconds */
  204 + __entry->moved)
  205 +);
  206 +
  207 +TRACE_EVENT(global_dirty_state,
  208 +
  209 + TP_PROTO(unsigned long background_thresh,
  210 + unsigned long dirty_thresh
  211 + ),
  212 +
  213 + TP_ARGS(background_thresh,
  214 + dirty_thresh
  215 + ),
  216 +
  217 + TP_STRUCT__entry(
  218 + __field(unsigned long, nr_dirty)
  219 + __field(unsigned long, nr_writeback)
  220 + __field(unsigned long, nr_unstable)
  221 + __field(unsigned long, background_thresh)
  222 + __field(unsigned long, dirty_thresh)
  223 + __field(unsigned long, dirty_limit)
  224 + __field(unsigned long, nr_dirtied)
  225 + __field(unsigned long, nr_written)
  226 + ),
  227 +
  228 + TP_fast_assign(
  229 + __entry->nr_dirty = global_page_state(NR_FILE_DIRTY);
  230 + __entry->nr_writeback = global_page_state(NR_WRITEBACK);
  231 + __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
  232 + __entry->nr_dirtied = global_page_state(NR_DIRTIED);
  233 + __entry->nr_written = global_page_state(NR_WRITTEN);
  234 + __entry->background_thresh = background_thresh;
  235 + __entry->dirty_thresh = dirty_thresh;
  236 + __entry->dirty_limit = global_dirty_limit;
  237 + ),
  238 +
  239 + TP_printk("dirty=%lu writeback=%lu unstable=%lu "
  240 + "bg_thresh=%lu thresh=%lu limit=%lu "
  241 + "dirtied=%lu written=%lu",
  242 + __entry->nr_dirty,
  243 + __entry->nr_writeback,
  244 + __entry->nr_unstable,
  245 + __entry->background_thresh,
  246 + __entry->dirty_thresh,
  247 + __entry->dirty_limit,
  248 + __entry->nr_dirtied,
  249 + __entry->nr_written
  250 + )
  251 +);
  252 +
155 253 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
156 254  
157 255 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
... ... @@ -185,6 +283,63 @@
185 283 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
186 284  
187 285 TP_ARGS(usec_timeout, usec_delayed)
  286 +);
  287 +
  288 +DECLARE_EVENT_CLASS(writeback_single_inode_template,
  289 +
  290 + TP_PROTO(struct inode *inode,
  291 + struct writeback_control *wbc,
  292 + unsigned long nr_to_write
  293 + ),
  294 +
  295 + TP_ARGS(inode, wbc, nr_to_write),
  296 +
  297 + TP_STRUCT__entry(
  298 + __array(char, name, 32)
  299 + __field(unsigned long, ino)
  300 + __field(unsigned long, state)
  301 + __field(unsigned long, age)
  302 + __field(unsigned long, writeback_index)
  303 + __field(long, nr_to_write)
  304 + __field(unsigned long, wrote)
  305 + ),
  306 +
  307 + TP_fast_assign(
  308 + strncpy(__entry->name,
  309 + dev_name(inode->i_mapping->backing_dev_info->dev), 32);
  310 + __entry->ino = inode->i_ino;
  311 + __entry->state = inode->i_state;
  312 + __entry->age = (jiffies - inode->dirtied_when) *
  313 + 1000 / HZ;
  314 + __entry->writeback_index = inode->i_mapping->writeback_index;
  315 + __entry->nr_to_write = nr_to_write;
  316 + __entry->wrote = nr_to_write - wbc->nr_to_write;
  317 + ),
  318 +
  319 + TP_printk("bdi %s: ino=%lu state=%s age=%lu "
  320 + "index=%lu to_write=%ld wrote=%lu",
  321 + __entry->name,
  322 + __entry->ino,
  323 + show_inode_state(__entry->state),
  324 + __entry->age,
  325 + __entry->writeback_index,
  326 + __entry->nr_to_write,
  327 + __entry->wrote
  328 + )
  329 +);
  330 +
  331 +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
  332 + TP_PROTO(struct inode *inode,
  333 + struct writeback_control *wbc,
  334 + unsigned long nr_to_write),
  335 + TP_ARGS(inode, wbc, nr_to_write)
  336 +);
  337 +
  338 +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
  339 + TP_PROTO(struct inode *inode,
  340 + struct writeback_control *wbc,
  341 + unsigned long nr_to_write),
  342 + TP_ARGS(inode, wbc, nr_to_write)
188 343 );
189 344  
190 345 #endif /* _TRACE_WRITEBACK_H */
... ... @@ -45,6 +45,17 @@
45 45 static int bdi_sync_supers(void *);
46 46 static void sync_supers_timer_fn(unsigned long);
47 47  
  48 +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
  49 +{
  50 + if (wb1 < wb2) {
  51 + spin_lock(&wb1->list_lock);
  52 + spin_lock_nested(&wb2->list_lock, 1);
  53 + } else {
  54 + spin_lock(&wb2->list_lock);
  55 + spin_lock_nested(&wb1->list_lock, 1);
  56 + }
  57 +}
  58 +
48 59 #ifdef CONFIG_DEBUG_FS
49 60 #include <linux/debugfs.h>
50 61 #include <linux/seq_file.h>
51 62  
52 63  
53 64  
... ... @@ -67,34 +78,42 @@
67 78 struct inode *inode;
68 79  
69 80 nr_dirty = nr_io = nr_more_io = 0;
70   - spin_lock(&inode_wb_list_lock);
  81 + spin_lock(&wb->list_lock);
71 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 83 nr_dirty++;
73 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 85 nr_io++;
75 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 87 nr_more_io++;
77   - spin_unlock(&inode_wb_list_lock);
  88 + spin_unlock(&wb->list_lock);
78 89  
79 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92  
82 93 #define K(x) ((x) << (PAGE_SHIFT - 10))
83 94 seq_printf(m,
84   - "BdiWriteback: %8lu kB\n"
85   - "BdiReclaimable: %8lu kB\n"
86   - "BdiDirtyThresh: %8lu kB\n"
87   - "DirtyThresh: %8lu kB\n"
88   - "BackgroundThresh: %8lu kB\n"
89   - "b_dirty: %8lu\n"
90   - "b_io: %8lu\n"
91   - "b_more_io: %8lu\n"
92   - "bdi_list: %8u\n"
93   - "state: %8lx\n",
  95 + "BdiWriteback: %10lu kB\n"
  96 + "BdiReclaimable: %10lu kB\n"
  97 + "BdiDirtyThresh: %10lu kB\n"
  98 + "DirtyThresh: %10lu kB\n"
  99 + "BackgroundThresh: %10lu kB\n"
  100 + "BdiWritten: %10lu kB\n"
  101 + "BdiWriteBandwidth: %10lu kBps\n"
  102 + "b_dirty: %10lu\n"
  103 + "b_io: %10lu\n"
  104 + "b_more_io: %10lu\n"
  105 + "bdi_list: %10u\n"
  106 + "state: %10lx\n",
94 107 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 108 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96   - K(bdi_thresh), K(dirty_thresh),
97   - K(background_thresh), nr_dirty, nr_io, nr_more_io,
  109 + K(bdi_thresh),
  110 + K(dirty_thresh),
  111 + K(background_thresh),
  112 + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
  113 + (unsigned long) K(bdi->write_bandwidth),
  114 + nr_dirty,
  115 + nr_io,
  116 + nr_more_io,
98 117 !list_empty(&bdi->bdi_list), bdi->state);
99 118 #undef K
100 119  
... ... @@ -249,18 +268,6 @@
249 268 return wb_has_dirty_io(&bdi->wb);
250 269 }
251 270  
252   -static void bdi_flush_io(struct backing_dev_info *bdi)
253   -{
254   - struct writeback_control wbc = {
255   - .sync_mode = WB_SYNC_NONE,
256   - .older_than_this = NULL,
257   - .range_cyclic = 1,
258   - .nr_to_write = 1024,
259   - };
260   -
261   - writeback_inodes_wb(&bdi->wb, &wbc);
262   -}
263   -
264 271 /*
265 272 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 273 * or we risk deadlocking on ->s_umount. The longer term solution would be
267 274  
... ... @@ -446,9 +453,10 @@
446 453 if (IS_ERR(task)) {
447 454 /*
448 455 * If thread creation fails, force writeout of
449   - * the bdi from the thread.
  456 + * the bdi from the thread. Hopefully 1024 is
  457 + * large enough for efficient IO.
450 458 */
451   - bdi_flush_io(bdi);
  459 + writeback_inodes_wb(&bdi->wb, 1024);
452 460 } else {
453 461 /*
454 462 * The spinlock makes sure we do not lose
455 463  
... ... @@ -629,9 +637,15 @@
629 637 INIT_LIST_HEAD(&wb->b_dirty);
630 638 INIT_LIST_HEAD(&wb->b_io);
631 639 INIT_LIST_HEAD(&wb->b_more_io);
  640 + spin_lock_init(&wb->list_lock);
632 641 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
633 642 }
634 643  
  644 +/*
  645 + * Initial write bandwidth: 100 MB/s
  646 + */
  647 +#define INIT_BW (100 << (20 - PAGE_SHIFT))
  648 +
635 649 int bdi_init(struct backing_dev_info *bdi)
636 650 {
637 651 int i, err;
... ... @@ -654,6 +668,13 @@
654 668 }
655 669  
656 670 bdi->dirty_exceeded = 0;
  671 +
  672 + bdi->bw_time_stamp = jiffies;
  673 + bdi->written_stamp = 0;
  674 +
  675 + bdi->write_bandwidth = INIT_BW;
  676 + bdi->avg_write_bandwidth = INIT_BW;
  677 +
657 678 err = prop_local_init_percpu(&bdi->completions);
658 679  
659 680 if (err) {
660 681  
... ... @@ -677,11 +698,12 @@
677 698 if (bdi_has_dirty_io(bdi)) {
678 699 struct bdi_writeback *dst = &default_backing_dev_info.wb;
679 700  
680   - spin_lock(&inode_wb_list_lock);
  701 + bdi_lock_two(&bdi->wb, dst);
681 702 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
682 703 list_splice(&bdi->wb.b_io, &dst->b_io);
683 704 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
684   - spin_unlock(&inode_wb_list_lock);
  705 + spin_unlock(&bdi->wb.list_lock);
  706 + spin_unlock(&dst->list_lock);
685 707 }
686 708  
687 709 bdi_unregister(bdi);
... ... @@ -78,7 +78,7 @@
78 78 * ->i_mutex (generic_file_buffered_write)
79 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 80 *
81   - * inode_wb_list_lock
  81 + * bdi->wb.list_lock
82 82 * sb_lock (fs/fs-writeback.c)
83 83 * ->mapping->tree_lock (__sync_single_inode)
84 84 *
85 85  
... ... @@ -96,9 +96,9 @@
96 96 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
97 97 * ->private_lock (page_remove_rmap->set_page_dirty)
98 98 * ->tree_lock (page_remove_rmap->set_page_dirty)
99   - * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
  99 + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
100 100 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
101   - * inode_wb_list_lock (zap_pte_range->set_page_dirty)
  101 + * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
102 102 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 104 *
... ... @@ -37,6 +37,16 @@
37 37 #include <trace/events/writeback.h>
38 38  
39 39 /*
  40 + * Sleep at most 200ms at a time in balance_dirty_pages().
  41 + */
  42 +#define MAX_PAUSE max(HZ/5, 1)
  43 +
  44 +/*
  45 + * Estimate write bandwidth at 200ms intervals.
  46 + */
  47 +#define BANDWIDTH_INTERVAL max(HZ/5, 1)
  48 +
  49 +/*
40 50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
41 51 * will look to see if it needs to force writeback or throttling.
42 52 */
... ... @@ -111,6 +121,7 @@
111 121  
112 122 /* End of sysctl-exported parameters */
113 123  
  124 +unsigned long global_dirty_limit;
114 125  
115 126 /*
116 127 * Scale the writeback cache size proportional to the relative writeout speeds.
... ... @@ -219,6 +230,7 @@
219 230 */
220 231 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221 232 {
  233 + __inc_bdi_stat(bdi, BDI_WRITTEN);
222 234 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 235 bdi->max_prop_frac);
224 236 }
225 237  
... ... @@ -244,13 +256,8 @@
244 256 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 257 long *numerator, long *denominator)
246 258 {
247   - if (bdi_cap_writeback_dirty(bdi)) {
248   - prop_fraction_percpu(&vm_completions, &bdi->completions,
  259 + prop_fraction_percpu(&vm_completions, &bdi->completions,
249 260 numerator, denominator);
250   - } else {
251   - *numerator = 0;
252   - *denominator = 1;
253   - }
254 261 }
255 262  
256 263 static inline void task_dirties_fraction(struct task_struct *tsk,
257 264  
... ... @@ -274,12 +281,13 @@
274 281 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 282 * dirty threshold may never get throttled.
276 283 */
  284 +#define TASK_LIMIT_FRACTION 8
277 285 static unsigned long task_dirty_limit(struct task_struct *tsk,
278 286 unsigned long bdi_dirty)
279 287 {
280 288 long numerator, denominator;
281 289 unsigned long dirty = bdi_dirty;
282   - u64 inv = dirty >> 3;
  290 + u64 inv = dirty / TASK_LIMIT_FRACTION;
283 291  
284 292 task_dirties_fraction(tsk, &numerator, &denominator);
285 293 inv *= numerator;
... ... @@ -290,6 +298,12 @@
290 298 return max(dirty, bdi_dirty/2);
291 299 }
292 300  
  301 +/* Minimum limit for any task */
  302 +static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
  303 +{
  304 + return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
  305 +}
  306 +
293 307 /*
294 308 *
295 309 */
... ... @@ -397,6 +411,11 @@
397 411 return x + 1; /* Ensure that we never return 0 */
398 412 }
399 413  
  414 +static unsigned long hard_dirty_limit(unsigned long thresh)
  415 +{
  416 + return max(thresh, global_dirty_limit);
  417 +}
  418 +
400 419 /*
401 420 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 421 *
403 422  
404 423  
405 424  
... ... @@ -435,12 +454,20 @@
435 454 }
436 455 *pbackground = background;
437 456 *pdirty = dirty;
  457 + trace_global_dirty_state(background, dirty);
438 458 }
439 459  
440   -/*
  460 +/**
441 461 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
  462 + * @bdi: the backing_dev_info to query
  463 + * @dirty: global dirty limit in pages
442 464 *
443   - * Allocate high/low dirty limits to fast/slow devices, in order to prevent
  465 + * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
  466 + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
  467 + * And the "limit" in the name is not seriously taken as hard limit in
  468 + * balance_dirty_pages().
  469 + *
  470 + * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 471 * - starving fast devices
445 472 * - piling up dirty pages (that will take long time to sync) on slow devices
446 473 *
447 474  
... ... @@ -468,7 +495,154 @@
468 495 return bdi_dirty;
469 496 }
470 497  
  498 +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
  499 + unsigned long elapsed,
  500 + unsigned long written)
  501 +{
  502 + const unsigned long period = roundup_pow_of_two(3 * HZ);
  503 + unsigned long avg = bdi->avg_write_bandwidth;
  504 + unsigned long old = bdi->write_bandwidth;
  505 + u64 bw;
  506 +
  507 + /*
  508 + * bw = written * HZ / elapsed
  509 + *
  510 + * bw * elapsed + write_bandwidth * (period - elapsed)
  511 + * write_bandwidth = ---------------------------------------------------
  512 + * period
  513 + */
  514 + bw = written - bdi->written_stamp;
  515 + bw *= HZ;
  516 + if (unlikely(elapsed > period)) {
  517 + do_div(bw, elapsed);
  518 + avg = bw;
  519 + goto out;
  520 + }
  521 + bw += (u64)bdi->write_bandwidth * (period - elapsed);
  522 + bw >>= ilog2(period);
  523 +
  524 + /*
  525 + * one more level of smoothing, for filtering out sudden spikes
  526 + */
  527 + if (avg > old && old >= (unsigned long)bw)
  528 + avg -= (avg - old) >> 3;
  529 +
  530 + if (avg < old && old <= (unsigned long)bw)
  531 + avg += (old - avg) >> 3;
  532 +
  533 +out:
  534 + bdi->write_bandwidth = bw;
  535 + bdi->avg_write_bandwidth = avg;
  536 +}
  537 +
471 538 /*
  539 + * The global dirtyable memory and dirty threshold could be suddenly knocked
  540 + * down by a large amount (eg. on the startup of KVM in a swapless system).
  541 + * This may throw the system into deep dirty exceeded state and throttle
  542 + * heavy/light dirtiers alike. To retain good responsiveness, maintain
  543 + * global_dirty_limit for tracking slowly down to the knocked down dirty
  544 + * threshold.
  545 + */
  546 +static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
  547 +{
  548 + unsigned long limit = global_dirty_limit;
  549 +
  550 + /*
  551 + * Follow up in one step.
  552 + */
  553 + if (limit < thresh) {
  554 + limit = thresh;
  555 + goto update;
  556 + }
  557 +
  558 + /*
  559 + * Follow down slowly. Use the higher one as the target, because thresh
  560 + * may drop below dirty. This is exactly the reason to introduce
  561 + * global_dirty_limit which is guaranteed to lie above the dirty pages.
  562 + */
  563 + thresh = max(thresh, dirty);
  564 + if (limit > thresh) {
  565 + limit -= (limit - thresh) >> 5;
  566 + goto update;
  567 + }
  568 + return;
  569 +update:
  570 + global_dirty_limit = limit;
  571 +}
  572 +
  573 +static void global_update_bandwidth(unsigned long thresh,
  574 + unsigned long dirty,
  575 + unsigned long now)
  576 +{
  577 + static DEFINE_SPINLOCK(dirty_lock);
  578 + static unsigned long update_time;
  579 +
  580 + /*
  581 + * check locklessly first to optimize away locking for the most time
  582 + */
  583 + if (time_before(now, update_time + BANDWIDTH_INTERVAL))
  584 + return;
  585 +
  586 + spin_lock(&dirty_lock);
  587 + if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
  588 + update_dirty_limit(thresh, dirty);
  589 + update_time = now;
  590 + }
  591 + spin_unlock(&dirty_lock);
  592 +}
  593 +
  594 +void __bdi_update_bandwidth(struct backing_dev_info *bdi,
  595 + unsigned long thresh,
  596 + unsigned long dirty,
  597 + unsigned long bdi_thresh,
  598 + unsigned long bdi_dirty,
  599 + unsigned long start_time)
  600 +{
  601 + unsigned long now = jiffies;
  602 + unsigned long elapsed = now - bdi->bw_time_stamp;
  603 + unsigned long written;
  604 +
  605 + /*
  606 + * rate-limit, only update once every 200ms.
  607 + */
  608 + if (elapsed < BANDWIDTH_INTERVAL)
  609 + return;
  610 +
  611 + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
  612 +
  613 + /*
  614 + * Skip quiet periods when disk bandwidth is under-utilized.
  615 + * (at least 1s idle time between two flusher runs)
  616 + */
  617 + if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
  618 + goto snapshot;
  619 +
  620 + if (thresh)
  621 + global_update_bandwidth(thresh, dirty, now);
  622 +
  623 + bdi_update_write_bandwidth(bdi, elapsed, written);
  624 +
  625 +snapshot:
  626 + bdi->written_stamp = written;
  627 + bdi->bw_time_stamp = now;
  628 +}
  629 +
  630 +static void bdi_update_bandwidth(struct backing_dev_info *bdi,
  631 + unsigned long thresh,
  632 + unsigned long dirty,
  633 + unsigned long bdi_thresh,
  634 + unsigned long bdi_dirty,
  635 + unsigned long start_time)
  636 +{
  637 + if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
  638 + return;
  639 + spin_lock(&bdi->wb.list_lock);
  640 + __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
  641 + start_time);
  642 + spin_unlock(&bdi->wb.list_lock);
  643 +}
  644 +
  645 +/*
472 646 * balance_dirty_pages() must be called by processes which are generating dirty
473 647 * data. It looks at the number of dirty pages in the machine and will force
474 648 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
475 649  
476 650  
477 651  
478 652  
479 653  
... ... @@ -478,27 +652,25 @@
478 652 static void balance_dirty_pages(struct address_space *mapping,
479 653 unsigned long write_chunk)
480 654 {
481   - long nr_reclaimable, bdi_nr_reclaimable;
482   - long nr_writeback, bdi_nr_writeback;
  655 + unsigned long nr_reclaimable, bdi_nr_reclaimable;
  656 + unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
  657 + unsigned long bdi_dirty;
483 658 unsigned long background_thresh;
484 659 unsigned long dirty_thresh;
485 660 unsigned long bdi_thresh;
  661 + unsigned long task_bdi_thresh;
  662 + unsigned long min_task_bdi_thresh;
486 663 unsigned long pages_written = 0;
487 664 unsigned long pause = 1;
488 665 bool dirty_exceeded = false;
  666 + bool clear_dirty_exceeded = true;
489 667 struct backing_dev_info *bdi = mapping->backing_dev_info;
  668 + unsigned long start_time = jiffies;
490 669  
491 670 for (;;) {
492   - struct writeback_control wbc = {
493   - .sync_mode = WB_SYNC_NONE,
494   - .older_than_this = NULL,
495   - .nr_to_write = write_chunk,
496   - .range_cyclic = 1,
497   - };
498   -
499 671 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 672 global_page_state(NR_UNSTABLE_NFS);
501   - nr_writeback = global_page_state(NR_WRITEBACK);
  673 + nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 674  
503 675 global_dirty_limits(&background_thresh, &dirty_thresh);
504 676  
505 677  
... ... @@ -507,12 +679,12 @@
507 679 * catch-up. This avoids (excessively) small writeouts
508 680 * when the bdi limits are ramping up.
509 681 */
510   - if (nr_reclaimable + nr_writeback <=
511   - (background_thresh + dirty_thresh) / 2)
  682 + if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
512 683 break;
513 684  
514 685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515   - bdi_thresh = task_dirty_limit(current, bdi_thresh);
  686 + min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
  687 + task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 688  
517 689 /*
518 690 * In order to avoid the stacked BDI deadlock we need
519 691  
520 692  
... ... @@ -524,12 +696,14 @@
524 696 * actually dirty; with m+n sitting in the percpu
525 697 * deltas.
526 698 */
527   - if (bdi_thresh < 2*bdi_stat_error(bdi)) {
  699 + if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 700 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529   - bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
  701 + bdi_dirty = bdi_nr_reclaimable +
  702 + bdi_stat_sum(bdi, BDI_WRITEBACK);
530 703 } else {
531 704 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532   - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
  705 + bdi_dirty = bdi_nr_reclaimable +
  706 + bdi_stat(bdi, BDI_WRITEBACK);
533 707 }
534 708  
535 709 /*
... ... @@ -538,9 +712,10 @@
538 712 * bdi or process from holding back light ones; The latter is
539 713 * the last resort safeguard.
540 714 */
541   - dirty_exceeded =
542   - (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
543   - || (nr_reclaimable + nr_writeback > dirty_thresh);
  715 + dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
  716 + (nr_dirty > dirty_thresh);
  717 + clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
  718 + (nr_dirty <= dirty_thresh);
544 719  
545 720 if (!dirty_exceeded)
546 721 break;
... ... @@ -548,6 +723,9 @@
548 723 if (!bdi->dirty_exceeded)
549 724 bdi->dirty_exceeded = 1;
550 725  
  726 + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
  727 + bdi_thresh, bdi_dirty, start_time);
  728 +
551 729 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
552 730 * Unstable writes are a feature of certain networked
553 731 * filesystems (i.e. NFS) in which data may have been
554 732  
555 733  
556 734  
557 735  
... ... @@ -557,19 +735,42 @@
557 735 * threshold otherwise wait until the disk writes catch
558 736 * up.
559 737 */
560   - trace_wbc_balance_dirty_start(&wbc, bdi);
561   - if (bdi_nr_reclaimable > bdi_thresh) {
562   - writeback_inodes_wb(&bdi->wb, &wbc);
563   - pages_written += write_chunk - wbc.nr_to_write;
564   - trace_wbc_balance_dirty_written(&wbc, bdi);
  738 + trace_balance_dirty_start(bdi);
  739 + if (bdi_nr_reclaimable > task_bdi_thresh) {
  740 + pages_written += writeback_inodes_wb(&bdi->wb,
  741 + write_chunk);
  742 + trace_balance_dirty_written(bdi, pages_written);
565 743 if (pages_written >= write_chunk)
566 744 break; /* We've done our duty */
567 745 }
568   - trace_wbc_balance_dirty_wait(&wbc, bdi);
569 746 __set_current_state(TASK_UNINTERRUPTIBLE);
570 747 io_schedule_timeout(pause);
  748 + trace_balance_dirty_wait(bdi);
571 749  
  750 + dirty_thresh = hard_dirty_limit(dirty_thresh);
572 751 /*
  752 + * max-pause area. If dirty exceeded but still within this
  753 + * area, no need to sleep for more than 200ms: (a) 8 pages per
  754 + * 200ms is typically more than enough to curb heavy dirtiers;
  755 + * (b) the pause time limit makes the dirtiers more responsive.
  756 + */
  757 + if (nr_dirty < dirty_thresh +
  758 + dirty_thresh / DIRTY_MAXPAUSE_AREA &&
  759 + time_after(jiffies, start_time + MAX_PAUSE))
  760 + break;
  761 + /*
  762 + * pass-good area. When some bdi gets blocked (eg. NFS server
  763 + * not responding), or write bandwidth dropped dramatically due
  764 + * to concurrent reads, or dirty threshold suddenly dropped and
  765 + * the dirty pages cannot be brought down anytime soon (eg. on
  766 + * slow USB stick), at least let go of the good bdi's.
  767 + */
  768 + if (nr_dirty < dirty_thresh +
  769 + dirty_thresh / DIRTY_PASSGOOD_AREA &&
  770 + bdi_dirty < bdi_thresh)
  771 + break;
  772 +
  773 + /*
573 774 * Increase the delay for each loop, up to our previous
574 775 * default of taking a 100ms nap.
575 776 */
... ... @@ -578,7 +779,8 @@
578 779 pause = HZ / 10;
579 780 }
580 781  
581   - if (!dirty_exceeded && bdi->dirty_exceeded)
  782 + /* Clear dirty_exceeded flag only when no task can exceed the limit */
  783 + if (clear_dirty_exceeded && bdi->dirty_exceeded)
582 784 bdi->dirty_exceeded = 0;
583 785  
584 786 if (writeback_in_progress(bdi))
585 787  
... ... @@ -626,9 +828,13 @@
626 828 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 829 unsigned long nr_pages_dirtied)
628 830 {
  831 + struct backing_dev_info *bdi = mapping->backing_dev_info;
629 832 unsigned long ratelimit;
630 833 unsigned long *p;
631 834  
  835 + if (!bdi_cap_account_dirty(bdi))
  836 + return;
  837 +
632 838 ratelimit = ratelimit_pages;
633 839 if (mapping->backing_dev_info->dirty_exceeded)
634 840 ratelimit = 8;
635 841  
... ... @@ -892,12 +1098,12 @@
892 1098 range_whole = 1;
893 1099 cycled = 1; /* ignore range_cyclic tests */
894 1100 }
895   - if (wbc->sync_mode == WB_SYNC_ALL)
  1101 + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 1102 tag = PAGECACHE_TAG_TOWRITE;
897 1103 else
898 1104 tag = PAGECACHE_TAG_DIRTY;
899 1105 retry:
900   - if (wbc->sync_mode == WB_SYNC_ALL)
  1106 + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 1107 tag_pages_for_writeback(mapping, index, end);
902 1108 done_index = index;
903 1109 while (!done && (index <= end)) {
... ... @@ -31,11 +31,11 @@
31 31 * mmlist_lock (in mmput, drain_mmlist and others)
32 32 * mapping->private_lock (in __set_page_dirty_buffers)
33 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
34   - * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
  34 + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
35 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 37 * in arch-dependent flush_dcache_mmap_lock,
38   - * within inode_wb_list_lock in __sync_single_inode)
  38 + * within bdi.wb->list_lock in __sync_single_inode)
39 39 *
40 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
41 41 * ->tasklist_lock