Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits) mm: properly reflect task dirty limits in dirty_exceeded logic writeback: don't busy retry writeback on new/freeing inodes writeback: scale IO chunk size up to half device bandwidth writeback: trace global_dirty_state writeback: introduce max-pause and pass-good dirty limits writeback: introduce smoothed global dirty limit writeback: consolidate variable names in balance_dirty_pages() writeback: show bdi write bandwidth in debugfs writeback: bdi write bandwidth estimation writeback: account per-bdi accumulated written pages writeback: make writeback_control.nr_to_write straight writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr() writeback: trace event writeback_queue_io writeback: trace event writeback_single_inode writeback: remove .nonblocking and .encountered_congestion writeback: remove writeback_control.more_io writeback: skip balance_dirty_pages() for in-memory fs writeback: add bdi_dirty_limit() kernel-doc writeback: avoid extra sync work at enqueue time writeback: elevate queue_io() into wb_writeback() ... Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits) mm: properly reflect task dirty limits in dirty_exceeded logic writeback: don't busy retry writeback on new/freeing inodes writeback: scale IO chunk size up to half device bandwidth writeback: trace global_dirty_state writeback: introduce max-pause and pass-good dirty limits writeback: introduce smoothed global dirty limit writeback: consolidate variable names in balance_dirty_pages() writeback: show bdi write bandwidth in debugfs writeback: bdi write bandwidth estimation writeback: account per-bdi accumulated written pages writeback: make writeback_control.nr_to_write straight writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr() writeback: trace event writeback_queue_io writeback: trace event writeback_single_inode writeback: remove .nonblocking and .encountered_congestion writeback: remove writeback_control.more_io writeback: skip balance_dirty_pages() for in-memory fs writeback: add bdi_dirty_limit() kernel-doc writeback: avoid extra sync work at enqueue time writeback: elevate queue_io() into wb_writeback() ... Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c
Linus Torvalds
2 parents a93a132927 bcff25fc8a
Showing 15 changed files Side-by-side Diff
fs/block_dev.c
fs/btrfs/extent_io.c
fs/ext4/inode.c
fs/fs-writeback.c
fs/inode.c
fs/nfs/write.c
include/linux/backing-dev.h
include/linux/writeback.h
include/trace/events/btrfs.h
include/trace/events/ext4.h
include/trace/events/writeback.h
mm/backing-dev.c
mm/filemap.c
mm/page-writeback.c
mm/rmap.c
@@ -44,24 +44,28 @@
 {
 	return &BDEV_I(inode)->bdev;
 }
-
 EXPORT_SYMBOL(I_BDEV);
  
 /*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
  */
 static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
-	spin_lock(&inode_wb_list_lock);
+	struct backing_dev_info *old = inode->i_data.backing_dev_info;
+
+	if (unlikely(dst == old))		/* deadlock avoidance */
+		return;
+	bdi_lock_two(&old->wb, &dst->wb);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&old->wb.list_lock);
+	spin_unlock(&dst->wb.list_lock);
 }
  
 static sector_t max_block(struct block_device *bdev)
@@ -2551,7 +2551,6 @@
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= wbc->sync_mode,
-		.older_than_this = NULL,
 		.nr_to_write	= 64,
 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
 		.range_end	= (loff_t)-1,
@@ -2584,7 +2583,6 @@
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
-		.older_than_this = NULL,
 		.nr_to_write	= nr_pages * 2,
 		.range_start	= start,
 		.range_end	= end + 1,
@@ -2741,7 +2741,7 @@
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
  
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
@@ -2973,7 +2973,7 @@
 	}
  
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
  
 	while (!ret && wbc->nr_to_write > 0) {
@@ -35,7 +35,9 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	unsigned long *older_than_this;
 	enum writeback_sync_modes sync_mode;
+	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
  
  
@@ -180,12 +182,13 @@
  */
 void inode_wb_list_del(struct inode *inode)
 {
-	spin_lock(&inode_wb_list_lock);
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+	spin_lock(&bdi->wb.list_lock);
 	list_del_init(&inode->i_wb_list);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&bdi->wb.list_lock);
 }
  
-
 /*
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * furthest end of its superblock's dirty-inode list.
  
@@ -195,11 +198,9 @@
  * the case then the inode must have been redirtied while it was being written
  * out and we don't reset its dirtied_when.
  */
-static void redirty_tail(struct inode *inode)
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 {
-	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
-	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&wb->list_lock);
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
  
  
@@ -213,11 +214,9 @@
 /*
  * requeue inode for re-scanning after bdi->b_io list is exhausted.
  */
-static void requeue_io(struct inode *inode)
+static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
-	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&wb->list_lock);
 	list_move(&inode->i_wb_list, &wb->b_more_io);
 }
  
@@ -225,7 +224,7 @@
 {
 	/*
 	 * Prevent speculative execution through
-	 * spin_unlock(&inode_wb_list_lock);
+	 * spin_unlock(&wb->list_lock);
 	 */
  
 	smp_mb();
  
  
@@ -250,15 +249,16 @@
 /*
  * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
  */
-static void move_expired_inodes(struct list_head *delaying_queue,
+static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
-				unsigned long *older_than_this)
+			       unsigned long *older_than_this)
 {
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
 	struct super_block *sb = NULL;
 	struct inode *inode;
 	int do_sb_sort = 0;
+	int moved = 0;
  
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
  
@@ -269,12 +269,13 @@
 			do_sb_sort = 1;
 		sb = inode->i_sb;
 		list_move(&inode->i_wb_list, &tmp);
+		moved++;
 	}
  
 	/* just one sb in list, splice to dispatch_queue and we're done */
 	if (!do_sb_sort) {
 		list_splice(&tmp, dispatch_queue);
-		return;
+		goto out;
 	}
  
 	/* Move inodes from one superblock together */
@@ -286,6 +287,8 @@
 				list_move(&inode->i_wb_list, dispatch_queue);
 		}
 	}
+out:
+	return moved;
 }
  
 /*
  
@@ -301,9 +304,11 @@
  */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-	assert_spin_locked(&inode_wb_list_lock);
+	int moved;
+	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
-	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+	trace_writeback_queue_io(wb, older_than_this, moved);
 }
  
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -316,7 +321,8 @@
 /*
  * Wait for writeback on an inode to complete.
  */
-static void inode_wait_for_writeback(struct inode *inode)
+static void inode_wait_for_writeback(struct inode *inode,
+				     struct bdi_writeback *wb)
 {
 	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
 	wait_queue_head_t *wqh;
  
  
@@ -324,15 +330,15 @@
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 	while (inode->i_state & I_SYNC) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&wb->list_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-		spin_lock(&inode_wb_list_lock);
+		spin_lock(&wb->list_lock);
 		spin_lock(&inode->i_lock);
 	}
 }
  
 /*
- * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+ * Write out an inode's dirty pages.  Called under wb->list_lock and
  * inode->i_lock.  Either the caller has an active reference on the inode or
  * the inode has I_WILL_FREE set.
  *
  
  
@@ -343,13 +349,15 @@
  * livelocks, etc.
  */
 static int
-writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+		       struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
+	long nr_to_write = wbc->nr_to_write;
 	unsigned dirty;
 	int ret;
  
-	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&wb->list_lock);
 	assert_spin_locked(&inode->i_lock);
  
 	if (!atomic_read(&inode->i_count))
  
@@ -367,14 +375,16 @@
 		 * completed a full scan of b_io.
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL) {
-			requeue_io(inode);
+			requeue_io(inode, wb);
+			trace_writeback_single_inode_requeue(inode, wbc,
+							     nr_to_write);
 			return 0;
 		}
  
 		/*
 		 * It's a data-integrity sync.  We must wait.
 		 */
-		inode_wait_for_writeback(inode);
+		inode_wait_for_writeback(inode, wb);
 	}
  
 	BUG_ON(inode->i_state & I_SYNC);
@@ -383,7 +393,7 @@
 	inode->i_state |= I_SYNC;
 	inode->i_state &= ~I_DIRTY_PAGES;
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
  
 	ret = do_writepages(mapping, wbc);
  
  
@@ -414,10 +424,19 @@
 			ret = err;
 	}
  
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
+		/*
+		 * Sync livelock prevention. Each inode is tagged and synced in
+		 * one shot. If still dirty, it will be redirty_tail()'ed below.
+		 * Update the dirty time to prevent enqueue and sync it again.
+		 */
+		if ((inode->i_state & I_DIRTY) &&
+		    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+			inode->dirtied_when = jiffies;
+
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
@@ -428,7 +447,7 @@
 				/*
 				 * slice used up: queue for next turn
 				 */
-				requeue_io(inode);
+				requeue_io(inode, wb);
 			} else {
 				/*
 				 * Writeback blocked by something other than
@@ -437,7 +456,7 @@
 				 * retrying writeback of the dirty page/inode
 				 * that cannot be performed immediately.
 				 */
-				redirty_tail(inode);
+				redirty_tail(inode, wb);
 			}
 		} else if (inode->i_state & I_DIRTY) {
 			/*
@@ -446,7 +465,7 @@
 			 * submission or metadata updates after data IO
 			 * completion.
 			 */
-			redirty_tail(inode);
+			redirty_tail(inode, wb);
 		} else {
 			/*
 			 * The inode is clean.  At this point we either have
  
@@ -457,9 +476,41 @@
 		}
 	}
 	inode_sync_complete(inode);
+	trace_writeback_single_inode(inode, wbc, nr_to_write);
 	return ret;
 }
  
+static long writeback_chunk_size(struct backing_dev_info *bdi,
+				 struct wb_writeback_work *work)
+{
+	long pages;
+
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          writeback_sb_inodes()       <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+		pages = LONG_MAX;
+	else {
+		pages = min(bdi->avg_write_bandwidth / 2,
+			    global_dirty_limit / DIRTY_SCOPE);
+		pages = min(pages, work->nr_pages);
+		pages = round_down(pages + MIN_WRITEBACK_PAGES,
+				   MIN_WRITEBACK_PAGES);
+	}
+
+	return pages;
+}
+
 /*
  * Write a portion of b_io inodes which belong to @sb.
  *
  
  
  
  
  
@@ -467,24 +518,36 @@
  * inodes. Otherwise write only ones which go sequentially
  * in reverse order.
  *
- * Return 1, if the caller writeback routine should be
- * interrupted. Otherwise return 0.
+ * Return the number of pages and/or inodes written.
  */
-static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
-		struct writeback_control *wbc, bool only_this_sb)
+static long writeback_sb_inodes(struct super_block *sb,
+				struct bdi_writeback *wb,
+				struct wb_writeback_work *work)
 {
+	struct writeback_control wbc = {
+		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
+		.for_kupdate		= work->for_kupdate,
+		.for_background		= work->for_background,
+		.range_cyclic		= work->range_cyclic,
+		.range_start		= 0,
+		.range_end		= LLONG_MAX,
+	};
+	unsigned long start_time = jiffies;
+	long write_chunk;
+	long wrote = 0;  /* count both pages and inodes */
+
 	while (!list_empty(&wb->b_io)) {
-		long pages_skipped;
 		struct inode *inode = wb_inode(wb->b_io.prev);
  
 		if (inode->i_sb != sb) {
-			if (only_this_sb) {
+			if (work->sb) {
 				/*
 				 * We only want to write back data for this
 				 * superblock, move all inodes not belonging
 				 * to it back onto the dirty list.
 				 */
-				redirty_tail(inode);
+				redirty_tail(inode, wb);
 				continue;
 			}
  
@@ -493,7 +556,7 @@
 			 * Bounce back to the caller to unpin this and
 			 * pin the next superblock.
 			 */
-			return 0;
+			break;
 		}
  
 		/*
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
@@ -504,96 +567,92 @@
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			spin_unlock(&inode->i_lock);
-			requeue_io(inode);
+			redirty_tail(inode, wb);
 			continue;
 		}
-
-		/*
-		 * Was this inode dirtied after sync_sb_inodes was called?
-		 * This keeps sync from extra jobs and livelock.
-		 */
-		if (inode_dirtied_after(inode, wbc->wb_start)) {
-			spin_unlock(&inode->i_lock);
-			return 1;
-		}
-
 		__iget(inode);
+		write_chunk = writeback_chunk_size(wb->bdi, work);
+		wbc.nr_to_write = write_chunk;
+		wbc.pages_skipped = 0;
  
-		pages_skipped = wbc->pages_skipped;
-		writeback_single_inode(inode, wbc);
-		if (wbc->pages_skipped != pages_skipped) {
+		writeback_single_inode(inode, wb, &wbc);
+
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
+		if (!(inode->i_state & I_DIRTY))
+			wrote++;
+		if (wbc.pages_skipped) {
 			/*
 			 * writeback is not making progress due to locked
 			 * buffers.  Skip this inode for now.
 			 */
-			redirty_tail(inode);
+			redirty_tail(inode, wb);
 		}
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&wb->list_lock);
 		iput(inode);
 		cond_resched();
-		spin_lock(&inode_wb_list_lock);
-		if (wbc->nr_to_write <= 0) {
-			wbc->more_io = 1;
-			return 1;
+		spin_lock(&wb->list_lock);
+		/*
+		 * bail out to wb_writeback() often enough to check
+		 * background threshold and other termination conditions.
+		 */
+		if (wrote) {
+			if (time_is_before_jiffies(start_time + HZ / 10UL))
+				break;
+			if (work->nr_pages <= 0)
+				break;
 		}
-		if (!list_empty(&wb->b_more_io))
-			wbc->more_io = 1;
 	}
-	/* b_io is empty */
-	return 1;
+	return wrote;
 }
  
-void writeback_inodes_wb(struct bdi_writeback *wb,
-		struct writeback_control *wbc)
+static long __writeback_inodes_wb(struct bdi_writeback *wb,
+				  struct wb_writeback_work *work)
 {
-	int ret = 0;
+	unsigned long start_time = jiffies;
+	long wrote = 0;
  
-	if (!wbc->wb_start)
-		wbc->wb_start = jiffies; /* livelock avoidance */
-	spin_lock(&inode_wb_list_lock);
-	if (!wbc->for_kupdate || list_empty(&wb->b_io))
-		queue_io(wb, wbc->older_than_this);
-
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
 		struct super_block *sb = inode->i_sb;
  
 		if (!grab_super_passive(sb)) {
-			requeue_io(inode);
+			requeue_io(inode, wb);
 			continue;
 		}
-		ret = writeback_sb_inodes(sb, wb, wbc, false);
+		wrote += writeback_sb_inodes(sb, wb, work);
 		drop_super(sb);
  
-		if (ret)
-			break;
+		/* refer to the same tests at the end of writeback_sb_inodes */
+		if (wrote) {
+			if (time_is_before_jiffies(start_time + HZ / 10UL))
+				break;
+			if (work->nr_pages <= 0)
+				break;
+		}
 	}
-	spin_unlock(&inode_wb_list_lock);
 	/* Leave any unwritten inodes on b_io */
+	return wrote;
 }
  
-static void __writeback_inodes_sb(struct super_block *sb,
-		struct bdi_writeback *wb, struct writeback_control *wbc)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
 {
-	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+	struct wb_writeback_work work = {
+		.nr_pages	= nr_pages,
+		.sync_mode	= WB_SYNC_NONE,
+		.range_cyclic	= 1,
+	};
  
-	spin_lock(&inode_wb_list_lock);
-	if (!wbc->for_kupdate || list_empty(&wb->b_io))
-		queue_io(wb, wbc->older_than_this);
-	writeback_sb_inodes(sb, wb, wbc, true);
-	spin_unlock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
+	if (list_empty(&wb->b_io))
+		queue_io(wb, NULL);
+	__writeback_inodes_wb(wb, &work);
+	spin_unlock(&wb->list_lock);
+
+	return nr_pages - work.nr_pages;
 }
  
-/*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024
-
 static inline bool over_bground_thresh(void)
 {
 	unsigned long background_thresh, dirty_thresh;
@@ -605,6 +664,16 @@
 }
  
 /*
+ * Called under wb->list_lock. If there are multiple wb per bdi,
+ * only the flusher working on the first wb should do it.
+ */
+static void wb_update_bandwidth(struct bdi_writeback *wb,
+				unsigned long start_time)
+{
+	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+}
+
+/*
  * Explicit flushing or periodic writeback of "old" data.
  *
  * Define "old": the first time one of an inode's pages is dirtied, we mark the
  
  
  
  
@@ -622,47 +691,16 @@
 static long wb_writeback(struct bdi_writeback *wb,
 			 struct wb_writeback_work *work)
 {
-	struct writeback_control wbc = {
-		.sync_mode		= work->sync_mode,
-		.older_than_this	= NULL,
-		.for_kupdate		= work->for_kupdate,
-		.for_background		= work->for_background,
-		.range_cyclic		= work->range_cyclic,
-	};
+	unsigned long wb_start = jiffies;
+	long nr_pages = work->nr_pages;
 	unsigned long oldest_jif;
-	long wrote = 0;
-	long write_chunk;
 	struct inode *inode;
+	long progress;
  
-	if (wbc.for_kupdate) {
-		wbc.older_than_this = &oldest_jif;
-		oldest_jif = jiffies -
-				msecs_to_jiffies(dirty_expire_interval * 10);
-	}
-	if (!wbc.range_cyclic) {
-		wbc.range_start = 0;
-		wbc.range_end = LLONG_MAX;
-	}
+	oldest_jif = jiffies;
+	work->older_than_this = &oldest_jif;
  
-	/*
-	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
-	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
-	 * here avoids calling into writeback_inodes_wb() more than once.
-	 *
-	 * The intended call sequence for WB_SYNC_ALL writeback is:
-	 *
-	 *      wb_writeback()
-	 *          __writeback_inodes_sb()     <== called only once
-	 *              write_cache_pages()     <== called once for each inode
-	 *                   (quickly) tag currently dirty pages
-	 *                   (maybe slowly) sync all tagged pages
-	 */
-	if (wbc.sync_mode == WB_SYNC_NONE)
-		write_chunk = MAX_WRITEBACK_PAGES;
-	else
-		write_chunk = LONG_MAX;
-
-	wbc.wb_start = jiffies; /* livelock avoidance */
+	spin_lock(&wb->list_lock);
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
@@ -687,52 +725,54 @@
 		if (work->for_background && !over_bground_thresh())
 			break;
  
-		wbc.more_io = 0;
-		wbc.nr_to_write = write_chunk;
-		wbc.pages_skipped = 0;
+		if (work->for_kupdate) {
+			oldest_jif = jiffies -
+				msecs_to_jiffies(dirty_expire_interval * 10);
+			work->older_than_this = &oldest_jif;
+		}
  
-		trace_wbc_writeback_start(&wbc, wb->bdi);
+		trace_writeback_start(wb->bdi, work);
+		if (list_empty(&wb->b_io))
+			queue_io(wb, work->older_than_this);
 		if (work->sb)
-			__writeback_inodes_sb(work->sb, wb, &wbc);
+			progress = writeback_sb_inodes(work->sb, wb, work);
 		else
-			writeback_inodes_wb(wb, &wbc);
-		trace_wbc_writeback_written(&wbc, wb->bdi);
+			progress = __writeback_inodes_wb(wb, work);
+		trace_writeback_written(wb->bdi, work);
  
-		work->nr_pages -= write_chunk - wbc.nr_to_write;
-		wrote += write_chunk - wbc.nr_to_write;
+		wb_update_bandwidth(wb, wb_start);
  
 		/*
-		 * If we consumed everything, see if we have more
+		 * Did we write something? Try for more
+		 *
+		 * Dirty inodes are moved to b_io for writeback in batches.
+		 * The completion of the current batch does not necessarily
+		 * mean the overall work is done. So we keep looping as long
+		 * as made some progress on cleaning pages or inodes.
 		 */
-		if (wbc.nr_to_write <= 0)
+		if (progress)
 			continue;
 		/*
-		 * Didn't write everything and we don't have more IO, bail
+		 * No more inodes for IO, bail
 		 */
-		if (!wbc.more_io)
+		if (list_empty(&wb->b_more_io))
 			break;
 		/*
-		 * Did we write something? Try for more
-		 */
-		if (wbc.nr_to_write < write_chunk)
-			continue;
-		/*
 		 * Nothing written. Wait for some inode to
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 */
-		spin_lock(&inode_wb_list_lock);
 		if (!list_empty(&wb->b_more_io))  {
+			trace_writeback_wait(wb->bdi, work);
 			inode = wb_inode(wb->b_more_io.prev);
-			trace_wbc_writeback_wait(&wbc, wb->bdi);
 			spin_lock(&inode->i_lock);
-			inode_wait_for_writeback(inode);
+			inode_wait_for_writeback(inode, wb);
 			spin_unlock(&inode->i_lock);
 		}
-		spin_unlock(&inode_wb_list_lock);
 	}
+	spin_unlock(&wb->list_lock);
  
-	return wrote;
+	return nr_pages - work->nr_pages;
 }
  
 /*
  
@@ -1063,10 +1103,10 @@
 			}
  
 			spin_unlock(&inode->i_lock);
-			spin_lock(&inode_wb_list_lock);
+			spin_lock(&bdi->wb.list_lock);
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
-			spin_unlock(&inode_wb_list_lock);
+			spin_unlock(&bdi->wb.list_lock);
  
 			if (wakeup_bdi)
 				bdi_wakeup_thread_delayed(bdi);
@@ -1162,10 +1202,11 @@
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_NONE,
-		.done		= &done,
-		.nr_pages	= nr,
+		.sb			= sb,
+		.sync_mode		= WB_SYNC_NONE,
+		.tagged_writepages	= 1,
+		.done			= &done,
+		.nr_pages		= nr,
 	};
  
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1267,6 +1308,7 @@
  */
 int write_inode_now(struct inode *inode, int sync)
 {
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	int ret;
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
  
  
@@ -1279,11 +1321,11 @@
 		wbc.nr_to_write = 0;
  
 	might_sleep();
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
-	ret = writeback_single_inode(inode, &wbc);
+	ret = writeback_single_inode(inode, wb, &wbc);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 	if (sync)
 		inode_sync_wait(inode);
 	return ret;
  
  
  
@@ -1303,13 +1345,14 @@
  */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	int ret;
  
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
-	ret = writeback_single_inode(inode, wbc);
+	ret = writeback_single_inode(inode, wb, wbc);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 	return ret;
 }
 EXPORT_SYMBOL(sync_inode);
@@ -37,7 +37,7 @@
  *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
- * inode_wb_list_lock protects:
+ * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
@@ -48,7 +48,7 @@
  *   inode->i_lock
  *     inode->i_sb->s_inode_lru_lock
  *
- * inode_wb_list_lock
+ * bdi->wb.list_lock
  *   inode->i_lock
  *
  * inode_hash_lock
@@ -65,7 +65,6 @@
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
  
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
  
 /*
  * Empty aops. Can be used for the cases where the user does not
@@ -1566,8 +1566,7 @@
 		int status;
 		bool sync = true;
  
-		if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
-		    wbc->for_background)
+		if (wbc->sync_mode == WB_SYNC_NONE)
 			sync = false;
  
 		status = pnfs_layoutcommit_inode(inode, sync);
@@ -40,6 +40,7 @@
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_WRITTEN,
 	NR_BDI_STAT_ITEMS
 };
  
@@ -57,6 +58,7 @@
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	spinlock_t list_lock;		/* protects the b_* lists */
 };
  
 struct backing_dev_info {
@@ -71,6 +73,11 @@
  
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
  
+	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long written_stamp;	/* pages written at bw_time_stamp */
+	unsigned long write_bandwidth;	/* the estimated write bandwidth */
+	unsigned long avg_write_bandwidth; /* further smoothed write bw */
+
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
  
@@ -106,6 +113,7 @@
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
  
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
@@ -7,10 +7,40 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
  
-struct backing_dev_info;
+/*
+ * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
+ *
+ *	(thresh - thresh/DIRTY_FULL_SCOPE, thresh)
+ *
+ * The 1/16 region above the global dirty limit will be put to maximum pauses:
+ *
+ *	(limit, limit + limit/DIRTY_MAXPAUSE_AREA)
+ *
+ * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
+ * to loops:
+ *
+ *	(limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
+ *
+ * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
+ * time) for the dirty pages to drop, unless written enough pages.
+ *
+ * The global dirty threshold is normally equal to the global dirty limit,
+ * except when the system suddenly allocates a lot of anonymous memory and
+ * knocks down the global dirty threshold quickly, in which case the global
+ * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
+ */
+#define DIRTY_SCOPE		8
+#define DIRTY_FULL_SCOPE	(DIRTY_SCOPE / 2)
+#define DIRTY_MAXPAUSE_AREA		16
+#define DIRTY_PASSGOOD_AREA		8
  
-extern spinlock_t inode_wb_list_lock;
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
  
+struct backing_dev_info;
+
 /*
  * fs/fs-writeback.c
  */
@@ -26,11 +56,6 @@
  */
 struct writeback_control {
 	enum writeback_sync_modes sync_mode;
-	unsigned long *older_than_this;	/* If !NULL, only write back inodes
-					   older than this */
-	unsigned long wb_start;         /* Time writeback_inodes_wb was
-					   called. This is needed to avoid
-					   extra jobs and livelock */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */
  
  
@@ -43,13 +68,11 @@
 	loff_t range_start;
 	loff_t range_end;
  
-	unsigned nonblocking:1;		/* Don't get stuck on request queues */
-	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_background:1;	/* A background writeback */
+	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
-	unsigned more_io:1;		/* more io to be dispatched */
 };
  
 /*
@@ -62,8 +85,7 @@
 int writeback_inodes_sb_if_idle(struct super_block *);
 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
 void sync_inodes_sb(struct super_block *);
-void writeback_inodes_wb(struct bdi_writeback *wb,
-		struct writeback_control *wbc);
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
 void wakeup_flusher_threads(long nr_pages);
  
@@ -94,6 +116,8 @@
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
  
+extern unsigned long global_dirty_limit;
+
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern unsigned long dirty_background_bytes;
@@ -127,6 +151,13 @@
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
 			       unsigned long dirty);
+
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long thresh,
+			    unsigned long dirty,
+			    unsigned long bdi_thresh,
+			    unsigned long bdi_dirty,
+			    unsigned long start_time);
  
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
@@ -284,7 +284,6 @@
 		__field(	long,   pages_skipped		)
 		__field(	loff_t, range_start		)
 		__field(	loff_t, range_end		)
-		__field(	char,   nonblocking		)
 		__field(	char,   for_kupdate		)
 		__field(	char,   for_reclaim		)
 		__field(	char,   range_cyclic		)
@@ -299,7 +298,6 @@
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
-		__entry->nonblocking	= wbc->nonblocking;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
  
@@ -310,13 +308,13 @@
  
 	TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
 		  "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
-		  "range_end = %llu, nonblocking = %d, for_kupdate = %d, "
+		  "range_end = %llu, for_kupdate = %d, "
 		  "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long)__entry->ino, __entry->index,
 		  __entry->nr_to_write, __entry->pages_skipped,
 		  __entry->range_start, __entry->range_end,
-		  __entry->nonblocking, __entry->for_kupdate,
+		  __entry->for_kupdate,
 		  __entry->for_reclaim, __entry->range_cyclic,
 		  (unsigned long)__entry->writeback_index)
 );
@@ -380,7 +380,6 @@
 		__field(	int,	pages_written		)
 		__field(	long,	pages_skipped		)
 		__field(	int,	sync_mode		)
-		__field(	char,	more_io			)	
 		__field(       pgoff_t,	writeback_index		)
 	),
  
  
  
@@ -391,16 +390,15 @@
 		__entry->pages_written	= pages_written;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->sync_mode	= wbc->sync_mode;
-		__entry->more_io	= wbc->more_io;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 	),
  
 	TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
-		  " more_io %d sync_mode %d writeback_index %lu",
+		  "sync_mode %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->ret,
 		  __entry->pages_written, __entry->pages_skipped,
-		  __entry->more_io, __entry->sync_mode,
+		  __entry->sync_mode,
 		  (unsigned long) __entry->writeback_index)
 );
  
@@ -8,6 +8,19 @@
 #include <linux/device.h>
 #include <linux/writeback.h>
  
+#define show_inode_state(state)					\
+	__print_flags(state, "|",				\
+		{I_DIRTY_SYNC,		"I_DIRTY_SYNC"},	\
+		{I_DIRTY_DATASYNC,	"I_DIRTY_DATASYNC"},	\
+		{I_DIRTY_PAGES,		"I_DIRTY_PAGES"},	\
+		{I_NEW,			"I_NEW"},		\
+		{I_WILL_FREE,		"I_WILL_FREE"},		\
+		{I_FREEING,		"I_FREEING"},		\
+		{I_CLEAR,		"I_CLEAR"},		\
+		{I_SYNC,		"I_SYNC"},		\
+		{I_REFERENCED,		"I_REFERENCED"}		\
+	)
+
 struct wb_writeback_work;
  
 DECLARE_EVENT_CLASS(writeback_work_class,
@@ -49,6 +62,9 @@
 DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
  
 TRACE_EVENT(writeback_pages_written,
 	TP_PROTO(long pages_written),
  
@@ -88,7 +104,31 @@
 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
 DEFINE_WRITEBACK_EVENT(writeback_thread_start);
 DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
+DEFINE_WRITEBACK_EVENT(balance_dirty_start);
+DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
  
+TRACE_EVENT(balance_dirty_written,
+
+	TP_PROTO(struct backing_dev_info *bdi, int written),
+
+	TP_ARGS(bdi, written),
+
+	TP_STRUCT__entry(
+		__array(char,	name, 32)
+		__field(int,	written)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->written = written;
+	),
+
+	TP_printk("bdi %s written %d",
+		  __entry->name,
+		  __entry->written
+	)
+);
+
 DECLARE_EVENT_CLASS(wbc_class,
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
 	TP_ARGS(wbc, bdi),
@@ -101,8 +141,6 @@
 		__field(int, for_background)
 		__field(int, for_reclaim)
 		__field(int, range_cyclic)
-		__field(int, more_io)
-		__field(unsigned long, older_than_this)
 		__field(long, range_start)
 		__field(long, range_end)
 	),
  
@@ -116,15 +154,12 @@
 		__entry->for_background	= wbc->for_background;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
-		__entry->more_io	= wbc->more_io;
-		__entry->older_than_this = wbc->older_than_this ?
-						*wbc->older_than_this : 0;
 		__entry->range_start	= (long)wbc->range_start;
 		__entry->range_end	= (long)wbc->range_end;
 	),
  
 	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
-		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"bgrd=%d reclm=%d cyclic=%d "
 		"start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
@@ -134,8 +169,6 @@
 		__entry->for_background,
 		__entry->for_reclaim,
 		__entry->range_cyclic,
-		__entry->more_io,
-		__entry->older_than_this,
 		__entry->range_start,
 		__entry->range_end)
 )
  
@@ -144,14 +177,79 @@
 DEFINE_EVENT(wbc_class, name, \
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
 	TP_ARGS(wbc, bdi))
-DEFINE_WBC_EVENT(wbc_writeback_start);
-DEFINE_WBC_EVENT(wbc_writeback_written);
-DEFINE_WBC_EVENT(wbc_writeback_wait);
-DEFINE_WBC_EVENT(wbc_balance_dirty_start);
-DEFINE_WBC_EVENT(wbc_balance_dirty_written);
-DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
 DEFINE_WBC_EVENT(wbc_writepage);
  
+TRACE_EVENT(writeback_queue_io,
+	TP_PROTO(struct bdi_writeback *wb,
+		 unsigned long *older_than_this,
+		 int moved),
+	TP_ARGS(wb, older_than_this, moved),
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(unsigned long,	older)
+		__field(long,		age)
+		__field(int,		moved)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+		__entry->older	= older_than_this ?  *older_than_this : 0;
+		__entry->age	= older_than_this ?
+				  (jiffies - *older_than_this) * 1000 / HZ : -1;
+		__entry->moved	= moved;
+	),
+	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
+		__entry->name,
+		__entry->older,	/* older_than_this in jiffies */
+		__entry->age,	/* older_than_this in relative milliseconds */
+		__entry->moved)
+);
+
+TRACE_EVENT(global_dirty_state,
+
+	TP_PROTO(unsigned long background_thresh,
+		 unsigned long dirty_thresh
+	),
+
+	TP_ARGS(background_thresh,
+		dirty_thresh
+	),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	nr_dirty)
+		__field(unsigned long,	nr_writeback)
+		__field(unsigned long,	nr_unstable)
+		__field(unsigned long,	background_thresh)
+		__field(unsigned long,	dirty_thresh)
+		__field(unsigned long,	dirty_limit)
+		__field(unsigned long,	nr_dirtied)
+		__field(unsigned long,	nr_written)
+	),
+
+	TP_fast_assign(
+		__entry->nr_dirty	= global_page_state(NR_FILE_DIRTY);
+		__entry->nr_writeback	= global_page_state(NR_WRITEBACK);
+		__entry->nr_unstable	= global_page_state(NR_UNSTABLE_NFS);
+		__entry->nr_dirtied	= global_page_state(NR_DIRTIED);
+		__entry->nr_written	= global_page_state(NR_WRITTEN);
+		__entry->background_thresh = background_thresh;
+		__entry->dirty_thresh	= dirty_thresh;
+		__entry->dirty_limit = global_dirty_limit;
+	),
+
+	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
+		  "bg_thresh=%lu thresh=%lu limit=%lu "
+		  "dirtied=%lu written=%lu",
+		  __entry->nr_dirty,
+		  __entry->nr_writeback,
+		  __entry->nr_unstable,
+		  __entry->background_thresh,
+		  __entry->dirty_thresh,
+		  __entry->dirty_limit,
+		  __entry->nr_dirtied,
+		  __entry->nr_written
+	)
+);
+
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
  
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
@@ -185,6 +283,63 @@
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
  
 	TP_ARGS(usec_timeout, usec_delayed)
+);
+
+DECLARE_EVENT_CLASS(writeback_single_inode_template,
+
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write
+	),
+
+	TP_ARGS(inode, wbc, nr_to_write),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long, ino)
+		__field(unsigned long, state)
+		__field(unsigned long, age)
+		__field(unsigned long, writeback_index)
+		__field(long, nr_to_write)
+		__field(unsigned long, wrote)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,
+			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
+		__entry->age		= (jiffies - inode->dirtied_when) *
+								1000 / HZ;
+		__entry->writeback_index = inode->i_mapping->writeback_index;
+		__entry->nr_to_write	= nr_to_write;
+		__entry->wrote		= nr_to_write - wbc->nr_to_write;
+	),
+
+	TP_printk("bdi %s: ino=%lu state=%s age=%lu "
+		  "index=%lu to_write=%ld wrote=%lu",
+		  __entry->name,
+		  __entry->ino,
+		  show_inode_state(__entry->state),
+		  __entry->age,
+		  __entry->writeback_index,
+		  __entry->nr_to_write,
+		  __entry->wrote
+	)
+);
+
+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write),
+	TP_ARGS(inode, wbc, nr_to_write)
+);
+
+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write),
+	TP_ARGS(inode, wbc, nr_to_write)
 );
  
 #endif /* _TRACE_WRITEBACK_H */
@@ -45,6 +45,17 @@
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
  
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+	if (wb1 < wb2) {
+		spin_lock(&wb1->list_lock);
+		spin_lock_nested(&wb2->list_lock, 1);
+	} else {
+		spin_lock(&wb2->list_lock);
+		spin_lock_nested(&wb1->list_lock, 1);
+	}
+}
+
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
  
  
  
@@ -67,34 +78,42 @@
 	struct inode *inode;
  
 	nr_dirty = nr_io = nr_more_io = 0;
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
 	list_for_each_entry(inode, &wb->b_io, i_wb_list)
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
  
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
  
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
-		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
-		   "BdiDirtyThresh:   %8lu kB\n"
-		   "DirtyThresh:      %8lu kB\n"
-		   "BackgroundThresh: %8lu kB\n"
-		   "b_dirty:          %8lu\n"
-		   "b_io:             %8lu\n"
-		   "b_more_io:        %8lu\n"
-		   "bdi_list:         %8u\n"
-		   "state:            %8lx\n",
+		   "BdiWriteback:       %10lu kB\n"
+		   "BdiReclaimable:     %10lu kB\n"
+		   "BdiDirtyThresh:     %10lu kB\n"
+		   "DirtyThresh:        %10lu kB\n"
+		   "BackgroundThresh:   %10lu kB\n"
+		   "BdiWritten:         %10lu kB\n"
+		   "BdiWriteBandwidth:  %10lu kBps\n"
+		   "b_dirty:            %10lu\n"
+		   "b_io:               %10lu\n"
+		   "b_more_io:          %10lu\n"
+		   "bdi_list:           %10u\n"
+		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-		   K(bdi_thresh), K(dirty_thresh),
-		   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+		   K(bdi_thresh),
+		   K(dirty_thresh),
+		   K(background_thresh),
+		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+		   (unsigned long) K(bdi->write_bandwidth),
+		   nr_dirty,
+		   nr_io,
+		   nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
  
@@ -249,18 +268,6 @@
 	return wb_has_dirty_io(&bdi->wb);
 }
  
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
-	struct writeback_control wbc = {
-		.sync_mode		= WB_SYNC_NONE,
-		.older_than_this	= NULL,
-		.range_cyclic		= 1,
-		.nr_to_write		= 1024,
-	};
-
-	writeback_inodes_wb(&bdi->wb, &wbc);
-}
-
 /*
  * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
  * or we risk deadlocking on ->s_umount. The longer term solution would be
  
@@ -446,9 +453,10 @@
 			if (IS_ERR(task)) {
 				/*
 				 * If thread creation fails, force writeout of
-				 * the bdi from the thread.
+				 * the bdi from the thread. Hopefully 1024 is
+				 * large enough for efficient IO.
 				 */
-				bdi_flush_io(bdi);
+				writeback_inodes_wb(&bdi->wb, 1024);
 			} else {
 				/*
 				 * The spinlock makes sure we do not lose
  
@@ -629,9 +637,15 @@
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	spin_lock_init(&wb->list_lock);
 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
  
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW		(100 << (20 - PAGE_SHIFT))
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, err;
@@ -654,6 +668,13 @@
 	}
  
 	bdi->dirty_exceeded = 0;
+
+	bdi->bw_time_stamp = jiffies;
+	bdi->written_stamp = 0;
+
+	bdi->write_bandwidth = INIT_BW;
+	bdi->avg_write_bandwidth = INIT_BW;
+
 	err = prop_local_init_percpu(&bdi->completions);
  
 	if (err) {
  
@@ -677,11 +698,12 @@
 	if (bdi_has_dirty_io(bdi)) {
 		struct bdi_writeback *dst = &default_backing_dev_info.wb;
  
-		spin_lock(&inode_wb_list_lock);
+		bdi_lock_two(&bdi->wb, dst);
 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 		list_splice(&bdi->wb.b_io, &dst->b_io);
 		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&bdi->wb.list_lock);
+		spin_unlock(&dst->list_lock);
 	}
  
 	bdi_unregister(bdi);
@@ -78,7 +78,7 @@
  *  ->i_mutex			(generic_file_buffered_write)
  *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
  *
- *  inode_wb_list_lock
+ *  bdi->wb.list_lock
  *    sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
  
@@ -96,9 +96,9 @@
  *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock	(page_remove_rmap->set_page_dirty)
+ *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock	(zap_pte_range->set_page_dirty)
+ *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
@@ -37,6 +37,16 @@
 #include <trace/events/writeback.h>
  
 /*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE		max(HZ/5, 1)
+
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL	max(HZ/5, 1)
+
+/*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  * will look to see if it needs to force writeback or throttling.
  */
@@ -111,6 +121,7 @@
  
 /* End of sysctl-exported parameters */
  
+unsigned long global_dirty_limit;
  
 /*
  * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@
  */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
+	__inc_bdi_stat(bdi, BDI_WRITTEN);
 	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
 			      bdi->max_prop_frac);
 }
  
@@ -244,13 +256,8 @@
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
 		long *numerator, long *denominator)
 {
-	if (bdi_cap_writeback_dirty(bdi)) {
-		prop_fraction_percpu(&vm_completions, &bdi->completions,
+	prop_fraction_percpu(&vm_completions, &bdi->completions,
 				numerator, denominator);
-	} else {
-		*numerator = 0;
-		*denominator = 1;
-	}
 }
  
 static inline void task_dirties_fraction(struct task_struct *tsk,
  
@@ -274,12 +281,13 @@
  * effectively curb the growth of dirty pages. Light dirtiers with high enough
  * dirty threshold may never get throttled.
  */
+#define TASK_LIMIT_FRACTION 8
 static unsigned long task_dirty_limit(struct task_struct *tsk,
 				       unsigned long bdi_dirty)
 {
 	long numerator, denominator;
 	unsigned long dirty = bdi_dirty;
-	u64 inv = dirty >> 3;
+	u64 inv = dirty / TASK_LIMIT_FRACTION;
  
 	task_dirties_fraction(tsk, &numerator, &denominator);
 	inv *= numerator;
@@ -290,6 +298,12 @@
 	return max(dirty, bdi_dirty/2);
 }
  
+/* Minimum limit for any task */
+static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+{
+	return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+}
+
 /*
  *
  */
@@ -397,6 +411,11 @@
 	return x + 1;	/* Ensure that we never return 0 */
 }
  
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+	return max(thresh, global_dirty_limit);
+}
+
 /*
  * global_dirty_limits - background-writeback and dirty-throttling thresholds
  *
  
  
  
@@ -435,12 +454,20 @@
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+	trace_global_dirty_state(background, dirty);
 }
  
-/*
+/**
  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
  *
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
  
@@ -468,7 +495,154 @@
 	return bdi_dirty;
 }
  
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+				       unsigned long elapsed,
+				       unsigned long written)
+{
+	const unsigned long period = roundup_pow_of_two(3 * HZ);
+	unsigned long avg = bdi->avg_write_bandwidth;
+	unsigned long old = bdi->write_bandwidth;
+	u64 bw;
+
+	/*
+	 * bw = written * HZ / elapsed
+	 *
+	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
+	 * write_bandwidth = ---------------------------------------------------
+	 *                                          period
+	 */
+	bw = written - bdi->written_stamp;
+	bw *= HZ;
+	if (unlikely(elapsed > period)) {
+		do_div(bw, elapsed);
+		avg = bw;
+		goto out;
+	}
+	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+	bw >>= ilog2(period);
+
+	/*
+	 * one more level of smoothing, for filtering out sudden spikes
+	 */
+	if (avg > old && old >= (unsigned long)bw)
+		avg -= (avg - old) >> 3;
+
+	if (avg < old && old <= (unsigned long)bw)
+		avg += (old - avg) >> 3;
+
+out:
+	bdi->write_bandwidth = bw;
+	bdi->avg_write_bandwidth = avg;
+}
+
 /*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+	unsigned long limit = global_dirty_limit;
+
+	/*
+	 * Follow up in one step.
+	 */
+	if (limit < thresh) {
+		limit = thresh;
+		goto update;
+	}
+
+	/*
+	 * Follow down slowly. Use the higher one as the target, because thresh
+	 * may drop below dirty. This is exactly the reason to introduce
+	 * global_dirty_limit which is guaranteed to lie above the dirty pages.
+	 */
+	thresh = max(thresh, dirty);
+	if (limit > thresh) {
+		limit -= (limit - thresh) >> 5;
+		goto update;
+	}
+	return;
+update:
+	global_dirty_limit = limit;
+}
+
+static void global_update_bandwidth(unsigned long thresh,
+				    unsigned long dirty,
+				    unsigned long now)
+{
+	static DEFINE_SPINLOCK(dirty_lock);
+	static unsigned long update_time;
+
+	/*
+	 * check locklessly first to optimize away locking for the most time
+	 */
+	if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+		return;
+
+	spin_lock(&dirty_lock);
+	if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+		update_dirty_limit(thresh, dirty);
+		update_time = now;
+	}
+	spin_unlock(&dirty_lock);
+}
+
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long thresh,
+			    unsigned long dirty,
+			    unsigned long bdi_thresh,
+			    unsigned long bdi_dirty,
+			    unsigned long start_time)
+{
+	unsigned long now = jiffies;
+	unsigned long elapsed = now - bdi->bw_time_stamp;
+	unsigned long written;
+
+	/*
+	 * rate-limit, only update once every 200ms.
+	 */
+	if (elapsed < BANDWIDTH_INTERVAL)
+		return;
+
+	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+	/*
+	 * Skip quiet periods when disk bandwidth is under-utilized.
+	 * (at least 1s idle time between two flusher runs)
+	 */
+	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+		goto snapshot;
+
+	if (thresh)
+		global_update_bandwidth(thresh, dirty, now);
+
+	bdi_update_write_bandwidth(bdi, elapsed, written);
+
+snapshot:
+	bdi->written_stamp = written;
+	bdi->bw_time_stamp = now;
+}
+
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+				 unsigned long thresh,
+				 unsigned long dirty,
+				 unsigned long bdi_thresh,
+				 unsigned long bdi_dirty,
+				 unsigned long start_time)
+{
+	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+		return;
+	spin_lock(&bdi->wb.list_lock);
+	__bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+			       start_time);
+	spin_unlock(&bdi->wb.list_lock);
+}
+
+/*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
  
  
  
  
  
@@ -478,27 +652,25 @@
 static void balance_dirty_pages(struct address_space *mapping,
 				unsigned long write_chunk)
 {
-	long nr_reclaimable, bdi_nr_reclaimable;
-	long nr_writeback, bdi_nr_writeback;
+	unsigned long nr_reclaimable, bdi_nr_reclaimable;
+	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+	unsigned long bdi_dirty;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
+	unsigned long task_bdi_thresh;
+	unsigned long min_task_bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long pause = 1;
 	bool dirty_exceeded = false;
+	bool clear_dirty_exceeded = true;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	unsigned long start_time = jiffies;
  
 	for (;;) {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_NONE,
-			.older_than_this = NULL,
-			.nr_to_write	= write_chunk,
-			.range_cyclic	= 1,
-		};
-
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-		nr_writeback = global_page_state(NR_WRITEBACK);
+		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
  
 		global_dirty_limits(&background_thresh, &dirty_thresh);
  
  
@@ -507,12 +679,12 @@
 		 * catch-up. This avoids (excessively) small writeouts
 		 * when the bdi limits are ramping up.
 		 */
-		if (nr_reclaimable + nr_writeback <=
-				(background_thresh + dirty_thresh) / 2)
+		if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
 			break;
  
 		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-		bdi_thresh = task_dirty_limit(current, bdi_thresh);
+		min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+		task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
  
 		/*
 		 * In order to avoid the stacked BDI deadlock we need
  
  
@@ -524,12 +696,14 @@
 		 * actually dirty; with m+n sitting in the percpu
 		 * deltas.
 		 */
-		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+		if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
 			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_nr_reclaimable +
+				    bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else {
 			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_nr_reclaimable +
+				    bdi_stat(bdi, BDI_WRITEBACK);
 		}
  
 		/*
@@ -538,9 +712,10 @@
 		 * bdi or process from holding back light ones; The latter is
 		 * the last resort safeguard.
 		 */
-		dirty_exceeded =
-			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-			|| (nr_reclaimable + nr_writeback > dirty_thresh);
+		dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+				  (nr_dirty > dirty_thresh);
+		clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+					(nr_dirty <= dirty_thresh);
  
 		if (!dirty_exceeded)
 			break;
@@ -548,6 +723,9 @@
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
  
+		bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+				     bdi_thresh, bdi_dirty, start_time);
+
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
  
  
  
  
@@ -557,19 +735,42 @@
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
-		trace_wbc_balance_dirty_start(&wbc, bdi);
-		if (bdi_nr_reclaimable > bdi_thresh) {
-			writeback_inodes_wb(&bdi->wb, &wbc);
-			pages_written += write_chunk - wbc.nr_to_write;
-			trace_wbc_balance_dirty_written(&wbc, bdi);
+		trace_balance_dirty_start(bdi);
+		if (bdi_nr_reclaimable > task_bdi_thresh) {
+			pages_written += writeback_inodes_wb(&bdi->wb,
+							     write_chunk);
+			trace_balance_dirty_written(bdi, pages_written);
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
 		}
-		trace_wbc_balance_dirty_wait(&wbc, bdi);
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		io_schedule_timeout(pause);
+		trace_balance_dirty_wait(bdi);
  
+		dirty_thresh = hard_dirty_limit(dirty_thresh);
 		/*
+		 * max-pause area. If dirty exceeded but still within this
+		 * area, no need to sleep for more than 200ms: (a) 8 pages per
+		 * 200ms is typically more than enough to curb heavy dirtiers;
+		 * (b) the pause time limit makes the dirtiers more responsive.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+		    time_after(jiffies, start_time + MAX_PAUSE))
+			break;
+		/*
+		 * pass-good area. When some bdi gets blocked (eg. NFS server
+		 * not responding), or write bandwidth dropped dramatically due
+		 * to concurrent reads, or dirty threshold suddenly dropped and
+		 * the dirty pages cannot be brought down anytime soon (eg. on
+		 * slow USB stick), at least let go of the good bdi's.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_PASSGOOD_AREA &&
+		    bdi_dirty < bdi_thresh)
+			break;
+
+		/*
 		 * Increase the delay for each loop, up to our previous
 		 * default of taking a 100ms nap.
 		 */
@@ -578,7 +779,8 @@
 			pause = HZ / 10;
 	}
  
-	if (!dirty_exceeded && bdi->dirty_exceeded)
+	/* Clear dirty_exceeded flag only when no task can exceed the limit */
+	if (clear_dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
  
 	if (writeback_in_progress(bdi))
  
@@ -626,9 +828,13 @@
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned long ratelimit;
 	unsigned long *p;
  
+	if (!bdi_cap_account_dirty(bdi))
+		return;
+
 	ratelimit = ratelimit_pages;
 	if (mapping->backing_dev_info->dirty_exceeded)
 		ratelimit = 8;
  
@@ -892,12 +1098,12 @@
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
@@ -31,11 +31,11 @@
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mapping->private_lock (in __set_page_dirty_buffers)
  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+ *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
- *                           within inode_wb_list_lock in __sync_single_inode)
+ *                           within bdi.wb->list_lock in __sync_single_inode)
  *
  * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
  *   ->tasklist_lock
...	...	@@ -44,24 +44,28 @@
44	44	{
45	45	return &BDEV_I(inode)->bdev;
46	46	}
47		-
48	47	EXPORT_SYMBOL(I_BDEV);
49	48
50	49	/*
51		- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
52		- * we need to move it onto the dirty list of @dst so that the inode is always
53		- * on the right list.
	50	+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
	51	+ * need to move it onto the dirty list of @dst so that the inode is always on
	52	+ * the right list.
54	53	*/
55	54	static void bdev_inode_switch_bdi(struct inode *inode,
56	55	struct backing_dev_info *dst)
57	56	{
58		- spin_lock(&inode_wb_list_lock);
	57	+ struct backing_dev_info *old = inode->i_data.backing_dev_info;
	58	+
	59	+ if (unlikely(dst == old)) /* deadlock avoidance */
	60	+ return;
	61	+ bdi_lock_two(&old->wb, &dst->wb);
59	62	spin_lock(&inode->i_lock);
60	63	inode->i_data.backing_dev_info = dst;
61	64	if (inode->i_state & I_DIRTY)
62	65	list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63	66	spin_unlock(&inode->i_lock);
64		- spin_unlock(&inode_wb_list_lock);
	67	+ spin_unlock(&old->wb.list_lock);
	68	+ spin_unlock(&dst->wb.list_lock);
65	69	}
66	70
67	71	static sector_t max_block(struct block_device *bdev)
...	...	@@ -2551,7 +2551,6 @@
2551	2551	};
2552	2552	struct writeback_control wbc_writepages = {
2553	2553	.sync_mode = wbc->sync_mode,
2554		- .older_than_this = NULL,
2555	2554	.nr_to_write = 64,
2556	2555	.range_start = page_offset(page) + PAGE_CACHE_SIZE,
2557	2556	.range_end = (loff_t)-1,
...	...	@@ -2584,7 +2583,6 @@
2584	2583	};
2585	2584	struct writeback_control wbc_writepages = {
2586	2585	.sync_mode = mode,
2587		- .older_than_this = NULL,
2588	2586	.nr_to_write = nr_pages * 2,
2589	2587	.range_start = start,
2590	2588	.range_end = end + 1,
...	...	@@ -2741,7 +2741,7 @@
2741	2741	index = wbc->range_start >> PAGE_CACHE_SHIFT;
2742	2742	end = wbc->range_end >> PAGE_CACHE_SHIFT;
2743	2743
2744		- if (wbc->sync_mode == WB_SYNC_ALL)
	2744	+ if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
2745	2745	tag = PAGECACHE_TAG_TOWRITE;
2746	2746	else
2747	2747	tag = PAGECACHE_TAG_DIRTY;
...	...	@@ -2973,7 +2973,7 @@
2973	2973	}
2974	2974
2975	2975	retry:
2976		- if (wbc->sync_mode == WB_SYNC_ALL)
	2976	+ if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
2977	2977	tag_pages_for_writeback(mapping, index, end);
2978	2978
2979	2979	while (!ret && wbc->nr_to_write > 0) {
...	...	@@ -37,7 +37,7 @@
37	37	* inode->i_sb->s_inode_lru, inode->i_lru
38	38	* inode_sb_list_lock protects:
39	39	* sb->s_inodes, inode->i_sb_list
40		- * inode_wb_list_lock protects:
	40	+ * bdi->wb.list_lock protects:
41	41	* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42	42	* inode_hash_lock protects:
43	43	* inode_hashtable, inode->i_hash
...	...	@@ -48,7 +48,7 @@
48	48	* inode->i_lock
49	49	* inode->i_sb->s_inode_lru_lock
50	50	*
51		- * inode_wb_list_lock
	51	+ * bdi->wb.list_lock
52	52	* inode->i_lock
53	53	*
54	54	* inode_hash_lock
...	...	@@ -65,7 +65,6 @@
65	65	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
66	66
67	67	__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
68		-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
69	68
70	69	/*
71	70	* Empty aops. Can be used for the cases where the user does not
...	...	@@ -1566,8 +1566,7 @@
1566	1566	int status;
1567	1567	bool sync = true;
1568	1568
1569		- if (wbc->sync_mode == WB_SYNC_NONE \|\| wbc->nonblocking \|\|
1570		- wbc->for_background)
	1569	+ if (wbc->sync_mode == WB_SYNC_NONE)
1571	1570	sync = false;
1572	1571
1573	1572	status = pnfs_layoutcommit_inode(inode, sync);
...	...	@@ -40,6 +40,7 @@
40	40	enum bdi_stat_item {
41	41	BDI_RECLAIMABLE,
42	42	BDI_WRITEBACK,
	43	+ BDI_WRITTEN,
43	44	NR_BDI_STAT_ITEMS
44	45	};
45	46
...	...	@@ -57,6 +58,7 @@
57	58	struct list_head b_dirty; /* dirty inodes */
58	59	struct list_head b_io; /* parked for writeback */
59	60	struct list_head b_more_io; /* parked for more writeback */
	61	+ spinlock_t list_lock; /* protects the b_* lists */
60	62	};
61	63
62	64	struct backing_dev_info {
...	...	@@ -71,6 +73,11 @@
71	73
72	74	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
73	75
	76	+ unsigned long bw_time_stamp; /* last time write bw is updated */
	77	+ unsigned long written_stamp; /* pages written at bw_time_stamp */
	78	+ unsigned long write_bandwidth; /* the estimated write bandwidth */
	79	+ unsigned long avg_write_bandwidth; /* further smoothed write bw */
	80	+
74	81	struct prop_local_percpu completions;
75	82	int dirty_exceeded;
76	83
...	...	@@ -106,6 +113,7 @@
106	113	int bdi_has_dirty_io(struct backing_dev_info *bdi);
107	114	void bdi_arm_supers_timer(void);
108	115	void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
	116	+void bdi_lock_two(struct bdi_writeback wb1, struct bdi_writeback wb2);
109	117
110	118	extern spinlock_t bdi_lock;
111	119	extern struct list_head bdi_list;
...	...	@@ -7,10 +7,40 @@
7	7	#include <linux/sched.h>
8	8	#include <linux/fs.h>
9	9
10		-struct backing_dev_info;
	10	+/*
	11	+ * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
	12	+ *
	13	+ * (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
	14	+ *
	15	+ * The 1/16 region above the global dirty limit will be put to maximum pauses:
	16	+ *
	17	+ * (limit, limit + limit/DIRTY_MAXPAUSE_AREA)
	18	+ *
	19	+ * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
	20	+ * to loops:
	21	+ *
	22	+ * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
	23	+ *
	24	+ * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
	25	+ * time) for the dirty pages to drop, unless written enough pages.
	26	+ *
	27	+ * The global dirty threshold is normally equal to the global dirty limit,
	28	+ * except when the system suddenly allocates a lot of anonymous memory and
	29	+ * knocks down the global dirty threshold quickly, in which case the global
	30	+ * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
	31	+ */
	32	+#define DIRTY_SCOPE 8
	33	+#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
	34	+#define DIRTY_MAXPAUSE_AREA 16
	35	+#define DIRTY_PASSGOOD_AREA 8
11	36
12		-extern spinlock_t inode_wb_list_lock;
	37	+/*
	38	+ * 4MB minimal write chunk size
	39	+ */
	40	+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
13	41
	42	+struct backing_dev_info;
	43	+
14	44	/*
15	45	* fs/fs-writeback.c
16	46	*/
...	...	@@ -26,11 +56,6 @@
26	56	*/
27	57	struct writeback_control {
28	58	enum writeback_sync_modes sync_mode;
29		- unsigned long older_than_this; / If !NULL, only write back inodes
30		- older than this */
31		- unsigned long wb_start; /* Time writeback_inodes_wb was
32		- called. This is needed to avoid
33		- extra jobs and livelock */
34	59	long nr_to_write; /* Write this many pages, and decrement
35	60	this for each page written */
36	61	long pages_skipped; /* Pages which were not written */
37	62
38	63
...	...	@@ -43,13 +68,11 @@
43	68	loff_t range_start;
44	69	loff_t range_end;
45	70
46		- unsigned nonblocking:1; /* Don't get stuck on request queues */
47		- unsigned encountered_congestion:1; /* An output: a queue is full */
48	71	unsigned for_kupdate:1; /* A kupdate writeback */
49	72	unsigned for_background:1; /* A background writeback */
	73	+ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
50	74	unsigned for_reclaim:1; /* Invoked from the page allocator */
51	75	unsigned range_cyclic:1; /* range_start is cyclic */
52		- unsigned more_io:1; /* more io to be dispatched */
53	76	};
54	77
55	78	/*
...	...	@@ -62,8 +85,7 @@
62	85	int writeback_inodes_sb_if_idle(struct super_block *);
63	86	int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
64	87	void sync_inodes_sb(struct super_block *);
65		-void writeback_inodes_wb(struct bdi_writeback *wb,
66		- struct writeback_control *wbc);
	88	+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
67	89	long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
68	90	void wakeup_flusher_threads(long nr_pages);
69	91
...	...	@@ -94,6 +116,8 @@
94	116	#endif
95	117	void throttle_vm_writeout(gfp_t gfp_mask);
96	118
	119	+extern unsigned long global_dirty_limit;
	120	+
97	121	/* These are exported to sysctl. */
98	122	extern int dirty_background_ratio;
99	123	extern unsigned long dirty_background_bytes;
...	...	@@ -127,6 +151,13 @@
127	151	void global_dirty_limits(unsigned long pbackground, unsigned long pdirty);
128	152	unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
129	153	unsigned long dirty);
	154	+
	155	+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
	156	+ unsigned long thresh,
	157	+ unsigned long dirty,
	158	+ unsigned long bdi_thresh,
	159	+ unsigned long bdi_dirty,
	160	+ unsigned long start_time);
130	161
131	162	void page_writeback_init(void);
132	163	void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
...	...	@@ -284,7 +284,6 @@
284	284	__field( long, pages_skipped )
285	285	__field( loff_t, range_start )
286	286	__field( loff_t, range_end )
287		- __field( char, nonblocking )
288	287	__field( char, for_kupdate )
289	288	__field( char, for_reclaim )
290	289	__field( char, range_cyclic )
...	...	@@ -299,7 +298,6 @@
299	298	__entry->pages_skipped = wbc->pages_skipped;
300	299	__entry->range_start = wbc->range_start;
301	300	__entry->range_end = wbc->range_end;
302		- __entry->nonblocking = wbc->nonblocking;
303	301	__entry->for_kupdate = wbc->for_kupdate;
304	302	__entry->for_reclaim = wbc->for_reclaim;
305	303	__entry->range_cyclic = wbc->range_cyclic;
306	304
...	...	@@ -310,13 +308,13 @@
310	308
311	309	TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
312	310	"nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
313		- "range_end = %llu, nonblocking = %d, for_kupdate = %d, "
	311	+ "range_end = %llu, for_kupdate = %d, "
314	312	"for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
315	313	show_root_type(__entry->root_objectid),
316	314	(unsigned long)__entry->ino, __entry->index,
317	315	__entry->nr_to_write, __entry->pages_skipped,
318	316	__entry->range_start, __entry->range_end,
319		- __entry->nonblocking, __entry->for_kupdate,
	317	+ __entry->for_kupdate,
320	318	__entry->for_reclaim, __entry->range_cyclic,
321	319	(unsigned long)__entry->writeback_index)
322	320	);
...	...	@@ -380,7 +380,6 @@
380	380	__field( int, pages_written )
381	381	__field( long, pages_skipped )
382	382	__field( int, sync_mode )
383		- __field( char, more_io )
384	383	__field( pgoff_t, writeback_index )
385	384	),
386	385
387	386
388	387
...	...	@@ -391,16 +390,15 @@
391	390	__entry->pages_written = pages_written;
392	391	__entry->pages_skipped = wbc->pages_skipped;
393	392	__entry->sync_mode = wbc->sync_mode;
394		- __entry->more_io = wbc->more_io;
395	393	__entry->writeback_index = inode->i_mapping->writeback_index;
396	394	),
397	395
398	396	TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
399		- " more_io %d sync_mode %d writeback_index %lu",
	397	+ "sync_mode %d writeback_index %lu",
400	398	MAJOR(__entry->dev), MINOR(__entry->dev),
401	399	(unsigned long) __entry->ino, __entry->ret,
402	400	__entry->pages_written, __entry->pages_skipped,
403		- __entry->more_io, __entry->sync_mode,
	401	+ __entry->sync_mode,
404	402	(unsigned long) __entry->writeback_index)
405	403	);
406	404
...	...	@@ -8,6 +8,19 @@
8	8	#include <linux/device.h>
9	9	#include <linux/writeback.h>
10	10
	11	+#define show_inode_state(state) \
	12	+ __print_flags(state, "\|", \
	13	+ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \
	14	+ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \
	15	+ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \
	16	+ {I_NEW, "I_NEW"}, \
	17	+ {I_WILL_FREE, "I_WILL_FREE"}, \
	18	+ {I_FREEING, "I_FREEING"}, \
	19	+ {I_CLEAR, "I_CLEAR"}, \
	20	+ {I_SYNC, "I_SYNC"}, \
	21	+ {I_REFERENCED, "I_REFERENCED"} \
	22	+ )
	23	+
11	24	struct wb_writeback_work;
12	25
13	26	DECLARE_EVENT_CLASS(writeback_work_class,
...	...	@@ -49,6 +62,9 @@
49	62	DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
50	63	DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
51	64	DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
	65	+DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
	66	+DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
	67	+DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
52	68
53	69	TRACE_EVENT(writeback_pages_written,
54	70	TP_PROTO(long pages_written),
55	71
...	...	@@ -88,7 +104,31 @@
88	104	DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
89	105	DEFINE_WRITEBACK_EVENT(writeback_thread_start);
90	106	DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
	107	+DEFINE_WRITEBACK_EVENT(balance_dirty_start);
	108	+DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
91	109
	110	+TRACE_EVENT(balance_dirty_written,
	111	+
	112	+ TP_PROTO(struct backing_dev_info *bdi, int written),
	113	+
	114	+ TP_ARGS(bdi, written),
	115	+
	116	+ TP_STRUCT__entry(
	117	+ __array(char, name, 32)
	118	+ __field(int, written)
	119	+ ),
	120	+
	121	+ TP_fast_assign(
	122	+ strncpy(__entry->name, dev_name(bdi->dev), 32);
	123	+ __entry->written = written;
	124	+ ),
	125	+
	126	+ TP_printk("bdi %s written %d",
	127	+ __entry->name,
	128	+ __entry->written
	129	+ )
	130	+);
	131	+
92	132	DECLARE_EVENT_CLASS(wbc_class,
93	133	TP_PROTO(struct writeback_control wbc, struct backing_dev_info bdi),
94	134	TP_ARGS(wbc, bdi),
...	...	@@ -101,8 +141,6 @@
101	141	__field(int, for_background)
102	142	__field(int, for_reclaim)
103	143	__field(int, range_cyclic)
104		- __field(int, more_io)
105		- __field(unsigned long, older_than_this)
106	144	__field(long, range_start)
107	145	__field(long, range_end)
108	146	),
109	147
...	...	@@ -116,15 +154,12 @@
116	154	__entry->for_background = wbc->for_background;
117	155	__entry->for_reclaim = wbc->for_reclaim;
118	156	__entry->range_cyclic = wbc->range_cyclic;
119		- __entry->more_io = wbc->more_io;
120		- __entry->older_than_this = wbc->older_than_this ?
121		- *wbc->older_than_this : 0;
122	157	__entry->range_start = (long)wbc->range_start;
123	158	__entry->range_end = (long)wbc->range_end;
124	159	),
125	160
126	161	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
127		- "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
	162	+ "bgrd=%d reclm=%d cyclic=%d "
128	163	"start=0x%lx end=0x%lx",
129	164	__entry->name,
130	165	__entry->nr_to_write,
...	...	@@ -134,8 +169,6 @@
134	169	__entry->for_background,
135	170	__entry->for_reclaim,
136	171	__entry->range_cyclic,
137		- __entry->more_io,
138		- __entry->older_than_this,
139	172	__entry->range_start,
140	173	__entry->range_end)
141	174	)
142	175
...	...	@@ -144,14 +177,79 @@
144	177	DEFINE_EVENT(wbc_class, name, \
145	178	TP_PROTO(struct writeback_control wbc, struct backing_dev_info bdi), \
146	179	TP_ARGS(wbc, bdi))
147		-DEFINE_WBC_EVENT(wbc_writeback_start);
148		-DEFINE_WBC_EVENT(wbc_writeback_written);
149		-DEFINE_WBC_EVENT(wbc_writeback_wait);
150		-DEFINE_WBC_EVENT(wbc_balance_dirty_start);
151		-DEFINE_WBC_EVENT(wbc_balance_dirty_written);
152		-DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
153	180	DEFINE_WBC_EVENT(wbc_writepage);
154	181
	182	+TRACE_EVENT(writeback_queue_io,
	183	+ TP_PROTO(struct bdi_writeback *wb,
	184	+ unsigned long *older_than_this,
	185	+ int moved),
	186	+ TP_ARGS(wb, older_than_this, moved),
	187	+ TP_STRUCT__entry(
	188	+ __array(char, name, 32)
	189	+ __field(unsigned long, older)
	190	+ __field(long, age)
	191	+ __field(int, moved)
	192	+ ),
	193	+ TP_fast_assign(
	194	+ strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
	195	+ __entry->older = older_than_this ? *older_than_this : 0;
	196	+ __entry->age = older_than_this ?
	197	+ (jiffies - older_than_this) 1000 / HZ : -1;
	198	+ __entry->moved = moved;
	199	+ ),
	200	+ TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
	201	+ __entry->name,
	202	+ __entry->older, /* older_than_this in jiffies */
	203	+ __entry->age, /* older_than_this in relative milliseconds */
	204	+ __entry->moved)
	205	+);
	206	+
	207	+TRACE_EVENT(global_dirty_state,
	208	+
	209	+ TP_PROTO(unsigned long background_thresh,
	210	+ unsigned long dirty_thresh
	211	+ ),
	212	+
	213	+ TP_ARGS(background_thresh,
	214	+ dirty_thresh
	215	+ ),
	216	+
	217	+ TP_STRUCT__entry(
	218	+ __field(unsigned long, nr_dirty)
	219	+ __field(unsigned long, nr_writeback)
	220	+ __field(unsigned long, nr_unstable)
	221	+ __field(unsigned long, background_thresh)
	222	+ __field(unsigned long, dirty_thresh)
	223	+ __field(unsigned long, dirty_limit)
	224	+ __field(unsigned long, nr_dirtied)
	225	+ __field(unsigned long, nr_written)
	226	+ ),
	227	+
	228	+ TP_fast_assign(
	229	+ __entry->nr_dirty = global_page_state(NR_FILE_DIRTY);
	230	+ __entry->nr_writeback = global_page_state(NR_WRITEBACK);
	231	+ __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
	232	+ __entry->nr_dirtied = global_page_state(NR_DIRTIED);
	233	+ __entry->nr_written = global_page_state(NR_WRITTEN);
	234	+ __entry->background_thresh = background_thresh;
	235	+ __entry->dirty_thresh = dirty_thresh;
	236	+ __entry->dirty_limit = global_dirty_limit;
	237	+ ),
	238	+
	239	+ TP_printk("dirty=%lu writeback=%lu unstable=%lu "
	240	+ "bg_thresh=%lu thresh=%lu limit=%lu "
	241	+ "dirtied=%lu written=%lu",
	242	+ __entry->nr_dirty,
	243	+ __entry->nr_writeback,
	244	+ __entry->nr_unstable,
	245	+ __entry->background_thresh,
	246	+ __entry->dirty_thresh,
	247	+ __entry->dirty_limit,
	248	+ __entry->nr_dirtied,
	249	+ __entry->nr_written
	250	+ )
	251	+);
	252	+
155	253	DECLARE_EVENT_CLASS(writeback_congest_waited_template,
156	254
157	255	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
...	...	@@ -185,6 +283,63 @@
185	283	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
186	284
187	285	TP_ARGS(usec_timeout, usec_delayed)
	286	+);
	287	+
	288	+DECLARE_EVENT_CLASS(writeback_single_inode_template,
	289	+
	290	+ TP_PROTO(struct inode *inode,
	291	+ struct writeback_control *wbc,
	292	+ unsigned long nr_to_write
	293	+ ),
	294	+
	295	+ TP_ARGS(inode, wbc, nr_to_write),
	296	+
	297	+ TP_STRUCT__entry(
	298	+ __array(char, name, 32)
	299	+ __field(unsigned long, ino)
	300	+ __field(unsigned long, state)
	301	+ __field(unsigned long, age)
	302	+ __field(unsigned long, writeback_index)
	303	+ __field(long, nr_to_write)
	304	+ __field(unsigned long, wrote)
	305	+ ),
	306	+
	307	+ TP_fast_assign(
	308	+ strncpy(__entry->name,
	309	+ dev_name(inode->i_mapping->backing_dev_info->dev), 32);
	310	+ __entry->ino = inode->i_ino;
	311	+ __entry->state = inode->i_state;
	312	+ __entry->age = (jiffies - inode->dirtied_when) *
	313	+ 1000 / HZ;
	314	+ __entry->writeback_index = inode->i_mapping->writeback_index;
	315	+ __entry->nr_to_write = nr_to_write;
	316	+ __entry->wrote = nr_to_write - wbc->nr_to_write;
	317	+ ),
	318	+
	319	+ TP_printk("bdi %s: ino=%lu state=%s age=%lu "
	320	+ "index=%lu to_write=%ld wrote=%lu",
	321	+ __entry->name,
	322	+ __entry->ino,
	323	+ show_inode_state(__entry->state),
	324	+ __entry->age,
	325	+ __entry->writeback_index,
	326	+ __entry->nr_to_write,
	327	+ __entry->wrote
	328	+ )
	329	+);
	330	+
	331	+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
	332	+ TP_PROTO(struct inode *inode,
	333	+ struct writeback_control *wbc,
	334	+ unsigned long nr_to_write),
	335	+ TP_ARGS(inode, wbc, nr_to_write)
	336	+);
	337	+
	338	+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
	339	+ TP_PROTO(struct inode *inode,
	340	+ struct writeback_control *wbc,
	341	+ unsigned long nr_to_write),
	342	+ TP_ARGS(inode, wbc, nr_to_write)
188	343	);
189	344
190	345	#endif /* _TRACE_WRITEBACK_H */