Merge branch 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux

* 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux: writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c writeback: balanced_rate cannot exceed write bandwidth writeback: do strict bdi dirty_exceeded writeback: avoid tiny dirty poll intervals writeback: max, min and target dirty pause time writeback: dirty ratelimit - think time compensation btrfs: fix dirtied pages accounting on sub-page writes writeback: fix dirtied pages accounting on redirty writeback: fix dirtied pages accounting on sub-page writes writeback: charge leaked page dirties to active tasks writeback: Include all dirty inodes in background writeback

Merge branch 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux
* 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux: writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c writeback: balanced_rate cannot exceed write bandwidth writeback: do strict bdi dirty_exceeded writeback: avoid tiny dirty poll intervals writeback: max, min and target dirty pause time writeback: dirty ratelimit - think time compensation btrfs: fix dirtied pages accounting on sub-page writes writeback: fix dirtied pages accounting on redirty writeback: fix dirtied pages accounting on sub-page writes writeback: charge leaked page dirties to active tasks writeback: Include all dirty inodes in background writeback
Linus Torvalds
2 parents 40ba587923 bc31b86a59
Showing 8 changed files Side-by-side Diff
fs/btrfs/file.c
fs/fs-writeback.c
include/linux/sched.h
include/linux/writeback.h
include/trace/events/writeback.h
kernel/exit.c
kernel/fork.c
mm/page-writeback.c
@@ -1136,7 +1136,8 @@
 				     GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
-		clear_page_dirty_for_io(pages[i]);
+		if (clear_page_dirty_for_io(pages[i]))
+			account_page_redirty(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/writeback.h>
@@ -29,6 +30,11 @@
 #include "internal.h"
  
 /*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
+
+/*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
 struct wb_writeback_work {
  
@@ -742,11 +748,17 @@
 		if (work->for_background && !over_bground_thresh(wb->bdi))
 			break;
  
+		/*
+		 * Kupdate and background works are special and we want to
+		 * include all inodes that need writing. Livelock avoidance is
+		 * handled by these works yielding to any other work so we are
+		 * safe.
+		 */
 		if (work->for_kupdate) {
 			oldest_jif = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
-			work->older_than_this = &oldest_jif;
-		}
+		} else if (work->for_background)
+			oldest_jif = jiffies;
  
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
@@ -1544,6 +1544,7 @@
 	 */
 	int nr_dirtied;
 	int nr_dirtied_pause;
+	unsigned long dirty_paused_when; /* start of a write-and-pause period */
  
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
  
+DECLARE_PER_CPU(int, dirty_throttle_leaks);
+
 /*
  * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
  *
@@ -23,11 +25,6 @@
 #define DIRTY_SCOPE		8
 #define DIRTY_FULL_SCOPE	(DIRTY_SCOPE / 2)
  
-/*
- * 4MB minimal write chunk size
- */
-#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
-
 struct backing_dev_info;
  
 /*
@@ -193,6 +190,8 @@
 void writeback_set_ratelimit(void);
 void tag_pages_for_writeback(struct address_space *mapping,
 			     pgoff_t start, pgoff_t end);
+
+void account_page_redirty(struct page *page);
  
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
@@ -300,12 +300,13 @@
 		 unsigned long dirty_ratelimit,
 		 unsigned long task_ratelimit,
 		 unsigned long dirtied,
+		 unsigned long period,
 		 long pause,
 		 unsigned long start_time),
  
 	TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
 		dirty_ratelimit, task_ratelimit,
-		dirtied, pause, start_time),
+		dirtied, period, pause, start_time),
  
 	TP_STRUCT__entry(
 		__array(	 char,	bdi, 32)
@@ -320,6 +321,8 @@
 		__field(unsigned int,	dirtied_pause)
 		__field(unsigned long,	paused)
 		__field(	 long,	pause)
+		__field(unsigned long,	period)
+		__field(	 long,	think)
 	),
  
 	TP_fast_assign(
@@ -336,6 +339,9 @@
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->dirtied	= dirtied;
 		__entry->dirtied_pause	= current->nr_dirtied_pause;
+		__entry->think		= current->dirty_paused_when == 0 ? 0 :
+			 (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
+		__entry->period		= period * 1000 / HZ;
 		__entry->pause		= pause * 1000 / HZ;
 		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
 	),
@@ -346,7 +352,7 @@
 		  "bdi_setpoint=%lu bdi_dirty=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
 		  "dirtied=%u dirtied_pause=%u "
-		  "paused=%lu pause=%ld",
+		  "paused=%lu pause=%ld period=%lu think=%ld",
 		  __entry->bdi,
 		  __entry->limit,
 		  __entry->setpoint,
@@ -358,7 +364,9 @@
 		  __entry->dirtied,
 		  __entry->dirtied_pause,
 		  __entry->paused,	/* ms */
-		  __entry->pause	/* ms */
+		  __entry->pause,	/* ms */
+		  __entry->period,	/* ms */
+		  __entry->think	/* ms */
 	  )
 );
  
@@ -51,6 +51,7 @@
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
+#include <linux/writeback.h>
  
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1035,6 +1036,8 @@
 	validate_creds_for_do_exit(tsk);
  
 	preempt_disable();
+	if (tsk->nr_dirtied)
+		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
@@ -1294,6 +1294,7 @@
  
 	p->nr_dirtied = 0;
 	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+	p->dirty_paused_when = 0;
  
 	/*
 	 * Ok, make it visible to the rest of the system.
@@ -42,6 +42,12 @@
 #define MAX_PAUSE		max(HZ/5, 1)
  
 /*
+ * Try to keep balance_dirty_pages() call intervals higher than this many pages
+ * by raising pause time to max_pause when falls below it.
+ */
+#define DIRTY_POLL_THRESH	(128 >> (PAGE_SHIFT - 10))
+
+/*
  * Estimate write bandwidth at 200ms intervals.
  */
 #define BANDWIDTH_INTERVAL	max(HZ/5, 1)
@@ -898,6 +904,11 @@
 	 */
 	balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
 					   dirty_rate | 1);
+	/*
+	 * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
+	 */
+	if (unlikely(balanced_dirty_ratelimit > write_bw))
+		balanced_dirty_ratelimit = write_bw;
  
 	/*
 	 * We could safely do this and return immediately:
  
  
  
  
  
  
  
  
  
  
@@ -1044,40 +1055,98 @@
 	return 1;
 }
  
-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
-				   unsigned long bdi_dirty)
+static long bdi_max_pause(struct backing_dev_info *bdi,
+			  unsigned long bdi_dirty)
 {
-	unsigned long bw = bdi->avg_write_bandwidth;
-	unsigned long hi = ilog2(bw);
-	unsigned long lo = ilog2(bdi->dirty_ratelimit);
-	unsigned long t;
+	long bw = bdi->avg_write_bandwidth;
+	long t;
  
-	/* target for 20ms max pause on 1-dd case */
-	t = HZ / 50;
+	/*
+	 * Limit pause time for small memory systems. If sleeping for too long
+	 * time, a small pool of dirty/writeback pages may go empty and disk go
+	 * idle.
+	 *
+	 * 8 serves as the safety ratio.
+	 */
+	t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+	t++;
  
+	return min_t(long, t, MAX_PAUSE);
+}
+
+static long bdi_min_pause(struct backing_dev_info *bdi,
+			  long max_pause,
+			  unsigned long task_ratelimit,
+			  unsigned long dirty_ratelimit,
+			  int *nr_dirtied_pause)
+{
+	long hi = ilog2(bdi->avg_write_bandwidth);
+	long lo = ilog2(bdi->dirty_ratelimit);
+	long t;		/* target pause */
+	long pause;	/* estimated next pause */
+	int pages;	/* target nr_dirtied_pause */
+
+	/* target for 10ms pause on 1-dd case */
+	t = max(1, HZ / 100);
+
 	/*
 	 * Scale up pause time for concurrent dirtiers in order to reduce CPU
 	 * overheads.
 	 *
-	 * (N * 20ms) on 2^N concurrent tasks.
+	 * (N * 10ms) on 2^N concurrent tasks.
 	 */
 	if (hi > lo)
-		t += (hi - lo) * (20 * HZ) / 1024;
+		t += (hi - lo) * (10 * HZ) / 1024;
  
 	/*
-	 * Limit pause time for small memory systems. If sleeping for too long
-	 * time, a small pool of dirty/writeback pages may go empty and disk go
-	 * idle.
+	 * This is a bit convoluted. We try to base the next nr_dirtied_pause
+	 * on the much more stable dirty_ratelimit. However the next pause time
+	 * will be computed based on task_ratelimit and the two rate limits may
+	 * depart considerably at some time. Especially if task_ratelimit goes
+	 * below dirty_ratelimit/2 and the target pause is max_pause, the next
+	 * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
+	 * result task_ratelimit won't be executed faithfully, which could
+	 * eventually bring down dirty_ratelimit.
 	 *
-	 * 8 serves as the safety ratio.
+	 * We apply two rules to fix it up:
+	 * 1) try to estimate the next pause time and if necessary, use a lower
+	 *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
+	 *    nr_dirtied_pause will be "dancing" with task_ratelimit.
+	 * 2) limit the target pause time to max_pause/2, so that the normal
+	 *    small fluctuations of task_ratelimit won't trigger rule (1) and
+	 *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
 	 */
-	t = min(t, bdi_dirty * HZ / (8 * bw + 1));
+	t = min(t, 1 + max_pause / 2);
+	pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
  
 	/*
-	 * The pause time will be settled within range (max_pause/4, max_pause).
-	 * Apply a minimal value of 4 to get a non-zero max_pause/4.
+	 * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
+	 * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
+	 * When the 16 consecutive reads are often interrupted by some dirty
+	 * throttling pause during the async writes, cfq will go into idles
+	 * (deadline is fine). So push nr_dirtied_pause as high as possible
+	 * until reaches DIRTY_POLL_THRESH=32 pages.
 	 */
-	return clamp_val(t, 4, MAX_PAUSE);
+	if (pages < DIRTY_POLL_THRESH) {
+		t = max_pause;
+		pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+		if (pages > DIRTY_POLL_THRESH) {
+			pages = DIRTY_POLL_THRESH;
+			t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
+		}
+	}
+
+	pause = HZ * pages / (task_ratelimit + 1);
+	if (pause > max_pause) {
+		t = max_pause;
+		pages = task_ratelimit * t / roundup_pow_of_two(HZ);
+	}
+
+	*nr_dirtied_pause = pages;
+	/*
+	 * The minimal pause time will normally be half the target pause time.
+	 */
+	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
 }
  
 /*
  
  
@@ -1098,16 +1167,21 @@
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	long pause = 0;
-	long uninitialized_var(max_pause);
+	long period;
+	long pause;
+	long max_pause;
+	long min_pause;
+	int nr_dirtied_pause;
 	bool dirty_exceeded = false;
 	unsigned long task_ratelimit;
-	unsigned long uninitialized_var(dirty_ratelimit);
+	unsigned long dirty_ratelimit;
 	unsigned long pos_ratio;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned long start_time = jiffies;
  
 	for (;;) {
+		unsigned long now = jiffies;
+
 		/*
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
  
@@ -1127,8 +1201,13 @@
 		 */
 		freerun = dirty_freerun_ceiling(dirty_thresh,
 						background_thresh);
-		if (nr_dirty <= freerun)
+		if (nr_dirty <= freerun) {
+			current->dirty_paused_when = now;
+			current->nr_dirtied = 0;
+			current->nr_dirtied_pause =
+				dirty_poll_interval(nr_dirty, dirty_thresh);
 			break;
+		}
  
 		if (unlikely(!writeback_in_progress(bdi)))
 			bdi_start_background_writeback(bdi);
@@ -1168,7 +1247,7 @@
 				    bdi_stat(bdi, BDI_WRITEBACK);
 		}
  
-		dirty_exceeded = (bdi_dirty > bdi_thresh) ||
+		dirty_exceeded = (bdi_dirty > bdi_thresh) &&
 				  (nr_dirty > dirty_thresh);
 		if (dirty_exceeded && !bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
  
  
  
@@ -1177,20 +1256,34 @@
 				     nr_dirty, bdi_thresh, bdi_dirty,
 				     start_time);
  
-		max_pause = bdi_max_pause(bdi, bdi_dirty);
-
 		dirty_ratelimit = bdi->dirty_ratelimit;
 		pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
 					       background_thresh, nr_dirty,
 					       bdi_thresh, bdi_dirty);
 		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
 							RATELIMIT_CALC_SHIFT;
+		max_pause = bdi_max_pause(bdi, bdi_dirty);
+		min_pause = bdi_min_pause(bdi, max_pause,
+					  task_ratelimit, dirty_ratelimit,
+					  &nr_dirtied_pause);
+
 		if (unlikely(task_ratelimit == 0)) {
+			period = max_pause;
 			pause = max_pause;
 			goto pause;
 		}
-		pause = HZ * pages_dirtied / task_ratelimit;
-		if (unlikely(pause <= 0)) {
+		period = HZ * pages_dirtied / task_ratelimit;
+		pause = period;
+		if (current->dirty_paused_when)
+			pause -= now - current->dirty_paused_when;
+		/*
+		 * For less than 1s think time (ext3/4 may block the dirtier
+		 * for up to 800ms from time to time on 1-HDD; so does xfs,
+		 * however at much less frequency), try to compensate it in
+		 * future periods by updating the virtual time; otherwise just
+		 * do a reset, as it may be a light dirtier.
+		 */
+		if (pause < min_pause) {
 			trace_balance_dirty_pages(bdi,
 						  dirty_thresh,
 						  background_thresh,
  
  
@@ -1200,12 +1293,24 @@
 						  dirty_ratelimit,
 						  task_ratelimit,
 						  pages_dirtied,
-						  pause,
+						  period,
+						  min(pause, 0L),
 						  start_time);
-			pause = 1; /* avoid resetting nr_dirtied_pause below */
+			if (pause < -HZ) {
+				current->dirty_paused_when = now;
+				current->nr_dirtied = 0;
+			} else if (period) {
+				current->dirty_paused_when += period;
+				current->nr_dirtied = 0;
+			} else if (current->nr_dirtied_pause <= pages_dirtied)
+				current->nr_dirtied_pause += pages_dirtied;
 			break;
 		}
-		pause = min(pause, max_pause);
+		if (unlikely(pause > max_pause)) {
+			/* for occasional dropped task_ratelimit */
+			now += min(pause - max_pause, max_pause);
+			pause = max_pause;
+		}
  
 pause:
 		trace_balance_dirty_pages(bdi,
  
@@ -1217,11 +1322,16 @@
 					  dirty_ratelimit,
 					  task_ratelimit,
 					  pages_dirtied,
+					  period,
 					  pause,
 					  start_time);
 		__set_current_state(TASK_KILLABLE);
 		io_schedule_timeout(pause);
  
+		current->dirty_paused_when = now + pause;
+		current->nr_dirtied = 0;
+		current->nr_dirtied_pause = nr_dirtied_pause;
+
 		/*
 		 * This is typically equal to (nr_dirty < dirty_thresh) and can
 		 * also keep "1000+ dd on a slow USB stick" under control.
@@ -1249,23 +1359,6 @@
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
  
-	current->nr_dirtied = 0;
-	if (pause == 0) { /* in freerun area */
-		current->nr_dirtied_pause =
-				dirty_poll_interval(nr_dirty, dirty_thresh);
-	} else if (pause <= max_pause / 4 &&
-		   pages_dirtied >= current->nr_dirtied_pause) {
-		current->nr_dirtied_pause = clamp_val(
-					dirty_ratelimit * (max_pause / 2) / HZ,
-					pages_dirtied + pages_dirtied / 8,
-					pages_dirtied * 4);
-	} else if (pause >= max_pause) {
-		current->nr_dirtied_pause = 1 | clamp_val(
-					dirty_ratelimit * (max_pause / 2) / HZ,
-					pages_dirtied / 4,
-					pages_dirtied - pages_dirtied / 8);
-	}
-
 	if (writeback_in_progress(bdi))
 		return;
  
@@ -1296,6 +1389,22 @@
  
 static DEFINE_PER_CPU(int, bdp_ratelimits);
  
+/*
+ * Normal tasks are throttled by
+ *	loop {
+ *		dirty tsk->nr_dirtied_pause pages;
+ *		take a snap in balance_dirty_pages();
+ *	}
+ * However there is a worst case. If every task exit immediately when dirtied
+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
+ * called to throttle the page dirties. The solution is to save the not yet
+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
+ * randomly into the running tasks. This works well for the above worst case,
+ * as the new task will pick up and accumulate the old task's leaked dirty
+ * count and eventually get throttled.
+ */
+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
+
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -1324,8 +1433,6 @@
 	if (bdi->dirty_exceeded)
 		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
  
-	current->nr_dirtied += nr_pages_dirtied;
-
 	preempt_disable();
 	/*
 	 * This prevents one CPU to accumulate too many dirtied pages without
  
@@ -1336,13 +1443,21 @@
 	p =  &__get_cpu_var(bdp_ratelimits);
 	if (unlikely(current->nr_dirtied >= ratelimit))
 		*p = 0;
-	else {
-		*p += nr_pages_dirtied;
-		if (unlikely(*p >= ratelimit_pages)) {
-			*p = 0;
-			ratelimit = 0;
-		}
+	else if (unlikely(*p >= ratelimit_pages)) {
+		*p = 0;
+		ratelimit = 0;
 	}
+	/*
+	 * Pick up the dirtied pages by the exited tasks. This avoids lots of
+	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
+	 * the dirty throttling and livelock other long-run dirtiers.
+	 */
+	p = &__get_cpu_var(dirty_throttle_leaks);
+	if (*p > 0 && current->nr_dirtied < ratelimit) {
+		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
+		*p -= nr_pages_dirtied;
+		current->nr_dirtied += nr_pages_dirtied;
+	}
 	preempt_enable();
  
 	if (unlikely(current->nr_dirtied >= ratelimit))
@@ -1823,6 +1938,8 @@
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
+		current->nr_dirtied++;
+		this_cpu_inc(bdp_ratelimits);
 	}
 }
 EXPORT_SYMBOL(account_page_dirtied);
@@ -1883,6 +2000,24 @@
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
  
 /*
+ * Call this whenever redirtying a page, to de-account the dirty counters
+ * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * systematic errors in balanced_dirty_ratelimit and the dirty pages position
+ * control.
+ */
+void account_page_redirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	if (mapping && mapping_cap_account_dirty(mapping)) {
+		current->nr_dirtied--;
+		dec_zone_page_state(page, NR_DIRTIED);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+	}
+}
+EXPORT_SYMBOL(account_page_redirty);
+
+/*
  * When a writepage implementation decides that it doesn't want to write this
  * page for some reason, it should redirty the locked page via
  * redirty_page_for_writepage() and it should then unlock the page and return 0
@@ -1890,6 +2025,7 @@
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
 	wbc->pages_skipped++;
+	account_page_redirty(page);
 	return __set_page_dirty_nobuffers(page);
 }
 EXPORT_SYMBOL(redirty_page_for_writepage);
...	...	@@ -1136,7 +1136,8 @@
1136	1136	GFP_NOFS);
1137	1137	}
1138	1138	for (i = 0; i < num_pages; i++) {
1139		- clear_page_dirty_for_io(pages[i]);
	1139	+ if (clear_page_dirty_for_io(pages[i]))
	1140	+ account_page_redirty(pages[i]);
1140	1141	set_page_extent_mapped(pages[i]);
1141	1142	WARN_ON(!PageLocked(pages[i]));
1142	1143	}
...	...	@@ -20,6 +20,7 @@
20	20	#include <linux/sched.h>
21	21	#include <linux/fs.h>
22	22	#include <linux/mm.h>
	23	+#include <linux/pagemap.h>
23	24	#include <linux/kthread.h>
24	25	#include <linux/freezer.h>
25	26	#include <linux/writeback.h>
...	...	@@ -29,6 +30,11 @@
29	30	#include "internal.h"
30	31
31	32	/*
	33	+ * 4MB minimal write chunk size
	34	+ */
	35	+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
	36	+
	37	+/*
32	38	* Passed into wb_writeback(), essentially a subset of writeback_control
33	39	*/
34	40	struct wb_writeback_work {
35	41
...	...	@@ -742,11 +748,17 @@
742	748	if (work->for_background && !over_bground_thresh(wb->bdi))
743	749	break;
744	750
	751	+ /*
	752	+ * Kupdate and background works are special and we want to
	753	+ * include all inodes that need writing. Livelock avoidance is
	754	+ * handled by these works yielding to any other work so we are
	755	+ * safe.
	756	+ */
745	757	if (work->for_kupdate) {
746	758	oldest_jif = jiffies -
747	759	msecs_to_jiffies(dirty_expire_interval * 10);
748		- work->older_than_this = &oldest_jif;
749		- }
	760	+ } else if (work->for_background)
	761	+ oldest_jif = jiffies;
750	762
751	763	trace_writeback_start(wb->bdi, work);
752	764	if (list_empty(&wb->b_io))
...	...	@@ -1544,6 +1544,7 @@
1544	1544	*/
1545	1545	int nr_dirtied;
1546	1546	int nr_dirtied_pause;
	1547	+ unsigned long dirty_paused_when; /* start of a write-and-pause period */
1547	1548
1548	1549	#ifdef CONFIG_LATENCYTOP
1549	1550	int latency_record_count;
...	...	@@ -7,6 +7,8 @@
7	7	#include <linux/sched.h>
8	8	#include <linux/fs.h>
9	9
	10	+DECLARE_PER_CPU(int, dirty_throttle_leaks);
	11	+
10	12	/*
11	13	* The 1/4 region under the global dirty thresh is for smooth dirty throttling:
12	14	*
...	...	@@ -23,11 +25,6 @@
23	25	#define DIRTY_SCOPE 8
24	26	#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
25	27
26		-/*
27		- * 4MB minimal write chunk size
28		- */
29		-#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
30		-
31	28	struct backing_dev_info;
32	29
33	30	/*
...	...	@@ -193,6 +190,8 @@
193	190	void writeback_set_ratelimit(void);
194	191	void tag_pages_for_writeback(struct address_space *mapping,
195	192	pgoff_t start, pgoff_t end);
	193	+
	194	+void account_page_redirty(struct page *page);
196	195
197	196	/* pdflush.c */
198	197	extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
...	...	@@ -300,12 +300,13 @@
300	300	unsigned long dirty_ratelimit,
301	301	unsigned long task_ratelimit,
302	302	unsigned long dirtied,
	303	+ unsigned long period,
303	304	long pause,
304	305	unsigned long start_time),
305	306
306	307	TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
307	308	dirty_ratelimit, task_ratelimit,
308		- dirtied, pause, start_time),
	309	+ dirtied, period, pause, start_time),
309	310
310	311	TP_STRUCT__entry(
311	312	__array( char, bdi, 32)
...	...	@@ -320,6 +321,8 @@
320	321	__field(unsigned int, dirtied_pause)
321	322	__field(unsigned long, paused)
322	323	__field( long, pause)
	324	+ __field(unsigned long, period)
	325	+ __field( long, think)
323	326	),
324	327
325	328	TP_fast_assign(
...	...	@@ -336,6 +339,9 @@
336	339	__entry->task_ratelimit = KBps(task_ratelimit);
337	340	__entry->dirtied = dirtied;
338	341	__entry->dirtied_pause = current->nr_dirtied_pause;
	342	+ __entry->think = current->dirty_paused_when == 0 ? 0 :
	343	+ (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
	344	+ __entry->period = period * 1000 / HZ;
339	345	__entry->pause = pause * 1000 / HZ;
340	346	__entry->paused = (jiffies - start_time) * 1000 / HZ;
341	347	),
...	...	@@ -346,7 +352,7 @@
346	352	"bdi_setpoint=%lu bdi_dirty=%lu "
347	353	"dirty_ratelimit=%lu task_ratelimit=%lu "
348	354	"dirtied=%u dirtied_pause=%u "
349		- "paused=%lu pause=%ld",
	355	+ "paused=%lu pause=%ld period=%lu think=%ld",
350	356	__entry->bdi,
351	357	__entry->limit,
352	358	__entry->setpoint,
...	...	@@ -358,7 +364,9 @@
358	364	__entry->dirtied,
359	365	__entry->dirtied_pause,
360	366	__entry->paused, /* ms */
361		- __entry->pause /* ms */
	367	+ __entry->pause, /* ms */
	368	+ __entry->period, /* ms */
	369	+ __entry->think /* ms */
362	370	)
363	371	);
364	372
...	...	@@ -51,6 +51,7 @@
51	51	#include <trace/events/sched.h>
52	52	#include <linux/hw_breakpoint.h>
53	53	#include <linux/oom.h>
	54	+#include <linux/writeback.h>
54	55
55	56	#include <asm/uaccess.h>
56	57	#include <asm/unistd.h>
...	...	@@ -1035,6 +1036,8 @@
1035	1036	validate_creds_for_do_exit(tsk);
1036	1037
1037	1038	preempt_disable();
	1039	+ if (tsk->nr_dirtied)
	1040	+ __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1038	1041	exit_rcu();
1039	1042	/* causes final put_task_struct in finish_task_switch(). */
1040	1043	tsk->state = TASK_DEAD;
...	...	@@ -1294,6 +1294,7 @@
1294	1294
1295	1295	p->nr_dirtied = 0;
1296	1296	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
	1297	+ p->dirty_paused_when = 0;
1297	1298
1298	1299	/*
1299	1300	* Ok, make it visible to the rest of the system.
...	...	@@ -42,6 +42,12 @@
42	42	#define MAX_PAUSE max(HZ/5, 1)
43	43
44	44	/*
	45	+ * Try to keep balance_dirty_pages() call intervals higher than this many pages
	46	+ * by raising pause time to max_pause when falls below it.
	47	+ */
	48	+#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
	49	+
	50	+/*
45	51	* Estimate write bandwidth at 200ms intervals.
46	52	*/
47	53	#define BANDWIDTH_INTERVAL max(HZ/5, 1)
...	...	@@ -898,6 +904,11 @@
898	904	*/
899	905	balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
900	906	dirty_rate \| 1);
	907	+ /*
	908	+ * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
	909	+ */
	910	+ if (unlikely(balanced_dirty_ratelimit > write_bw))
	911	+ balanced_dirty_ratelimit = write_bw;
901	912
902	913	/*
903	914	* We could safely do this and return immediately:
904	915
905	916
906	917
907	918
908	919
909	920
910	921
911	922
912	923
913	924
...	...	@@ -1044,40 +1055,98 @@
1044	1055	return 1;
1045	1056	}
1046	1057
1047		-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
1048		- unsigned long bdi_dirty)
	1058	+static long bdi_max_pause(struct backing_dev_info *bdi,
	1059	+ unsigned long bdi_dirty)
1049	1060	{
1050		- unsigned long bw = bdi->avg_write_bandwidth;
1051		- unsigned long hi = ilog2(bw);
1052		- unsigned long lo = ilog2(bdi->dirty_ratelimit);
1053		- unsigned long t;
	1061	+ long bw = bdi->avg_write_bandwidth;
	1062	+ long t;
1054	1063
1055		- /* target for 20ms max pause on 1-dd case */
1056		- t = HZ / 50;
	1064	+ /*
	1065	+ * Limit pause time for small memory systems. If sleeping for too long
	1066	+ * time, a small pool of dirty/writeback pages may go empty and disk go
	1067	+ * idle.
	1068	+ *
	1069	+ * 8 serves as the safety ratio.
	1070	+ */
	1071	+ t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
	1072	+ t++;
1057	1073
	1074	+ return min_t(long, t, MAX_PAUSE);
	1075	+}
	1076	+
	1077	+static long bdi_min_pause(struct backing_dev_info *bdi,
	1078	+ long max_pause,
	1079	+ unsigned long task_ratelimit,
	1080	+ unsigned long dirty_ratelimit,
	1081	+ int *nr_dirtied_pause)
	1082	+{
	1083	+ long hi = ilog2(bdi->avg_write_bandwidth);
	1084	+ long lo = ilog2(bdi->dirty_ratelimit);
	1085	+ long t; /* target pause */
	1086	+ long pause; /* estimated next pause */
	1087	+ int pages; /* target nr_dirtied_pause */
	1088	+
	1089	+ /* target for 10ms pause on 1-dd case */
	1090	+ t = max(1, HZ / 100);
	1091	+
1058	1092	/*
1059	1093	* Scale up pause time for concurrent dirtiers in order to reduce CPU
1060	1094	* overheads.
1061	1095	*
1062		- * (N * 20ms) on 2^N concurrent tasks.
	1096	+ * (N * 10ms) on 2^N concurrent tasks.
1063	1097	*/
1064	1098	if (hi > lo)
1065		- t += (hi - lo) * (20 * HZ) / 1024;
	1099	+ t += (hi - lo) * (10 * HZ) / 1024;
1066	1100
1067	1101	/*
1068		- * Limit pause time for small memory systems. If sleeping for too long
1069		- * time, a small pool of dirty/writeback pages may go empty and disk go
1070		- * idle.
	1102	+ * This is a bit convoluted. We try to base the next nr_dirtied_pause
	1103	+ * on the much more stable dirty_ratelimit. However the next pause time
	1104	+ * will be computed based on task_ratelimit and the two rate limits may
	1105	+ * depart considerably at some time. Especially if task_ratelimit goes
	1106	+ * below dirty_ratelimit/2 and the target pause is max_pause, the next
	1107	+ * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
	1108	+ * result task_ratelimit won't be executed faithfully, which could
	1109	+ * eventually bring down dirty_ratelimit.
1071	1110	*
1072		- * 8 serves as the safety ratio.
	1111	+ * We apply two rules to fix it up:
	1112	+ * 1) try to estimate the next pause time and if necessary, use a lower
	1113	+ * nr_dirtied_pause so as not to exceed max_pause. When this happens,
	1114	+ * nr_dirtied_pause will be "dancing" with task_ratelimit.
	1115	+ * 2) limit the target pause time to max_pause/2, so that the normal
	1116	+ * small fluctuations of task_ratelimit won't trigger rule (1) and
	1117	+ * nr_dirtied_pause will remain as stable as dirty_ratelimit.
1073	1118	*/
1074		- t = min(t, bdi_dirty * HZ / (8 * bw + 1));
	1119	+ t = min(t, 1 + max_pause / 2);
	1120	+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1075	1121
1076	1122	/*
1077		- * The pause time will be settled within range (max_pause/4, max_pause).
1078		- * Apply a minimal value of 4 to get a non-zero max_pause/4.
	1123	+ * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
	1124	+ * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
	1125	+ * When the 16 consecutive reads are often interrupted by some dirty
	1126	+ * throttling pause during the async writes, cfq will go into idles
	1127	+ * (deadline is fine). So push nr_dirtied_pause as high as possible
	1128	+ * until reaches DIRTY_POLL_THRESH=32 pages.
1079	1129	*/
1080		- return clamp_val(t, 4, MAX_PAUSE);
	1130	+ if (pages < DIRTY_POLL_THRESH) {
	1131	+ t = max_pause;
	1132	+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
	1133	+ if (pages > DIRTY_POLL_THRESH) {
	1134	+ pages = DIRTY_POLL_THRESH;
	1135	+ t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
	1136	+ }
	1137	+ }
	1138	+
	1139	+ pause = HZ * pages / (task_ratelimit + 1);
	1140	+ if (pause > max_pause) {
	1141	+ t = max_pause;
	1142	+ pages = task_ratelimit * t / roundup_pow_of_two(HZ);
	1143	+ }
	1144	+
	1145	+ *nr_dirtied_pause = pages;
	1146	+ /*
	1147	+ * The minimal pause time will normally be half the target pause time.
	1148	+ */
	1149	+ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1081	1150	}
1082	1151
1083	1152	/*
1084	1153
1085	1154
...	...	@@ -1098,16 +1167,21 @@
1098	1167	unsigned long background_thresh;
1099	1168	unsigned long dirty_thresh;
1100	1169	unsigned long bdi_thresh;
1101		- long pause = 0;
1102		- long uninitialized_var(max_pause);
	1170	+ long period;
	1171	+ long pause;
	1172	+ long max_pause;
	1173	+ long min_pause;
	1174	+ int nr_dirtied_pause;
1103	1175	bool dirty_exceeded = false;
1104	1176	unsigned long task_ratelimit;
1105		- unsigned long uninitialized_var(dirty_ratelimit);
	1177	+ unsigned long dirty_ratelimit;
1106	1178	unsigned long pos_ratio;
1107	1179	struct backing_dev_info *bdi = mapping->backing_dev_info;
1108	1180	unsigned long start_time = jiffies;
1109	1181
1110	1182	for (;;) {
	1183	+ unsigned long now = jiffies;
	1184	+
1111	1185	/*
1112	1186	* Unstable writes are a feature of certain networked
1113	1187	* filesystems (i.e. NFS) in which data may have been
1114	1188
...	...	@@ -1127,8 +1201,13 @@
1127	1201	*/
1128	1202	freerun = dirty_freerun_ceiling(dirty_thresh,
1129	1203	background_thresh);
1130		- if (nr_dirty <= freerun)
	1204	+ if (nr_dirty <= freerun) {
	1205	+ current->dirty_paused_when = now;
	1206	+ current->nr_dirtied = 0;
	1207	+ current->nr_dirtied_pause =
	1208	+ dirty_poll_interval(nr_dirty, dirty_thresh);
1131	1209	break;
	1210	+ }
1132	1211
1133	1212	if (unlikely(!writeback_in_progress(bdi)))
1134	1213	bdi_start_background_writeback(bdi);
...	...	@@ -1168,7 +1247,7 @@
1168	1247	bdi_stat(bdi, BDI_WRITEBACK);
1169	1248	}
1170	1249
1171		- dirty_exceeded = (bdi_dirty > bdi_thresh) \|\|
	1250	+ dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1172	1251	(nr_dirty > dirty_thresh);
1173	1252	if (dirty_exceeded && !bdi->dirty_exceeded)
1174	1253	bdi->dirty_exceeded = 1;
1175	1254
1176	1255
1177	1256
...	...	@@ -1177,20 +1256,34 @@
1177	1256	nr_dirty, bdi_thresh, bdi_dirty,
1178	1257	start_time);
1179	1258
1180		- max_pause = bdi_max_pause(bdi, bdi_dirty);
1181		-
1182	1259	dirty_ratelimit = bdi->dirty_ratelimit;
1183	1260	pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1184	1261	background_thresh, nr_dirty,
1185	1262	bdi_thresh, bdi_dirty);
1186	1263	task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1187	1264	RATELIMIT_CALC_SHIFT;
	1265	+ max_pause = bdi_max_pause(bdi, bdi_dirty);
	1266	+ min_pause = bdi_min_pause(bdi, max_pause,
	1267	+ task_ratelimit, dirty_ratelimit,
	1268	+ &nr_dirtied_pause);
	1269	+
1188	1270	if (unlikely(task_ratelimit == 0)) {
	1271	+ period = max_pause;
1189	1272	pause = max_pause;
1190	1273	goto pause;
1191	1274	}
1192		- pause = HZ * pages_dirtied / task_ratelimit;
1193		- if (unlikely(pause <= 0)) {
	1275	+ period = HZ * pages_dirtied / task_ratelimit;
	1276	+ pause = period;
	1277	+ if (current->dirty_paused_when)
	1278	+ pause -= now - current->dirty_paused_when;
	1279	+ /*
	1280	+ * For less than 1s think time (ext3/4 may block the dirtier
	1281	+ * for up to 800ms from time to time on 1-HDD; so does xfs,
	1282	+ * however at much less frequency), try to compensate it in
	1283	+ * future periods by updating the virtual time; otherwise just
	1284	+ * do a reset, as it may be a light dirtier.
	1285	+ */
	1286	+ if (pause < min_pause) {
1194	1287	trace_balance_dirty_pages(bdi,
1195	1288	dirty_thresh,
1196	1289	background_thresh,
1197	1290
1198	1291
...	...	@@ -1200,12 +1293,24 @@
1200	1293	dirty_ratelimit,
1201	1294	task_ratelimit,
1202	1295	pages_dirtied,
1203		- pause,
	1296	+ period,
	1297	+ min(pause, 0L),
1204	1298	start_time);
1205		- pause = 1; /* avoid resetting nr_dirtied_pause below */
	1299	+ if (pause < -HZ) {
	1300	+ current->dirty_paused_when = now;
	1301	+ current->nr_dirtied = 0;
	1302	+ } else if (period) {
	1303	+ current->dirty_paused_when += period;
	1304	+ current->nr_dirtied = 0;
	1305	+ } else if (current->nr_dirtied_pause <= pages_dirtied)
	1306	+ current->nr_dirtied_pause += pages_dirtied;
1206	1307	break;
1207	1308	}
1208		- pause = min(pause, max_pause);
	1309	+ if (unlikely(pause > max_pause)) {
	1310	+ /* for occasional dropped task_ratelimit */
	1311	+ now += min(pause - max_pause, max_pause);
	1312	+ pause = max_pause;
	1313	+ }
1209	1314
1210	1315	pause:
1211	1316	trace_balance_dirty_pages(bdi,
1212	1317
...	...	@@ -1217,11 +1322,16 @@
1217	1322	dirty_ratelimit,
1218	1323	task_ratelimit,
1219	1324	pages_dirtied,
	1325	+ period,
1220	1326	pause,
1221	1327	start_time);
1222	1328	__set_current_state(TASK_KILLABLE);
1223	1329	io_schedule_timeout(pause);
1224	1330
	1331	+ current->dirty_paused_when = now + pause;
	1332	+ current->nr_dirtied = 0;
	1333	+ current->nr_dirtied_pause = nr_dirtied_pause;
	1334	+
1225	1335	/*
1226	1336	* This is typically equal to (nr_dirty < dirty_thresh) and can
1227	1337	* also keep "1000+ dd on a slow USB stick" under control.
...	...	@@ -1249,23 +1359,6 @@
1249	1359	if (!dirty_exceeded && bdi->dirty_exceeded)
1250	1360	bdi->dirty_exceeded = 0;
1251	1361
1252		- current->nr_dirtied = 0;
1253		- if (pause == 0) { /* in freerun area */
1254		- current->nr_dirtied_pause =
1255		- dirty_poll_interval(nr_dirty, dirty_thresh);
1256		- } else if (pause <= max_pause / 4 &&
1257		- pages_dirtied >= current->nr_dirtied_pause) {
1258		- current->nr_dirtied_pause = clamp_val(
1259		- dirty_ratelimit * (max_pause / 2) / HZ,
1260		- pages_dirtied + pages_dirtied / 8,
1261		- pages_dirtied * 4);
1262		- } else if (pause >= max_pause) {
1263		- current->nr_dirtied_pause = 1 \| clamp_val(
1264		- dirty_ratelimit * (max_pause / 2) / HZ,
1265		- pages_dirtied / 4,
1266		- pages_dirtied - pages_dirtied / 8);
1267		- }
1268		-
1269	1362	if (writeback_in_progress(bdi))
1270	1363	return;
1271	1364
...	...	@@ -1296,6 +1389,22 @@
1296	1389
1297	1390	static DEFINE_PER_CPU(int, bdp_ratelimits);
1298	1391
	1392	+/*
	1393	+ * Normal tasks are throttled by
	1394	+ * loop {
	1395	+ * dirty tsk->nr_dirtied_pause pages;
	1396	+ * take a snap in balance_dirty_pages();
	1397	+ * }
	1398	+ * However there is a worst case. If every task exit immediately when dirtied
	1399	+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
	1400	+ * called to throttle the page dirties. The solution is to save the not yet
	1401	+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
	1402	+ * randomly into the running tasks. This works well for the above worst case,
	1403	+ * as the new task will pick up and accumulate the old task's leaked dirty
	1404	+ * count and eventually get throttled.
	1405	+ */
	1406	+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
	1407	+
1299	1408	/**
1300	1409	* balance_dirty_pages_ratelimited_nr - balance dirty memory state
1301	1410	* @mapping: address_space which was dirtied
...	...	@@ -1324,8 +1433,6 @@
1324	1433	if (bdi->dirty_exceeded)
1325	1434	ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1326	1435
1327		- current->nr_dirtied += nr_pages_dirtied;
1328		-
1329	1436	preempt_disable();
1330	1437	/*
1331	1438	* This prevents one CPU to accumulate too many dirtied pages without
1332	1439
...	...	@@ -1336,13 +1443,21 @@
1336	1443	p = &__get_cpu_var(bdp_ratelimits);
1337	1444	if (unlikely(current->nr_dirtied >= ratelimit))
1338	1445	*p = 0;
1339		- else {
1340		- *p += nr_pages_dirtied;
1341		- if (unlikely(*p >= ratelimit_pages)) {
1342		- *p = 0;
1343		- ratelimit = 0;
1344		- }
	1446	+ else if (unlikely(*p >= ratelimit_pages)) {
	1447	+ *p = 0;
	1448	+ ratelimit = 0;
1345	1449	}
	1450	+ /*
	1451	+ * Pick up the dirtied pages by the exited tasks. This avoids lots of
	1452	+ * short-lived tasks (eg. gcc invocations in a kernel build) escaping
	1453	+ * the dirty throttling and livelock other long-run dirtiers.
	1454	+ */
	1455	+ p = &__get_cpu_var(dirty_throttle_leaks);
	1456	+ if (*p > 0 && current->nr_dirtied < ratelimit) {
	1457	+ nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
	1458	+ *p -= nr_pages_dirtied;
	1459	+ current->nr_dirtied += nr_pages_dirtied;
	1460	+ }
1346	1461	preempt_enable();
1347	1462
1348	1463	if (unlikely(current->nr_dirtied >= ratelimit))
...	...	@@ -1823,6 +1938,8 @@
1823	1938	__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1824	1939	__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1825	1940	task_io_account_write(PAGE_CACHE_SIZE);
	1941	+ current->nr_dirtied++;
	1942	+ this_cpu_inc(bdp_ratelimits);
1826	1943	}
1827	1944	}
1828	1945	EXPORT_SYMBOL(account_page_dirtied);
...	...	@@ -1883,6 +2000,24 @@
1883	2000	EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1884	2001
1885	2002	/*
	2003	+ * Call this whenever redirtying a page, to de-account the dirty counters
	2004	+ * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
	2005	+ * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
	2006	+ * systematic errors in balanced_dirty_ratelimit and the dirty pages position
	2007	+ * control.
	2008	+ */
	2009	+void account_page_redirty(struct page *page)
	2010	+{
	2011	+ struct address_space *mapping = page->mapping;
	2012	+ if (mapping && mapping_cap_account_dirty(mapping)) {
	2013	+ current->nr_dirtied--;
	2014	+ dec_zone_page_state(page, NR_DIRTIED);
	2015	+ dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
	2016	+ }
	2017	+}
	2018	+EXPORT_SYMBOL(account_page_redirty);
	2019	+
	2020	+/*
1886	2021	* When a writepage implementation decides that it doesn't want to write this
1887	2022	* page for some reason, it should redirty the locked page via
1888	2023	* redirty_page_for_writepage() and it should then unlock the page and return 0
...	...	@@ -1890,6 +2025,7 @@
1890	2025	int redirty_page_for_writepage(struct writeback_control wbc, struct page page)
1891	2026	{
1892	2027	wbc->pages_skipped++;
	2028	+ account_page_redirty(page);
1893	2029	return __set_page_dirty_nobuffers(page);
1894	2030	}
1895	2031	EXPORT_SYMBOL(redirty_page_for_writepage);