writeback: introduce max-pause and pass-good dirty limits

The max-pause limit helps to keep the sleep time inside balance_dirty_pages() within MAX_PAUSE=200ms. The 200ms max sleep means per task rate limit of 8pages/200ms=160KB/s when dirty exceeded, which normally is enough to stop dirtiers from continue pushing the dirty pages high, unless there are a sufficient large number of slow dirtiers (eg. 500 tasks doing 160KB/s will still sum up to 80MB/s, exceeding the write bandwidth of a slow disk and hence accumulating more and more dirty pages). The pass-good limit helps to let go of the good bdi's in the presence of a blocked bdi (ie. NFS server not responding) or slow USB disk which for some reason build up a large number of initial dirty pages that refuse to go away anytime soon. For example, given two bdi's A and B and the initial state bdi_thresh_A = dirty_thresh / 2 bdi_thresh_B = dirty_thresh / 2 bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 Then A get blocked, after a dozen seconds bdi_thresh_A = 0 bdi_thresh_B = dirty_thresh bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 The (bdi_dirty_B < bdi_thresh_B) test is now useless and the dirty pages will be effectively throttled by condition (nr_dirty < dirty_thresh). This has two problems: (1) we lose the protections for light dirtiers (2) balance_dirty_pages() effectively becomes IO-less because the (bdi_nr_reclaimable > bdi_thresh) test won't be true. This is good for IO, but balance_dirty_pages() loses an important way to break out of the loop which leads to more spread out throttle delays. DIRTY_PASSGOOD_AREA can eliminate the above issues. The only problem is, DIRTY_PASSGOOD_AREA needs to be defined as 2 to fully cover the above example while this patch uses the more conservative value 8 so as not to surprise people with too many dirty pages than expected. The max-pause limit won't noticeably impact the speed dirty pages are knocked down when there is a sudden drop of global/bdi dirty thresholds. Because the heavy dirties will be throttled below 160KB/s which is slow enough. It does help to avoid long dirty throttle delays and especially will make light dirtiers more responsive. Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>

writeback: introduce max-pause and pass-good dirty limits
The max-pause limit helps to keep the sleep time inside balance_dirty_pages() within MAX_PAUSE=200ms. The 200ms max sleep means per task rate limit of 8pages/200ms=160KB/s when dirty exceeded, which normally is enough to stop dirtiers from continue pushing the dirty pages high, unless there are a sufficient large number of slow dirtiers (eg. 500 tasks doing 160KB/s will still sum up to 80MB/s, exceeding the write bandwidth of a slow disk and hence accumulating more and more dirty pages). The pass-good limit helps to let go of the good bdi's in the presence of a blocked bdi (ie. NFS server not responding) or slow USB disk which for some reason build up a large number of initial dirty pages that refuse to go away anytime soon. For example, given two bdi's A and B and the initial state bdi_thresh_A = dirty_thresh / 2 bdi_thresh_B = dirty_thresh / 2 bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 Then A get blocked, after a dozen seconds bdi_thresh_A = 0 bdi_thresh_B = dirty_thresh bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 The (bdi_dirty_B < bdi_thresh_B) test is now useless and the dirty pages will be effectively throttled by condition (nr_dirty < dirty_thresh). This has two problems: (1) we lose the protections for light dirtiers (2) balance_dirty_pages() effectively becomes IO-less because the (bdi_nr_reclaimable > bdi_thresh) test won't be true. This is good for IO, but balance_dirty_pages() loses an important way to break out of the loop which leads to more spread out throttle delays. DIRTY_PASSGOOD_AREA can eliminate the above issues. The only problem is, DIRTY_PASSGOOD_AREA needs to be defined as 2 to fully cover the above example while this patch uses the more conservative value 8 so as not to surprise people with too many dirty pages than expected. The max-pause limit won't noticeably impact the speed dirty pages are knocked down when there is a sudden drop of global/bdi dirty thresholds. Because the heavy dirties will be throttled below 160KB/s which is slow enough. It does help to avoid long dirty throttle delays and especially will make light dirtiers more responsive. Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Wu Fengguang
1 parent c42843f2f0
Showing 2 changed files with 54 additions and 0 deletions Side-by-side Diff
include/linux/writeback.h
mm/page-writeback.c
@@ -7,6 +7,27 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
  
+/*
+ * The 1/16 region above the global dirty limit will be put to maximum pauses:
+ *
+ *	(limit, limit + limit/DIRTY_MAXPAUSE_AREA)
+ *
+ * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
+ * to loops:
+ *
+ *	(limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
+ *
+ * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
+ * time) for the dirty pages to drop, unless written enough pages.
+ *
+ * The global dirty threshold is normally equal to the global dirty limit,
+ * except when the system suddenly allocates a lot of anonymous memory and
+ * knocks down the global dirty threshold quickly, in which case the global
+ * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
+ */
+#define DIRTY_MAXPAUSE_AREA		16
+#define DIRTY_PASSGOOD_AREA		8
+
 struct backing_dev_info;
  
 /*
@@ -37,6 +37,11 @@
 #include <trace/events/writeback.h>
  
 /*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE		max(HZ/5, 1)
+
+/*
  * Estimate write bandwidth at 200ms intervals.
  */
 #define BANDWIDTH_INTERVAL	max(HZ/5, 1)
@@ -399,6 +404,11 @@
 	return x + 1;	/* Ensure that we never return 0 */
 }
  
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+	return max(thresh, global_dirty_limit);
+}
+
 /*
  * global_dirty_limits - background-writeback and dirty-throttling thresholds
  *
@@ -722,6 +732,29 @@
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		io_schedule_timeout(pause);
 		trace_balance_dirty_wait(bdi);
+
+		dirty_thresh = hard_dirty_limit(dirty_thresh);
+		/*
+		 * max-pause area. If dirty exceeded but still within this
+		 * area, no need to sleep for more than 200ms: (a) 8 pages per
+		 * 200ms is typically more than enough to curb heavy dirtiers;
+		 * (b) the pause time limit makes the dirtiers more responsive.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+		    time_after(jiffies, start_time + MAX_PAUSE))
+			break;
+		/*
+		 * pass-good area. When some bdi gets blocked (eg. NFS server
+		 * not responding), or write bandwidth dropped dramatically due
+		 * to concurrent reads, or dirty threshold suddenly dropped and
+		 * the dirty pages cannot be brought down anytime soon (eg. on
+		 * slow USB stick), at least let go of the good bdi's.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_PASSGOOD_AREA &&
+		    bdi_dirty < bdi_thresh)
+			break;
  
 		/*
 		 * Increase the delay for each loop, up to our previous
...	...	@@ -7,6 +7,27 @@
7	7	#include <linux/sched.h>
8	8	#include <linux/fs.h>
9	9
	10	+/*
	11	+ * The 1/16 region above the global dirty limit will be put to maximum pauses:
	12	+ *
	13	+ * (limit, limit + limit/DIRTY_MAXPAUSE_AREA)
	14	+ *
	15	+ * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
	16	+ * to loops:
	17	+ *
	18	+ * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
	19	+ *
	20	+ * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
	21	+ * time) for the dirty pages to drop, unless written enough pages.
	22	+ *
	23	+ * The global dirty threshold is normally equal to the global dirty limit,
	24	+ * except when the system suddenly allocates a lot of anonymous memory and
	25	+ * knocks down the global dirty threshold quickly, in which case the global
	26	+ * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
	27	+ */
	28	+#define DIRTY_MAXPAUSE_AREA 16
	29	+#define DIRTY_PASSGOOD_AREA 8
	30	+
10	31	struct backing_dev_info;
11	32
12	33	/*
...	...	@@ -37,6 +37,11 @@
37	37	#include <trace/events/writeback.h>
38	38
39	39	/*
	40	+ * Sleep at most 200ms at a time in balance_dirty_pages().
	41	+ */
	42	+#define MAX_PAUSE max(HZ/5, 1)
	43	+
	44	+/*
40	45	* Estimate write bandwidth at 200ms intervals.
41	46	*/
42	47	#define BANDWIDTH_INTERVAL max(HZ/5, 1)
...	...	@@ -399,6 +404,11 @@
399	404	return x + 1; /* Ensure that we never return 0 */
400	405	}
401	406
	407	+static unsigned long hard_dirty_limit(unsigned long thresh)
	408	+{
	409	+ return max(thresh, global_dirty_limit);
	410	+}
	411	+
402	412	/*
403	413	* global_dirty_limits - background-writeback and dirty-throttling thresholds
404	414	*
...	...	@@ -722,6 +732,29 @@
722	732	__set_current_state(TASK_UNINTERRUPTIBLE);
723	733	io_schedule_timeout(pause);
724	734	trace_balance_dirty_wait(bdi);
	735	+
	736	+ dirty_thresh = hard_dirty_limit(dirty_thresh);
	737	+ /*
	738	+ * max-pause area. If dirty exceeded but still within this
	739	+ * area, no need to sleep for more than 200ms: (a) 8 pages per
	740	+ * 200ms is typically more than enough to curb heavy dirtiers;
	741	+ * (b) the pause time limit makes the dirtiers more responsive.
	742	+ */
	743	+ if (nr_dirty < dirty_thresh +
	744	+ dirty_thresh / DIRTY_MAXPAUSE_AREA &&
	745	+ time_after(jiffies, start_time + MAX_PAUSE))
	746	+ break;
	747	+ /*
	748	+ * pass-good area. When some bdi gets blocked (eg. NFS server
	749	+ * not responding), or write bandwidth dropped dramatically due
	750	+ * to concurrent reads, or dirty threshold suddenly dropped and
	751	+ * the dirty pages cannot be brought down anytime soon (eg. on
	752	+ * slow USB stick), at least let go of the good bdi's.
	753	+ */
	754	+ if (nr_dirty < dirty_thresh +
	755	+ dirty_thresh / DIRTY_PASSGOOD_AREA &&
	756	+ bdi_dirty < bdi_thresh)
	757	+ break;
725	758
726	759	/*
727	760	* Increase the delay for each loop, up to our previous