writeback: per task dirty rate limit

Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Andrea Righi <andrea@betterlinux.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>

writeback: per task dirty rate limit
Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Andrea Righi <andrea@betterlinux.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Wu Fengguang
1 parent 7381131cbc
Showing 3 changed files with 60 additions and 39 deletions Side-by-side Diff
include/linux/sched.h
kernel/fork.c
mm/page-writeback.c
@@ -1525,6 +1525,13 @@
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+	/*
+	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
+	 * balance_dirty_pages() for some dirty throttling pause
+	 */
+	int nr_dirtied;
+	int nr_dirtied_pause;
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
@@ -1302,6 +1302,9 @@
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
  
+	p->nr_dirtied = 0;
+	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+
 	/*
 	 * Ok, make it visible to the rest of the system.
 	 * We dont wake it up yet.
@@ -54,20 +54,6 @@
  */
 static long ratelimit_pages = 32;
  
-/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
- */
-static inline long sync_writeback_pages(unsigned long dirtied)
-{
-	if (dirtied < ratelimit_pages)
-		dirtied = ratelimit_pages;
-
-	return dirtied + dirtied / 2;
-}
-
 /* The following parameters are exported via /proc/sys/vm */
  
 /*
@@ -169,6 +155,8 @@
 	int shift = calc_period_shift();
 	prop_change_shift(&vm_completions, shift);
 	prop_change_shift(&vm_dirties, shift);
+
+	writeback_set_ratelimit();
 }
  
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -979,6 +967,23 @@
 }
  
 /*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+					 unsigned long thresh)
+{
+	if (thresh > dirty)
+		return 1UL << (ilog2(thresh - dirty) >> 1);
+
+	return 1;
+}
+
+/*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
@@ -1112,6 +1117,9 @@
 	if (clear_dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
  
+	current->nr_dirtied = 0;
+	current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh);
+
 	if (writeback_in_progress(bdi))
 		return;
  
@@ -1138,7 +1146,7 @@
 	}
 }
  
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+static DEFINE_PER_CPU(int, bdp_ratelimits);
  
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  
  
  
  
  
  
  
@@ -1158,31 +1166,39 @@
 					unsigned long nr_pages_dirtied)
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	unsigned long ratelimit;
-	unsigned long *p;
+	int ratelimit;
+	int *p;
  
 	if (!bdi_cap_account_dirty(bdi))
 		return;
  
-	ratelimit = ratelimit_pages;
-	if (mapping->backing_dev_info->dirty_exceeded)
-		ratelimit = 8;
+	ratelimit = current->nr_dirtied_pause;
+	if (bdi->dirty_exceeded)
+		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
  
+	current->nr_dirtied += nr_pages_dirtied;
+
+	preempt_disable();
 	/*
-	 * Check the rate limiting. Also, we do not want to throttle real-time
-	 * tasks in balance_dirty_pages(). Period.
+	 * This prevents one CPU to accumulate too many dirtied pages without
+	 * calling into balance_dirty_pages(), which can happen when there are
+	 * 1000+ tasks, all of them start dirtying pages at exactly the same
+	 * time, hence all honoured too large initial task->nr_dirtied_pause.
 	 */
-	preempt_disable();
 	p =  &__get_cpu_var(bdp_ratelimits);
-	*p += nr_pages_dirtied;
-	if (unlikely(*p >= ratelimit)) {
-		ratelimit = sync_writeback_pages(*p);
+	if (unlikely(current->nr_dirtied >= ratelimit))
 		*p = 0;
-		preempt_enable();
-		balance_dirty_pages(mapping, ratelimit);
-		return;
+	else {
+		*p += nr_pages_dirtied;
+		if (unlikely(*p >= ratelimit_pages)) {
+			*p = 0;
+			ratelimit = 0;
+		}
 	}
 	preempt_enable();
+
+	if (unlikely(current->nr_dirtied >= ratelimit))
+		balance_dirty_pages(mapping, current->nr_dirtied);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
  
  
  
@@ -1277,22 +1293,17 @@
  *
  * Here we set ratelimit_pages to a level which ensures that when all CPUs are
  * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
+ * thresholds.
  */
  
 void writeback_set_ratelimit(void)
 {
-	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+	unsigned long background_thresh;
+	unsigned long dirty_thresh;
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+	ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
 		ratelimit_pages = 16;
-	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
  
 static int __cpuinit
...	...	@@ -1525,6 +1525,13 @@
1525	1525	int make_it_fail;
1526	1526	#endif
1527	1527	struct prop_local_single dirties;
	1528	+ /*
	1529	+ * when (nr_dirtied >= nr_dirtied_pause), it's time to call
	1530	+ * balance_dirty_pages() for some dirty throttling pause
	1531	+ */
	1532	+ int nr_dirtied;
	1533	+ int nr_dirtied_pause;
	1534	+
1528	1535	#ifdef CONFIG_LATENCYTOP
1529	1536	int latency_record_count;
1530	1537	struct latency_record latency_record[LT_SAVECOUNT];
...	...	@@ -1302,6 +1302,9 @@
1302	1302	p->pdeath_signal = 0;
1303	1303	p->exit_state = 0;
1304	1304
	1305	+ p->nr_dirtied = 0;
	1306	+ p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
	1307	+
1305	1308	/*
1306	1309	* Ok, make it visible to the rest of the system.
1307	1310	* We dont wake it up yet.
...	...	@@ -54,20 +54,6 @@
54	54	*/
55	55	static long ratelimit_pages = 32;
56	56
57		-/*
58		- * When balance_dirty_pages decides that the caller needs to perform some
59		- * non-background writeback, this is how many pages it will attempt to write.
60		- * It should be somewhat larger than dirtied pages to ensure that reasonably
61		- * large amounts of I/O are submitted.
62		- */
63		-static inline long sync_writeback_pages(unsigned long dirtied)
64		-{
65		- if (dirtied < ratelimit_pages)
66		- dirtied = ratelimit_pages;
67		-
68		- return dirtied + dirtied / 2;
69		-}
70		-
71	57	/* The following parameters are exported via /proc/sys/vm */
72	58
73	59	/*
...	...	@@ -169,6 +155,8 @@
169	155	int shift = calc_period_shift();
170	156	prop_change_shift(&vm_completions, shift);
171	157	prop_change_shift(&vm_dirties, shift);
	158	+
	159	+ writeback_set_ratelimit();
172	160	}
173	161
174	162	int dirty_background_ratio_handler(struct ctl_table *table, int write,
...	...	@@ -979,6 +967,23 @@
979	967	}
980	968
981	969	/*
	970	+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
	971	+ * will look to see if it needs to start dirty throttling.
	972	+ *
	973	+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
	974	+ * global_page_state() too often. So scale it near-sqrt to the safety margin
	975	+ * (the number of pages we may dirty without exceeding the dirty limits).
	976	+ */
	977	+static unsigned long dirty_poll_interval(unsigned long dirty,
	978	+ unsigned long thresh)
	979	+{
	980	+ if (thresh > dirty)
	981	+ return 1UL << (ilog2(thresh - dirty) >> 1);
	982	+
	983	+ return 1;
	984	+}
	985	+
	986	+/*
982	987	* balance_dirty_pages() must be called by processes which are generating dirty
983	988	* data. It looks at the number of dirty pages in the machine and will force
984	989	* the caller to perform writeback if the system is over `vm_dirty_ratio'.
...	...	@@ -1112,6 +1117,9 @@
1112	1117	if (clear_dirty_exceeded && bdi->dirty_exceeded)
1113	1118	bdi->dirty_exceeded = 0;
1114	1119
	1120	+ current->nr_dirtied = 0;
	1121	+ current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh);
	1122	+
1115	1123	if (writeback_in_progress(bdi))
1116	1124	return;
1117	1125
...	...	@@ -1138,7 +1146,7 @@
1138	1146	}
1139	1147	}
1140	1148
1141		-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
	1149	+static DEFINE_PER_CPU(int, bdp_ratelimits);
1142	1150
1143	1151	/**
1144	1152	* balance_dirty_pages_ratelimited_nr - balance dirty memory state
1145	1153
1146	1154
1147	1155
1148	1156
1149	1157
1150	1158
1151	1159
...	...	@@ -1158,31 +1166,39 @@
1158	1166	unsigned long nr_pages_dirtied)
1159	1167	{
1160	1168	struct backing_dev_info *bdi = mapping->backing_dev_info;
1161		- unsigned long ratelimit;
1162		- unsigned long *p;
	1169	+ int ratelimit;
	1170	+ int *p;
1163	1171
1164	1172	if (!bdi_cap_account_dirty(bdi))
1165	1173	return;
1166	1174
1167		- ratelimit = ratelimit_pages;
1168		- if (mapping->backing_dev_info->dirty_exceeded)
1169		- ratelimit = 8;
	1175	+ ratelimit = current->nr_dirtied_pause;
	1176	+ if (bdi->dirty_exceeded)
	1177	+ ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1170	1178
	1179	+ current->nr_dirtied += nr_pages_dirtied;
	1180	+
	1181	+ preempt_disable();
1171	1182	/*
1172		- * Check the rate limiting. Also, we do not want to throttle real-time
1173		- * tasks in balance_dirty_pages(). Period.
	1183	+ * This prevents one CPU to accumulate too many dirtied pages without
	1184	+ * calling into balance_dirty_pages(), which can happen when there are
	1185	+ * 1000+ tasks, all of them start dirtying pages at exactly the same
	1186	+ * time, hence all honoured too large initial task->nr_dirtied_pause.
1174	1187	*/
1175		- preempt_disable();
1176	1188	p = &__get_cpu_var(bdp_ratelimits);
1177		- *p += nr_pages_dirtied;
1178		- if (unlikely(*p >= ratelimit)) {
1179		- ratelimit = sync_writeback_pages(*p);
	1189	+ if (unlikely(current->nr_dirtied >= ratelimit))
1180	1190	*p = 0;
1181		- preempt_enable();
1182		- balance_dirty_pages(mapping, ratelimit);
1183		- return;
	1191	+ else {
	1192	+ *p += nr_pages_dirtied;
	1193	+ if (unlikely(*p >= ratelimit_pages)) {
	1194	+ *p = 0;
	1195	+ ratelimit = 0;
	1196	+ }
1184	1197	}
1185	1198	preempt_enable();
	1199	+
	1200	+ if (unlikely(current->nr_dirtied >= ratelimit))
	1201	+ balance_dirty_pages(mapping, current->nr_dirtied);
1186	1202	}
1187	1203	EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
1188	1204
1189	1205
1190	1206
...	...	@@ -1277,22 +1293,17 @@
1277	1293	*
1278	1294	* Here we set ratelimit_pages to a level which ensures that when all CPUs are
1279	1295	* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
1280		- * thresholds before writeback cuts in.
1281		- *
1282		- * But the limit should not be set too high. Because it also controls the
1283		- * amount of memory which the balance_dirty_pages() caller has to write back.
1284		- * If this is too large then the caller will block on the IO queue all the
1285		- * time. So limit it to four megabytes - the balance_dirty_pages() caller
1286		- * will write six megabyte chunks, max.
	1296	+ * thresholds.
1287	1297	*/
1288	1298
1289	1299	void writeback_set_ratelimit(void)
1290	1300	{
1291		- ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
	1301	+ unsigned long background_thresh;
	1302	+ unsigned long dirty_thresh;
	1303	+ global_dirty_limits(&background_thresh, &dirty_thresh);
	1304	+ ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
1292	1305	if (ratelimit_pages < 16)
1293	1306	ratelimit_pages = 16;
1294		- if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
1295		- ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
1296	1307	}
1297	1308
1298	1309	static int __cpuinit