Commit 9d823e8f6b1b7b39f952d7d1795f29162143a433

Authored by Wu Fengguang
1 parent 7381131cbc

writeback: per task dirty rate limit

Add two fields to task_struct.

1) account dirtied pages in the individual tasks, for accuracy
2) per-task balance_dirty_pages() call intervals, for flexibility

The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will
scale near-sqrt to the safety gap between dirty pages and threshold.

The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying
pages at exactly the same time, each task will be assigned a large
initial nr_dirtied_pause, so that the dirty threshold will be exceeded
long before each task reached its nr_dirtied_pause and hence call
balance_dirty_pages().

The solution is to watch for the number of pages dirtied on each CPU in
between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages
(3% dirty threshold), force call balance_dirty_pages() for a chance to
set bdi->dirty_exceeded. In normal situations, this safeguarding
condition is not expected to trigger at all.

On the sqrt in dirty_poll_interval():

It will serve as an initial guess when dirty pages are still in the
freerun area.

When dirty pages are floating inside the dirty control scope [freerun,
limit], a followup patch will use some refined dirty poll interval to
get the desired pause time.

   thresh-dirty (MB)    sqrt
		   1      16
		   2      22
		   4      32
		   8      45
		  16      64
		  32      90
		  64     128
		 128     181
		 256     256
		 512     362
		1024     512

The above table means, given 1MB (or 1GB) gap and the dd tasks polling
balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't
be exceeded as long as there are less than 16 (or 512) concurrent dd's.

So sqrt naturally leads to less overheads and more safe concurrent tasks
for large memory servers, which have large (thresh-freerun) gaps.

peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case

CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Andrea Righi <andrea@betterlinux.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>

Showing 3 changed files with 60 additions and 39 deletions Side-by-side Diff

include/linux/sched.h
... ... @@ -1525,6 +1525,13 @@
1525 1525 int make_it_fail;
1526 1526 #endif
1527 1527 struct prop_local_single dirties;
  1528 + /*
  1529 + * when (nr_dirtied >= nr_dirtied_pause), it's time to call
  1530 + * balance_dirty_pages() for some dirty throttling pause
  1531 + */
  1532 + int nr_dirtied;
  1533 + int nr_dirtied_pause;
  1534 +
1528 1535 #ifdef CONFIG_LATENCYTOP
1529 1536 int latency_record_count;
1530 1537 struct latency_record latency_record[LT_SAVECOUNT];
... ... @@ -1302,6 +1302,9 @@
1302 1302 p->pdeath_signal = 0;
1303 1303 p->exit_state = 0;
1304 1304  
  1305 + p->nr_dirtied = 0;
  1306 + p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
  1307 +
1305 1308 /*
1306 1309 * Ok, make it visible to the rest of the system.
1307 1310 * We dont wake it up yet.
... ... @@ -54,20 +54,6 @@
54 54 */
55 55 static long ratelimit_pages = 32;
56 56  
57   -/*
58   - * When balance_dirty_pages decides that the caller needs to perform some
59   - * non-background writeback, this is how many pages it will attempt to write.
60   - * It should be somewhat larger than dirtied pages to ensure that reasonably
61   - * large amounts of I/O are submitted.
62   - */
63   -static inline long sync_writeback_pages(unsigned long dirtied)
64   -{
65   - if (dirtied < ratelimit_pages)
66   - dirtied = ratelimit_pages;
67   -
68   - return dirtied + dirtied / 2;
69   -}
70   -
71 57 /* The following parameters are exported via /proc/sys/vm */
72 58  
73 59 /*
... ... @@ -169,6 +155,8 @@
169 155 int shift = calc_period_shift();
170 156 prop_change_shift(&vm_completions, shift);
171 157 prop_change_shift(&vm_dirties, shift);
  158 +
  159 + writeback_set_ratelimit();
172 160 }
173 161  
174 162 int dirty_background_ratio_handler(struct ctl_table *table, int write,
... ... @@ -979,6 +967,23 @@
979 967 }
980 968  
981 969 /*
  970 + * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
  971 + * will look to see if it needs to start dirty throttling.
  972 + *
  973 + * If dirty_poll_interval is too low, big NUMA machines will call the expensive
  974 + * global_page_state() too often. So scale it near-sqrt to the safety margin
  975 + * (the number of pages we may dirty without exceeding the dirty limits).
  976 + */
  977 +static unsigned long dirty_poll_interval(unsigned long dirty,
  978 + unsigned long thresh)
  979 +{
  980 + if (thresh > dirty)
  981 + return 1UL << (ilog2(thresh - dirty) >> 1);
  982 +
  983 + return 1;
  984 +}
  985 +
  986 +/*
982 987 * balance_dirty_pages() must be called by processes which are generating dirty
983 988 * data. It looks at the number of dirty pages in the machine and will force
984 989 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
... ... @@ -1112,6 +1117,9 @@
1112 1117 if (clear_dirty_exceeded && bdi->dirty_exceeded)
1113 1118 bdi->dirty_exceeded = 0;
1114 1119  
  1120 + current->nr_dirtied = 0;
  1121 + current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh);
  1122 +
1115 1123 if (writeback_in_progress(bdi))
1116 1124 return;
1117 1125  
... ... @@ -1138,7 +1146,7 @@
1138 1146 }
1139 1147 }
1140 1148  
1141   -static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
  1149 +static DEFINE_PER_CPU(int, bdp_ratelimits);
1142 1150  
1143 1151 /**
1144 1152 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
1145 1153  
1146 1154  
1147 1155  
1148 1156  
1149 1157  
1150 1158  
1151 1159  
... ... @@ -1158,31 +1166,39 @@
1158 1166 unsigned long nr_pages_dirtied)
1159 1167 {
1160 1168 struct backing_dev_info *bdi = mapping->backing_dev_info;
1161   - unsigned long ratelimit;
1162   - unsigned long *p;
  1169 + int ratelimit;
  1170 + int *p;
1163 1171  
1164 1172 if (!bdi_cap_account_dirty(bdi))
1165 1173 return;
1166 1174  
1167   - ratelimit = ratelimit_pages;
1168   - if (mapping->backing_dev_info->dirty_exceeded)
1169   - ratelimit = 8;
  1175 + ratelimit = current->nr_dirtied_pause;
  1176 + if (bdi->dirty_exceeded)
  1177 + ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1170 1178  
  1179 + current->nr_dirtied += nr_pages_dirtied;
  1180 +
  1181 + preempt_disable();
1171 1182 /*
1172   - * Check the rate limiting. Also, we do not want to throttle real-time
1173   - * tasks in balance_dirty_pages(). Period.
  1183 + * This prevents one CPU to accumulate too many dirtied pages without
  1184 + * calling into balance_dirty_pages(), which can happen when there are
  1185 + * 1000+ tasks, all of them start dirtying pages at exactly the same
  1186 + * time, hence all honoured too large initial task->nr_dirtied_pause.
1174 1187 */
1175   - preempt_disable();
1176 1188 p = &__get_cpu_var(bdp_ratelimits);
1177   - *p += nr_pages_dirtied;
1178   - if (unlikely(*p >= ratelimit)) {
1179   - ratelimit = sync_writeback_pages(*p);
  1189 + if (unlikely(current->nr_dirtied >= ratelimit))
1180 1190 *p = 0;
1181   - preempt_enable();
1182   - balance_dirty_pages(mapping, ratelimit);
1183   - return;
  1191 + else {
  1192 + *p += nr_pages_dirtied;
  1193 + if (unlikely(*p >= ratelimit_pages)) {
  1194 + *p = 0;
  1195 + ratelimit = 0;
  1196 + }
1184 1197 }
1185 1198 preempt_enable();
  1199 +
  1200 + if (unlikely(current->nr_dirtied >= ratelimit))
  1201 + balance_dirty_pages(mapping, current->nr_dirtied);
1186 1202 }
1187 1203 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
1188 1204  
1189 1205  
1190 1206  
... ... @@ -1277,22 +1293,17 @@
1277 1293 *
1278 1294 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
1279 1295 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
1280   - * thresholds before writeback cuts in.
1281   - *
1282   - * But the limit should not be set too high. Because it also controls the
1283   - * amount of memory which the balance_dirty_pages() caller has to write back.
1284   - * If this is too large then the caller will block on the IO queue all the
1285   - * time. So limit it to four megabytes - the balance_dirty_pages() caller
1286   - * will write six megabyte chunks, max.
  1296 + * thresholds.
1287 1297 */
1288 1298  
1289 1299 void writeback_set_ratelimit(void)
1290 1300 {
1291   - ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
  1301 + unsigned long background_thresh;
  1302 + unsigned long dirty_thresh;
  1303 + global_dirty_limits(&background_thresh, &dirty_thresh);
  1304 + ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
1292 1305 if (ratelimit_pages < 16)
1293 1306 ratelimit_pages = 16;
1294   - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
1295   - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
1296 1307 }
1297 1308  
1298 1309 static int __cpuinit