Commit 7ccb9ad5364d6ac0c803096c67e76a7545cf7a77

Authored by Wu Fengguang
1 parent 83712358ba

writeback: max, min and target dirty pause time

Control the pause time and the call intervals to balance_dirty_pages()
with three parameters:

1) max_pause, limited by bdi_dirty and MAX_PAUSE

2) the target pause time, grows with the number of dd tasks
   and is normally limited by max_pause/2

3) the minimal pause, set to half the target pause
   and is used to skip short sleeps and accumulate them into bigger ones

The typical behaviors after patch:

- if ever task_ratelimit is far below dirty_ratelimit, the pause time
  will remain constant at max_pause and nr_dirtied_pause will be
  fluctuating with task_ratelimit

- in the normal cases, nr_dirtied_pause will remain stable (keep in the
  same pace with dirty_ratelimit) and the pause time will be fluctuating
  with task_ratelimit

In summary, someone has to fluctuate with task_ratelimit, because

	task_ratelimit = nr_dirtied_pause / pause

We normally prefer a stable nr_dirtied_pause, until reaching max_pause.

The notable behavior changes are:

- in stable workloads, there will no longer be sudden big trajectory
  switching of nr_dirtied_pause as concerned by Peter. It will be as
  smooth as dirty_ratelimit and changing proportionally with it (as
  always, assuming bdi bandwidth does not fluctuate across 2^N lines,
  otherwise nr_dirtied_pause will show up in 2+ parallel trajectories)

- in the rare cases when something keeps task_ratelimit far below
  dirty_ratelimit, the smoothness can no longer be retained and
  nr_dirtied_pause will be "dancing" with task_ratelimit. This fixes a
  (not that destructive but still not good) bug that
	  dirty_ratelimit gets brought down undesirably
	  <= balanced_dirty_ratelimit is under estimated
	  <= weakly executed task_ratelimit
	  <= pause goes too large and gets trimmed down to max_pause
	  <= nr_dirtied_pause (based on dirty_ratelimit) is set too large
	  <= dirty_ratelimit being much larger than task_ratelimit

- introduce min_pause to avoid small pause sleeps

- when pause is trimmed down to max_pause, try to compensate it at the
  next pause time

The "refactor" type of changes are:

The max_pause equation is slightly transformed to make it slightly more
efficient.

We now scale target_pause by (N * 10ms) on 2^N concurrent tasks, which
is effectively equal to the original scaling max_pause by (N * 20ms)
because the original code does implicit target_pause ~= max_pause / 2.
Based on the same implicit ratio, target_pause starts with 10ms on 1 dd.

CC: Jan Kara <jack@suse.cz>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>

Showing 1 changed file with 81 additions and 44 deletions Side-by-side Diff

... ... @@ -962,40 +962,81 @@
962 962 return 1;
963 963 }
964 964  
965   -static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
966   - unsigned long bdi_dirty)
  965 +static long bdi_max_pause(struct backing_dev_info *bdi,
  966 + unsigned long bdi_dirty)
967 967 {
968   - unsigned long bw = bdi->avg_write_bandwidth;
969   - unsigned long hi = ilog2(bw);
970   - unsigned long lo = ilog2(bdi->dirty_ratelimit);
971   - unsigned long t;
  968 + long bw = bdi->avg_write_bandwidth;
  969 + long t;
972 970  
973   - /* target for 20ms max pause on 1-dd case */
974   - t = HZ / 50;
  971 + /*
  972 + * Limit pause time for small memory systems. If sleeping for too long
  973 + * time, a small pool of dirty/writeback pages may go empty and disk go
  974 + * idle.
  975 + *
  976 + * 8 serves as the safety ratio.
  977 + */
  978 + t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
  979 + t++;
975 980  
  981 + return min_t(long, t, MAX_PAUSE);
  982 +}
  983 +
  984 +static long bdi_min_pause(struct backing_dev_info *bdi,
  985 + long max_pause,
  986 + unsigned long task_ratelimit,
  987 + unsigned long dirty_ratelimit,
  988 + int *nr_dirtied_pause)
  989 +{
  990 + long hi = ilog2(bdi->avg_write_bandwidth);
  991 + long lo = ilog2(bdi->dirty_ratelimit);
  992 + long t; /* target pause */
  993 + long pause; /* estimated next pause */
  994 + int pages; /* target nr_dirtied_pause */
  995 +
  996 + /* target for 10ms pause on 1-dd case */
  997 + t = max(1, HZ / 100);
  998 +
976 999 /*
977 1000 * Scale up pause time for concurrent dirtiers in order to reduce CPU
978 1001 * overheads.
979 1002 *
980   - * (N * 20ms) on 2^N concurrent tasks.
  1003 + * (N * 10ms) on 2^N concurrent tasks.
981 1004 */
982 1005 if (hi > lo)
983   - t += (hi - lo) * (20 * HZ) / 1024;
  1006 + t += (hi - lo) * (10 * HZ) / 1024;
984 1007  
985 1008 /*
986   - * Limit pause time for small memory systems. If sleeping for too long
987   - * time, a small pool of dirty/writeback pages may go empty and disk go
988   - * idle.
  1009 + * This is a bit convoluted. We try to base the next nr_dirtied_pause
  1010 + * on the much more stable dirty_ratelimit. However the next pause time
  1011 + * will be computed based on task_ratelimit and the two rate limits may
  1012 + * depart considerably at some time. Especially if task_ratelimit goes
  1013 + * below dirty_ratelimit/2 and the target pause is max_pause, the next
  1014 + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
  1015 + * result task_ratelimit won't be executed faithfully, which could
  1016 + * eventually bring down dirty_ratelimit.
989 1017 *
990   - * 8 serves as the safety ratio.
  1018 + * We apply two rules to fix it up:
  1019 + * 1) try to estimate the next pause time and if necessary, use a lower
  1020 + * nr_dirtied_pause so as not to exceed max_pause. When this happens,
  1021 + * nr_dirtied_pause will be "dancing" with task_ratelimit.
  1022 + * 2) limit the target pause time to max_pause/2, so that the normal
  1023 + * small fluctuations of task_ratelimit won't trigger rule (1) and
  1024 + * nr_dirtied_pause will remain as stable as dirty_ratelimit.
991 1025 */
992   - t = min(t, bdi_dirty * HZ / (8 * bw + 1));
  1026 + t = min(t, 1 + max_pause / 2);
  1027 + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
993 1028  
  1029 + pause = HZ * pages / (task_ratelimit + 1);
  1030 + if (pause > max_pause) {
  1031 + t = max_pause;
  1032 + pages = task_ratelimit * t / roundup_pow_of_two(HZ);
  1033 + }
  1034 +
  1035 + *nr_dirtied_pause = pages;
994 1036 /*
995   - * The pause time will be settled within range (max_pause/4, max_pause).
996   - * Apply a minimal value of 4 to get a non-zero max_pause/4.
  1037 + * The minimal pause time will normally be half the target pause time.
997 1038 */
998   - return clamp_val(t, 4, MAX_PAUSE);
  1039 + return 1 + t / 2;
999 1040 }
1000 1041  
1001 1042 /*
1002 1043  
... ... @@ -1017,11 +1058,13 @@
1017 1058 unsigned long dirty_thresh;
1018 1059 unsigned long bdi_thresh;
1019 1060 long period;
1020   - long pause = 0;
1021   - long uninitialized_var(max_pause);
  1061 + long pause;
  1062 + long max_pause;
  1063 + long min_pause;
  1064 + int nr_dirtied_pause;
1022 1065 bool dirty_exceeded = false;
1023 1066 unsigned long task_ratelimit;
1024   - unsigned long uninitialized_var(dirty_ratelimit);
  1067 + unsigned long dirty_ratelimit;
1025 1068 unsigned long pos_ratio;
1026 1069 struct backing_dev_info *bdi = mapping->backing_dev_info;
1027 1070 unsigned long start_time = jiffies;
... ... @@ -1051,6 +1094,8 @@
1051 1094 if (nr_dirty <= freerun) {
1052 1095 current->dirty_paused_when = now;
1053 1096 current->nr_dirtied = 0;
  1097 + current->nr_dirtied_pause =
  1098 + dirty_poll_interval(nr_dirty, dirty_thresh);
1054 1099 break;
1055 1100 }
1056 1101  
1057 1102  
... ... @@ -1101,14 +1146,17 @@
1101 1146 nr_dirty, bdi_thresh, bdi_dirty,
1102 1147 start_time);
1103 1148  
1104   - max_pause = bdi_max_pause(bdi, bdi_dirty);
1105   -
1106 1149 dirty_ratelimit = bdi->dirty_ratelimit;
1107 1150 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1108 1151 background_thresh, nr_dirty,
1109 1152 bdi_thresh, bdi_dirty);
1110 1153 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1111 1154 RATELIMIT_CALC_SHIFT;
  1155 + max_pause = bdi_max_pause(bdi, bdi_dirty);
  1156 + min_pause = bdi_min_pause(bdi, max_pause,
  1157 + task_ratelimit, dirty_ratelimit,
  1158 + &nr_dirtied_pause);
  1159 +
1112 1160 if (unlikely(task_ratelimit == 0)) {
1113 1161 period = max_pause;
1114 1162 pause = max_pause;
... ... @@ -1125,7 +1173,7 @@
1125 1173 * future periods by updating the virtual time; otherwise just
1126 1174 * do a reset, as it may be a light dirtier.
1127 1175 */
1128   - if (unlikely(pause <= 0)) {
  1176 + if (pause < min_pause) {
1129 1177 trace_balance_dirty_pages(bdi,
1130 1178 dirty_thresh,
1131 1179 background_thresh,
... ... @@ -1136,7 +1184,7 @@
1136 1184 task_ratelimit,
1137 1185 pages_dirtied,
1138 1186 period,
1139   - pause,
  1187 + min(pause, 0L),
1140 1188 start_time);
1141 1189 if (pause < -HZ) {
1142 1190 current->dirty_paused_when = now;
1143 1191  
... ... @@ -1144,11 +1192,15 @@
1144 1192 } else if (period) {
1145 1193 current->dirty_paused_when += period;
1146 1194 current->nr_dirtied = 0;
1147   - }
1148   - pause = 1; /* avoid resetting nr_dirtied_pause below */
  1195 + } else if (current->nr_dirtied_pause <= pages_dirtied)
  1196 + current->nr_dirtied_pause += pages_dirtied;
1149 1197 break;
1150 1198 }
1151   - pause = min(pause, max_pause);
  1199 + if (unlikely(pause > max_pause)) {
  1200 + /* for occasional dropped task_ratelimit */
  1201 + now += min(pause - max_pause, max_pause);
  1202 + pause = max_pause;
  1203 + }
1152 1204  
1153 1205 pause:
1154 1206 trace_balance_dirty_pages(bdi,
... ... @@ -1168,6 +1220,7 @@
1168 1220  
1169 1221 current->dirty_paused_when = now + pause;
1170 1222 current->nr_dirtied = 0;
  1223 + current->nr_dirtied_pause = nr_dirtied_pause;
1171 1224  
1172 1225 /*
1173 1226 * This is typically equal to (nr_dirty < dirty_thresh) and can
... ... @@ -1195,22 +1248,6 @@
1195 1248  
1196 1249 if (!dirty_exceeded && bdi->dirty_exceeded)
1197 1250 bdi->dirty_exceeded = 0;
1198   -
1199   - if (pause == 0) { /* in freerun area */
1200   - current->nr_dirtied_pause =
1201   - dirty_poll_interval(nr_dirty, dirty_thresh);
1202   - } else if (period <= max_pause / 4 &&
1203   - pages_dirtied >= current->nr_dirtied_pause) {
1204   - current->nr_dirtied_pause = clamp_val(
1205   - dirty_ratelimit * (max_pause / 2) / HZ,
1206   - pages_dirtied + pages_dirtied / 8,
1207   - pages_dirtied * 4);
1208   - } else if (pause >= max_pause) {
1209   - current->nr_dirtied_pause = 1 | clamp_val(
1210   - dirty_ratelimit * (max_pause / 2) / HZ,
1211   - pages_dirtied / 4,
1212   - pages_dirtied - pages_dirtied / 8);
1213   - }
1214 1251  
1215 1252 if (writeback_in_progress(bdi))
1216 1253 return;