Commit 7ccb9ad5364d6ac0c803096c67e76a7545cf7a77
1 parent
83712358ba
Exists in
master
and in
20 other branches
writeback: max, min and target dirty pause time
Control the pause time and the call intervals to balance_dirty_pages() with three parameters: 1) max_pause, limited by bdi_dirty and MAX_PAUSE 2) the target pause time, grows with the number of dd tasks and is normally limited by max_pause/2 3) the minimal pause, set to half the target pause and is used to skip short sleeps and accumulate them into bigger ones The typical behaviors after patch: - if ever task_ratelimit is far below dirty_ratelimit, the pause time will remain constant at max_pause and nr_dirtied_pause will be fluctuating with task_ratelimit - in the normal cases, nr_dirtied_pause will remain stable (keep in the same pace with dirty_ratelimit) and the pause time will be fluctuating with task_ratelimit In summary, someone has to fluctuate with task_ratelimit, because task_ratelimit = nr_dirtied_pause / pause We normally prefer a stable nr_dirtied_pause, until reaching max_pause. The notable behavior changes are: - in stable workloads, there will no longer be sudden big trajectory switching of nr_dirtied_pause as concerned by Peter. It will be as smooth as dirty_ratelimit and changing proportionally with it (as always, assuming bdi bandwidth does not fluctuate across 2^N lines, otherwise nr_dirtied_pause will show up in 2+ parallel trajectories) - in the rare cases when something keeps task_ratelimit far below dirty_ratelimit, the smoothness can no longer be retained and nr_dirtied_pause will be "dancing" with task_ratelimit. This fixes a (not that destructive but still not good) bug that dirty_ratelimit gets brought down undesirably <= balanced_dirty_ratelimit is under estimated <= weakly executed task_ratelimit <= pause goes too large and gets trimmed down to max_pause <= nr_dirtied_pause (based on dirty_ratelimit) is set too large <= dirty_ratelimit being much larger than task_ratelimit - introduce min_pause to avoid small pause sleeps - when pause is trimmed down to max_pause, try to compensate it at the next pause time The "refactor" type of changes are: The max_pause equation is slightly transformed to make it slightly more efficient. We now scale target_pause by (N * 10ms) on 2^N concurrent tasks, which is effectively equal to the original scaling max_pause by (N * 20ms) because the original code does implicit target_pause ~= max_pause / 2. Based on the same implicit ratio, target_pause starts with 10ms on 1 dd. CC: Jan Kara <jack@suse.cz> CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Showing 1 changed file with 81 additions and 44 deletions Side-by-side Diff
mm/page-writeback.c
... | ... | @@ -962,40 +962,81 @@ |
962 | 962 | return 1; |
963 | 963 | } |
964 | 964 | |
965 | -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | |
966 | - unsigned long bdi_dirty) | |
965 | +static long bdi_max_pause(struct backing_dev_info *bdi, | |
966 | + unsigned long bdi_dirty) | |
967 | 967 | { |
968 | - unsigned long bw = bdi->avg_write_bandwidth; | |
969 | - unsigned long hi = ilog2(bw); | |
970 | - unsigned long lo = ilog2(bdi->dirty_ratelimit); | |
971 | - unsigned long t; | |
968 | + long bw = bdi->avg_write_bandwidth; | |
969 | + long t; | |
972 | 970 | |
973 | - /* target for 20ms max pause on 1-dd case */ | |
974 | - t = HZ / 50; | |
971 | + /* | |
972 | + * Limit pause time for small memory systems. If sleeping for too long | |
973 | + * time, a small pool of dirty/writeback pages may go empty and disk go | |
974 | + * idle. | |
975 | + * | |
976 | + * 8 serves as the safety ratio. | |
977 | + */ | |
978 | + t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); | |
979 | + t++; | |
975 | 980 | |
981 | + return min_t(long, t, MAX_PAUSE); | |
982 | +} | |
983 | + | |
984 | +static long bdi_min_pause(struct backing_dev_info *bdi, | |
985 | + long max_pause, | |
986 | + unsigned long task_ratelimit, | |
987 | + unsigned long dirty_ratelimit, | |
988 | + int *nr_dirtied_pause) | |
989 | +{ | |
990 | + long hi = ilog2(bdi->avg_write_bandwidth); | |
991 | + long lo = ilog2(bdi->dirty_ratelimit); | |
992 | + long t; /* target pause */ | |
993 | + long pause; /* estimated next pause */ | |
994 | + int pages; /* target nr_dirtied_pause */ | |
995 | + | |
996 | + /* target for 10ms pause on 1-dd case */ | |
997 | + t = max(1, HZ / 100); | |
998 | + | |
976 | 999 | /* |
977 | 1000 | * Scale up pause time for concurrent dirtiers in order to reduce CPU |
978 | 1001 | * overheads. |
979 | 1002 | * |
980 | - * (N * 20ms) on 2^N concurrent tasks. | |
1003 | + * (N * 10ms) on 2^N concurrent tasks. | |
981 | 1004 | */ |
982 | 1005 | if (hi > lo) |
983 | - t += (hi - lo) * (20 * HZ) / 1024; | |
1006 | + t += (hi - lo) * (10 * HZ) / 1024; | |
984 | 1007 | |
985 | 1008 | /* |
986 | - * Limit pause time for small memory systems. If sleeping for too long | |
987 | - * time, a small pool of dirty/writeback pages may go empty and disk go | |
988 | - * idle. | |
1009 | + * This is a bit convoluted. We try to base the next nr_dirtied_pause | |
1010 | + * on the much more stable dirty_ratelimit. However the next pause time | |
1011 | + * will be computed based on task_ratelimit and the two rate limits may | |
1012 | + * depart considerably at some time. Especially if task_ratelimit goes | |
1013 | + * below dirty_ratelimit/2 and the target pause is max_pause, the next | |
1014 | + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a | |
1015 | + * result task_ratelimit won't be executed faithfully, which could | |
1016 | + * eventually bring down dirty_ratelimit. | |
989 | 1017 | * |
990 | - * 8 serves as the safety ratio. | |
1018 | + * We apply two rules to fix it up: | |
1019 | + * 1) try to estimate the next pause time and if necessary, use a lower | |
1020 | + * nr_dirtied_pause so as not to exceed max_pause. When this happens, | |
1021 | + * nr_dirtied_pause will be "dancing" with task_ratelimit. | |
1022 | + * 2) limit the target pause time to max_pause/2, so that the normal | |
1023 | + * small fluctuations of task_ratelimit won't trigger rule (1) and | |
1024 | + * nr_dirtied_pause will remain as stable as dirty_ratelimit. | |
991 | 1025 | */ |
992 | - t = min(t, bdi_dirty * HZ / (8 * bw + 1)); | |
1026 | + t = min(t, 1 + max_pause / 2); | |
1027 | + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); | |
993 | 1028 | |
1029 | + pause = HZ * pages / (task_ratelimit + 1); | |
1030 | + if (pause > max_pause) { | |
1031 | + t = max_pause; | |
1032 | + pages = task_ratelimit * t / roundup_pow_of_two(HZ); | |
1033 | + } | |
1034 | + | |
1035 | + *nr_dirtied_pause = pages; | |
994 | 1036 | /* |
995 | - * The pause time will be settled within range (max_pause/4, max_pause). | |
996 | - * Apply a minimal value of 4 to get a non-zero max_pause/4. | |
1037 | + * The minimal pause time will normally be half the target pause time. | |
997 | 1038 | */ |
998 | - return clamp_val(t, 4, MAX_PAUSE); | |
1039 | + return 1 + t / 2; | |
999 | 1040 | } |
1000 | 1041 | |
1001 | 1042 | /* |
1002 | 1043 | |
... | ... | @@ -1017,11 +1058,13 @@ |
1017 | 1058 | unsigned long dirty_thresh; |
1018 | 1059 | unsigned long bdi_thresh; |
1019 | 1060 | long period; |
1020 | - long pause = 0; | |
1021 | - long uninitialized_var(max_pause); | |
1061 | + long pause; | |
1062 | + long max_pause; | |
1063 | + long min_pause; | |
1064 | + int nr_dirtied_pause; | |
1022 | 1065 | bool dirty_exceeded = false; |
1023 | 1066 | unsigned long task_ratelimit; |
1024 | - unsigned long uninitialized_var(dirty_ratelimit); | |
1067 | + unsigned long dirty_ratelimit; | |
1025 | 1068 | unsigned long pos_ratio; |
1026 | 1069 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1027 | 1070 | unsigned long start_time = jiffies; |
... | ... | @@ -1051,6 +1094,8 @@ |
1051 | 1094 | if (nr_dirty <= freerun) { |
1052 | 1095 | current->dirty_paused_when = now; |
1053 | 1096 | current->nr_dirtied = 0; |
1097 | + current->nr_dirtied_pause = | |
1098 | + dirty_poll_interval(nr_dirty, dirty_thresh); | |
1054 | 1099 | break; |
1055 | 1100 | } |
1056 | 1101 | |
1057 | 1102 | |
... | ... | @@ -1101,14 +1146,17 @@ |
1101 | 1146 | nr_dirty, bdi_thresh, bdi_dirty, |
1102 | 1147 | start_time); |
1103 | 1148 | |
1104 | - max_pause = bdi_max_pause(bdi, bdi_dirty); | |
1105 | - | |
1106 | 1149 | dirty_ratelimit = bdi->dirty_ratelimit; |
1107 | 1150 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
1108 | 1151 | background_thresh, nr_dirty, |
1109 | 1152 | bdi_thresh, bdi_dirty); |
1110 | 1153 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> |
1111 | 1154 | RATELIMIT_CALC_SHIFT; |
1155 | + max_pause = bdi_max_pause(bdi, bdi_dirty); | |
1156 | + min_pause = bdi_min_pause(bdi, max_pause, | |
1157 | + task_ratelimit, dirty_ratelimit, | |
1158 | + &nr_dirtied_pause); | |
1159 | + | |
1112 | 1160 | if (unlikely(task_ratelimit == 0)) { |
1113 | 1161 | period = max_pause; |
1114 | 1162 | pause = max_pause; |
... | ... | @@ -1125,7 +1173,7 @@ |
1125 | 1173 | * future periods by updating the virtual time; otherwise just |
1126 | 1174 | * do a reset, as it may be a light dirtier. |
1127 | 1175 | */ |
1128 | - if (unlikely(pause <= 0)) { | |
1176 | + if (pause < min_pause) { | |
1129 | 1177 | trace_balance_dirty_pages(bdi, |
1130 | 1178 | dirty_thresh, |
1131 | 1179 | background_thresh, |
... | ... | @@ -1136,7 +1184,7 @@ |
1136 | 1184 | task_ratelimit, |
1137 | 1185 | pages_dirtied, |
1138 | 1186 | period, |
1139 | - pause, | |
1187 | + min(pause, 0L), | |
1140 | 1188 | start_time); |
1141 | 1189 | if (pause < -HZ) { |
1142 | 1190 | current->dirty_paused_when = now; |
1143 | 1191 | |
... | ... | @@ -1144,11 +1192,15 @@ |
1144 | 1192 | } else if (period) { |
1145 | 1193 | current->dirty_paused_when += period; |
1146 | 1194 | current->nr_dirtied = 0; |
1147 | - } | |
1148 | - pause = 1; /* avoid resetting nr_dirtied_pause below */ | |
1195 | + } else if (current->nr_dirtied_pause <= pages_dirtied) | |
1196 | + current->nr_dirtied_pause += pages_dirtied; | |
1149 | 1197 | break; |
1150 | 1198 | } |
1151 | - pause = min(pause, max_pause); | |
1199 | + if (unlikely(pause > max_pause)) { | |
1200 | + /* for occasional dropped task_ratelimit */ | |
1201 | + now += min(pause - max_pause, max_pause); | |
1202 | + pause = max_pause; | |
1203 | + } | |
1152 | 1204 | |
1153 | 1205 | pause: |
1154 | 1206 | trace_balance_dirty_pages(bdi, |
... | ... | @@ -1168,6 +1220,7 @@ |
1168 | 1220 | |
1169 | 1221 | current->dirty_paused_when = now + pause; |
1170 | 1222 | current->nr_dirtied = 0; |
1223 | + current->nr_dirtied_pause = nr_dirtied_pause; | |
1171 | 1224 | |
1172 | 1225 | /* |
1173 | 1226 | * This is typically equal to (nr_dirty < dirty_thresh) and can |
... | ... | @@ -1195,22 +1248,6 @@ |
1195 | 1248 | |
1196 | 1249 | if (!dirty_exceeded && bdi->dirty_exceeded) |
1197 | 1250 | bdi->dirty_exceeded = 0; |
1198 | - | |
1199 | - if (pause == 0) { /* in freerun area */ | |
1200 | - current->nr_dirtied_pause = | |
1201 | - dirty_poll_interval(nr_dirty, dirty_thresh); | |
1202 | - } else if (period <= max_pause / 4 && | |
1203 | - pages_dirtied >= current->nr_dirtied_pause) { | |
1204 | - current->nr_dirtied_pause = clamp_val( | |
1205 | - dirty_ratelimit * (max_pause / 2) / HZ, | |
1206 | - pages_dirtied + pages_dirtied / 8, | |
1207 | - pages_dirtied * 4); | |
1208 | - } else if (pause >= max_pause) { | |
1209 | - current->nr_dirtied_pause = 1 | clamp_val( | |
1210 | - dirty_ratelimit * (max_pause / 2) / HZ, | |
1211 | - pages_dirtied / 4, | |
1212 | - pages_dirtied - pages_dirtied / 8); | |
1213 | - } | |
1214 | 1251 | |
1215 | 1252 | if (writeback_in_progress(bdi)) |
1216 | 1253 | return; |