Commit 001a541ea9163ace5e8243ee0e907ad80a4c0ec2
Exists in
master
and in
6 other branches
Merge branch 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux
* 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux: writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c writeback: balanced_rate cannot exceed write bandwidth writeback: do strict bdi dirty_exceeded writeback: avoid tiny dirty poll intervals writeback: max, min and target dirty pause time writeback: dirty ratelimit - think time compensation btrfs: fix dirtied pages accounting on sub-page writes writeback: fix dirtied pages accounting on redirty writeback: fix dirtied pages accounting on sub-page writes writeback: charge leaked page dirties to active tasks writeback: Include all dirty inodes in background writeback
Showing 8 changed files Side-by-side Diff
fs/btrfs/file.c
... | ... | @@ -1136,7 +1136,8 @@ |
1136 | 1136 | GFP_NOFS); |
1137 | 1137 | } |
1138 | 1138 | for (i = 0; i < num_pages; i++) { |
1139 | - clear_page_dirty_for_io(pages[i]); | |
1139 | + if (clear_page_dirty_for_io(pages[i])) | |
1140 | + account_page_redirty(pages[i]); | |
1140 | 1141 | set_page_extent_mapped(pages[i]); |
1141 | 1142 | WARN_ON(!PageLocked(pages[i])); |
1142 | 1143 | } |
fs/fs-writeback.c
... | ... | @@ -20,6 +20,7 @@ |
20 | 20 | #include <linux/sched.h> |
21 | 21 | #include <linux/fs.h> |
22 | 22 | #include <linux/mm.h> |
23 | +#include <linux/pagemap.h> | |
23 | 24 | #include <linux/kthread.h> |
24 | 25 | #include <linux/freezer.h> |
25 | 26 | #include <linux/writeback.h> |
... | ... | @@ -29,6 +30,11 @@ |
29 | 30 | #include "internal.h" |
30 | 31 | |
31 | 32 | /* |
33 | + * 4MB minimal write chunk size | |
34 | + */ | |
35 | +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) | |
36 | + | |
37 | +/* | |
32 | 38 | * Passed into wb_writeback(), essentially a subset of writeback_control |
33 | 39 | */ |
34 | 40 | struct wb_writeback_work { |
35 | 41 | |
... | ... | @@ -742,11 +748,17 @@ |
742 | 748 | if (work->for_background && !over_bground_thresh(wb->bdi)) |
743 | 749 | break; |
744 | 750 | |
751 | + /* | |
752 | + * Kupdate and background works are special and we want to | |
753 | + * include all inodes that need writing. Livelock avoidance is | |
754 | + * handled by these works yielding to any other work so we are | |
755 | + * safe. | |
756 | + */ | |
745 | 757 | if (work->for_kupdate) { |
746 | 758 | oldest_jif = jiffies - |
747 | 759 | msecs_to_jiffies(dirty_expire_interval * 10); |
748 | - work->older_than_this = &oldest_jif; | |
749 | - } | |
760 | + } else if (work->for_background) | |
761 | + oldest_jif = jiffies; | |
750 | 762 | |
751 | 763 | trace_writeback_start(wb->bdi, work); |
752 | 764 | if (list_empty(&wb->b_io)) |
include/linux/sched.h
include/linux/writeback.h
... | ... | @@ -7,6 +7,8 @@ |
7 | 7 | #include <linux/sched.h> |
8 | 8 | #include <linux/fs.h> |
9 | 9 | |
10 | +DECLARE_PER_CPU(int, dirty_throttle_leaks); | |
11 | + | |
10 | 12 | /* |
11 | 13 | * The 1/4 region under the global dirty thresh is for smooth dirty throttling: |
12 | 14 | * |
... | ... | @@ -23,11 +25,6 @@ |
23 | 25 | #define DIRTY_SCOPE 8 |
24 | 26 | #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) |
25 | 27 | |
26 | -/* | |
27 | - * 4MB minimal write chunk size | |
28 | - */ | |
29 | -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) | |
30 | - | |
31 | 28 | struct backing_dev_info; |
32 | 29 | |
33 | 30 | /* |
... | ... | @@ -193,6 +190,8 @@ |
193 | 190 | void writeback_set_ratelimit(void); |
194 | 191 | void tag_pages_for_writeback(struct address_space *mapping, |
195 | 192 | pgoff_t start, pgoff_t end); |
193 | + | |
194 | +void account_page_redirty(struct page *page); | |
196 | 195 | |
197 | 196 | /* pdflush.c */ |
198 | 197 | extern int nr_pdflush_threads; /* Global so it can be exported to sysctl |
include/trace/events/writeback.h
... | ... | @@ -300,12 +300,13 @@ |
300 | 300 | unsigned long dirty_ratelimit, |
301 | 301 | unsigned long task_ratelimit, |
302 | 302 | unsigned long dirtied, |
303 | + unsigned long period, | |
303 | 304 | long pause, |
304 | 305 | unsigned long start_time), |
305 | 306 | |
306 | 307 | TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, |
307 | 308 | dirty_ratelimit, task_ratelimit, |
308 | - dirtied, pause, start_time), | |
309 | + dirtied, period, pause, start_time), | |
309 | 310 | |
310 | 311 | TP_STRUCT__entry( |
311 | 312 | __array( char, bdi, 32) |
... | ... | @@ -320,6 +321,8 @@ |
320 | 321 | __field(unsigned int, dirtied_pause) |
321 | 322 | __field(unsigned long, paused) |
322 | 323 | __field( long, pause) |
324 | + __field(unsigned long, period) | |
325 | + __field( long, think) | |
323 | 326 | ), |
324 | 327 | |
325 | 328 | TP_fast_assign( |
... | ... | @@ -336,6 +339,9 @@ |
336 | 339 | __entry->task_ratelimit = KBps(task_ratelimit); |
337 | 340 | __entry->dirtied = dirtied; |
338 | 341 | __entry->dirtied_pause = current->nr_dirtied_pause; |
342 | + __entry->think = current->dirty_paused_when == 0 ? 0 : | |
343 | + (long)(jiffies - current->dirty_paused_when) * 1000/HZ; | |
344 | + __entry->period = period * 1000 / HZ; | |
339 | 345 | __entry->pause = pause * 1000 / HZ; |
340 | 346 | __entry->paused = (jiffies - start_time) * 1000 / HZ; |
341 | 347 | ), |
... | ... | @@ -346,7 +352,7 @@ |
346 | 352 | "bdi_setpoint=%lu bdi_dirty=%lu " |
347 | 353 | "dirty_ratelimit=%lu task_ratelimit=%lu " |
348 | 354 | "dirtied=%u dirtied_pause=%u " |
349 | - "paused=%lu pause=%ld", | |
355 | + "paused=%lu pause=%ld period=%lu think=%ld", | |
350 | 356 | __entry->bdi, |
351 | 357 | __entry->limit, |
352 | 358 | __entry->setpoint, |
... | ... | @@ -358,7 +364,9 @@ |
358 | 364 | __entry->dirtied, |
359 | 365 | __entry->dirtied_pause, |
360 | 366 | __entry->paused, /* ms */ |
361 | - __entry->pause /* ms */ | |
367 | + __entry->pause, /* ms */ | |
368 | + __entry->period, /* ms */ | |
369 | + __entry->think /* ms */ | |
362 | 370 | ) |
363 | 371 | ); |
364 | 372 |
kernel/exit.c
... | ... | @@ -51,6 +51,7 @@ |
51 | 51 | #include <trace/events/sched.h> |
52 | 52 | #include <linux/hw_breakpoint.h> |
53 | 53 | #include <linux/oom.h> |
54 | +#include <linux/writeback.h> | |
54 | 55 | |
55 | 56 | #include <asm/uaccess.h> |
56 | 57 | #include <asm/unistd.h> |
... | ... | @@ -1035,6 +1036,8 @@ |
1035 | 1036 | validate_creds_for_do_exit(tsk); |
1036 | 1037 | |
1037 | 1038 | preempt_disable(); |
1039 | + if (tsk->nr_dirtied) | |
1040 | + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); | |
1038 | 1041 | exit_rcu(); |
1039 | 1042 | /* causes final put_task_struct in finish_task_switch(). */ |
1040 | 1043 | tsk->state = TASK_DEAD; |
kernel/fork.c
mm/page-writeback.c
... | ... | @@ -42,6 +42,12 @@ |
42 | 42 | #define MAX_PAUSE max(HZ/5, 1) |
43 | 43 | |
44 | 44 | /* |
45 | + * Try to keep balance_dirty_pages() call intervals higher than this many pages | |
46 | + * by raising pause time to max_pause when falls below it. | |
47 | + */ | |
48 | +#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) | |
49 | + | |
50 | +/* | |
45 | 51 | * Estimate write bandwidth at 200ms intervals. |
46 | 52 | */ |
47 | 53 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
... | ... | @@ -898,6 +904,11 @@ |
898 | 904 | */ |
899 | 905 | balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, |
900 | 906 | dirty_rate | 1); |
907 | + /* | |
908 | + * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw | |
909 | + */ | |
910 | + if (unlikely(balanced_dirty_ratelimit > write_bw)) | |
911 | + balanced_dirty_ratelimit = write_bw; | |
901 | 912 | |
902 | 913 | /* |
903 | 914 | * We could safely do this and return immediately: |
904 | 915 | |
905 | 916 | |
906 | 917 | |
907 | 918 | |
908 | 919 | |
909 | 920 | |
910 | 921 | |
911 | 922 | |
912 | 923 | |
913 | 924 | |
... | ... | @@ -1044,40 +1055,98 @@ |
1044 | 1055 | return 1; |
1045 | 1056 | } |
1046 | 1057 | |
1047 | -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | |
1048 | - unsigned long bdi_dirty) | |
1058 | +static long bdi_max_pause(struct backing_dev_info *bdi, | |
1059 | + unsigned long bdi_dirty) | |
1049 | 1060 | { |
1050 | - unsigned long bw = bdi->avg_write_bandwidth; | |
1051 | - unsigned long hi = ilog2(bw); | |
1052 | - unsigned long lo = ilog2(bdi->dirty_ratelimit); | |
1053 | - unsigned long t; | |
1061 | + long bw = bdi->avg_write_bandwidth; | |
1062 | + long t; | |
1054 | 1063 | |
1055 | - /* target for 20ms max pause on 1-dd case */ | |
1056 | - t = HZ / 50; | |
1064 | + /* | |
1065 | + * Limit pause time for small memory systems. If sleeping for too long | |
1066 | + * time, a small pool of dirty/writeback pages may go empty and disk go | |
1067 | + * idle. | |
1068 | + * | |
1069 | + * 8 serves as the safety ratio. | |
1070 | + */ | |
1071 | + t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); | |
1072 | + t++; | |
1057 | 1073 | |
1074 | + return min_t(long, t, MAX_PAUSE); | |
1075 | +} | |
1076 | + | |
1077 | +static long bdi_min_pause(struct backing_dev_info *bdi, | |
1078 | + long max_pause, | |
1079 | + unsigned long task_ratelimit, | |
1080 | + unsigned long dirty_ratelimit, | |
1081 | + int *nr_dirtied_pause) | |
1082 | +{ | |
1083 | + long hi = ilog2(bdi->avg_write_bandwidth); | |
1084 | + long lo = ilog2(bdi->dirty_ratelimit); | |
1085 | + long t; /* target pause */ | |
1086 | + long pause; /* estimated next pause */ | |
1087 | + int pages; /* target nr_dirtied_pause */ | |
1088 | + | |
1089 | + /* target for 10ms pause on 1-dd case */ | |
1090 | + t = max(1, HZ / 100); | |
1091 | + | |
1058 | 1092 | /* |
1059 | 1093 | * Scale up pause time for concurrent dirtiers in order to reduce CPU |
1060 | 1094 | * overheads. |
1061 | 1095 | * |
1062 | - * (N * 20ms) on 2^N concurrent tasks. | |
1096 | + * (N * 10ms) on 2^N concurrent tasks. | |
1063 | 1097 | */ |
1064 | 1098 | if (hi > lo) |
1065 | - t += (hi - lo) * (20 * HZ) / 1024; | |
1099 | + t += (hi - lo) * (10 * HZ) / 1024; | |
1066 | 1100 | |
1067 | 1101 | /* |
1068 | - * Limit pause time for small memory systems. If sleeping for too long | |
1069 | - * time, a small pool of dirty/writeback pages may go empty and disk go | |
1070 | - * idle. | |
1102 | + * This is a bit convoluted. We try to base the next nr_dirtied_pause | |
1103 | + * on the much more stable dirty_ratelimit. However the next pause time | |
1104 | + * will be computed based on task_ratelimit and the two rate limits may | |
1105 | + * depart considerably at some time. Especially if task_ratelimit goes | |
1106 | + * below dirty_ratelimit/2 and the target pause is max_pause, the next | |
1107 | + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a | |
1108 | + * result task_ratelimit won't be executed faithfully, which could | |
1109 | + * eventually bring down dirty_ratelimit. | |
1071 | 1110 | * |
1072 | - * 8 serves as the safety ratio. | |
1111 | + * We apply two rules to fix it up: | |
1112 | + * 1) try to estimate the next pause time and if necessary, use a lower | |
1113 | + * nr_dirtied_pause so as not to exceed max_pause. When this happens, | |
1114 | + * nr_dirtied_pause will be "dancing" with task_ratelimit. | |
1115 | + * 2) limit the target pause time to max_pause/2, so that the normal | |
1116 | + * small fluctuations of task_ratelimit won't trigger rule (1) and | |
1117 | + * nr_dirtied_pause will remain as stable as dirty_ratelimit. | |
1073 | 1118 | */ |
1074 | - t = min(t, bdi_dirty * HZ / (8 * bw + 1)); | |
1119 | + t = min(t, 1 + max_pause / 2); | |
1120 | + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); | |
1075 | 1121 | |
1076 | 1122 | /* |
1077 | - * The pause time will be settled within range (max_pause/4, max_pause). | |
1078 | - * Apply a minimal value of 4 to get a non-zero max_pause/4. | |
1123 | + * Tiny nr_dirtied_pause is found to hurt I/O performance in the test | |
1124 | + * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. | |
1125 | + * When the 16 consecutive reads are often interrupted by some dirty | |
1126 | + * throttling pause during the async writes, cfq will go into idles | |
1127 | + * (deadline is fine). So push nr_dirtied_pause as high as possible | |
1128 | + * until reaches DIRTY_POLL_THRESH=32 pages. | |
1079 | 1129 | */ |
1080 | - return clamp_val(t, 4, MAX_PAUSE); | |
1130 | + if (pages < DIRTY_POLL_THRESH) { | |
1131 | + t = max_pause; | |
1132 | + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); | |
1133 | + if (pages > DIRTY_POLL_THRESH) { | |
1134 | + pages = DIRTY_POLL_THRESH; | |
1135 | + t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; | |
1136 | + } | |
1137 | + } | |
1138 | + | |
1139 | + pause = HZ * pages / (task_ratelimit + 1); | |
1140 | + if (pause > max_pause) { | |
1141 | + t = max_pause; | |
1142 | + pages = task_ratelimit * t / roundup_pow_of_two(HZ); | |
1143 | + } | |
1144 | + | |
1145 | + *nr_dirtied_pause = pages; | |
1146 | + /* | |
1147 | + * The minimal pause time will normally be half the target pause time. | |
1148 | + */ | |
1149 | + return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; | |
1081 | 1150 | } |
1082 | 1151 | |
1083 | 1152 | /* |
1084 | 1153 | |
1085 | 1154 | |
... | ... | @@ -1098,16 +1167,21 @@ |
1098 | 1167 | unsigned long background_thresh; |
1099 | 1168 | unsigned long dirty_thresh; |
1100 | 1169 | unsigned long bdi_thresh; |
1101 | - long pause = 0; | |
1102 | - long uninitialized_var(max_pause); | |
1170 | + long period; | |
1171 | + long pause; | |
1172 | + long max_pause; | |
1173 | + long min_pause; | |
1174 | + int nr_dirtied_pause; | |
1103 | 1175 | bool dirty_exceeded = false; |
1104 | 1176 | unsigned long task_ratelimit; |
1105 | - unsigned long uninitialized_var(dirty_ratelimit); | |
1177 | + unsigned long dirty_ratelimit; | |
1106 | 1178 | unsigned long pos_ratio; |
1107 | 1179 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1108 | 1180 | unsigned long start_time = jiffies; |
1109 | 1181 | |
1110 | 1182 | for (;;) { |
1183 | + unsigned long now = jiffies; | |
1184 | + | |
1111 | 1185 | /* |
1112 | 1186 | * Unstable writes are a feature of certain networked |
1113 | 1187 | * filesystems (i.e. NFS) in which data may have been |
1114 | 1188 | |
... | ... | @@ -1127,8 +1201,13 @@ |
1127 | 1201 | */ |
1128 | 1202 | freerun = dirty_freerun_ceiling(dirty_thresh, |
1129 | 1203 | background_thresh); |
1130 | - if (nr_dirty <= freerun) | |
1204 | + if (nr_dirty <= freerun) { | |
1205 | + current->dirty_paused_when = now; | |
1206 | + current->nr_dirtied = 0; | |
1207 | + current->nr_dirtied_pause = | |
1208 | + dirty_poll_interval(nr_dirty, dirty_thresh); | |
1131 | 1209 | break; |
1210 | + } | |
1132 | 1211 | |
1133 | 1212 | if (unlikely(!writeback_in_progress(bdi))) |
1134 | 1213 | bdi_start_background_writeback(bdi); |
... | ... | @@ -1168,7 +1247,7 @@ |
1168 | 1247 | bdi_stat(bdi, BDI_WRITEBACK); |
1169 | 1248 | } |
1170 | 1249 | |
1171 | - dirty_exceeded = (bdi_dirty > bdi_thresh) || | |
1250 | + dirty_exceeded = (bdi_dirty > bdi_thresh) && | |
1172 | 1251 | (nr_dirty > dirty_thresh); |
1173 | 1252 | if (dirty_exceeded && !bdi->dirty_exceeded) |
1174 | 1253 | bdi->dirty_exceeded = 1; |
1175 | 1254 | |
1176 | 1255 | |
1177 | 1256 | |
... | ... | @@ -1177,20 +1256,34 @@ |
1177 | 1256 | nr_dirty, bdi_thresh, bdi_dirty, |
1178 | 1257 | start_time); |
1179 | 1258 | |
1180 | - max_pause = bdi_max_pause(bdi, bdi_dirty); | |
1181 | - | |
1182 | 1259 | dirty_ratelimit = bdi->dirty_ratelimit; |
1183 | 1260 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
1184 | 1261 | background_thresh, nr_dirty, |
1185 | 1262 | bdi_thresh, bdi_dirty); |
1186 | 1263 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> |
1187 | 1264 | RATELIMIT_CALC_SHIFT; |
1265 | + max_pause = bdi_max_pause(bdi, bdi_dirty); | |
1266 | + min_pause = bdi_min_pause(bdi, max_pause, | |
1267 | + task_ratelimit, dirty_ratelimit, | |
1268 | + &nr_dirtied_pause); | |
1269 | + | |
1188 | 1270 | if (unlikely(task_ratelimit == 0)) { |
1271 | + period = max_pause; | |
1189 | 1272 | pause = max_pause; |
1190 | 1273 | goto pause; |
1191 | 1274 | } |
1192 | - pause = HZ * pages_dirtied / task_ratelimit; | |
1193 | - if (unlikely(pause <= 0)) { | |
1275 | + period = HZ * pages_dirtied / task_ratelimit; | |
1276 | + pause = period; | |
1277 | + if (current->dirty_paused_when) | |
1278 | + pause -= now - current->dirty_paused_when; | |
1279 | + /* | |
1280 | + * For less than 1s think time (ext3/4 may block the dirtier | |
1281 | + * for up to 800ms from time to time on 1-HDD; so does xfs, | |
1282 | + * however at much less frequency), try to compensate it in | |
1283 | + * future periods by updating the virtual time; otherwise just | |
1284 | + * do a reset, as it may be a light dirtier. | |
1285 | + */ | |
1286 | + if (pause < min_pause) { | |
1194 | 1287 | trace_balance_dirty_pages(bdi, |
1195 | 1288 | dirty_thresh, |
1196 | 1289 | background_thresh, |
1197 | 1290 | |
1198 | 1291 | |
... | ... | @@ -1200,12 +1293,24 @@ |
1200 | 1293 | dirty_ratelimit, |
1201 | 1294 | task_ratelimit, |
1202 | 1295 | pages_dirtied, |
1203 | - pause, | |
1296 | + period, | |
1297 | + min(pause, 0L), | |
1204 | 1298 | start_time); |
1205 | - pause = 1; /* avoid resetting nr_dirtied_pause below */ | |
1299 | + if (pause < -HZ) { | |
1300 | + current->dirty_paused_when = now; | |
1301 | + current->nr_dirtied = 0; | |
1302 | + } else if (period) { | |
1303 | + current->dirty_paused_when += period; | |
1304 | + current->nr_dirtied = 0; | |
1305 | + } else if (current->nr_dirtied_pause <= pages_dirtied) | |
1306 | + current->nr_dirtied_pause += pages_dirtied; | |
1206 | 1307 | break; |
1207 | 1308 | } |
1208 | - pause = min(pause, max_pause); | |
1309 | + if (unlikely(pause > max_pause)) { | |
1310 | + /* for occasional dropped task_ratelimit */ | |
1311 | + now += min(pause - max_pause, max_pause); | |
1312 | + pause = max_pause; | |
1313 | + } | |
1209 | 1314 | |
1210 | 1315 | pause: |
1211 | 1316 | trace_balance_dirty_pages(bdi, |
1212 | 1317 | |
... | ... | @@ -1217,11 +1322,16 @@ |
1217 | 1322 | dirty_ratelimit, |
1218 | 1323 | task_ratelimit, |
1219 | 1324 | pages_dirtied, |
1325 | + period, | |
1220 | 1326 | pause, |
1221 | 1327 | start_time); |
1222 | 1328 | __set_current_state(TASK_KILLABLE); |
1223 | 1329 | io_schedule_timeout(pause); |
1224 | 1330 | |
1331 | + current->dirty_paused_when = now + pause; | |
1332 | + current->nr_dirtied = 0; | |
1333 | + current->nr_dirtied_pause = nr_dirtied_pause; | |
1334 | + | |
1225 | 1335 | /* |
1226 | 1336 | * This is typically equal to (nr_dirty < dirty_thresh) and can |
1227 | 1337 | * also keep "1000+ dd on a slow USB stick" under control. |
... | ... | @@ -1249,23 +1359,6 @@ |
1249 | 1359 | if (!dirty_exceeded && bdi->dirty_exceeded) |
1250 | 1360 | bdi->dirty_exceeded = 0; |
1251 | 1361 | |
1252 | - current->nr_dirtied = 0; | |
1253 | - if (pause == 0) { /* in freerun area */ | |
1254 | - current->nr_dirtied_pause = | |
1255 | - dirty_poll_interval(nr_dirty, dirty_thresh); | |
1256 | - } else if (pause <= max_pause / 4 && | |
1257 | - pages_dirtied >= current->nr_dirtied_pause) { | |
1258 | - current->nr_dirtied_pause = clamp_val( | |
1259 | - dirty_ratelimit * (max_pause / 2) / HZ, | |
1260 | - pages_dirtied + pages_dirtied / 8, | |
1261 | - pages_dirtied * 4); | |
1262 | - } else if (pause >= max_pause) { | |
1263 | - current->nr_dirtied_pause = 1 | clamp_val( | |
1264 | - dirty_ratelimit * (max_pause / 2) / HZ, | |
1265 | - pages_dirtied / 4, | |
1266 | - pages_dirtied - pages_dirtied / 8); | |
1267 | - } | |
1268 | - | |
1269 | 1362 | if (writeback_in_progress(bdi)) |
1270 | 1363 | return; |
1271 | 1364 | |
... | ... | @@ -1296,6 +1389,22 @@ |
1296 | 1389 | |
1297 | 1390 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
1298 | 1391 | |
1392 | +/* | |
1393 | + * Normal tasks are throttled by | |
1394 | + * loop { | |
1395 | + * dirty tsk->nr_dirtied_pause pages; | |
1396 | + * take a snap in balance_dirty_pages(); | |
1397 | + * } | |
1398 | + * However there is a worst case. If every task exit immediately when dirtied | |
1399 | + * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be | |
1400 | + * called to throttle the page dirties. The solution is to save the not yet | |
1401 | + * throttled page dirties in dirty_throttle_leaks on task exit and charge them | |
1402 | + * randomly into the running tasks. This works well for the above worst case, | |
1403 | + * as the new task will pick up and accumulate the old task's leaked dirty | |
1404 | + * count and eventually get throttled. | |
1405 | + */ | |
1406 | +DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | |
1407 | + | |
1299 | 1408 | /** |
1300 | 1409 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
1301 | 1410 | * @mapping: address_space which was dirtied |
... | ... | @@ -1324,8 +1433,6 @@ |
1324 | 1433 | if (bdi->dirty_exceeded) |
1325 | 1434 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
1326 | 1435 | |
1327 | - current->nr_dirtied += nr_pages_dirtied; | |
1328 | - | |
1329 | 1436 | preempt_disable(); |
1330 | 1437 | /* |
1331 | 1438 | * This prevents one CPU to accumulate too many dirtied pages without |
1332 | 1439 | |
... | ... | @@ -1336,13 +1443,21 @@ |
1336 | 1443 | p = &__get_cpu_var(bdp_ratelimits); |
1337 | 1444 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1338 | 1445 | *p = 0; |
1339 | - else { | |
1340 | - *p += nr_pages_dirtied; | |
1341 | - if (unlikely(*p >= ratelimit_pages)) { | |
1342 | - *p = 0; | |
1343 | - ratelimit = 0; | |
1344 | - } | |
1446 | + else if (unlikely(*p >= ratelimit_pages)) { | |
1447 | + *p = 0; | |
1448 | + ratelimit = 0; | |
1345 | 1449 | } |
1450 | + /* | |
1451 | + * Pick up the dirtied pages by the exited tasks. This avoids lots of | |
1452 | + * short-lived tasks (eg. gcc invocations in a kernel build) escaping | |
1453 | + * the dirty throttling and livelock other long-run dirtiers. | |
1454 | + */ | |
1455 | + p = &__get_cpu_var(dirty_throttle_leaks); | |
1456 | + if (*p > 0 && current->nr_dirtied < ratelimit) { | |
1457 | + nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); | |
1458 | + *p -= nr_pages_dirtied; | |
1459 | + current->nr_dirtied += nr_pages_dirtied; | |
1460 | + } | |
1346 | 1461 | preempt_enable(); |
1347 | 1462 | |
1348 | 1463 | if (unlikely(current->nr_dirtied >= ratelimit)) |
... | ... | @@ -1823,6 +1938,8 @@ |
1823 | 1938 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); |
1824 | 1939 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); |
1825 | 1940 | task_io_account_write(PAGE_CACHE_SIZE); |
1941 | + current->nr_dirtied++; | |
1942 | + this_cpu_inc(bdp_ratelimits); | |
1826 | 1943 | } |
1827 | 1944 | } |
1828 | 1945 | EXPORT_SYMBOL(account_page_dirtied); |
... | ... | @@ -1883,6 +2000,24 @@ |
1883 | 2000 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); |
1884 | 2001 | |
1885 | 2002 | /* |
2003 | + * Call this whenever redirtying a page, to de-account the dirty counters | |
2004 | + * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written | |
2005 | + * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to | |
2006 | + * systematic errors in balanced_dirty_ratelimit and the dirty pages position | |
2007 | + * control. | |
2008 | + */ | |
2009 | +void account_page_redirty(struct page *page) | |
2010 | +{ | |
2011 | + struct address_space *mapping = page->mapping; | |
2012 | + if (mapping && mapping_cap_account_dirty(mapping)) { | |
2013 | + current->nr_dirtied--; | |
2014 | + dec_zone_page_state(page, NR_DIRTIED); | |
2015 | + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | |
2016 | + } | |
2017 | +} | |
2018 | +EXPORT_SYMBOL(account_page_redirty); | |
2019 | + | |
2020 | +/* | |
1886 | 2021 | * When a writepage implementation decides that it doesn't want to write this |
1887 | 2022 | * page for some reason, it should redirty the locked page via |
1888 | 2023 | * redirty_page_for_writepage() and it should then unlock the page and return 0 |
... | ... | @@ -1890,6 +2025,7 @@ |
1890 | 2025 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) |
1891 | 2026 | { |
1892 | 2027 | wbc->pages_skipped++; |
2028 | + account_page_redirty(page); | |
1893 | 2029 | return __set_page_dirty_nobuffers(page); |
1894 | 2030 | } |
1895 | 2031 | EXPORT_SYMBOL(redirty_page_for_writepage); |