Commit 001a541ea9163ace5e8243ee0e907ad80a4c0ec2

Authored by Linus Torvalds

Merge branch 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux

* 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux:
  writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c
  writeback: balanced_rate cannot exceed write bandwidth
  writeback: do strict bdi dirty_exceeded
  writeback: avoid tiny dirty poll intervals
  writeback: max, min and target dirty pause time
  writeback: dirty ratelimit - think time compensation
  btrfs: fix dirtied pages accounting on sub-page writes
  writeback: fix dirtied pages accounting on redirty
  writeback: fix dirtied pages accounting on sub-page writes
  writeback: charge leaked page dirties to active tasks
  writeback: Include all dirty inodes in background writeback

Showing 8 changed files Side-by-side Diff

... ... @@ -1136,7 +1136,8 @@
1136 1136 GFP_NOFS);
1137 1137 }
1138 1138 for (i = 0; i < num_pages; i++) {
1139   - clear_page_dirty_for_io(pages[i]);
  1139 + if (clear_page_dirty_for_io(pages[i]))
  1140 + account_page_redirty(pages[i]);
1140 1141 set_page_extent_mapped(pages[i]);
1141 1142 WARN_ON(!PageLocked(pages[i]));
1142 1143 }
... ... @@ -20,6 +20,7 @@
20 20 #include <linux/sched.h>
21 21 #include <linux/fs.h>
22 22 #include <linux/mm.h>
  23 +#include <linux/pagemap.h>
23 24 #include <linux/kthread.h>
24 25 #include <linux/freezer.h>
25 26 #include <linux/writeback.h>
... ... @@ -29,6 +30,11 @@
29 30 #include "internal.h"
30 31  
31 32 /*
  33 + * 4MB minimal write chunk size
  34 + */
  35 +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
  36 +
  37 +/*
32 38 * Passed into wb_writeback(), essentially a subset of writeback_control
33 39 */
34 40 struct wb_writeback_work {
35 41  
... ... @@ -742,11 +748,17 @@
742 748 if (work->for_background && !over_bground_thresh(wb->bdi))
743 749 break;
744 750  
  751 + /*
  752 + * Kupdate and background works are special and we want to
  753 + * include all inodes that need writing. Livelock avoidance is
  754 + * handled by these works yielding to any other work so we are
  755 + * safe.
  756 + */
745 757 if (work->for_kupdate) {
746 758 oldest_jif = jiffies -
747 759 msecs_to_jiffies(dirty_expire_interval * 10);
748   - work->older_than_this = &oldest_jif;
749   - }
  760 + } else if (work->for_background)
  761 + oldest_jif = jiffies;
750 762  
751 763 trace_writeback_start(wb->bdi, work);
752 764 if (list_empty(&wb->b_io))
include/linux/sched.h
... ... @@ -1544,6 +1544,7 @@
1544 1544 */
1545 1545 int nr_dirtied;
1546 1546 int nr_dirtied_pause;
  1547 + unsigned long dirty_paused_when; /* start of a write-and-pause period */
1547 1548  
1548 1549 #ifdef CONFIG_LATENCYTOP
1549 1550 int latency_record_count;
include/linux/writeback.h
... ... @@ -7,6 +7,8 @@
7 7 #include <linux/sched.h>
8 8 #include <linux/fs.h>
9 9  
  10 +DECLARE_PER_CPU(int, dirty_throttle_leaks);
  11 +
10 12 /*
11 13 * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
12 14 *
... ... @@ -23,11 +25,6 @@
23 25 #define DIRTY_SCOPE 8
24 26 #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
25 27  
26   -/*
27   - * 4MB minimal write chunk size
28   - */
29   -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
30   -
31 28 struct backing_dev_info;
32 29  
33 30 /*
... ... @@ -193,6 +190,8 @@
193 190 void writeback_set_ratelimit(void);
194 191 void tag_pages_for_writeback(struct address_space *mapping,
195 192 pgoff_t start, pgoff_t end);
  193 +
  194 +void account_page_redirty(struct page *page);
196 195  
197 196 /* pdflush.c */
198 197 extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
include/trace/events/writeback.h
... ... @@ -300,12 +300,13 @@
300 300 unsigned long dirty_ratelimit,
301 301 unsigned long task_ratelimit,
302 302 unsigned long dirtied,
  303 + unsigned long period,
303 304 long pause,
304 305 unsigned long start_time),
305 306  
306 307 TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
307 308 dirty_ratelimit, task_ratelimit,
308   - dirtied, pause, start_time),
  309 + dirtied, period, pause, start_time),
309 310  
310 311 TP_STRUCT__entry(
311 312 __array( char, bdi, 32)
... ... @@ -320,6 +321,8 @@
320 321 __field(unsigned int, dirtied_pause)
321 322 __field(unsigned long, paused)
322 323 __field( long, pause)
  324 + __field(unsigned long, period)
  325 + __field( long, think)
323 326 ),
324 327  
325 328 TP_fast_assign(
... ... @@ -336,6 +339,9 @@
336 339 __entry->task_ratelimit = KBps(task_ratelimit);
337 340 __entry->dirtied = dirtied;
338 341 __entry->dirtied_pause = current->nr_dirtied_pause;
  342 + __entry->think = current->dirty_paused_when == 0 ? 0 :
  343 + (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
  344 + __entry->period = period * 1000 / HZ;
339 345 __entry->pause = pause * 1000 / HZ;
340 346 __entry->paused = (jiffies - start_time) * 1000 / HZ;
341 347 ),
... ... @@ -346,7 +352,7 @@
346 352 "bdi_setpoint=%lu bdi_dirty=%lu "
347 353 "dirty_ratelimit=%lu task_ratelimit=%lu "
348 354 "dirtied=%u dirtied_pause=%u "
349   - "paused=%lu pause=%ld",
  355 + "paused=%lu pause=%ld period=%lu think=%ld",
350 356 __entry->bdi,
351 357 __entry->limit,
352 358 __entry->setpoint,
... ... @@ -358,7 +364,9 @@
358 364 __entry->dirtied,
359 365 __entry->dirtied_pause,
360 366 __entry->paused, /* ms */
361   - __entry->pause /* ms */
  367 + __entry->pause, /* ms */
  368 + __entry->period, /* ms */
  369 + __entry->think /* ms */
362 370 )
363 371 );
364 372  
... ... @@ -51,6 +51,7 @@
51 51 #include <trace/events/sched.h>
52 52 #include <linux/hw_breakpoint.h>
53 53 #include <linux/oom.h>
  54 +#include <linux/writeback.h>
54 55  
55 56 #include <asm/uaccess.h>
56 57 #include <asm/unistd.h>
... ... @@ -1035,6 +1036,8 @@
1035 1036 validate_creds_for_do_exit(tsk);
1036 1037  
1037 1038 preempt_disable();
  1039 + if (tsk->nr_dirtied)
  1040 + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1038 1041 exit_rcu();
1039 1042 /* causes final put_task_struct in finish_task_switch(). */
1040 1043 tsk->state = TASK_DEAD;
... ... @@ -1294,6 +1294,7 @@
1294 1294  
1295 1295 p->nr_dirtied = 0;
1296 1296 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
  1297 + p->dirty_paused_when = 0;
1297 1298  
1298 1299 /*
1299 1300 * Ok, make it visible to the rest of the system.
... ... @@ -42,6 +42,12 @@
42 42 #define MAX_PAUSE max(HZ/5, 1)
43 43  
44 44 /*
  45 + * Try to keep balance_dirty_pages() call intervals higher than this many pages
  46 + * by raising pause time to max_pause when falls below it.
  47 + */
  48 +#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
  49 +
  50 +/*
45 51 * Estimate write bandwidth at 200ms intervals.
46 52 */
47 53 #define BANDWIDTH_INTERVAL max(HZ/5, 1)
... ... @@ -898,6 +904,11 @@
898 904 */
899 905 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
900 906 dirty_rate | 1);
  907 + /*
  908 + * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
  909 + */
  910 + if (unlikely(balanced_dirty_ratelimit > write_bw))
  911 + balanced_dirty_ratelimit = write_bw;
901 912  
902 913 /*
903 914 * We could safely do this and return immediately:
904 915  
905 916  
906 917  
907 918  
908 919  
909 920  
910 921  
911 922  
912 923  
913 924  
... ... @@ -1044,40 +1055,98 @@
1044 1055 return 1;
1045 1056 }
1046 1057  
1047   -static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
1048   - unsigned long bdi_dirty)
  1058 +static long bdi_max_pause(struct backing_dev_info *bdi,
  1059 + unsigned long bdi_dirty)
1049 1060 {
1050   - unsigned long bw = bdi->avg_write_bandwidth;
1051   - unsigned long hi = ilog2(bw);
1052   - unsigned long lo = ilog2(bdi->dirty_ratelimit);
1053   - unsigned long t;
  1061 + long bw = bdi->avg_write_bandwidth;
  1062 + long t;
1054 1063  
1055   - /* target for 20ms max pause on 1-dd case */
1056   - t = HZ / 50;
  1064 + /*
  1065 + * Limit pause time for small memory systems. If sleeping for too long
  1066 + * time, a small pool of dirty/writeback pages may go empty and disk go
  1067 + * idle.
  1068 + *
  1069 + * 8 serves as the safety ratio.
  1070 + */
  1071 + t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
  1072 + t++;
1057 1073  
  1074 + return min_t(long, t, MAX_PAUSE);
  1075 +}
  1076 +
  1077 +static long bdi_min_pause(struct backing_dev_info *bdi,
  1078 + long max_pause,
  1079 + unsigned long task_ratelimit,
  1080 + unsigned long dirty_ratelimit,
  1081 + int *nr_dirtied_pause)
  1082 +{
  1083 + long hi = ilog2(bdi->avg_write_bandwidth);
  1084 + long lo = ilog2(bdi->dirty_ratelimit);
  1085 + long t; /* target pause */
  1086 + long pause; /* estimated next pause */
  1087 + int pages; /* target nr_dirtied_pause */
  1088 +
  1089 + /* target for 10ms pause on 1-dd case */
  1090 + t = max(1, HZ / 100);
  1091 +
1058 1092 /*
1059 1093 * Scale up pause time for concurrent dirtiers in order to reduce CPU
1060 1094 * overheads.
1061 1095 *
1062   - * (N * 20ms) on 2^N concurrent tasks.
  1096 + * (N * 10ms) on 2^N concurrent tasks.
1063 1097 */
1064 1098 if (hi > lo)
1065   - t += (hi - lo) * (20 * HZ) / 1024;
  1099 + t += (hi - lo) * (10 * HZ) / 1024;
1066 1100  
1067 1101 /*
1068   - * Limit pause time for small memory systems. If sleeping for too long
1069   - * time, a small pool of dirty/writeback pages may go empty and disk go
1070   - * idle.
  1102 + * This is a bit convoluted. We try to base the next nr_dirtied_pause
  1103 + * on the much more stable dirty_ratelimit. However the next pause time
  1104 + * will be computed based on task_ratelimit and the two rate limits may
  1105 + * depart considerably at some time. Especially if task_ratelimit goes
  1106 + * below dirty_ratelimit/2 and the target pause is max_pause, the next
  1107 + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
  1108 + * result task_ratelimit won't be executed faithfully, which could
  1109 + * eventually bring down dirty_ratelimit.
1071 1110 *
1072   - * 8 serves as the safety ratio.
  1111 + * We apply two rules to fix it up:
  1112 + * 1) try to estimate the next pause time and if necessary, use a lower
  1113 + * nr_dirtied_pause so as not to exceed max_pause. When this happens,
  1114 + * nr_dirtied_pause will be "dancing" with task_ratelimit.
  1115 + * 2) limit the target pause time to max_pause/2, so that the normal
  1116 + * small fluctuations of task_ratelimit won't trigger rule (1) and
  1117 + * nr_dirtied_pause will remain as stable as dirty_ratelimit.
1073 1118 */
1074   - t = min(t, bdi_dirty * HZ / (8 * bw + 1));
  1119 + t = min(t, 1 + max_pause / 2);
  1120 + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1075 1121  
1076 1122 /*
1077   - * The pause time will be settled within range (max_pause/4, max_pause).
1078   - * Apply a minimal value of 4 to get a non-zero max_pause/4.
  1123 + * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
  1124 + * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
  1125 + * When the 16 consecutive reads are often interrupted by some dirty
  1126 + * throttling pause during the async writes, cfq will go into idles
  1127 + * (deadline is fine). So push nr_dirtied_pause as high as possible
  1128 + * until reaches DIRTY_POLL_THRESH=32 pages.
1079 1129 */
1080   - return clamp_val(t, 4, MAX_PAUSE);
  1130 + if (pages < DIRTY_POLL_THRESH) {
  1131 + t = max_pause;
  1132 + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
  1133 + if (pages > DIRTY_POLL_THRESH) {
  1134 + pages = DIRTY_POLL_THRESH;
  1135 + t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
  1136 + }
  1137 + }
  1138 +
  1139 + pause = HZ * pages / (task_ratelimit + 1);
  1140 + if (pause > max_pause) {
  1141 + t = max_pause;
  1142 + pages = task_ratelimit * t / roundup_pow_of_two(HZ);
  1143 + }
  1144 +
  1145 + *nr_dirtied_pause = pages;
  1146 + /*
  1147 + * The minimal pause time will normally be half the target pause time.
  1148 + */
  1149 + return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1081 1150 }
1082 1151  
1083 1152 /*
1084 1153  
1085 1154  
... ... @@ -1098,16 +1167,21 @@
1098 1167 unsigned long background_thresh;
1099 1168 unsigned long dirty_thresh;
1100 1169 unsigned long bdi_thresh;
1101   - long pause = 0;
1102   - long uninitialized_var(max_pause);
  1170 + long period;
  1171 + long pause;
  1172 + long max_pause;
  1173 + long min_pause;
  1174 + int nr_dirtied_pause;
1103 1175 bool dirty_exceeded = false;
1104 1176 unsigned long task_ratelimit;
1105   - unsigned long uninitialized_var(dirty_ratelimit);
  1177 + unsigned long dirty_ratelimit;
1106 1178 unsigned long pos_ratio;
1107 1179 struct backing_dev_info *bdi = mapping->backing_dev_info;
1108 1180 unsigned long start_time = jiffies;
1109 1181  
1110 1182 for (;;) {
  1183 + unsigned long now = jiffies;
  1184 +
1111 1185 /*
1112 1186 * Unstable writes are a feature of certain networked
1113 1187 * filesystems (i.e. NFS) in which data may have been
1114 1188  
... ... @@ -1127,8 +1201,13 @@
1127 1201 */
1128 1202 freerun = dirty_freerun_ceiling(dirty_thresh,
1129 1203 background_thresh);
1130   - if (nr_dirty <= freerun)
  1204 + if (nr_dirty <= freerun) {
  1205 + current->dirty_paused_when = now;
  1206 + current->nr_dirtied = 0;
  1207 + current->nr_dirtied_pause =
  1208 + dirty_poll_interval(nr_dirty, dirty_thresh);
1131 1209 break;
  1210 + }
1132 1211  
1133 1212 if (unlikely(!writeback_in_progress(bdi)))
1134 1213 bdi_start_background_writeback(bdi);
... ... @@ -1168,7 +1247,7 @@
1168 1247 bdi_stat(bdi, BDI_WRITEBACK);
1169 1248 }
1170 1249  
1171   - dirty_exceeded = (bdi_dirty > bdi_thresh) ||
  1250 + dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1172 1251 (nr_dirty > dirty_thresh);
1173 1252 if (dirty_exceeded && !bdi->dirty_exceeded)
1174 1253 bdi->dirty_exceeded = 1;
1175 1254  
1176 1255  
1177 1256  
... ... @@ -1177,20 +1256,34 @@
1177 1256 nr_dirty, bdi_thresh, bdi_dirty,
1178 1257 start_time);
1179 1258  
1180   - max_pause = bdi_max_pause(bdi, bdi_dirty);
1181   -
1182 1259 dirty_ratelimit = bdi->dirty_ratelimit;
1183 1260 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1184 1261 background_thresh, nr_dirty,
1185 1262 bdi_thresh, bdi_dirty);
1186 1263 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1187 1264 RATELIMIT_CALC_SHIFT;
  1265 + max_pause = bdi_max_pause(bdi, bdi_dirty);
  1266 + min_pause = bdi_min_pause(bdi, max_pause,
  1267 + task_ratelimit, dirty_ratelimit,
  1268 + &nr_dirtied_pause);
  1269 +
1188 1270 if (unlikely(task_ratelimit == 0)) {
  1271 + period = max_pause;
1189 1272 pause = max_pause;
1190 1273 goto pause;
1191 1274 }
1192   - pause = HZ * pages_dirtied / task_ratelimit;
1193   - if (unlikely(pause <= 0)) {
  1275 + period = HZ * pages_dirtied / task_ratelimit;
  1276 + pause = period;
  1277 + if (current->dirty_paused_when)
  1278 + pause -= now - current->dirty_paused_when;
  1279 + /*
  1280 + * For less than 1s think time (ext3/4 may block the dirtier
  1281 + * for up to 800ms from time to time on 1-HDD; so does xfs,
  1282 + * however at much less frequency), try to compensate it in
  1283 + * future periods by updating the virtual time; otherwise just
  1284 + * do a reset, as it may be a light dirtier.
  1285 + */
  1286 + if (pause < min_pause) {
1194 1287 trace_balance_dirty_pages(bdi,
1195 1288 dirty_thresh,
1196 1289 background_thresh,
1197 1290  
1198 1291  
... ... @@ -1200,12 +1293,24 @@
1200 1293 dirty_ratelimit,
1201 1294 task_ratelimit,
1202 1295 pages_dirtied,
1203   - pause,
  1296 + period,
  1297 + min(pause, 0L),
1204 1298 start_time);
1205   - pause = 1; /* avoid resetting nr_dirtied_pause below */
  1299 + if (pause < -HZ) {
  1300 + current->dirty_paused_when = now;
  1301 + current->nr_dirtied = 0;
  1302 + } else if (period) {
  1303 + current->dirty_paused_when += period;
  1304 + current->nr_dirtied = 0;
  1305 + } else if (current->nr_dirtied_pause <= pages_dirtied)
  1306 + current->nr_dirtied_pause += pages_dirtied;
1206 1307 break;
1207 1308 }
1208   - pause = min(pause, max_pause);
  1309 + if (unlikely(pause > max_pause)) {
  1310 + /* for occasional dropped task_ratelimit */
  1311 + now += min(pause - max_pause, max_pause);
  1312 + pause = max_pause;
  1313 + }
1209 1314  
1210 1315 pause:
1211 1316 trace_balance_dirty_pages(bdi,
1212 1317  
... ... @@ -1217,11 +1322,16 @@
1217 1322 dirty_ratelimit,
1218 1323 task_ratelimit,
1219 1324 pages_dirtied,
  1325 + period,
1220 1326 pause,
1221 1327 start_time);
1222 1328 __set_current_state(TASK_KILLABLE);
1223 1329 io_schedule_timeout(pause);
1224 1330  
  1331 + current->dirty_paused_when = now + pause;
  1332 + current->nr_dirtied = 0;
  1333 + current->nr_dirtied_pause = nr_dirtied_pause;
  1334 +
1225 1335 /*
1226 1336 * This is typically equal to (nr_dirty < dirty_thresh) and can
1227 1337 * also keep "1000+ dd on a slow USB stick" under control.
... ... @@ -1249,23 +1359,6 @@
1249 1359 if (!dirty_exceeded && bdi->dirty_exceeded)
1250 1360 bdi->dirty_exceeded = 0;
1251 1361  
1252   - current->nr_dirtied = 0;
1253   - if (pause == 0) { /* in freerun area */
1254   - current->nr_dirtied_pause =
1255   - dirty_poll_interval(nr_dirty, dirty_thresh);
1256   - } else if (pause <= max_pause / 4 &&
1257   - pages_dirtied >= current->nr_dirtied_pause) {
1258   - current->nr_dirtied_pause = clamp_val(
1259   - dirty_ratelimit * (max_pause / 2) / HZ,
1260   - pages_dirtied + pages_dirtied / 8,
1261   - pages_dirtied * 4);
1262   - } else if (pause >= max_pause) {
1263   - current->nr_dirtied_pause = 1 | clamp_val(
1264   - dirty_ratelimit * (max_pause / 2) / HZ,
1265   - pages_dirtied / 4,
1266   - pages_dirtied - pages_dirtied / 8);
1267   - }
1268   -
1269 1362 if (writeback_in_progress(bdi))
1270 1363 return;
1271 1364  
... ... @@ -1296,6 +1389,22 @@
1296 1389  
1297 1390 static DEFINE_PER_CPU(int, bdp_ratelimits);
1298 1391  
  1392 +/*
  1393 + * Normal tasks are throttled by
  1394 + * loop {
  1395 + * dirty tsk->nr_dirtied_pause pages;
  1396 + * take a snap in balance_dirty_pages();
  1397 + * }
  1398 + * However there is a worst case. If every task exit immediately when dirtied
  1399 + * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
  1400 + * called to throttle the page dirties. The solution is to save the not yet
  1401 + * throttled page dirties in dirty_throttle_leaks on task exit and charge them
  1402 + * randomly into the running tasks. This works well for the above worst case,
  1403 + * as the new task will pick up and accumulate the old task's leaked dirty
  1404 + * count and eventually get throttled.
  1405 + */
  1406 +DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
  1407 +
1299 1408 /**
1300 1409 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
1301 1410 * @mapping: address_space which was dirtied
... ... @@ -1324,8 +1433,6 @@
1324 1433 if (bdi->dirty_exceeded)
1325 1434 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1326 1435  
1327   - current->nr_dirtied += nr_pages_dirtied;
1328   -
1329 1436 preempt_disable();
1330 1437 /*
1331 1438 * This prevents one CPU to accumulate too many dirtied pages without
1332 1439  
... ... @@ -1336,13 +1443,21 @@
1336 1443 p = &__get_cpu_var(bdp_ratelimits);
1337 1444 if (unlikely(current->nr_dirtied >= ratelimit))
1338 1445 *p = 0;
1339   - else {
1340   - *p += nr_pages_dirtied;
1341   - if (unlikely(*p >= ratelimit_pages)) {
1342   - *p = 0;
1343   - ratelimit = 0;
1344   - }
  1446 + else if (unlikely(*p >= ratelimit_pages)) {
  1447 + *p = 0;
  1448 + ratelimit = 0;
1345 1449 }
  1450 + /*
  1451 + * Pick up the dirtied pages by the exited tasks. This avoids lots of
  1452 + * short-lived tasks (eg. gcc invocations in a kernel build) escaping
  1453 + * the dirty throttling and livelock other long-run dirtiers.
  1454 + */
  1455 + p = &__get_cpu_var(dirty_throttle_leaks);
  1456 + if (*p > 0 && current->nr_dirtied < ratelimit) {
  1457 + nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
  1458 + *p -= nr_pages_dirtied;
  1459 + current->nr_dirtied += nr_pages_dirtied;
  1460 + }
1346 1461 preempt_enable();
1347 1462  
1348 1463 if (unlikely(current->nr_dirtied >= ratelimit))
... ... @@ -1823,6 +1938,8 @@
1823 1938 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1824 1939 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1825 1940 task_io_account_write(PAGE_CACHE_SIZE);
  1941 + current->nr_dirtied++;
  1942 + this_cpu_inc(bdp_ratelimits);
1826 1943 }
1827 1944 }
1828 1945 EXPORT_SYMBOL(account_page_dirtied);
... ... @@ -1883,6 +2000,24 @@
1883 2000 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1884 2001  
1885 2002 /*
  2003 + * Call this whenever redirtying a page, to de-account the dirty counters
  2004 + * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
  2005 + * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
  2006 + * systematic errors in balanced_dirty_ratelimit and the dirty pages position
  2007 + * control.
  2008 + */
  2009 +void account_page_redirty(struct page *page)
  2010 +{
  2011 + struct address_space *mapping = page->mapping;
  2012 + if (mapping && mapping_cap_account_dirty(mapping)) {
  2013 + current->nr_dirtied--;
  2014 + dec_zone_page_state(page, NR_DIRTIED);
  2015 + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
  2016 + }
  2017 +}
  2018 +EXPORT_SYMBOL(account_page_redirty);
  2019 +
  2020 +/*
1886 2021 * When a writepage implementation decides that it doesn't want to write this
1887 2022 * page for some reason, it should redirty the locked page via
1888 2023 * redirty_page_for_writepage() and it should then unlock the page and return 0
... ... @@ -1890,6 +2025,7 @@
1890 2025 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1891 2026 {
1892 2027 wbc->pages_skipped++;
  2028 + account_page_redirty(page);
1893 2029 return __set_page_dirty_nobuffers(page);
1894 2030 }
1895 2031 EXPORT_SYMBOL(redirty_page_for_writepage);