Commit ffd1f609ab10532e8137b4b981fdf903ef4d0b32
1 parent
c42843f2f0
Exists in
master
and in
4 other branches
writeback: introduce max-pause and pass-good dirty limits
The max-pause limit helps to keep the sleep time inside balance_dirty_pages() within MAX_PAUSE=200ms. The 200ms max sleep means per task rate limit of 8pages/200ms=160KB/s when dirty exceeded, which normally is enough to stop dirtiers from continue pushing the dirty pages high, unless there are a sufficient large number of slow dirtiers (eg. 500 tasks doing 160KB/s will still sum up to 80MB/s, exceeding the write bandwidth of a slow disk and hence accumulating more and more dirty pages). The pass-good limit helps to let go of the good bdi's in the presence of a blocked bdi (ie. NFS server not responding) or slow USB disk which for some reason build up a large number of initial dirty pages that refuse to go away anytime soon. For example, given two bdi's A and B and the initial state bdi_thresh_A = dirty_thresh / 2 bdi_thresh_B = dirty_thresh / 2 bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 Then A get blocked, after a dozen seconds bdi_thresh_A = 0 bdi_thresh_B = dirty_thresh bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 The (bdi_dirty_B < bdi_thresh_B) test is now useless and the dirty pages will be effectively throttled by condition (nr_dirty < dirty_thresh). This has two problems: (1) we lose the protections for light dirtiers (2) balance_dirty_pages() effectively becomes IO-less because the (bdi_nr_reclaimable > bdi_thresh) test won't be true. This is good for IO, but balance_dirty_pages() loses an important way to break out of the loop which leads to more spread out throttle delays. DIRTY_PASSGOOD_AREA can eliminate the above issues. The only problem is, DIRTY_PASSGOOD_AREA needs to be defined as 2 to fully cover the above example while this patch uses the more conservative value 8 so as not to surprise people with too many dirty pages than expected. The max-pause limit won't noticeably impact the speed dirty pages are knocked down when there is a sudden drop of global/bdi dirty thresholds. Because the heavy dirties will be throttled below 160KB/s which is slow enough. It does help to avoid long dirty throttle delays and especially will make light dirtiers more responsive. Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Showing 2 changed files with 54 additions and 0 deletions Side-by-side Diff
include/linux/writeback.h
... | ... | @@ -7,6 +7,27 @@ |
7 | 7 | #include <linux/sched.h> |
8 | 8 | #include <linux/fs.h> |
9 | 9 | |
10 | +/* | |
11 | + * The 1/16 region above the global dirty limit will be put to maximum pauses: | |
12 | + * | |
13 | + * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) | |
14 | + * | |
15 | + * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put | |
16 | + * to loops: | |
17 | + * | |
18 | + * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA) | |
19 | + * | |
20 | + * Further beyond, all dirtier tasks will enter a loop waiting (possibly long | |
21 | + * time) for the dirty pages to drop, unless written enough pages. | |
22 | + * | |
23 | + * The global dirty threshold is normally equal to the global dirty limit, | |
24 | + * except when the system suddenly allocates a lot of anonymous memory and | |
25 | + * knocks down the global dirty threshold quickly, in which case the global | |
26 | + * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. | |
27 | + */ | |
28 | +#define DIRTY_MAXPAUSE_AREA 16 | |
29 | +#define DIRTY_PASSGOOD_AREA 8 | |
30 | + | |
10 | 31 | struct backing_dev_info; |
11 | 32 | |
12 | 33 | /* |
mm/page-writeback.c
... | ... | @@ -37,6 +37,11 @@ |
37 | 37 | #include <trace/events/writeback.h> |
38 | 38 | |
39 | 39 | /* |
40 | + * Sleep at most 200ms at a time in balance_dirty_pages(). | |
41 | + */ | |
42 | +#define MAX_PAUSE max(HZ/5, 1) | |
43 | + | |
44 | +/* | |
40 | 45 | * Estimate write bandwidth at 200ms intervals. |
41 | 46 | */ |
42 | 47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
... | ... | @@ -399,6 +404,11 @@ |
399 | 404 | return x + 1; /* Ensure that we never return 0 */ |
400 | 405 | } |
401 | 406 | |
407 | +static unsigned long hard_dirty_limit(unsigned long thresh) | |
408 | +{ | |
409 | + return max(thresh, global_dirty_limit); | |
410 | +} | |
411 | + | |
402 | 412 | /* |
403 | 413 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
404 | 414 | * |
... | ... | @@ -722,6 +732,29 @@ |
722 | 732 | __set_current_state(TASK_UNINTERRUPTIBLE); |
723 | 733 | io_schedule_timeout(pause); |
724 | 734 | trace_balance_dirty_wait(bdi); |
735 | + | |
736 | + dirty_thresh = hard_dirty_limit(dirty_thresh); | |
737 | + /* | |
738 | + * max-pause area. If dirty exceeded but still within this | |
739 | + * area, no need to sleep for more than 200ms: (a) 8 pages per | |
740 | + * 200ms is typically more than enough to curb heavy dirtiers; | |
741 | + * (b) the pause time limit makes the dirtiers more responsive. | |
742 | + */ | |
743 | + if (nr_dirty < dirty_thresh + | |
744 | + dirty_thresh / DIRTY_MAXPAUSE_AREA && | |
745 | + time_after(jiffies, start_time + MAX_PAUSE)) | |
746 | + break; | |
747 | + /* | |
748 | + * pass-good area. When some bdi gets blocked (eg. NFS server | |
749 | + * not responding), or write bandwidth dropped dramatically due | |
750 | + * to concurrent reads, or dirty threshold suddenly dropped and | |
751 | + * the dirty pages cannot be brought down anytime soon (eg. on | |
752 | + * slow USB stick), at least let go of the good bdi's. | |
753 | + */ | |
754 | + if (nr_dirty < dirty_thresh + | |
755 | + dirty_thresh / DIRTY_PASSGOOD_AREA && | |
756 | + bdi_dirty < bdi_thresh) | |
757 | + break; | |
725 | 758 | |
726 | 759 | /* |
727 | 760 | * Increase the delay for each loop, up to our previous |