Blame view
mm/page-writeback.c
84.3 KB
1da177e4c
|
1 |
/* |
f30c22695
|
2 |
* mm/page-writeback.c |
1da177e4c
|
3 4 |
* * Copyright (C) 2002, Linus Torvalds. |
90eec103b
|
5 |
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
1da177e4c
|
6 7 8 9 |
* * Contains functions related to writing back dirty pages at the * address_space level. * |
e1f8e8744
|
10 |
* 10Apr2002 Andrew Morton |
1da177e4c
|
11 12 13 14 |
* Initial version */ #include <linux/kernel.h> |
b95f1b31b
|
15 |
#include <linux/export.h> |
1da177e4c
|
16 17 18 19 20 21 22 23 24 |
#include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> |
55e829af0
|
25 |
#include <linux/task_io_accounting_ops.h> |
1da177e4c
|
26 27 |
#include <linux/blkdev.h> #include <linux/mpage.h> |
d08b3851d
|
28 |
#include <linux/rmap.h> |
1da177e4c
|
29 30 31 32 33 34 |
#include <linux/percpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> |
ff01bb483
|
35 |
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
811d736f9
|
36 |
#include <linux/pagevec.h> |
eb608e3a3
|
37 |
#include <linux/timer.h> |
8bd75c77b
|
38 |
#include <linux/sched/rt.h> |
6e543d578
|
39 |
#include <linux/mm_inline.h> |
028c2dd18
|
40 |
#include <trace/events/writeback.h> |
1da177e4c
|
41 |
|
6e543d578
|
42 |
#include "internal.h" |
1da177e4c
|
43 |
/* |
ffd1f609a
|
44 45 46 47 48 |
* Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* |
5b9b35743
|
49 50 51 52 53 54 |
* Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* |
e98be2d59
|
55 56 57 |
* Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
6c14ae1e9
|
58 |
#define RATELIMIT_CALC_SHIFT 10 |
e98be2d59
|
59 |
/* |
1da177e4c
|
60 61 62 63 |
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; |
1da177e4c
|
64 65 66 |
/* The following parameters are exported via /proc/sys/vm */ /* |
5b0830cb9
|
67 |
* Start background writeback (via writeback threads) at this percentage |
1da177e4c
|
68 |
*/ |
1b5e62b42
|
69 |
int dirty_background_ratio = 10; |
1da177e4c
|
70 71 |
/* |
2da02997e
|
72 73 74 75 76 77 |
* dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ unsigned long dirty_background_bytes; /* |
195cf453d
|
78 79 80 81 82 83 |
* free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ int vm_highmem_is_dirtyable; /* |
1da177e4c
|
84 85 |
* The generator of dirty data starts writeback at this percentage */ |
1b5e62b42
|
86 |
int vm_dirty_ratio = 20; |
1da177e4c
|
87 88 |
/* |
2da02997e
|
89 90 91 92 93 94 |
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ unsigned long vm_dirty_bytes; /* |
704503d83
|
95 |
* The interval between `kupdate'-style writebacks |
1da177e4c
|
96 |
*/ |
22ef37eed
|
97 |
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ |
1da177e4c
|
98 |
|
91913a294
|
99 |
EXPORT_SYMBOL_GPL(dirty_writeback_interval); |
1da177e4c
|
100 |
/* |
704503d83
|
101 |
* The longest time for which data is allowed to remain dirty |
1da177e4c
|
102 |
*/ |
22ef37eed
|
103 |
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ |
1da177e4c
|
104 105 106 107 108 109 110 |
/* * Flag that makes the machine dump writes/reads and block dirtyings. */ int block_dump; /* |
ed5b43f15
|
111 112 |
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. |
1da177e4c
|
113 114 115 116 117 118 |
*/ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ |
dcc25ae76
|
119 |
struct wb_domain global_wb_domain; |
1da177e4c
|
120 |
|
2bc00aef0
|
121 122 |
/* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { |
e9f07dfd7
|
123 124 |
#ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; |
9fc3a43e1
|
125 |
struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ |
e9f07dfd7
|
126 |
#endif |
2bc00aef0
|
127 |
struct bdi_writeback *wb; |
e9770b348
|
128 |
struct fprop_local_percpu *wb_completions; |
eb608e3a3
|
129 |
|
9fc3a43e1
|
130 |
unsigned long avail; /* dirtyable */ |
2bc00aef0
|
131 132 133 134 135 136 |
unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; |
970fb01ad
|
137 |
unsigned long wb_bg_thresh; |
daddfa3cb
|
138 139 |
unsigned long pos_ratio; |
2bc00aef0
|
140 |
}; |
eb608e3a3
|
141 142 143 144 145 146 |
/* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) |
04fbfdc14
|
147 |
|
693108a8a
|
148 |
#ifdef CONFIG_CGROUP_WRITEBACK |
d60d1bddd
|
149 150 151 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions |
9fc3a43e1
|
152 |
#define GDTC_INIT_NO_WB .dom = &global_wb_domain |
d60d1bddd
|
153 154 155 156 157 |
#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc |
c2aa723a6
|
158 159 160 161 162 |
static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } |
e9f07dfd7
|
163 164 165 166 167 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } |
9fc3a43e1
|
168 169 170 171 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } |
841710aa6
|
172 173 174 175 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } |
693108a8a
|
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; do_div(min, tot_bw); } if (max < 100) { max *= this_bw; do_div(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ |
d60d1bddd
|
204 205 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions |
9fc3a43e1
|
206 |
#define GDTC_INIT_NO_WB |
c2aa723a6
|
207 208 209 210 211 212 |
#define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } |
e9f07dfd7
|
213 214 215 216 217 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } |
9fc3a43e1
|
218 219 220 221 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } |
841710aa6
|
222 223 224 225 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } |
693108a8a
|
226 227 228 229 230 231 232 233 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ |
04fbfdc14
|
234 |
/* |
a756cf590
|
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
* In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effictive number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ |
a804552b9
|
251 |
/** |
281e37265
|
252 253 |
* node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node |
a804552b9
|
254 |
* |
281e37265
|
255 256 |
* Returns the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. |
a804552b9
|
257 |
*/ |
281e37265
|
258 |
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
a804552b9
|
259 |
{ |
281e37265
|
260 261 262 263 264 265 266 267 268 269 270 |
unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } |
a804552b9
|
271 |
|
a8d014373
|
272 273 274 275 276 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ |
281e37265
|
277 |
nr_pages -= min(nr_pages, pgdat->totalreserve_pages); |
a804552b9
|
278 |
|
281e37265
|
279 280 |
nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); |
a804552b9
|
281 282 283 |
return nr_pages; } |
1edf22348
|
284 285 286 287 |
static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; |
bb4cc2bea
|
288 |
unsigned long x = 0; |
09b4ab3c4
|
289 |
int i; |
1edf22348
|
290 291 |
for_each_node_state(node, N_HIGH_MEMORY) { |
281e37265
|
292 293 |
for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; |
9cb937e21
|
294 |
unsigned long nr_pages; |
281e37265
|
295 296 297 298 299 |
if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; |
9cb937e21
|
300 301 |
if (!populated_zone(z)) continue; |
1edf22348
|
302 |
|
9cb937e21
|
303 |
nr_pages = zone_page_state(z, NR_FREE_PAGES); |
281e37265
|
304 |
/* watch for underflows */ |
9cb937e21
|
305 |
nr_pages -= min(nr_pages, high_wmark_pages(z)); |
bb4cc2bea
|
306 307 308 |
nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; |
09b4ab3c4
|
309 |
} |
1edf22348
|
310 |
} |
281e37265
|
311 |
|
1edf22348
|
312 |
/* |
c8b74c2f6
|
313 314 315 316 317 318 319 320 321 322 323 324 |
* Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below * the zone's dirty balance reserve and the above calculation * will underflow. However we still want to add in nodes * which are below threshold (negative values) to get a more * accurate calculation but make sure that the total never * underflows. */ if ((long)x < 0) x = 0; /* |
1edf22348
|
325 326 327 328 329 330 331 332 333 334 335 336 |
* Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** |
ccafa2879
|
337 |
* global_dirtyable_memory - number of globally dirtyable pages |
1edf22348
|
338 |
* |
ccafa2879
|
339 340 |
* Returns the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. |
1edf22348
|
341 |
*/ |
18cf8cf8b
|
342 |
static unsigned long global_dirtyable_memory(void) |
1edf22348
|
343 344 |
{ unsigned long x; |
a804552b9
|
345 |
x = global_page_state(NR_FREE_PAGES); |
a8d014373
|
346 347 348 349 350 351 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); |
1edf22348
|
352 |
|
599d0c954
|
353 354 |
x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); |
a804552b9
|
355 |
|
1edf22348
|
356 357 358 359 360 |
if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } |
9fc3a43e1
|
361 362 363 |
/** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest |
ccafa2879
|
364 |
* |
9fc3a43e1
|
365 366 367 368 |
* Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
ccafa2879
|
369 370 |
* real-time tasks. */ |
9fc3a43e1
|
371 |
static void domain_dirty_limits(struct dirty_throttle_control *dtc) |
ccafa2879
|
372 |
{ |
9fc3a43e1
|
373 374 375 376 |
const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; |
62a584fe0
|
377 378 379 |
/* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; |
9fc3a43e1
|
380 381 |
unsigned long thresh; unsigned long bg_thresh; |
ccafa2879
|
382 |
struct task_struct *tsk; |
9fc3a43e1
|
383 384 385 386 387 388 389 |
/* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against |
62a584fe0
|
390 391 392 |
* globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. |
9fc3a43e1
|
393 394 |
*/ if (bytes) |
62a584fe0
|
395 396 |
ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); |
9fc3a43e1
|
397 |
if (bg_bytes) |
62a584fe0
|
398 399 |
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); |
9fc3a43e1
|
400 401 402 403 404 |
bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); |
ccafa2879
|
405 |
else |
62a584fe0
|
406 |
thresh = (ratio * available_memory) / PAGE_SIZE; |
ccafa2879
|
407 |
|
9fc3a43e1
|
408 409 |
if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); |
ccafa2879
|
410 |
else |
62a584fe0
|
411 |
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; |
ccafa2879
|
412 |
|
9fc3a43e1
|
413 414 |
if (bg_thresh >= thresh) bg_thresh = thresh / 2; |
ccafa2879
|
415 416 |
tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
a53eaff8c
|
417 418 |
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; |
ccafa2879
|
419 |
} |
9fc3a43e1
|
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 |
dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; |
ccafa2879
|
445 |
} |
a756cf590
|
446 |
/** |
281e37265
|
447 448 |
* node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node |
a756cf590
|
449 |
* |
281e37265
|
450 451 |
* Returns the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. |
a756cf590
|
452 |
*/ |
281e37265
|
453 |
static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
a756cf590
|
454 |
{ |
281e37265
|
455 |
unsigned long node_memory = node_dirtyable_memory(pgdat); |
a756cf590
|
456 457 458 459 460 |
struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * |
281e37265
|
461 |
node_memory / global_dirtyable_memory(); |
a756cf590
|
462 |
else |
281e37265
|
463 |
dirty = vm_dirty_ratio * node_memory / 100; |
a756cf590
|
464 465 466 467 468 469 470 471 |
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) dirty += dirty / 4; return dirty; } /** |
281e37265
|
472 473 |
* node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check |
a756cf590
|
474 |
* |
281e37265
|
475 |
* Returns %true when the dirty pages in @pgdat are within the node's |
a756cf590
|
476 477 |
* dirty limit, %false if the limit is exceeded. */ |
281e37265
|
478 |
bool node_dirty_ok(struct pglist_data *pgdat) |
a756cf590
|
479 |
{ |
281e37265
|
480 481 |
unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; |
11fb99898
|
482 483 484 |
nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); |
a756cf590
|
485 |
|
281e37265
|
486 |
return nr_pages <= limit; |
a756cf590
|
487 |
} |
2da02997e
|
488 |
int dirty_background_ratio_handler(struct ctl_table *table, int write, |
8d65af789
|
489 |
void __user *buffer, size_t *lenp, |
2da02997e
|
490 491 492 |
loff_t *ppos) { int ret; |
8d65af789
|
493 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e
|
494 495 496 497 498 499 |
if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, |
8d65af789
|
500 |
void __user *buffer, size_t *lenp, |
2da02997e
|
501 502 503 |
loff_t *ppos) { int ret; |
8d65af789
|
504 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e
|
505 506 507 508 |
if (ret == 0 && write) dirty_background_ratio = 0; return ret; } |
04fbfdc14
|
509 |
int dirty_ratio_handler(struct ctl_table *table, int write, |
8d65af789
|
510 |
void __user *buffer, size_t *lenp, |
04fbfdc14
|
511 512 513 |
loff_t *ppos) { int old_ratio = vm_dirty_ratio; |
2da02997e
|
514 |
int ret; |
8d65af789
|
515 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
04fbfdc14
|
516 |
if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
eb608e3a3
|
517 |
writeback_set_ratelimit(); |
2da02997e
|
518 519 520 521 |
vm_dirty_bytes = 0; } return ret; } |
2da02997e
|
522 |
int dirty_bytes_handler(struct ctl_table *table, int write, |
8d65af789
|
523 |
void __user *buffer, size_t *lenp, |
2da02997e
|
524 525 |
loff_t *ppos) { |
fc3501d41
|
526 |
unsigned long old_bytes = vm_dirty_bytes; |
2da02997e
|
527 |
int ret; |
8d65af789
|
528 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e
|
529 |
if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
eb608e3a3
|
530 |
writeback_set_ratelimit(); |
2da02997e
|
531 |
vm_dirty_ratio = 0; |
04fbfdc14
|
532 533 534 |
} return ret; } |
eb608e3a3
|
535 536 537 538 539 540 541 542 |
static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } |
c7981433e
|
543 544 545 |
static void wb_domain_writeout_inc(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac) |
04fbfdc14
|
546 |
{ |
c7981433e
|
547 548 |
__fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); |
eb608e3a3
|
549 |
/* First event after period switching was turned off? */ |
380c27ca3
|
550 |
if (!unlikely(dom->period_time)) { |
eb608e3a3
|
551 552 553 554 555 556 |
/* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ |
380c27ca3
|
557 558 |
dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3
|
559 |
} |
04fbfdc14
|
560 |
} |
c7981433e
|
561 562 563 564 565 |
/* * Increment @wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) |
dd5656e59
|
566 |
{ |
841710aa6
|
567 |
struct wb_domain *cgdom; |
dd5656e59
|
568 |
|
c7981433e
|
569 570 571 |
__inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); |
841710aa6
|
572 573 574 575 576 |
cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac); |
dd5656e59
|
577 |
} |
dd5656e59
|
578 |
|
93f78d882
|
579 |
void wb_writeout_inc(struct bdi_writeback *wb) |
04fbfdc14
|
580 |
{ |
dd5656e59
|
581 582 583 |
unsigned long flags; local_irq_save(flags); |
93f78d882
|
584 |
__wb_writeout_inc(wb); |
dd5656e59
|
585 |
local_irq_restore(flags); |
04fbfdc14
|
586 |
} |
93f78d882
|
587 |
EXPORT_SYMBOL_GPL(wb_writeout_inc); |
04fbfdc14
|
588 |
|
04fbfdc14
|
589 |
/* |
eb608e3a3
|
590 591 592 593 594 |
* On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(unsigned long t) { |
380c27ca3
|
595 596 |
struct wb_domain *dom = (void *)t; int miss_periods = (jiffies - dom->period_time) / |
eb608e3a3
|
597 |
VM_COMPLETIONS_PERIOD_LEN; |
380c27ca3
|
598 599 |
if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + |
eb608e3a3
|
600 |
miss_periods * VM_COMPLETIONS_PERIOD_LEN); |
380c27ca3
|
601 |
mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3
|
602 603 604 605 606 |
} else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ |
380c27ca3
|
607 |
dom->period_time = 0; |
eb608e3a3
|
608 609 |
} } |
380c27ca3
|
610 611 612 |
int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); |
dcc25ae76
|
613 614 |
spin_lock_init(&dom->lock); |
380c27ca3
|
615 616 617 |
init_timer_deferrable(&dom->period_timer); dom->period_timer.function = writeout_period; dom->period_timer.data = (unsigned long)dom; |
dcc25ae76
|
618 619 |
dom->dirty_limit_tstamp = jiffies; |
380c27ca3
|
620 621 |
return fprop_global_init(&dom->completions, gfp); } |
841710aa6
|
622 623 624 625 626 627 628 |
#ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif |
eb608e3a3
|
629 |
/* |
d08c429b0
|
630 631 632 |
* bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. |
189d3c4a9
|
633 |
*/ |
189d3c4a9
|
634 635 636 637 638 |
static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; |
189d3c4a9
|
639 |
|
cfc4ba536
|
640 |
spin_lock_bh(&bdi_lock); |
a42dde041
|
641 |
if (min_ratio > bdi->max_ratio) { |
189d3c4a9
|
642 |
ret = -EINVAL; |
a42dde041
|
643 644 645 646 647 648 649 650 651 |
} else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } |
cfc4ba536
|
652 |
spin_unlock_bh(&bdi_lock); |
a42dde041
|
653 654 655 656 657 658 |
return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { |
a42dde041
|
659 660 661 662 |
int ret = 0; if (max_ratio > 100) return -EINVAL; |
cfc4ba536
|
663 |
spin_lock_bh(&bdi_lock); |
a42dde041
|
664 665 666 667 |
if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; |
eb608e3a3
|
668 |
bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
a42dde041
|
669 |
} |
cfc4ba536
|
670 |
spin_unlock_bh(&bdi_lock); |
189d3c4a9
|
671 672 673 |
return ret; } |
a42dde041
|
674 |
EXPORT_SYMBOL(bdi_set_max_ratio); |
189d3c4a9
|
675 |
|
6c14ae1e9
|
676 677 678 679 680 |
static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } |
c7981433e
|
681 682 |
static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) |
ffd1f609a
|
683 |
{ |
dcc25ae76
|
684 |
return max(thresh, dom->dirty_limit); |
ffd1f609a
|
685 |
} |
c5edf9cdc
|
686 687 688 689 690 691 |
/* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) |
c2aa723a6
|
692 693 |
{ struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); |
c5edf9cdc
|
694 695 696 |
unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); |
c2aa723a6
|
697 |
|
c5edf9cdc
|
698 |
mdtc->avail = filepages + min(headroom, other_clean); |
ffd1f609a
|
699 |
} |
6f7186562
|
700 |
/** |
b1cbc6d40
|
701 702 |
* __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest |
1babe1838
|
703 |
* |
a88a341a7
|
704 |
* Returns @wb's dirty limit in pages. The term "dirty" in the context of |
6f7186562
|
705 |
* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. |
aed21ad28
|
706 707 708 709 710 711 |
* * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks |
a88a341a7
|
712 |
* more (rather than completely block them) when the wb dirty pages go high. |
1babe1838
|
713 |
* |
6f7186562
|
714 |
* It allocates high/low dirty limits to fast/slow devices, in order to prevent |
1babe1838
|
715 716 717 |
* - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * |
a88a341a7
|
718 |
* The wb's share of dirty limit will be adapting to its throughput and |
1babe1838
|
719 720 |
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ |
b1cbc6d40
|
721 |
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) |
16c4042f0
|
722 |
{ |
e9f07dfd7
|
723 |
struct wb_domain *dom = dtc_dom(dtc); |
b1cbc6d40
|
724 |
unsigned long thresh = dtc->thresh; |
0d960a383
|
725 |
u64 wb_thresh; |
16c4042f0
|
726 |
long numerator, denominator; |
693108a8a
|
727 |
unsigned long wb_min_ratio, wb_max_ratio; |
04fbfdc14
|
728 |
|
16c4042f0
|
729 |
/* |
0d960a383
|
730 |
* Calculate this BDI's share of the thresh ratio. |
16c4042f0
|
731 |
*/ |
e9770b348
|
732 |
fprop_fraction_percpu(&dom->completions, dtc->wb_completions, |
380c27ca3
|
733 |
&numerator, &denominator); |
04fbfdc14
|
734 |
|
0d960a383
|
735 736 737 |
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; do_div(wb_thresh, denominator); |
04fbfdc14
|
738 |
|
b1cbc6d40
|
739 |
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); |
04fbfdc14
|
740 |
|
0d960a383
|
741 742 743 |
wb_thresh += (thresh * wb_min_ratio) / 100; if (wb_thresh > (thresh * wb_max_ratio) / 100) wb_thresh = thresh * wb_max_ratio / 100; |
16c4042f0
|
744 |
|
0d960a383
|
745 |
return wb_thresh; |
1da177e4c
|
746 |
} |
b1cbc6d40
|
747 748 749 750 751 |
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); |
1da177e4c
|
752 |
} |
6c14ae1e9
|
753 |
/* |
5a5374856
|
754 755 756 757 758 759 760 761 762 763 764 765 766 |
* setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ |
d5c9fde3d
|
767 |
static long long pos_ratio_polynom(unsigned long setpoint, |
5a5374856
|
768 769 770 771 772 |
unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; |
d5c9fde3d
|
773 |
x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, |
464d1387a
|
774 |
(limit - setpoint) | 1); |
5a5374856
|
775 776 777 778 779 780 781 782 783 |
pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* |
6c14ae1e9
|
784 785 786 787 |
* Dirty position control. * * (o) global/bdi setpoints * |
de1fff37b
|
788 |
* We want the dirty pages be balanced around the global/wb setpoints. |
6c14ae1e9
|
789 790 791 792 793 794 795 796 797 |
* When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * |
de1fff37b
|
798 799 |
* if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio |
6c14ae1e9
|
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 |
* * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 .............* * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * |
de1fff37b
|
824 |
* (o) wb control line |
6c14ae1e9
|
825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 |
* * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> |
de1fff37b
|
850 |
* wb_setpoint^ x_intercept^ |
6c14ae1e9
|
851 |
* |
de1fff37b
|
852 |
* The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can |
6c14ae1e9
|
853 854 |
* be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD |
de1fff37b
|
855 856 |
* card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload |
6c14ae1e9
|
857 |
*/ |
daddfa3cb
|
858 |
static void wb_position_ratio(struct dirty_throttle_control *dtc) |
6c14ae1e9
|
859 |
{ |
2bc00aef0
|
860 |
struct bdi_writeback *wb = dtc->wb; |
a88a341a7
|
861 |
unsigned long write_bw = wb->avg_write_bandwidth; |
2bc00aef0
|
862 |
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e
|
863 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
2bc00aef0
|
864 |
unsigned long wb_thresh = dtc->wb_thresh; |
6c14ae1e9
|
865 866 |
unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ |
de1fff37b
|
867 |
unsigned long wb_setpoint; |
6c14ae1e9
|
868 869 870 |
unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; |
daddfa3cb
|
871 |
dtc->pos_ratio = 0; |
2bc00aef0
|
872 |
if (unlikely(dtc->dirty >= limit)) |
daddfa3cb
|
873 |
return; |
6c14ae1e9
|
874 875 876 877 |
/* * global setpoint * |
5a5374856
|
878 879 880 |
* See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; |
2bc00aef0
|
881 |
pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); |
5a5374856
|
882 883 884 885 |
/* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For |
de1fff37b
|
886 887 |
* such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". |
5a5374856
|
888 889 890 891 |
* This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". |
6c14ae1e9
|
892 |
* |
a88a341a7
|
893 |
* Here, in wb_position_ratio(), we calculate pos_ratio based on |
de1fff37b
|
894 |
* two values: wb_dirty and wb_thresh. Let's consider an example: |
5a5374856
|
895 896 |
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). |
de1fff37b
|
897 |
* Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. |
0d960a383
|
898 |
* wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is |
de1fff37b
|
899 |
* about ~6K pages (as the average of background and throttle wb |
5a5374856
|
900 |
* limits). The 3rd order polynomial will provide positive feedback if |
de1fff37b
|
901 |
* wb_dirty is under wb_setpoint and vice versa. |
6c14ae1e9
|
902 |
* |
5a5374856
|
903 |
* Note, that we cannot use global counters in these calculations |
de1fff37b
|
904 |
* because we want to throttle process writing to a strictlimit wb |
5a5374856
|
905 906 |
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). |
6c14ae1e9
|
907 |
*/ |
a88a341a7
|
908 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
de1fff37b
|
909 |
long long wb_pos_ratio; |
5a5374856
|
910 |
|
daddfa3cb
|
911 912 913 914 915 |
if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } |
5a5374856
|
916 |
|
2bc00aef0
|
917 |
if (dtc->wb_dirty >= wb_thresh) |
daddfa3cb
|
918 |
return; |
5a5374856
|
919 |
|
970fb01ad
|
920 921 |
wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); |
5a5374856
|
922 |
|
de1fff37b
|
923 |
if (wb_setpoint == 0 || wb_setpoint == wb_thresh) |
daddfa3cb
|
924 |
return; |
5a5374856
|
925 |
|
2bc00aef0
|
926 |
wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, |
de1fff37b
|
927 |
wb_thresh); |
5a5374856
|
928 929 |
/* |
de1fff37b
|
930 931 |
* Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global |
5a5374856
|
932 |
* state ("dirty") is not limiting factor and we have to |
de1fff37b
|
933 |
* make decision based on wb counters. But there is an |
5a5374856
|
934 935 |
* important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other |
de1fff37b
|
936 |
* wb's) while given strictlimit wb is below limit. |
5a5374856
|
937 |
* |
de1fff37b
|
938 |
* "pos_ratio * wb_pos_ratio" would work for the case above, |
5a5374856
|
939 |
* but it would look too non-natural for the case of all |
de1fff37b
|
940 |
* activity in the system coming from a single strictlimit wb |
5a5374856
|
941 942 943 944 |
* with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 |
de1fff37b
|
945 |
* (when globally we are at freerun and wb is well below wb |
5a5374856
|
946 947 948 949 |
* setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ |
daddfa3cb
|
950 951 |
dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; |
5a5374856
|
952 |
} |
6c14ae1e9
|
953 954 955 |
/* * We have computed basic pos_ratio above based on global situation. If |
de1fff37b
|
956 |
* the wb is over/under its share of dirty pages, we want to scale |
6c14ae1e9
|
957 958 959 960 |
* pos_ratio further down/up. That is done by the following mechanism. */ /* |
de1fff37b
|
961 |
* wb setpoint |
6c14ae1e9
|
962 |
* |
de1fff37b
|
963 |
* f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) |
6c14ae1e9
|
964 |
* |
de1fff37b
|
965 |
* x_intercept - wb_dirty |
6c14ae1e9
|
966 |
* := -------------------------- |
de1fff37b
|
967 |
* x_intercept - wb_setpoint |
6c14ae1e9
|
968 |
* |
de1fff37b
|
969 |
* The main wb control line is a linear function that subjects to |
6c14ae1e9
|
970 |
* |
de1fff37b
|
971 972 973 |
* (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw |
6c14ae1e9
|
974 |
* |
de1fff37b
|
975 |
* For single wb case, the dirty pages are observed to fluctuate |
6c14ae1e9
|
976 |
* regularly within range |
de1fff37b
|
977 |
* [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] |
6c14ae1e9
|
978 979 980 |
* for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * |
de1fff37b
|
981 |
* For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its |
6c14ae1e9
|
982 |
* own size, so move the slope over accordingly and choose a slope that |
de1fff37b
|
983 |
* yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. |
6c14ae1e9
|
984 |
*/ |
2bc00aef0
|
985 986 |
if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; |
aed21ad28
|
987 |
/* |
de1fff37b
|
988 |
* It's very possible that wb_thresh is close to 0 not because the |
aed21ad28
|
989 990 991 992 993 |
* device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ |
2bc00aef0
|
994 |
wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); |
6c14ae1e9
|
995 |
/* |
de1fff37b
|
996 997 |
* scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh |
6c14ae1e9
|
998 |
*/ |
e4bc13adf
|
999 |
x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); |
de1fff37b
|
1000 |
wb_setpoint = setpoint * (u64)x >> 16; |
6c14ae1e9
|
1001 |
/* |
de1fff37b
|
1002 1003 |
* Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. |
6c14ae1e9
|
1004 |
* |
de1fff37b
|
1005 1006 1007 |
* wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh |
6c14ae1e9
|
1008 |
*/ |
2bc00aef0
|
1009 |
span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; |
de1fff37b
|
1010 |
x_intercept = wb_setpoint + span; |
6c14ae1e9
|
1011 |
|
2bc00aef0
|
1012 1013 |
if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), |
e4bc13adf
|
1014 |
(x_intercept - wb_setpoint) | 1); |
6c14ae1e9
|
1015 1016 |
} else pos_ratio /= 4; |
8927f66c4
|
1017 |
/* |
de1fff37b
|
1018 |
* wb reserve area, safeguard against dirty pool underrun and disk idle |
8927f66c4
|
1019 1020 1021 |
* It may push the desired control point of global dirty pages higher * than setpoint. */ |
de1fff37b
|
1022 |
x_intercept = wb_thresh / 2; |
2bc00aef0
|
1023 1024 1025 1026 |
if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); |
50657fc4d
|
1027 |
else |
8927f66c4
|
1028 1029 |
pos_ratio *= 8; } |
daddfa3cb
|
1030 |
dtc->pos_ratio = pos_ratio; |
6c14ae1e9
|
1031 |
} |
a88a341a7
|
1032 1033 1034 |
static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) |
e98be2d59
|
1035 1036 |
{ const unsigned long period = roundup_pow_of_two(3 * HZ); |
a88a341a7
|
1037 1038 |
unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; |
e98be2d59
|
1039 1040 1041 1042 1043 1044 1045 1046 |
u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period |
c72efb658
|
1047 1048 1049 |
* * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. |
e98be2d59
|
1050 |
*/ |
a88a341a7
|
1051 |
bw = written - min(written, wb->written_stamp); |
e98be2d59
|
1052 1053 1054 1055 1056 1057 |
bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } |
a88a341a7
|
1058 |
bw += (u64)wb->write_bandwidth * (period - elapsed); |
e98be2d59
|
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 |
bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: |
95a46c65e
|
1071 1072 1073 1074 1075 1076 1077 |
/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } |
a88a341a7
|
1078 1079 |
wb->write_bandwidth = bw; wb->avg_write_bandwidth = avg; |
e98be2d59
|
1080 |
} |
2bc00aef0
|
1081 |
static void update_dirty_limit(struct dirty_throttle_control *dtc) |
c42843f2f
|
1082 |
{ |
e9f07dfd7
|
1083 |
struct wb_domain *dom = dtc_dom(dtc); |
2bc00aef0
|
1084 |
unsigned long thresh = dtc->thresh; |
dcc25ae76
|
1085 |
unsigned long limit = dom->dirty_limit; |
c42843f2f
|
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 |
/* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce |
dcc25ae76
|
1098 |
* dom->dirty_limit which is guaranteed to lie above the dirty pages. |
c42843f2f
|
1099 |
*/ |
2bc00aef0
|
1100 |
thresh = max(thresh, dtc->dirty); |
c42843f2f
|
1101 1102 1103 1104 1105 1106 |
if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: |
dcc25ae76
|
1107 |
dom->dirty_limit = limit; |
c42843f2f
|
1108 |
} |
e9f07dfd7
|
1109 |
static void domain_update_bandwidth(struct dirty_throttle_control *dtc, |
c42843f2f
|
1110 1111 |
unsigned long now) { |
e9f07dfd7
|
1112 |
struct wb_domain *dom = dtc_dom(dtc); |
c42843f2f
|
1113 1114 1115 1116 |
/* * check locklessly first to optimize away locking for the most time */ |
dcc25ae76
|
1117 |
if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) |
c42843f2f
|
1118 |
return; |
dcc25ae76
|
1119 1120 |
spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { |
2bc00aef0
|
1121 |
update_dirty_limit(dtc); |
dcc25ae76
|
1122 |
dom->dirty_limit_tstamp = now; |
c42843f2f
|
1123 |
} |
dcc25ae76
|
1124 |
spin_unlock(&dom->lock); |
c42843f2f
|
1125 |
} |
be3ffa276
|
1126 |
/* |
de1fff37b
|
1127 |
* Maintain wb->dirty_ratelimit, the base dirty throttle rate. |
be3ffa276
|
1128 |
* |
de1fff37b
|
1129 |
* Normal wb tasks will be curbed at or below it in long term. |
be3ffa276
|
1130 1131 |
* Obviously it should be around (write_bw / N) when there are N dd tasks. */ |
2bc00aef0
|
1132 |
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, |
a88a341a7
|
1133 1134 |
unsigned long dirtied, unsigned long elapsed) |
be3ffa276
|
1135 |
{ |
2bc00aef0
|
1136 1137 1138 |
struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e
|
1139 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
7381131cb
|
1140 |
unsigned long setpoint = (freerun + limit) / 2; |
a88a341a7
|
1141 1142 |
unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; |
be3ffa276
|
1143 1144 1145 |
unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; |
7381131cb
|
1146 1147 |
unsigned long step; unsigned long x; |
d59b1087a
|
1148 |
unsigned long shift; |
be3ffa276
|
1149 1150 1151 1152 1153 |
/* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ |
a88a341a7
|
1154 |
dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; |
be3ffa276
|
1155 |
|
be3ffa276
|
1156 1157 1158 1159 |
/* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * |
daddfa3cb
|
1160 |
dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; |
be3ffa276
|
1161 1162 1163 1164 |
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, |
de1fff37b
|
1165 |
* if there are N dd tasks, each throttled at task_ratelimit, the wb's |
be3ffa276
|
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 |
* dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); |
bdaac4902
|
1195 1196 1197 1198 1199 |
/* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; |
be3ffa276
|
1200 |
|
7381131cb
|
1201 1202 1203 |
/* * We could safely do this and return immediately: * |
de1fff37b
|
1204 |
* wb->dirty_ratelimit = balanced_dirty_ratelimit; |
7381131cb
|
1205 1206 |
* * However to get a more stable dirty_ratelimit, the below elaborated |
331cbdeed
|
1207 |
* code makes use of task_ratelimit to filter out singular points and |
7381131cb
|
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 |
* limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
331cbdeed
|
1230 |
* and filter out the singular points of balanced_dirty_ratelimit. Which |
7381131cb
|
1231 1232 1233 1234 1235 |
* keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; |
5a5374856
|
1236 1237 |
/* |
de1fff37b
|
1238 |
* For strictlimit case, calculations above were based on wb counters |
a88a341a7
|
1239 |
* and limits (starting from pos_ratio = wb_position_ratio() and up to |
5a5374856
|
1240 |
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). |
de1fff37b
|
1241 1242 |
* Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". |
5a5374856
|
1243 |
* |
de1fff37b
|
1244 1245 |
* We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity |
970fb01ad
|
1246 |
* of backing device. |
5a5374856
|
1247 |
*/ |
a88a341a7
|
1248 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
2bc00aef0
|
1249 1250 1251 |
dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; |
5a5374856
|
1252 |
else |
970fb01ad
|
1253 |
setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; |
5a5374856
|
1254 |
} |
7381131cb
|
1255 |
if (dirty < setpoint) { |
a88a341a7
|
1256 |
x = min3(wb->balanced_dirty_ratelimit, |
7c809968f
|
1257 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb
|
1258 1259 1260 |
if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { |
a88a341a7
|
1261 |
x = max3(wb->balanced_dirty_ratelimit, |
7c809968f
|
1262 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb
|
1263 1264 1265 1266 1267 1268 1269 1270 1271 |
if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ |
d59b1087a
|
1272 1273 1274 1275 1276 |
shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; |
7381131cb
|
1277 1278 1279 1280 1281 |
if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; |
a88a341a7
|
1282 1283 |
wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; |
b48c104d2
|
1284 |
|
5634cc2aa
|
1285 |
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); |
be3ffa276
|
1286 |
} |
c2aa723a6
|
1287 1288 |
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, |
8a7317995
|
1289 1290 |
unsigned long start_time, bool update_ratelimit) |
e98be2d59
|
1291 |
{ |
c2aa723a6
|
1292 |
struct bdi_writeback *wb = gdtc->wb; |
e98be2d59
|
1293 |
unsigned long now = jiffies; |
a88a341a7
|
1294 |
unsigned long elapsed = now - wb->bw_time_stamp; |
be3ffa276
|
1295 |
unsigned long dirtied; |
e98be2d59
|
1296 |
unsigned long written; |
8a7317995
|
1297 |
lockdep_assert_held(&wb->list_lock); |
e98be2d59
|
1298 1299 1300 1301 1302 |
/* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; |
a88a341a7
|
1303 1304 |
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); |
e98be2d59
|
1305 1306 1307 1308 1309 |
/* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ |
a88a341a7
|
1310 |
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) |
e98be2d59
|
1311 |
goto snapshot; |
8a7317995
|
1312 |
if (update_ratelimit) { |
c2aa723a6
|
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 |
domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_bandwidth(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } |
be3ffa276
|
1324 |
} |
a88a341a7
|
1325 |
wb_update_write_bandwidth(wb, elapsed, written); |
e98be2d59
|
1326 1327 |
snapshot: |
a88a341a7
|
1328 1329 1330 |
wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; |
e98be2d59
|
1331 |
} |
8a7317995
|
1332 |
void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) |
e98be2d59
|
1333 |
{ |
2bc00aef0
|
1334 |
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; |
c2aa723a6
|
1335 |
__wb_update_bandwidth(&gdtc, NULL, start_time, false); |
e98be2d59
|
1336 |
} |
1da177e4c
|
1337 |
/* |
d0e1d66b5
|
1338 |
* After a task dirtied this many pages, balance_dirty_pages_ratelimited() |
9d823e8f6
|
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 |
* will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive * global_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } |
a88a341a7
|
1353 |
static unsigned long wb_max_pause(struct bdi_writeback *wb, |
de1fff37b
|
1354 |
unsigned long wb_dirty) |
c8462cc9d
|
1355 |
{ |
a88a341a7
|
1356 |
unsigned long bw = wb->avg_write_bandwidth; |
e3b6c655b
|
1357 |
unsigned long t; |
c8462cc9d
|
1358 |
|
7ccb9ad53
|
1359 1360 1361 1362 1363 1364 1365 |
/* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ |
de1fff37b
|
1366 |
t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); |
7ccb9ad53
|
1367 |
t++; |
e3b6c655b
|
1368 |
return min_t(unsigned long, t, MAX_PAUSE); |
7ccb9ad53
|
1369 |
} |
a88a341a7
|
1370 1371 1372 1373 1374 |
static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) |
c8462cc9d
|
1375 |
{ |
a88a341a7
|
1376 1377 |
long hi = ilog2(wb->avg_write_bandwidth); long lo = ilog2(wb->dirty_ratelimit); |
7ccb9ad53
|
1378 1379 1380 |
long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ |
c8462cc9d
|
1381 |
|
7ccb9ad53
|
1382 1383 |
/* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); |
c8462cc9d
|
1384 1385 1386 1387 1388 |
/* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * |
7ccb9ad53
|
1389 |
* (N * 10ms) on 2^N concurrent tasks. |
c8462cc9d
|
1390 1391 |
*/ if (hi > lo) |
7ccb9ad53
|
1392 |
t += (hi - lo) * (10 * HZ) / 1024; |
c8462cc9d
|
1393 1394 |
/* |
7ccb9ad53
|
1395 1396 1397 1398 1399 1400 1401 1402 |
* This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. |
c8462cc9d
|
1403 |
* |
7ccb9ad53
|
1404 1405 1406 1407 1408 1409 1410 |
* We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. |
c8462cc9d
|
1411 |
*/ |
7ccb9ad53
|
1412 1413 |
t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); |
c8462cc9d
|
1414 1415 |
/* |
5b9b35743
|
1416 1417 1418 1419 1420 1421 |
* Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. |
c8462cc9d
|
1422 |
*/ |
5b9b35743
|
1423 1424 1425 1426 1427 1428 1429 1430 |
if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } |
7ccb9ad53
|
1431 1432 1433 1434 1435 |
pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } |
c8462cc9d
|
1436 |
|
7ccb9ad53
|
1437 |
*nr_dirtied_pause = pages; |
c8462cc9d
|
1438 |
/* |
7ccb9ad53
|
1439 |
* The minimal pause time will normally be half the target pause time. |
c8462cc9d
|
1440 |
*/ |
5b9b35743
|
1441 |
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; |
c8462cc9d
|
1442 |
} |
970fb01ad
|
1443 |
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) |
5a5374856
|
1444 |
{ |
2bc00aef0
|
1445 |
struct bdi_writeback *wb = dtc->wb; |
93f78d882
|
1446 |
unsigned long wb_reclaimable; |
5a5374856
|
1447 1448 |
/* |
de1fff37b
|
1449 |
* wb_thresh is not treated as some limiting factor as |
5a5374856
|
1450 |
* dirty_thresh, due to reasons |
de1fff37b
|
1451 |
* - in JBOD setup, wb_thresh can fluctuate a lot |
5a5374856
|
1452 |
* - in a system with HDD and USB key, the USB key may somehow |
de1fff37b
|
1453 1454 |
* go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. |
5a5374856
|
1455 |
* In this case we don't want to hard throttle the USB key |
de1fff37b
|
1456 1457 |
* dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in |
a88a341a7
|
1458 |
* wb_position_ratio() will let the dirtier task progress |
de1fff37b
|
1459 |
* at some rate <= (write_bw / 2) for bringing down wb_dirty. |
5a5374856
|
1460 |
*/ |
b1cbc6d40
|
1461 |
dtc->wb_thresh = __wb_calc_thresh(dtc); |
970fb01ad
|
1462 1463 |
dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; |
5a5374856
|
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 |
/* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ |
2bc00aef0
|
1475 |
if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { |
93f78d882
|
1476 |
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
2bc00aef0
|
1477 |
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); |
5a5374856
|
1478 |
} else { |
93f78d882
|
1479 |
wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
2bc00aef0
|
1480 |
dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); |
5a5374856
|
1481 1482 |
} } |
9d823e8f6
|
1483 |
/* |
1da177e4c
|
1484 1485 |
* balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force |
143dfe861
|
1486 |
* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
5b0830cb9
|
1487 1488 |
* If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. |
1da177e4c
|
1489 |
*/ |
3a2e9a5a2
|
1490 |
static void balance_dirty_pages(struct address_space *mapping, |
dfb8ae567
|
1491 |
struct bdi_writeback *wb, |
143dfe861
|
1492 |
unsigned long pages_dirtied) |
1da177e4c
|
1493 |
{ |
2bc00aef0
|
1494 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6
|
1495 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
2bc00aef0
|
1496 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6
|
1497 1498 1499 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; |
143dfe861
|
1500 |
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
83712358b
|
1501 |
long period; |
7ccb9ad53
|
1502 1503 1504 1505 |
long pause; long max_pause; long min_pause; int nr_dirtied_pause; |
e50e37201
|
1506 |
bool dirty_exceeded = false; |
143dfe861
|
1507 |
unsigned long task_ratelimit; |
7ccb9ad53
|
1508 |
unsigned long dirty_ratelimit; |
dfb8ae567
|
1509 |
struct backing_dev_info *bdi = wb->bdi; |
5a5374856
|
1510 |
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; |
e98be2d59
|
1511 |
unsigned long start_time = jiffies; |
1da177e4c
|
1512 1513 |
for (;;) { |
83712358b
|
1514 |
unsigned long now = jiffies; |
2bc00aef0
|
1515 |
unsigned long dirty, thresh, bg_thresh; |
50e55bf62
|
1516 1517 1518 |
unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; |
83712358b
|
1519 |
|
143dfe861
|
1520 1521 1522 1523 1524 1525 |
/* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. */ |
11fb99898
|
1526 1527 |
nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
9fc3a43e1
|
1528 |
gdtc->avail = global_dirtyable_memory(); |
11fb99898
|
1529 |
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); |
5fce25a9d
|
1530 |
|
9fc3a43e1
|
1531 |
domain_dirty_limits(gdtc); |
16c4042f0
|
1532 |
|
5a5374856
|
1533 |
if (unlikely(strictlimit)) { |
970fb01ad
|
1534 |
wb_dirty_limits(gdtc); |
5a5374856
|
1535 |
|
2bc00aef0
|
1536 1537 |
dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; |
970fb01ad
|
1538 |
bg_thresh = gdtc->wb_bg_thresh; |
5a5374856
|
1539 |
} else { |
2bc00aef0
|
1540 1541 1542 |
dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; |
5a5374856
|
1543 |
} |
c2aa723a6
|
1544 |
if (mdtc) { |
c5edf9cdc
|
1545 |
unsigned long filepages, headroom, writeback; |
c2aa723a6
|
1546 1547 1548 1549 1550 |
/* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ |
c5edf9cdc
|
1551 1552 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); |
c2aa723a6
|
1553 |
mdtc->dirty += writeback; |
c5edf9cdc
|
1554 |
mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6
|
1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 |
domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } |
5a5374856
|
1568 |
} |
16c4042f0
|
1569 1570 1571 |
/* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts |
de1fff37b
|
1572 |
* when the wb limits are ramping up in case of !strictlimit. |
5a5374856
|
1573 |
* |
de1fff37b
|
1574 1575 |
* In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping |
5a5374856
|
1576 |
* up are the price we consciously pay for strictlimit-ing. |
c2aa723a6
|
1577 1578 1579 |
* * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. |
16c4042f0
|
1580 |
*/ |
c2aa723a6
|
1581 1582 1583 1584 1585 |
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv = dirty_poll_interval(dirty, thresh); unsigned long m_intv = ULONG_MAX; |
83712358b
|
1586 1587 |
current->dirty_paused_when = now; current->nr_dirtied = 0; |
c2aa723a6
|
1588 1589 1590 |
if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); |
16c4042f0
|
1591 |
break; |
83712358b
|
1592 |
} |
16c4042f0
|
1593 |
|
bc05873dc
|
1594 |
if (unlikely(!writeback_in_progress(wb))) |
9ecf4866c
|
1595 |
wb_start_background_writeback(wb); |
143dfe861
|
1596 |
|
c2aa723a6
|
1597 1598 1599 1600 |
/* * Calculate global domain's pos_ratio and select the * global dtc by default. */ |
5a5374856
|
1601 |
if (!strictlimit) |
970fb01ad
|
1602 |
wb_dirty_limits(gdtc); |
5fce25a9d
|
1603 |
|
2bc00aef0
|
1604 1605 |
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); |
daddfa3cb
|
1606 1607 |
wb_position_ratio(gdtc); |
c2aa723a6
|
1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 |
sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) wb_dirty_limits(mdtc); dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } |
daddfa3cb
|
1627 |
|
a88a341a7
|
1628 1629 |
if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; |
1da177e4c
|
1630 |
|
8a7317995
|
1631 1632 1633 |
if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); |
c2aa723a6
|
1634 |
__wb_update_bandwidth(gdtc, mdtc, start_time, true); |
8a7317995
|
1635 1636 |
spin_unlock(&wb->list_lock); } |
e98be2d59
|
1637 |
|
c2aa723a6
|
1638 |
/* throttle according to the chosen dtc */ |
a88a341a7
|
1639 |
dirty_ratelimit = wb->dirty_ratelimit; |
c2aa723a6
|
1640 |
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> |
3a73dbbc9
|
1641 |
RATELIMIT_CALC_SHIFT; |
c2aa723a6
|
1642 |
max_pause = wb_max_pause(wb, sdtc->wb_dirty); |
a88a341a7
|
1643 1644 1645 |
min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); |
7ccb9ad53
|
1646 |
|
3a73dbbc9
|
1647 |
if (unlikely(task_ratelimit == 0)) { |
83712358b
|
1648 |
period = max_pause; |
c8462cc9d
|
1649 |
pause = max_pause; |
143dfe861
|
1650 |
goto pause; |
04fbfdc14
|
1651 |
} |
83712358b
|
1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 |
period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ |
7ccb9ad53
|
1663 |
if (pause < min_pause) { |
5634cc2aa
|
1664 |
trace_balance_dirty_pages(wb, |
c2aa723a6
|
1665 1666 1667 1668 1669 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31
|
1670 1671 1672 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b
|
1673 |
period, |
7ccb9ad53
|
1674 |
min(pause, 0L), |
ece13ac31
|
1675 |
start_time); |
83712358b
|
1676 1677 1678 1679 1680 1681 |
if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; |
7ccb9ad53
|
1682 1683 |
} else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; |
57fc978cf
|
1684 |
break; |
04fbfdc14
|
1685 |
} |
7ccb9ad53
|
1686 1687 1688 1689 1690 |
if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } |
143dfe861
|
1691 1692 |
pause: |
5634cc2aa
|
1693 |
trace_balance_dirty_pages(wb, |
c2aa723a6
|
1694 1695 1696 1697 1698 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31
|
1699 1700 1701 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b
|
1702 |
period, |
ece13ac31
|
1703 1704 |
pause, start_time); |
499d05ecf
|
1705 |
__set_current_state(TASK_KILLABLE); |
d25105e89
|
1706 |
io_schedule_timeout(pause); |
87c6a9b25
|
1707 |
|
83712358b
|
1708 1709 |
current->dirty_paused_when = now + pause; current->nr_dirtied = 0; |
7ccb9ad53
|
1710 |
current->nr_dirtied_pause = nr_dirtied_pause; |
83712358b
|
1711 |
|
ffd1f609a
|
1712 |
/* |
2bc00aef0
|
1713 1714 |
* This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. |
ffd1f609a
|
1715 |
*/ |
1df647197
|
1716 |
if (task_ratelimit) |
ffd1f609a
|
1717 |
break; |
499d05ecf
|
1718 |
|
c5c6343c4
|
1719 1720 |
/* * In the case of an unresponding NFS server and the NFS dirty |
de1fff37b
|
1721 |
* pages exceeds dirty_thresh, give the other good wb's a pipe |
c5c6343c4
|
1722 1723 1724 1725 |
* to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the comsumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 |
de1fff37b
|
1726 |
* more page. However wb_dirty has accounting errors. So use |
93f78d882
|
1727 |
* the larger and more IO friendly wb_stat_error. |
c5c6343c4
|
1728 |
*/ |
c2aa723a6
|
1729 |
if (sdtc->wb_dirty <= wb_stat_error(wb)) |
c5c6343c4
|
1730 |
break; |
499d05ecf
|
1731 1732 |
if (fatal_signal_pending(current)) break; |
1da177e4c
|
1733 |
} |
a88a341a7
|
1734 1735 |
if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; |
1da177e4c
|
1736 |
|
bc05873dc
|
1737 |
if (writeback_in_progress(wb)) |
5b0830cb9
|
1738 |
return; |
1da177e4c
|
1739 1740 1741 1742 1743 1744 1745 1746 1747 |
/* * In laptop mode, we wait until hitting the higher threshold before * starting background writeout, and then write out all the way down * to the lower threshold. So slow writers cause minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ |
143dfe861
|
1748 1749 |
if (laptop_mode) return; |
2bc00aef0
|
1750 |
if (nr_reclaimable > gdtc->bg_thresh) |
9ecf4866c
|
1751 |
wb_start_background_writeback(wb); |
1da177e4c
|
1752 |
} |
9d823e8f6
|
1753 |
static DEFINE_PER_CPU(int, bdp_ratelimits); |
245b2e70e
|
1754 |
|
54848d73f
|
1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 |
/* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; |
1da177e4c
|
1770 |
/** |
d0e1d66b5
|
1771 |
* balance_dirty_pages_ratelimited - balance dirty memory state |
67be2dd1b
|
1772 |
* @mapping: address_space which was dirtied |
1da177e4c
|
1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 |
* * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * On really big machines, get_writeback_state is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ |
d0e1d66b5
|
1783 |
void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1da177e4c
|
1784 |
{ |
dfb8ae567
|
1785 1786 1787 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; |
9d823e8f6
|
1788 1789 |
int ratelimit; int *p; |
1da177e4c
|
1790 |
|
36715cef0
|
1791 1792 |
if (!bdi_cap_account_dirty(bdi)) return; |
dfb8ae567
|
1793 1794 1795 1796 |
if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; |
9d823e8f6
|
1797 |
ratelimit = current->nr_dirtied_pause; |
a88a341a7
|
1798 |
if (wb->dirty_exceeded) |
9d823e8f6
|
1799 |
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
9d823e8f6
|
1800 |
preempt_disable(); |
1da177e4c
|
1801 |
/* |
9d823e8f6
|
1802 1803 1804 1805 |
* This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. |
1da177e4c
|
1806 |
*/ |
7c8e0181e
|
1807 |
p = this_cpu_ptr(&bdp_ratelimits); |
9d823e8f6
|
1808 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
fa5a734e4
|
1809 |
*p = 0; |
d3bc1fef9
|
1810 1811 1812 |
else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; |
1da177e4c
|
1813 |
} |
54848d73f
|
1814 1815 1816 1817 1818 |
/* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ |
7c8e0181e
|
1819 |
p = this_cpu_ptr(&dirty_throttle_leaks); |
54848d73f
|
1820 |
if (*p > 0 && current->nr_dirtied < ratelimit) { |
d0e1d66b5
|
1821 |
unsigned long nr_pages_dirtied; |
54848d73f
|
1822 1823 1824 |
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; |
1da177e4c
|
1825 |
} |
fa5a734e4
|
1826 |
preempt_enable(); |
9d823e8f6
|
1827 1828 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
dfb8ae567
|
1829 1830 1831 |
balance_dirty_pages(mapping, wb, current->nr_dirtied); wb_put(wb); |
1da177e4c
|
1832 |
} |
d0e1d66b5
|
1833 |
EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
1da177e4c
|
1834 |
|
aa661bbe1
|
1835 1836 1837 1838 1839 1840 1841 1842 1843 |
/** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's * clean enough. Returns %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { |
947e9762a
|
1844 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6
|
1845 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
947e9762a
|
1846 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6
|
1847 1848 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; |
aa661bbe1
|
1849 |
|
947e9762a
|
1850 1851 1852 1853 1854 |
/* * Similar to balance_dirty_pages() but ignores pages being written * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); |
11fb99898
|
1855 1856 |
gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
947e9762a
|
1857 |
domain_dirty_limits(gdtc); |
aa661bbe1
|
1858 |
|
947e9762a
|
1859 |
if (gdtc->dirty > gdtc->bg_thresh) |
aa661bbe1
|
1860 |
return true; |
74d369443
|
1861 1862 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) |
aa661bbe1
|
1863 |
return true; |
c2aa723a6
|
1864 |
if (mdtc) { |
c5edf9cdc
|
1865 |
unsigned long filepages, headroom, writeback; |
c2aa723a6
|
1866 |
|
c5edf9cdc
|
1867 1868 1869 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6
|
1870 1871 1872 1873 |
domain_dirty_limits(mdtc); /* ditto, ignore writeback */ if (mdtc->dirty > mdtc->bg_thresh) return true; |
74d369443
|
1874 1875 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) |
c2aa723a6
|
1876 1877 |
return true; } |
aa661bbe1
|
1878 1879 |
return false; } |
1da177e4c
|
1880 |
/* |
1da177e4c
|
1881 1882 |
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ |
cccad5b98
|
1883 |
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, |
8d65af789
|
1884 |
void __user *buffer, size_t *length, loff_t *ppos) |
1da177e4c
|
1885 |
{ |
8d65af789
|
1886 |
proc_dointvec(table, write, buffer, length, ppos); |
1da177e4c
|
1887 1888 |
return 0; } |
c2c4986ed
|
1889 |
#ifdef CONFIG_BLOCK |
31373d09d
|
1890 |
void laptop_mode_timer_fn(unsigned long data) |
1da177e4c
|
1891 |
{ |
31373d09d
|
1892 |
struct request_queue *q = (struct request_queue *)data; |
11fb99898
|
1893 1894 |
int nr_pages = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
a06fd6b10
|
1895 |
struct bdi_writeback *wb; |
1da177e4c
|
1896 |
|
31373d09d
|
1897 1898 1899 1900 |
/* * We want to write everything out, not just down to the dirty * threshold */ |
a06fd6b10
|
1901 1902 |
if (!bdi_has_dirty_io(&q->backing_dev_info)) return; |
9ad18ab93
|
1903 |
rcu_read_lock(); |
b817525a4
|
1904 |
list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node) |
a06fd6b10
|
1905 1906 1907 |
if (wb_has_dirty_io(wb)) wb_start_writeback(wb, nr_pages, true, WB_REASON_LAPTOP_TIMER); |
9ad18ab93
|
1908 |
rcu_read_unlock(); |
1da177e4c
|
1909 1910 1911 1912 1913 1914 1915 |
} /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ |
31373d09d
|
1916 |
void laptop_io_completion(struct backing_dev_info *info) |
1da177e4c
|
1917 |
{ |
31373d09d
|
1918 |
mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); |
1da177e4c
|
1919 1920 1921 1922 1923 1924 1925 1926 1927 |
} /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { |
31373d09d
|
1928 1929 1930 1931 1932 1933 1934 1935 |
struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); |
1da177e4c
|
1936 |
} |
c2c4986ed
|
1937 |
#endif |
1da177e4c
|
1938 1939 1940 1941 1942 1943 1944 1945 1946 |
/* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * If it is too low then SMP machines will call the (expensive) * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
9d823e8f6
|
1947 |
* thresholds. |
1da177e4c
|
1948 |
*/ |
2d1d43f6a
|
1949 |
void writeback_set_ratelimit(void) |
1da177e4c
|
1950 |
{ |
dcc25ae76
|
1951 |
struct wb_domain *dom = &global_wb_domain; |
9d823e8f6
|
1952 1953 |
unsigned long background_thresh; unsigned long dirty_thresh; |
dcc25ae76
|
1954 |
|
9d823e8f6
|
1955 |
global_dirty_limits(&background_thresh, &dirty_thresh); |
dcc25ae76
|
1956 |
dom->dirty_limit = dirty_thresh; |
9d823e8f6
|
1957 |
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); |
1da177e4c
|
1958 1959 |
if (ratelimit_pages < 16) ratelimit_pages = 16; |
1da177e4c
|
1960 |
} |
1d7ac6aec
|
1961 |
static int page_writeback_cpu_online(unsigned int cpu) |
1da177e4c
|
1962 |
{ |
1d7ac6aec
|
1963 1964 |
writeback_set_ratelimit(); return 0; |
1da177e4c
|
1965 |
} |
1da177e4c
|
1966 |
/* |
dc6e29da9
|
1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 |
* Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers (by * comparing nr_free_buffer_pages() to vm_total_pages. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory (by subtracting * totalhigh_pages from vm_total_pages), and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. |
1da177e4c
|
1983 1984 1985 |
*/ void __init page_writeback_init(void) { |
a50fcb512
|
1986 |
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); |
1d7ac6aec
|
1987 1988 1989 1990 |
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); |
1da177e4c
|
1991 |
} |
811d736f9
|
1992 |
/** |
f446daaea
|
1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 |
* tag_pages_for_writeback - tag pages to be written by write_cache_pages * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is * that write_cache_pages (or whoever calls this function) will then use * TOWRITE tag to identify pages eligible for writeback. This mechanism is * used to avoid livelocking of writeback by a process steadily creating new * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ /* * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. */ |
f446daaea
|
2009 2010 2011 |
void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { |
3c111a071
|
2012 |
#define WRITEBACK_TAG_BATCH 4096 |
f446daaea
|
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 |
unsigned long tagged; do { spin_lock_irq(&mapping->tree_lock); tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, &start, end, WRITEBACK_TAG_BATCH, PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); cond_resched(); |
d5ed3a4af
|
2023 2024 |
/* We check 'start' to handle wrapping when end == ~0UL */ } while (tagged >= WRITEBACK_TAG_BATCH && start); |
f446daaea
|
2025 2026 2027 2028 |
} EXPORT_SYMBOL(tag_pages_for_writeback); /** |
0ea971801
|
2029 |
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them. |
811d736f9
|
2030 2031 |
* @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
0ea971801
|
2032 2033 |
* @writepage: function called for each page * @data: data passed to writepage function |
811d736f9
|
2034 |
* |
0ea971801
|
2035 |
* If a page is already under I/O, write_cache_pages() skips it, even |
811d736f9
|
2036 2037 2038 2039 2040 2041 |
* if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. |
f446daaea
|
2042 2043 2044 2045 2046 2047 2048 |
* * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). |
811d736f9
|
2049 |
*/ |
0ea971801
|
2050 2051 2052 |
int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) |
811d736f9
|
2053 |
{ |
811d736f9
|
2054 2055 |
int ret = 0; int done = 0; |
811d736f9
|
2056 2057 |
struct pagevec pvec; int nr_pages; |
31a12666d
|
2058 |
pgoff_t uninitialized_var(writeback_index); |
811d736f9
|
2059 2060 |
pgoff_t index; pgoff_t end; /* Inclusive */ |
bd19e012f
|
2061 |
pgoff_t done_index; |
31a12666d
|
2062 |
int cycled; |
811d736f9
|
2063 |
int range_whole = 0; |
f446daaea
|
2064 |
int tag; |
811d736f9
|
2065 |
|
811d736f9
|
2066 2067 |
pagevec_init(&pvec, 0); if (wbc->range_cyclic) { |
31a12666d
|
2068 2069 2070 2071 2072 2073 |
writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; |
811d736f9
|
2074 2075 |
end = -1; } else { |
09cbfeaf1
|
2076 2077 |
index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; |
811d736f9
|
2078 2079 |
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; |
31a12666d
|
2080 |
cycled = 1; /* ignore range_cyclic tests */ |
811d736f9
|
2081 |
} |
6e6938b6d
|
2082 |
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
f446daaea
|
2083 2084 2085 |
tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; |
811d736f9
|
2086 |
retry: |
6e6938b6d
|
2087 |
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
f446daaea
|
2088 |
tag_pages_for_writeback(mapping, index, end); |
bd19e012f
|
2089 |
done_index = index; |
5a3d5c981
|
2090 2091 |
while (!done && (index <= end)) { int i; |
f446daaea
|
2092 |
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, |
5a3d5c981
|
2093 2094 2095 |
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; |
811d736f9
|
2096 |
|
811d736f9
|
2097 2098 2099 2100 |
for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* |
d5482cdf8
|
2101 2102 2103 2104 2105 |
* At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. |
811d736f9
|
2106 |
*/ |
d5482cdf8
|
2107 2108 2109 2110 2111 2112 2113 2114 |
if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } |
cf15b07cf
|
2115 |
done_index = page->index; |
d5482cdf8
|
2116 |
|
811d736f9
|
2117 |
lock_page(page); |
5a3d5c981
|
2118 2119 2120 2121 2122 2123 2124 2125 |
/* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ |
811d736f9
|
2126 |
if (unlikely(page->mapping != mapping)) { |
5a3d5c981
|
2127 |
continue_unlock: |
811d736f9
|
2128 2129 2130 |
unlock_page(page); continue; } |
515f4a037
|
2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 |
if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } |
811d736f9
|
2142 |
|
515f4a037
|
2143 2144 |
BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) |
5a3d5c981
|
2145 |
goto continue_unlock; |
811d736f9
|
2146 |
|
de1414a65
|
2147 |
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); |
0ea971801
|
2148 |
ret = (*writepage)(page, wbc, data); |
00266770b
|
2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 |
if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ |
cf15b07cf
|
2163 |
done_index = page->index + 1; |
00266770b
|
2164 2165 2166 |
done = 1; break; } |
0b5649278
|
2167 |
} |
00266770b
|
2168 |
|
546a19242
|
2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 |
/* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; |
05fe478dd
|
2179 |
} |
811d736f9
|
2180 2181 2182 2183 |
} pagevec_release(&pvec); cond_resched(); } |
3a4c6800f
|
2184 |
if (!cycled && !done) { |
811d736f9
|
2185 |
/* |
31a12666d
|
2186 |
* range_cyclic: |
811d736f9
|
2187 2188 2189 |
* We hit the last page and there is more work to be done: wrap * back to the start of the file */ |
31a12666d
|
2190 |
cycled = 1; |
811d736f9
|
2191 |
index = 0; |
31a12666d
|
2192 |
end = writeback_index - 1; |
811d736f9
|
2193 2194 |
goto retry; } |
0b5649278
|
2195 2196 |
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; |
06d6cf695
|
2197 |
|
811d736f9
|
2198 2199 |
return ret; } |
0ea971801
|
2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 |
EXPORT_SYMBOL(write_cache_pages); /* * Function used by generic_writepages to call the real writepage * function and set the mapping flags on error */ static int __writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /** * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * * This is a library function, which implements the writepages() * address_space_operation. */ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
9b6096a65
|
2226 2227 |
struct blk_plug plug; int ret; |
0ea971801
|
2228 2229 2230 |
/* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; |
9b6096a65
|
2231 2232 2233 2234 |
blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __writepage, mapping); blk_finish_plug(&plug); return ret; |
0ea971801
|
2235 |
} |
811d736f9
|
2236 2237 |
EXPORT_SYMBOL(generic_writepages); |
1da177e4c
|
2238 2239 |
int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
22905f775
|
2240 |
int ret; |
1da177e4c
|
2241 2242 2243 |
if (wbc->nr_to_write <= 0) return 0; if (mapping->a_ops->writepages) |
d08b3851d
|
2244 |
ret = mapping->a_ops->writepages(mapping, wbc); |
22905f775
|
2245 2246 |
else ret = generic_writepages(mapping, wbc); |
22905f775
|
2247 |
return ret; |
1da177e4c
|
2248 2249 2250 2251 |
} /** * write_one_page - write out a single page and optionally wait on I/O |
67be2dd1b
|
2252 2253 |
* @page: the page to write * @wait: if true, wait on writeout |
1da177e4c
|
2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 |
* * The page must be locked by the caller and will be unlocked upon return. * * write_one_page() returns a negative error code if I/O failed. */ int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { |
09cbfeaf1
|
2274 |
get_page(page); |
1da177e4c
|
2275 2276 2277 2278 2279 2280 |
ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } |
09cbfeaf1
|
2281 |
put_page(page); |
1da177e4c
|
2282 2283 2284 2285 2286 2287 2288 2289 |
} else { unlock_page(page); } return ret; } EXPORT_SYMBOL(write_one_page); /* |
767193253
|
2290 2291 2292 2293 2294 |
* For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) |
c3f0da631
|
2295 |
return !TestSetPageDirty(page); |
767193253
|
2296 2297 2298 2299 |
return 0; } /* |
e3a7cca1e
|
2300 |
* Helper function for set_page_dirty family. |
c4843a759
|
2301 |
* |
81f8c3a46
|
2302 |
* Caller must hold lock_page_memcg(). |
c4843a759
|
2303 |
* |
e3a7cca1e
|
2304 2305 |
* NOTE: This relies on being atomic wrt interrupts. */ |
62cccb8c8
|
2306 |
void account_page_dirtied(struct page *page, struct address_space *mapping) |
e3a7cca1e
|
2307 |
{ |
52ebea749
|
2308 |
struct inode *inode = mapping->host; |
9fb0a7da0
|
2309 |
trace_writeback_dirty_page(page, mapping); |
e3a7cca1e
|
2310 |
if (mapping_cap_account_dirty(mapping)) { |
52ebea749
|
2311 |
struct bdi_writeback *wb; |
de1414a65
|
2312 |
|
52ebea749
|
2313 2314 |
inode_attach_wb(inode, page); wb = inode_to_wb(inode); |
de1414a65
|
2315 |
|
62cccb8c8
|
2316 |
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
11fb99898
|
2317 |
__inc_node_page_state(page, NR_FILE_DIRTY); |
5a1c84b40
|
2318 |
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b
|
2319 |
__inc_node_page_state(page, NR_DIRTIED); |
52ebea749
|
2320 2321 |
__inc_wb_stat(wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_DIRTIED); |
09cbfeaf1
|
2322 |
task_io_account_write(PAGE_SIZE); |
d3bc1fef9
|
2323 2324 |
current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); |
e3a7cca1e
|
2325 2326 |
} } |
679ceace8
|
2327 |
EXPORT_SYMBOL(account_page_dirtied); |
e3a7cca1e
|
2328 2329 |
/* |
b9ea25152
|
2330 2331 |
* Helper function for deaccounting dirty page without writeback. * |
81f8c3a46
|
2332 |
* Caller must hold lock_page_memcg(). |
b9ea25152
|
2333 |
*/ |
c4843a759
|
2334 |
void account_page_cleaned(struct page *page, struct address_space *mapping, |
62cccb8c8
|
2335 |
struct bdi_writeback *wb) |
b9ea25152
|
2336 2337 |
{ if (mapping_cap_account_dirty(mapping)) { |
62cccb8c8
|
2338 |
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
11fb99898
|
2339 |
dec_node_page_state(page, NR_FILE_DIRTY); |
5a1c84b40
|
2340 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a
|
2341 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
09cbfeaf1
|
2342 |
task_io_account_cancelled_write(PAGE_SIZE); |
b9ea25152
|
2343 2344 |
} } |
b9ea25152
|
2345 2346 |
/* |
1da177e4c
|
2347 2348 2349 2350 2351 2352 2353 |
* For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * |
2d6d7f982
|
2354 2355 2356 |
* The caller must ensure this doesn't race with truncation. Most will simply * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and * the pte lock held, which also locks out truncation. |
1da177e4c
|
2357 2358 2359 |
*/ int __set_page_dirty_nobuffers(struct page *page) { |
62cccb8c8
|
2360 |
lock_page_memcg(page); |
1da177e4c
|
2361 2362 |
if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); |
a85d9df1e
|
2363 |
unsigned long flags; |
1da177e4c
|
2364 |
|
c4843a759
|
2365 |
if (!mapping) { |
62cccb8c8
|
2366 |
unlock_page_memcg(page); |
8c08540f8
|
2367 |
return 1; |
c4843a759
|
2368 |
} |
8c08540f8
|
2369 |
|
a85d9df1e
|
2370 |
spin_lock_irqsave(&mapping->tree_lock, flags); |
2d6d7f982
|
2371 2372 |
BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
62cccb8c8
|
2373 |
account_page_dirtied(page, mapping); |
2d6d7f982
|
2374 2375 |
radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); |
a85d9df1e
|
2376 |
spin_unlock_irqrestore(&mapping->tree_lock, flags); |
62cccb8c8
|
2377 |
unlock_page_memcg(page); |
c4843a759
|
2378 |
|
8c08540f8
|
2379 2380 2381 |
if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
1da177e4c
|
2382 |
} |
4741c9fd3
|
2383 |
return 1; |
1da177e4c
|
2384 |
} |
62cccb8c8
|
2385 |
unlock_page_memcg(page); |
4741c9fd3
|
2386 |
return 0; |
1da177e4c
|
2387 2388 2389 2390 |
} EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* |
2f800fbd7
|
2391 2392 2393 2394 2395 2396 2397 2398 2399 |
* Call this whenever redirtying a page, to de-account the dirty counters * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to * systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; |
910181343
|
2400 |
|
2f800fbd7
|
2401 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a
|
2402 2403 2404 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; bool locked; |
910181343
|
2405 |
|
682aa8e1a
|
2406 |
wb = unlocked_inode_to_wb_begin(inode, &locked); |
2f800fbd7
|
2407 |
current->nr_dirtied--; |
c4a25635b
|
2408 |
dec_node_page_state(page, NR_DIRTIED); |
910181343
|
2409 |
dec_wb_stat(wb, WB_DIRTIED); |
682aa8e1a
|
2410 |
unlocked_inode_to_wb_end(inode, locked); |
2f800fbd7
|
2411 2412 2413 2414 2415 |
} } EXPORT_SYMBOL(account_page_redirty); /* |
1da177e4c
|
2416 2417 2418 2419 2420 2421 |
* When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { |
8d38633c3
|
2422 |
int ret; |
1da177e4c
|
2423 |
wbc->pages_skipped++; |
8d38633c3
|
2424 |
ret = __set_page_dirty_nobuffers(page); |
2f800fbd7
|
2425 |
account_page_redirty(page); |
8d38633c3
|
2426 |
return ret; |
1da177e4c
|
2427 2428 2429 2430 |
} EXPORT_SYMBOL(redirty_page_for_writepage); /* |
6746aff74
|
2431 2432 2433 2434 2435 2436 2437 |
* Dirty a page. * * For pages with a mapping this should be done under the page lock * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. * |
1da177e4c
|
2438 2439 2440 |
* If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ |
1cf6e7d83
|
2441 |
int set_page_dirty(struct page *page) |
1da177e4c
|
2442 2443 |
{ struct address_space *mapping = page_mapping(page); |
800d8c63b
|
2444 |
page = compound_head(page); |
1da177e4c
|
2445 2446 |
if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
278df9f45
|
2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 |
/* * readahead/lru_deactivate_page could remain * PG_readahead/PG_reclaim due to race with end_page_writeback * About readahead, if the page is written, the flags would be * reset. So no problem. * About lru_deactivate_page, if the page is redirty, the flag * will be reset. So no problem. but if the page is used by readahead * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ |
a4bb3ecdc
|
2457 2458 |
if (PageReclaim(page)) ClearPageReclaim(page); |
9361401eb
|
2459 2460 2461 2462 2463 |
#ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; #endif return (*spd)(page); |
1da177e4c
|
2464 |
} |
4741c9fd3
|
2465 2466 2467 2468 |
if (!PageDirty(page)) { if (!TestSetPageDirty(page)) return 1; } |
1da177e4c
|
2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 |
return 0; } EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. * * Usually, the page _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the page should be locked before running set_page_dirty(). */ int set_page_dirty_lock(struct page *page) { int ret; |
7eaceacca
|
2486 |
lock_page(page); |
1da177e4c
|
2487 2488 2489 2490 2491 2492 2493 |
ret = set_page_dirty(page); unlock_page(page); return ret; } EXPORT_SYMBOL(set_page_dirty_lock); /* |
11f81becc
|
2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 |
* This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ void cancel_dirty_page(struct page *page) { |
c4843a759
|
2508 2509 2510 |
struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { |
682aa8e1a
|
2511 2512 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
682aa8e1a
|
2513 |
bool locked; |
c4843a759
|
2514 |
|
62cccb8c8
|
2515 |
lock_page_memcg(page); |
682aa8e1a
|
2516 |
wb = unlocked_inode_to_wb_begin(inode, &locked); |
c4843a759
|
2517 2518 |
if (TestClearPageDirty(page)) |
62cccb8c8
|
2519 |
account_page_cleaned(page, mapping, wb); |
c4843a759
|
2520 |
|
682aa8e1a
|
2521 |
unlocked_inode_to_wb_end(inode, locked); |
62cccb8c8
|
2522 |
unlock_page_memcg(page); |
c4843a759
|
2523 2524 2525 |
} else { ClearPageDirty(page); } |
11f81becc
|
2526 2527 2528 2529 |
} EXPORT_SYMBOL(cancel_dirty_page); /* |
1da177e4c
|
2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 |
* Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page * tagged as dirty in the radix tree so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), * at which stage we bring the page's dirty flag and radix-tree dirty tag * back into sync. * * This incoherency between the page's dirty flag and radix-tree tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); |
c4843a759
|
2546 |
int ret = 0; |
1da177e4c
|
2547 |
|
79352894b
|
2548 |
BUG_ON(!PageLocked(page)); |
7658cc289
|
2549 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a
|
2550 2551 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
682aa8e1a
|
2552 |
bool locked; |
7658cc289
|
2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 |
/* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole page dirty if it was * dirty in a pagetable. Only to then * (c) clean the page again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "set_page_dirty(page)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the page "master dirty bit" * as a serialization point for all the different * threads doing their things. |
7658cc289
|
2577 2578 2579 |
*/ if (page_mkclean(page)) set_page_dirty(page); |
79352894b
|
2580 2581 2582 |
/* * We carefully synchronise fault handlers against * installing a dirty pte and marking the page dirty |
2d6d7f982
|
2583 2584 2585 2586 |
* at this point. We do this by having them hold the * page lock while dirtying the page, and pages are * always locked coming in here, so we get the desired * exclusion. |
79352894b
|
2587 |
*/ |
682aa8e1a
|
2588 |
wb = unlocked_inode_to_wb_begin(inode, &locked); |
7658cc289
|
2589 |
if (TestClearPageDirty(page)) { |
62cccb8c8
|
2590 |
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
11fb99898
|
2591 |
dec_node_page_state(page, NR_FILE_DIRTY); |
5a1c84b40
|
2592 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a
|
2593 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
c4843a759
|
2594 |
ret = 1; |
1da177e4c
|
2595 |
} |
682aa8e1a
|
2596 |
unlocked_inode_to_wb_end(inode, locked); |
c4843a759
|
2597 |
return ret; |
1da177e4c
|
2598 |
} |
7658cc289
|
2599 |
return TestClearPageDirty(page); |
1da177e4c
|
2600 |
} |
58bb01a9c
|
2601 |
EXPORT_SYMBOL(clear_page_dirty_for_io); |
1da177e4c
|
2602 2603 2604 2605 |
int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); |
d7365e783
|
2606 |
int ret; |
1da177e4c
|
2607 |
|
62cccb8c8
|
2608 |
lock_page_memcg(page); |
371a096ed
|
2609 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
910181343
|
2610 2611 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c
|
2612 |
unsigned long flags; |
19fd62312
|
2613 |
spin_lock_irqsave(&mapping->tree_lock, flags); |
1da177e4c
|
2614 |
ret = TestClearPageWriteback(page); |
69cb51d18
|
2615 |
if (ret) { |
1da177e4c
|
2616 2617 2618 |
radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6
|
2619 |
if (bdi_cap_account_writeback(bdi)) { |
910181343
|
2620 2621 2622 2623 |
struct bdi_writeback *wb = inode_to_wb(inode); __dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); |
04fbfdc14
|
2624 |
} |
69cb51d18
|
2625 |
} |
6c60d2b57
|
2626 2627 2628 2629 |
if (mapping->host && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); |
19fd62312
|
2630 |
spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1da177e4c
|
2631 2632 2633 |
} else { ret = TestClearPageWriteback(page); } |
99b12e3d8
|
2634 |
if (ret) { |
62cccb8c8
|
2635 |
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
11fb99898
|
2636 |
dec_node_page_state(page, NR_WRITEBACK); |
5a1c84b40
|
2637 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b
|
2638 |
inc_node_page_state(page, NR_WRITTEN); |
99b12e3d8
|
2639 |
} |
62cccb8c8
|
2640 |
unlock_page_memcg(page); |
1da177e4c
|
2641 2642 |
return ret; } |
1c8349a17
|
2643 |
int __test_set_page_writeback(struct page *page, bool keep_write) |
1da177e4c
|
2644 2645 |
{ struct address_space *mapping = page_mapping(page); |
d7365e783
|
2646 |
int ret; |
1da177e4c
|
2647 |
|
62cccb8c8
|
2648 |
lock_page_memcg(page); |
371a096ed
|
2649 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
910181343
|
2650 2651 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c
|
2652 |
unsigned long flags; |
19fd62312
|
2653 |
spin_lock_irqsave(&mapping->tree_lock, flags); |
1da177e4c
|
2654 |
ret = TestSetPageWriteback(page); |
69cb51d18
|
2655 |
if (!ret) { |
6c60d2b57
|
2656 2657 2658 2659 |
bool on_wblist; on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); |
1da177e4c
|
2660 2661 2662 |
radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6
|
2663 |
if (bdi_cap_account_writeback(bdi)) |
910181343
|
2664 |
__inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); |
6c60d2b57
|
2665 2666 2667 2668 2669 2670 2671 2672 |
/* * We can come through here when swapping anonymous * pages, so we don't necessarily have an inode to track * for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); |
69cb51d18
|
2673 |
} |
1da177e4c
|
2674 2675 2676 2677 |
if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); |
1c8349a17
|
2678 2679 2680 2681 |
if (!keep_write) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_TOWRITE); |
19fd62312
|
2682 |
spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1da177e4c
|
2683 2684 2685 |
} else { ret = TestSetPageWriteback(page); } |
3a3c02ecf
|
2686 |
if (!ret) { |
62cccb8c8
|
2687 |
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
11fb99898
|
2688 |
inc_node_page_state(page, NR_WRITEBACK); |
5a1c84b40
|
2689 |
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
3a3c02ecf
|
2690 |
} |
62cccb8c8
|
2691 |
unlock_page_memcg(page); |
1da177e4c
|
2692 2693 2694 |
return ret; } |
1c8349a17
|
2695 |
EXPORT_SYMBOL(__test_set_page_writeback); |
1da177e4c
|
2696 2697 |
/* |
001281881
|
2698 |
* Return true if any of the pages in the mapping are marked with the |
1da177e4c
|
2699 2700 2701 2702 |
* passed tag. */ int mapping_tagged(struct address_space *mapping, int tag) { |
72c478321
|
2703 |
return radix_tree_tagged(&mapping->page_tree, tag); |
1da177e4c
|
2704 2705 |
} EXPORT_SYMBOL(mapping_tagged); |
1d1d1a767
|
2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 |
/** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. * * This function determines if the given page is related to a backing device * that requires page contents to be held stable during writeback. If so, then * it will wait for any pending writeback to complete. */ void wait_for_stable_page(struct page *page) { |
de1414a65
|
2717 2718 |
if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) wait_on_page_writeback(page); |
1d1d1a767
|
2719 2720 |
} EXPORT_SYMBOL_GPL(wait_for_stable_page); |