Blame view
mm/page-writeback.c
85 KB
1da177e4c Linux-2.6.12-rc2 |
1 |
/* |
f30c22695 fix file specific... |
2 |
* mm/page-writeback.c |
1da177e4c Linux-2.6.12-rc2 |
3 4 |
* * Copyright (C) 2002, Linus Torvalds. |
90eec103b treewide: Remove ... |
5 |
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
1da177e4c Linux-2.6.12-rc2 |
6 7 8 9 |
* * Contains functions related to writing back dirty pages at the * address_space level. * |
e1f8e8744 Remove Andrew Mor... |
10 |
* 10Apr2002 Andrew Morton |
1da177e4c Linux-2.6.12-rc2 |
11 12 13 14 |
* Initial version */ #include <linux/kernel.h> |
b95f1b31b mm: Map most file... |
15 |
#include <linux/export.h> |
1da177e4c Linux-2.6.12-rc2 |
16 17 18 19 20 21 22 23 24 |
#include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> |
55e829af0 [PATCH] io-accoun... |
25 |
#include <linux/task_io_accounting_ops.h> |
1da177e4c Linux-2.6.12-rc2 |
26 27 |
#include <linux/blkdev.h> #include <linux/mpage.h> |
d08b3851d [PATCH] mm: track... |
28 |
#include <linux/rmap.h> |
1da177e4c Linux-2.6.12-rc2 |
29 30 31 32 33 34 |
#include <linux/percpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> |
ff01bb483 fs: move code out... |
35 |
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
811d736f9 [PATCH] BLOCK: Di... |
36 |
#include <linux/pagevec.h> |
eb608e3a3 block: Convert BD... |
37 |
#include <linux/timer.h> |
8bd75c77b sched/rt: Move rt... |
38 |
#include <linux/sched/rt.h> |
f361bf4a6 sched/headers: Pr... |
39 |
#include <linux/sched/signal.h> |
6e543d578 mm: vmscan: fix d... |
40 |
#include <linux/mm_inline.h> |
028c2dd18 writeback: Add tr... |
41 |
#include <trace/events/writeback.h> |
1da177e4c Linux-2.6.12-rc2 |
42 |
|
6e543d578 mm: vmscan: fix d... |
43 |
#include "internal.h" |
1da177e4c Linux-2.6.12-rc2 |
44 |
/* |
ffd1f609a writeback: introd... |
45 46 47 48 49 |
* Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* |
5b9b35743 writeback: avoid ... |
50 51 52 53 54 55 |
* Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* |
e98be2d59 writeback: bdi wr... |
56 57 58 |
* Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
6c14ae1e9 writeback: dirty ... |
59 |
#define RATELIMIT_CALC_SHIFT 10 |
e98be2d59 writeback: bdi wr... |
60 |
/* |
1da177e4c Linux-2.6.12-rc2 |
61 62 63 64 |
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; |
1da177e4c Linux-2.6.12-rc2 |
65 66 67 |
/* The following parameters are exported via /proc/sys/vm */ /* |
5b0830cb9 writeback: get ri... |
68 |
* Start background writeback (via writeback threads) at this percentage |
1da177e4c Linux-2.6.12-rc2 |
69 |
*/ |
1b5e62b42 writeback: double... |
70 |
int dirty_background_ratio = 10; |
1da177e4c Linux-2.6.12-rc2 |
71 72 |
/* |
2da02997e mm: add dirty_bac... |
73 74 75 76 77 78 |
* dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ unsigned long dirty_background_bytes; /* |
195cf453d mm/page-writeback... |
79 80 81 82 83 84 |
* free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ int vm_highmem_is_dirtyable; /* |
1da177e4c Linux-2.6.12-rc2 |
85 86 |
* The generator of dirty data starts writeback at this percentage */ |
1b5e62b42 writeback: double... |
87 |
int vm_dirty_ratio = 20; |
1da177e4c Linux-2.6.12-rc2 |
88 89 |
/* |
2da02997e mm: add dirty_bac... |
90 91 92 93 94 95 |
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ unsigned long vm_dirty_bytes; /* |
704503d83 mm: fix proc_doin... |
96 |
* The interval between `kupdate'-style writebacks |
1da177e4c Linux-2.6.12-rc2 |
97 |
*/ |
22ef37eed page-writeback: f... |
98 |
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ |
1da177e4c Linux-2.6.12-rc2 |
99 |
|
91913a294 mm: export dirty_... |
100 |
EXPORT_SYMBOL_GPL(dirty_writeback_interval); |
1da177e4c Linux-2.6.12-rc2 |
101 |
/* |
704503d83 mm: fix proc_doin... |
102 |
* The longest time for which data is allowed to remain dirty |
1da177e4c Linux-2.6.12-rc2 |
103 |
*/ |
22ef37eed page-writeback: f... |
104 |
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ |
1da177e4c Linux-2.6.12-rc2 |
105 106 107 108 109 110 111 |
/* * Flag that makes the machine dump writes/reads and block dirtyings. */ int block_dump; /* |
ed5b43f15 [PATCH] Represent... |
112 113 |
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. |
1da177e4c Linux-2.6.12-rc2 |
114 115 116 117 118 119 |
*/ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ |
dcc25ae76 writeback: move g... |
120 |
struct wb_domain global_wb_domain; |
1da177e4c Linux-2.6.12-rc2 |
121 |
|
2bc00aef0 writeback: consol... |
122 123 |
/* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { |
e9f07dfd7 writeback: add di... |
124 125 |
#ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; |
9fc3a43e1 writeback: separa... |
126 |
struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ |
e9f07dfd7 writeback: add di... |
127 |
#endif |
2bc00aef0 writeback: consol... |
128 |
struct bdi_writeback *wb; |
e9770b348 writeback: add di... |
129 |
struct fprop_local_percpu *wb_completions; |
eb608e3a3 block: Convert BD... |
130 |
|
9fc3a43e1 writeback: separa... |
131 |
unsigned long avail; /* dirtyable */ |
2bc00aef0 writeback: consol... |
132 133 134 135 136 137 |
unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; |
970fb01ad writeback: add di... |
138 |
unsigned long wb_bg_thresh; |
daddfa3cb writeback: add di... |
139 140 |
unsigned long pos_ratio; |
2bc00aef0 writeback: consol... |
141 |
}; |
eb608e3a3 block: Convert BD... |
142 143 144 145 146 147 |
/* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) |
04fbfdc14 mm: per device di... |
148 |
|
693108a8a writeback: make b... |
149 |
#ifdef CONFIG_CGROUP_WRITEBACK |
d60d1bddd writeback: memcg ... |
150 151 152 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions |
9fc3a43e1 writeback: separa... |
153 |
#define GDTC_INIT_NO_WB .dom = &global_wb_domain |
d60d1bddd writeback: memcg ... |
154 155 156 157 158 |
#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc |
c2aa723a6 writeback: implem... |
159 160 161 162 163 |
static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } |
e9f07dfd7 writeback: add di... |
164 165 166 167 168 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } |
9fc3a43e1 writeback: separa... |
169 170 171 172 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } |
841710aa6 writeback: implem... |
173 174 175 176 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } |
693108a8a writeback: make b... |
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; do_div(min, tot_bw); } if (max < 100) { max *= this_bw; do_div(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ |
d60d1bddd writeback: memcg ... |
205 206 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions |
9fc3a43e1 writeback: separa... |
207 |
#define GDTC_INIT_NO_WB |
c2aa723a6 writeback: implem... |
208 209 210 211 212 213 |
#define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } |
e9f07dfd7 writeback: add di... |
214 215 216 217 218 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } |
9fc3a43e1 writeback: separa... |
219 220 221 222 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } |
841710aa6 writeback: implem... |
223 224 225 226 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } |
693108a8a writeback: make b... |
227 228 229 230 231 232 233 234 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ |
04fbfdc14 mm: per device di... |
235 |
/* |
a756cf590 mm: try to distri... |
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
* In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effictive number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ |
a804552b9 mm/page-writeback... |
252 |
/** |
281e37265 mm, page_alloc: c... |
253 254 |
* node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node |
a804552b9 mm/page-writeback... |
255 |
* |
281e37265 mm, page_alloc: c... |
256 257 |
* Returns the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. |
a804552b9 mm/page-writeback... |
258 |
*/ |
281e37265 mm, page_alloc: c... |
259 |
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
a804552b9 mm/page-writeback... |
260 |
{ |
281e37265 mm, page_alloc: c... |
261 262 263 264 265 266 267 268 269 270 271 |
unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } |
a804552b9 mm/page-writeback... |
272 |
|
a8d014373 mm: page_alloc: g... |
273 274 275 276 277 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ |
281e37265 mm, page_alloc: c... |
278 |
nr_pages -= min(nr_pages, pgdat->totalreserve_pages); |
a804552b9 mm/page-writeback... |
279 |
|
281e37265 mm, page_alloc: c... |
280 281 |
nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); |
a804552b9 mm/page-writeback... |
282 283 284 |
return nr_pages; } |
1edf22348 mm/page-writeback... |
285 286 287 288 |
static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; |
bb4cc2bea mm, vmscan: remov... |
289 |
unsigned long x = 0; |
09b4ab3c4 mm/writeback: cor... |
290 |
int i; |
1edf22348 mm/page-writeback... |
291 292 |
for_each_node_state(node, N_HIGH_MEMORY) { |
281e37265 mm, page_alloc: c... |
293 294 |
for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; |
9cb937e21 mm, page_alloc: f... |
295 |
unsigned long nr_pages; |
281e37265 mm, page_alloc: c... |
296 297 298 299 300 |
if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; |
9cb937e21 mm, page_alloc: f... |
301 302 |
if (!populated_zone(z)) continue; |
1edf22348 mm/page-writeback... |
303 |
|
9cb937e21 mm, page_alloc: f... |
304 |
nr_pages = zone_page_state(z, NR_FREE_PAGES); |
281e37265 mm, page_alloc: c... |
305 |
/* watch for underflows */ |
9cb937e21 mm, page_alloc: f... |
306 |
nr_pages -= min(nr_pages, high_wmark_pages(z)); |
bb4cc2bea mm, vmscan: remov... |
307 308 309 |
nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; |
09b4ab3c4 mm/writeback: cor... |
310 |
} |
1edf22348 mm/page-writeback... |
311 |
} |
281e37265 mm, page_alloc: c... |
312 |
|
1edf22348 mm/page-writeback... |
313 |
/* |
c8b74c2f6 mm: fix calculati... |
314 315 316 317 318 319 320 321 322 323 324 325 |
* Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below * the zone's dirty balance reserve and the above calculation * will underflow. However we still want to add in nodes * which are below threshold (negative values) to get a more * accurate calculation but make sure that the total never * underflows. */ if ((long)x < 0) x = 0; /* |
1edf22348 mm/page-writeback... |
326 327 328 329 330 331 332 333 334 335 336 337 |
* Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** |
ccafa2879 mm: writeback: cl... |
338 |
* global_dirtyable_memory - number of globally dirtyable pages |
1edf22348 mm/page-writeback... |
339 |
* |
ccafa2879 mm: writeback: cl... |
340 341 |
* Returns the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. |
1edf22348 mm/page-writeback... |
342 |
*/ |
18cf8cf8b mm: page-writebac... |
343 |
static unsigned long global_dirtyable_memory(void) |
1edf22348 mm/page-writeback... |
344 345 |
{ unsigned long x; |
c41f012ad mm: rename global... |
346 |
x = global_zone_page_state(NR_FREE_PAGES); |
a8d014373 mm: page_alloc: g... |
347 348 349 350 351 352 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); |
1edf22348 mm/page-writeback... |
353 |
|
599d0c954 mm, vmscan: move ... |
354 355 |
x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); |
a804552b9 mm/page-writeback... |
356 |
|
1edf22348 mm/page-writeback... |
357 358 359 360 361 |
if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } |
9fc3a43e1 writeback: separa... |
362 363 364 |
/** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest |
ccafa2879 mm: writeback: cl... |
365 |
* |
9fc3a43e1 writeback: separa... |
366 367 368 369 |
* Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
ccafa2879 mm: writeback: cl... |
370 371 |
* real-time tasks. */ |
9fc3a43e1 writeback: separa... |
372 |
static void domain_dirty_limits(struct dirty_throttle_control *dtc) |
ccafa2879 mm: writeback: cl... |
373 |
{ |
9fc3a43e1 writeback: separa... |
374 375 376 377 |
const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; |
62a584fe0 writeback: use hi... |
378 379 380 |
/* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; |
9fc3a43e1 writeback: separa... |
381 382 |
unsigned long thresh; unsigned long bg_thresh; |
ccafa2879 mm: writeback: cl... |
383 |
struct task_struct *tsk; |
9fc3a43e1 writeback: separa... |
384 385 386 387 388 389 390 |
/* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against |
62a584fe0 writeback: use hi... |
391 392 393 |
* globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. |
9fc3a43e1 writeback: separa... |
394 395 |
*/ if (bytes) |
62a584fe0 writeback: use hi... |
396 397 |
ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); |
9fc3a43e1 writeback: separa... |
398 |
if (bg_bytes) |
62a584fe0 writeback: use hi... |
399 400 |
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); |
9fc3a43e1 writeback: separa... |
401 402 403 404 405 |
bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); |
ccafa2879 mm: writeback: cl... |
406 |
else |
62a584fe0 writeback: use hi... |
407 |
thresh = (ratio * available_memory) / PAGE_SIZE; |
ccafa2879 mm: writeback: cl... |
408 |
|
9fc3a43e1 writeback: separa... |
409 410 |
if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); |
ccafa2879 mm: writeback: cl... |
411 |
else |
62a584fe0 writeback: use hi... |
412 |
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; |
ccafa2879 mm: writeback: cl... |
413 |
|
9fc3a43e1 writeback: separa... |
414 415 |
if (bg_thresh >= thresh) bg_thresh = thresh / 2; |
ccafa2879 mm: writeback: cl... |
416 417 |
tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
a53eaff8c MM: increase safe... |
418 419 |
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; |
ccafa2879 mm: writeback: cl... |
420 |
} |
9fc3a43e1 writeback: separa... |
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 |
dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; |
ccafa2879 mm: writeback: cl... |
446 |
} |
a756cf590 mm: try to distri... |
447 |
/** |
281e37265 mm, page_alloc: c... |
448 449 |
* node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node |
a756cf590 mm: try to distri... |
450 |
* |
281e37265 mm, page_alloc: c... |
451 452 |
* Returns the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. |
a756cf590 mm: try to distri... |
453 |
*/ |
281e37265 mm, page_alloc: c... |
454 |
static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
a756cf590 mm: try to distri... |
455 |
{ |
281e37265 mm, page_alloc: c... |
456 |
unsigned long node_memory = node_dirtyable_memory(pgdat); |
a756cf590 mm: try to distri... |
457 458 459 460 461 |
struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * |
281e37265 mm, page_alloc: c... |
462 |
node_memory / global_dirtyable_memory(); |
a756cf590 mm: try to distri... |
463 |
else |
281e37265 mm, page_alloc: c... |
464 |
dirty = vm_dirty_ratio * node_memory / 100; |
a756cf590 mm: try to distri... |
465 466 467 468 469 470 471 472 |
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) dirty += dirty / 4; return dirty; } /** |
281e37265 mm, page_alloc: c... |
473 474 |
* node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check |
a756cf590 mm: try to distri... |
475 |
* |
281e37265 mm, page_alloc: c... |
476 |
* Returns %true when the dirty pages in @pgdat are within the node's |
a756cf590 mm: try to distri... |
477 478 |
* dirty limit, %false if the limit is exceeded. */ |
281e37265 mm, page_alloc: c... |
479 |
bool node_dirty_ok(struct pglist_data *pgdat) |
a756cf590 mm: try to distri... |
480 |
{ |
281e37265 mm, page_alloc: c... |
481 482 |
unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; |
11fb99898 mm: move most fil... |
483 484 485 |
nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); |
a756cf590 mm: try to distri... |
486 |
|
281e37265 mm, page_alloc: c... |
487 |
return nr_pages <= limit; |
a756cf590 mm: try to distri... |
488 |
} |
2da02997e mm: add dirty_bac... |
489 |
int dirty_background_ratio_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
490 |
void __user *buffer, size_t *lenp, |
2da02997e mm: add dirty_bac... |
491 492 493 |
loff_t *ppos) { int ret; |
8d65af789 sysctl: remove "s... |
494 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
495 496 497 498 499 500 |
if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
501 |
void __user *buffer, size_t *lenp, |
2da02997e mm: add dirty_bac... |
502 503 504 |
loff_t *ppos) { int ret; |
8d65af789 sysctl: remove "s... |
505 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
506 507 508 509 |
if (ret == 0 && write) dirty_background_ratio = 0; return ret; } |
04fbfdc14 mm: per device di... |
510 |
int dirty_ratio_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
511 |
void __user *buffer, size_t *lenp, |
04fbfdc14 mm: per device di... |
512 513 514 |
loff_t *ppos) { int old_ratio = vm_dirty_ratio; |
2da02997e mm: add dirty_bac... |
515 |
int ret; |
8d65af789 sysctl: remove "s... |
516 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
04fbfdc14 mm: per device di... |
517 |
if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
eb608e3a3 block: Convert BD... |
518 |
writeback_set_ratelimit(); |
2da02997e mm: add dirty_bac... |
519 520 521 522 |
vm_dirty_bytes = 0; } return ret; } |
2da02997e mm: add dirty_bac... |
523 |
int dirty_bytes_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
524 |
void __user *buffer, size_t *lenp, |
2da02997e mm: add dirty_bac... |
525 526 |
loff_t *ppos) { |
fc3501d41 mm: fix dirty_byt... |
527 |
unsigned long old_bytes = vm_dirty_bytes; |
2da02997e mm: add dirty_bac... |
528 |
int ret; |
8d65af789 sysctl: remove "s... |
529 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
530 |
if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
eb608e3a3 block: Convert BD... |
531 |
writeback_set_ratelimit(); |
2da02997e mm: add dirty_bac... |
532 |
vm_dirty_ratio = 0; |
04fbfdc14 mm: per device di... |
533 534 535 |
} return ret; } |
eb608e3a3 block: Convert BD... |
536 537 538 539 540 541 542 543 |
static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } |
c7981433e writeback: make _... |
544 545 546 |
static void wb_domain_writeout_inc(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac) |
04fbfdc14 mm: per device di... |
547 |
{ |
c7981433e writeback: make _... |
548 549 |
__fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); |
eb608e3a3 block: Convert BD... |
550 |
/* First event after period switching was turned off? */ |
517663edd mm/page-writeback... |
551 |
if (unlikely(!dom->period_time)) { |
eb608e3a3 block: Convert BD... |
552 553 554 555 556 557 |
/* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ |
380c27ca3 writeback: implem... |
558 559 |
dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3 block: Convert BD... |
560 |
} |
04fbfdc14 mm: per device di... |
561 |
} |
c7981433e writeback: make _... |
562 563 564 565 566 |
/* * Increment @wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) |
dd5656e59 mm: bdi: export b... |
567 |
{ |
841710aa6 writeback: implem... |
568 |
struct wb_domain *cgdom; |
dd5656e59 mm: bdi: export b... |
569 |
|
3e8f399da writeback: rework... |
570 |
inc_wb_stat(wb, WB_WRITTEN); |
c7981433e writeback: make _... |
571 572 |
wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); |
841710aa6 writeback: implem... |
573 574 575 576 577 |
cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac); |
dd5656e59 mm: bdi: export b... |
578 |
} |
dd5656e59 mm: bdi: export b... |
579 |
|
93f78d882 writeback: move b... |
580 |
void wb_writeout_inc(struct bdi_writeback *wb) |
04fbfdc14 mm: per device di... |
581 |
{ |
dd5656e59 mm: bdi: export b... |
582 583 584 |
unsigned long flags; local_irq_save(flags); |
93f78d882 writeback: move b... |
585 |
__wb_writeout_inc(wb); |
dd5656e59 mm: bdi: export b... |
586 |
local_irq_restore(flags); |
04fbfdc14 mm: per device di... |
587 |
} |
93f78d882 writeback: move b... |
588 |
EXPORT_SYMBOL_GPL(wb_writeout_inc); |
04fbfdc14 mm: per device di... |
589 |
|
04fbfdc14 mm: per device di... |
590 |
/* |
eb608e3a3 block: Convert BD... |
591 592 593 594 595 |
* On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(unsigned long t) { |
380c27ca3 writeback: implem... |
596 597 |
struct wb_domain *dom = (void *)t; int miss_periods = (jiffies - dom->period_time) / |
eb608e3a3 block: Convert BD... |
598 |
VM_COMPLETIONS_PERIOD_LEN; |
380c27ca3 writeback: implem... |
599 600 |
if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + |
eb608e3a3 block: Convert BD... |
601 |
miss_periods * VM_COMPLETIONS_PERIOD_LEN); |
380c27ca3 writeback: implem... |
602 |
mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3 block: Convert BD... |
603 604 605 606 607 |
} else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ |
380c27ca3 writeback: implem... |
608 |
dom->period_time = 0; |
eb608e3a3 block: Convert BD... |
609 610 |
} } |
380c27ca3 writeback: implem... |
611 612 613 |
int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); |
dcc25ae76 writeback: move g... |
614 615 |
spin_lock_init(&dom->lock); |
0a372d09c mm/page-writeback... |
616 617 |
setup_deferrable_timer(&dom->period_timer, writeout_period, (unsigned long)dom); |
dcc25ae76 writeback: move g... |
618 619 |
dom->dirty_limit_tstamp = jiffies; |
380c27ca3 writeback: implem... |
620 621 |
return fprop_global_init(&dom->completions, gfp); } |
841710aa6 writeback: implem... |
622 623 624 625 626 627 628 |
#ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif |
eb608e3a3 block: Convert BD... |
629 |
/* |
d08c429b0 mm/page-writeback... |
630 631 632 |
* bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. |
189d3c4a9 mm: bdi: allow se... |
633 |
*/ |
189d3c4a9 mm: bdi: allow se... |
634 635 636 637 638 |
static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; |
189d3c4a9 mm: bdi: allow se... |
639 |
|
cfc4ba536 writeback: use RC... |
640 |
spin_lock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
641 |
if (min_ratio > bdi->max_ratio) { |
189d3c4a9 mm: bdi: allow se... |
642 |
ret = -EINVAL; |
a42dde041 mm: bdi: allow se... |
643 644 645 646 647 648 649 650 651 |
} else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } |
cfc4ba536 writeback: use RC... |
652 |
spin_unlock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
653 654 655 656 657 658 |
return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { |
a42dde041 mm: bdi: allow se... |
659 660 661 662 |
int ret = 0; if (max_ratio > 100) return -EINVAL; |
cfc4ba536 writeback: use RC... |
663 |
spin_lock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
664 665 666 667 |
if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; |
eb608e3a3 block: Convert BD... |
668 |
bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
a42dde041 mm: bdi: allow se... |
669 |
} |
cfc4ba536 writeback: use RC... |
670 |
spin_unlock_bh(&bdi_lock); |
189d3c4a9 mm: bdi: allow se... |
671 672 673 |
return ret; } |
a42dde041 mm: bdi: allow se... |
674 |
EXPORT_SYMBOL(bdi_set_max_ratio); |
189d3c4a9 mm: bdi: allow se... |
675 |
|
6c14ae1e9 writeback: dirty ... |
676 677 678 679 680 |
static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } |
c7981433e writeback: make _... |
681 682 |
static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) |
ffd1f609a writeback: introd... |
683 |
{ |
dcc25ae76 writeback: move g... |
684 |
return max(thresh, dom->dirty_limit); |
ffd1f609a writeback: introd... |
685 |
} |
c5edf9cdc writeback: fix in... |
686 687 688 689 690 691 |
/* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) |
c2aa723a6 writeback: implem... |
692 693 |
{ struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); |
c5edf9cdc writeback: fix in... |
694 695 696 |
unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); |
c2aa723a6 writeback: implem... |
697 |
|
c5edf9cdc writeback: fix in... |
698 |
mdtc->avail = filepages + min(headroom, other_clean); |
ffd1f609a writeback: introd... |
699 |
} |
6f7186562 writeback: add bd... |
700 |
/** |
b1cbc6d40 writeback: make _... |
701 702 |
* __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest |
1babe1838 writeback: add co... |
703 |
* |
a88a341a7 writeback: move b... |
704 |
* Returns @wb's dirty limit in pages. The term "dirty" in the context of |
6f7186562 writeback: add bd... |
705 |
* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. |
aed21ad28 writeback: commen... |
706 707 708 709 710 711 |
* * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks |
a88a341a7 writeback: move b... |
712 |
* more (rather than completely block them) when the wb dirty pages go high. |
1babe1838 writeback: add co... |
713 |
* |
6f7186562 writeback: add bd... |
714 |
* It allocates high/low dirty limits to fast/slow devices, in order to prevent |
1babe1838 writeback: add co... |
715 716 717 |
* - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * |
a88a341a7 writeback: move b... |
718 |
* The wb's share of dirty limit will be adapting to its throughput and |
1babe1838 writeback: add co... |
719 720 |
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ |
b1cbc6d40 writeback: make _... |
721 |
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) |
16c4042f0 writeback: avoid ... |
722 |
{ |
e9f07dfd7 writeback: add di... |
723 |
struct wb_domain *dom = dtc_dom(dtc); |
b1cbc6d40 writeback: make _... |
724 |
unsigned long thresh = dtc->thresh; |
0d960a383 writeback: clean ... |
725 |
u64 wb_thresh; |
16c4042f0 writeback: avoid ... |
726 |
long numerator, denominator; |
693108a8a writeback: make b... |
727 |
unsigned long wb_min_ratio, wb_max_ratio; |
04fbfdc14 mm: per device di... |
728 |
|
16c4042f0 writeback: avoid ... |
729 |
/* |
0d960a383 writeback: clean ... |
730 |
* Calculate this BDI's share of the thresh ratio. |
16c4042f0 writeback: avoid ... |
731 |
*/ |
e9770b348 writeback: add di... |
732 |
fprop_fraction_percpu(&dom->completions, dtc->wb_completions, |
380c27ca3 writeback: implem... |
733 |
&numerator, &denominator); |
04fbfdc14 mm: per device di... |
734 |
|
0d960a383 writeback: clean ... |
735 736 737 |
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; do_div(wb_thresh, denominator); |
04fbfdc14 mm: per device di... |
738 |
|
b1cbc6d40 writeback: make _... |
739 |
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); |
04fbfdc14 mm: per device di... |
740 |
|
0d960a383 writeback: clean ... |
741 742 743 |
wb_thresh += (thresh * wb_min_ratio) / 100; if (wb_thresh > (thresh * wb_max_ratio) / 100) wb_thresh = thresh * wb_max_ratio / 100; |
16c4042f0 writeback: avoid ... |
744 |
|
0d960a383 writeback: clean ... |
745 |
return wb_thresh; |
1da177e4c Linux-2.6.12-rc2 |
746 |
} |
b1cbc6d40 writeback: make _... |
747 748 749 750 751 |
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); |
1da177e4c Linux-2.6.12-rc2 |
752 |
} |
6c14ae1e9 writeback: dirty ... |
753 |
/* |
5a5374856 mm/page-writeback... |
754 755 756 757 758 759 760 761 762 763 764 765 766 |
* setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ |
d5c9fde3d mm/page-writeback... |
767 |
static long long pos_ratio_polynom(unsigned long setpoint, |
5a5374856 mm/page-writeback... |
768 769 770 771 772 |
unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; |
d5c9fde3d mm/page-writeback... |
773 |
x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, |
464d1387a writeback: use |1... |
774 |
(limit - setpoint) | 1); |
5a5374856 mm/page-writeback... |
775 776 777 778 779 780 781 782 783 |
pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* |
6c14ae1e9 writeback: dirty ... |
784 785 786 787 |
* Dirty position control. * * (o) global/bdi setpoints * |
de1fff37b writeback: s/bdi/... |
788 |
* We want the dirty pages be balanced around the global/wb setpoints. |
6c14ae1e9 writeback: dirty ... |
789 790 791 792 793 794 795 796 797 |
* When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * |
de1fff37b writeback: s/bdi/... |
798 799 |
* if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio |
6c14ae1e9 writeback: dirty ... |
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 |
* * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 .............* * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * |
de1fff37b writeback: s/bdi/... |
824 |
* (o) wb control line |
6c14ae1e9 writeback: dirty ... |
825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 |
* * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> |
de1fff37b writeback: s/bdi/... |
850 |
* wb_setpoint^ x_intercept^ |
6c14ae1e9 writeback: dirty ... |
851 |
* |
de1fff37b writeback: s/bdi/... |
852 |
* The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can |
6c14ae1e9 writeback: dirty ... |
853 854 |
* be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD |
de1fff37b writeback: s/bdi/... |
855 856 |
* card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload |
6c14ae1e9 writeback: dirty ... |
857 |
*/ |
daddfa3cb writeback: add di... |
858 |
static void wb_position_ratio(struct dirty_throttle_control *dtc) |
6c14ae1e9 writeback: dirty ... |
859 |
{ |
2bc00aef0 writeback: consol... |
860 |
struct bdi_writeback *wb = dtc->wb; |
a88a341a7 writeback: move b... |
861 |
unsigned long write_bw = wb->avg_write_bandwidth; |
2bc00aef0 writeback: consol... |
862 |
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e writeback: make _... |
863 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
2bc00aef0 writeback: consol... |
864 |
unsigned long wb_thresh = dtc->wb_thresh; |
6c14ae1e9 writeback: dirty ... |
865 866 |
unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ |
de1fff37b writeback: s/bdi/... |
867 |
unsigned long wb_setpoint; |
6c14ae1e9 writeback: dirty ... |
868 869 870 |
unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; |
daddfa3cb writeback: add di... |
871 |
dtc->pos_ratio = 0; |
2bc00aef0 writeback: consol... |
872 |
if (unlikely(dtc->dirty >= limit)) |
daddfa3cb writeback: add di... |
873 |
return; |
6c14ae1e9 writeback: dirty ... |
874 875 876 877 |
/* * global setpoint * |
5a5374856 mm/page-writeback... |
878 879 880 |
* See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; |
2bc00aef0 writeback: consol... |
881 |
pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); |
5a5374856 mm/page-writeback... |
882 883 884 885 |
/* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For |
de1fff37b writeback: s/bdi/... |
886 887 |
* such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". |
5a5374856 mm/page-writeback... |
888 889 890 891 |
* This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". |
6c14ae1e9 writeback: dirty ... |
892 |
* |
a88a341a7 writeback: move b... |
893 |
* Here, in wb_position_ratio(), we calculate pos_ratio based on |
de1fff37b writeback: s/bdi/... |
894 |
* two values: wb_dirty and wb_thresh. Let's consider an example: |
5a5374856 mm/page-writeback... |
895 896 |
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). |
de1fff37b writeback: s/bdi/... |
897 |
* Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. |
0d960a383 writeback: clean ... |
898 |
* wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is |
de1fff37b writeback: s/bdi/... |
899 |
* about ~6K pages (as the average of background and throttle wb |
5a5374856 mm/page-writeback... |
900 |
* limits). The 3rd order polynomial will provide positive feedback if |
de1fff37b writeback: s/bdi/... |
901 |
* wb_dirty is under wb_setpoint and vice versa. |
6c14ae1e9 writeback: dirty ... |
902 |
* |
5a5374856 mm/page-writeback... |
903 |
* Note, that we cannot use global counters in these calculations |
de1fff37b writeback: s/bdi/... |
904 |
* because we want to throttle process writing to a strictlimit wb |
5a5374856 mm/page-writeback... |
905 906 |
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). |
6c14ae1e9 writeback: dirty ... |
907 |
*/ |
a88a341a7 writeback: move b... |
908 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
de1fff37b writeback: s/bdi/... |
909 |
long long wb_pos_ratio; |
5a5374856 mm/page-writeback... |
910 |
|
daddfa3cb writeback: add di... |
911 912 913 914 915 |
if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } |
5a5374856 mm/page-writeback... |
916 |
|
2bc00aef0 writeback: consol... |
917 |
if (dtc->wb_dirty >= wb_thresh) |
daddfa3cb writeback: add di... |
918 |
return; |
5a5374856 mm/page-writeback... |
919 |
|
970fb01ad writeback: add di... |
920 921 |
wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); |
5a5374856 mm/page-writeback... |
922 |
|
de1fff37b writeback: s/bdi/... |
923 |
if (wb_setpoint == 0 || wb_setpoint == wb_thresh) |
daddfa3cb writeback: add di... |
924 |
return; |
5a5374856 mm/page-writeback... |
925 |
|
2bc00aef0 writeback: consol... |
926 |
wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, |
de1fff37b writeback: s/bdi/... |
927 |
wb_thresh); |
5a5374856 mm/page-writeback... |
928 929 |
/* |
de1fff37b writeback: s/bdi/... |
930 931 |
* Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global |
5a5374856 mm/page-writeback... |
932 |
* state ("dirty") is not limiting factor and we have to |
de1fff37b writeback: s/bdi/... |
933 |
* make decision based on wb counters. But there is an |
5a5374856 mm/page-writeback... |
934 935 |
* important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other |
de1fff37b writeback: s/bdi/... |
936 |
* wb's) while given strictlimit wb is below limit. |
5a5374856 mm/page-writeback... |
937 |
* |
de1fff37b writeback: s/bdi/... |
938 |
* "pos_ratio * wb_pos_ratio" would work for the case above, |
5a5374856 mm/page-writeback... |
939 |
* but it would look too non-natural for the case of all |
de1fff37b writeback: s/bdi/... |
940 |
* activity in the system coming from a single strictlimit wb |
5a5374856 mm/page-writeback... |
941 942 943 944 |
* with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 |
de1fff37b writeback: s/bdi/... |
945 |
* (when globally we are at freerun and wb is well below wb |
5a5374856 mm/page-writeback... |
946 947 948 949 |
* setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ |
daddfa3cb writeback: add di... |
950 951 |
dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; |
5a5374856 mm/page-writeback... |
952 |
} |
6c14ae1e9 writeback: dirty ... |
953 954 955 |
/* * We have computed basic pos_ratio above based on global situation. If |
de1fff37b writeback: s/bdi/... |
956 |
* the wb is over/under its share of dirty pages, we want to scale |
6c14ae1e9 writeback: dirty ... |
957 958 959 960 |
* pos_ratio further down/up. That is done by the following mechanism. */ /* |
de1fff37b writeback: s/bdi/... |
961 |
* wb setpoint |
6c14ae1e9 writeback: dirty ... |
962 |
* |
de1fff37b writeback: s/bdi/... |
963 |
* f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) |
6c14ae1e9 writeback: dirty ... |
964 |
* |
de1fff37b writeback: s/bdi/... |
965 |
* x_intercept - wb_dirty |
6c14ae1e9 writeback: dirty ... |
966 |
* := -------------------------- |
de1fff37b writeback: s/bdi/... |
967 |
* x_intercept - wb_setpoint |
6c14ae1e9 writeback: dirty ... |
968 |
* |
de1fff37b writeback: s/bdi/... |
969 |
* The main wb control line is a linear function that subjects to |
6c14ae1e9 writeback: dirty ... |
970 |
* |
de1fff37b writeback: s/bdi/... |
971 972 973 |
* (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw |
6c14ae1e9 writeback: dirty ... |
974 |
* |
de1fff37b writeback: s/bdi/... |
975 |
* For single wb case, the dirty pages are observed to fluctuate |
6c14ae1e9 writeback: dirty ... |
976 |
* regularly within range |
de1fff37b writeback: s/bdi/... |
977 |
* [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] |
6c14ae1e9 writeback: dirty ... |
978 979 980 |
* for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * |
de1fff37b writeback: s/bdi/... |
981 |
* For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its |
6c14ae1e9 writeback: dirty ... |
982 |
* own size, so move the slope over accordingly and choose a slope that |
de1fff37b writeback: s/bdi/... |
983 |
* yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. |
6c14ae1e9 writeback: dirty ... |
984 |
*/ |
2bc00aef0 writeback: consol... |
985 986 |
if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; |
aed21ad28 writeback: commen... |
987 |
/* |
de1fff37b writeback: s/bdi/... |
988 |
* It's very possible that wb_thresh is close to 0 not because the |
aed21ad28 writeback: commen... |
989 990 991 992 993 |
* device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ |
2bc00aef0 writeback: consol... |
994 |
wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); |
6c14ae1e9 writeback: dirty ... |
995 |
/* |
de1fff37b writeback: s/bdi/... |
996 997 |
* scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh |
6c14ae1e9 writeback: dirty ... |
998 |
*/ |
e4bc13adf Merge branch 'for... |
999 |
x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); |
de1fff37b writeback: s/bdi/... |
1000 |
wb_setpoint = setpoint * (u64)x >> 16; |
6c14ae1e9 writeback: dirty ... |
1001 |
/* |
de1fff37b writeback: s/bdi/... |
1002 1003 |
* Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. |
6c14ae1e9 writeback: dirty ... |
1004 |
* |
de1fff37b writeback: s/bdi/... |
1005 1006 1007 |
* wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh |
6c14ae1e9 writeback: dirty ... |
1008 |
*/ |
2bc00aef0 writeback: consol... |
1009 |
span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; |
de1fff37b writeback: s/bdi/... |
1010 |
x_intercept = wb_setpoint + span; |
6c14ae1e9 writeback: dirty ... |
1011 |
|
2bc00aef0 writeback: consol... |
1012 1013 |
if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), |
e4bc13adf Merge branch 'for... |
1014 |
(x_intercept - wb_setpoint) | 1); |
6c14ae1e9 writeback: dirty ... |
1015 1016 |
} else pos_ratio /= 4; |
8927f66c4 writeback: dirty ... |
1017 |
/* |
de1fff37b writeback: s/bdi/... |
1018 |
* wb reserve area, safeguard against dirty pool underrun and disk idle |
8927f66c4 writeback: dirty ... |
1019 1020 1021 |
* It may push the desired control point of global dirty pages higher * than setpoint. */ |
de1fff37b writeback: s/bdi/... |
1022 |
x_intercept = wb_thresh / 2; |
2bc00aef0 writeback: consol... |
1023 1024 1025 1026 |
if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); |
50657fc4d writeback: fix pp... |
1027 |
else |
8927f66c4 writeback: dirty ... |
1028 1029 |
pos_ratio *= 8; } |
daddfa3cb writeback: add di... |
1030 |
dtc->pos_ratio = pos_ratio; |
6c14ae1e9 writeback: dirty ... |
1031 |
} |
a88a341a7 writeback: move b... |
1032 1033 1034 |
static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) |
e98be2d59 writeback: bdi wr... |
1035 1036 |
{ const unsigned long period = roundup_pow_of_two(3 * HZ); |
a88a341a7 writeback: move b... |
1037 1038 |
unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; |
e98be2d59 writeback: bdi wr... |
1039 1040 1041 1042 1043 1044 1045 1046 |
u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period |
c72efb658 writeback: fix po... |
1047 1048 1049 |
* * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. |
e98be2d59 writeback: bdi wr... |
1050 |
*/ |
a88a341a7 writeback: move b... |
1051 |
bw = written - min(written, wb->written_stamp); |
e98be2d59 writeback: bdi wr... |
1052 1053 1054 1055 1056 1057 |
bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } |
a88a341a7 writeback: move b... |
1058 |
bw += (u64)wb->write_bandwidth * (period - elapsed); |
e98be2d59 writeback: bdi wr... |
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 |
bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: |
95a46c65e writeback: make b... |
1071 1072 1073 1074 1075 1076 1077 |
/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } |
a88a341a7 writeback: move b... |
1078 1079 |
wb->write_bandwidth = bw; wb->avg_write_bandwidth = avg; |
e98be2d59 writeback: bdi wr... |
1080 |
} |
2bc00aef0 writeback: consol... |
1081 |
static void update_dirty_limit(struct dirty_throttle_control *dtc) |
c42843f2f writeback: introd... |
1082 |
{ |
e9f07dfd7 writeback: add di... |
1083 |
struct wb_domain *dom = dtc_dom(dtc); |
2bc00aef0 writeback: consol... |
1084 |
unsigned long thresh = dtc->thresh; |
dcc25ae76 writeback: move g... |
1085 |
unsigned long limit = dom->dirty_limit; |
c42843f2f writeback: introd... |
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 |
/* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce |
dcc25ae76 writeback: move g... |
1098 |
* dom->dirty_limit which is guaranteed to lie above the dirty pages. |
c42843f2f writeback: introd... |
1099 |
*/ |
2bc00aef0 writeback: consol... |
1100 |
thresh = max(thresh, dtc->dirty); |
c42843f2f writeback: introd... |
1101 1102 1103 1104 1105 1106 |
if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: |
dcc25ae76 writeback: move g... |
1107 |
dom->dirty_limit = limit; |
c42843f2f writeback: introd... |
1108 |
} |
e9f07dfd7 writeback: add di... |
1109 |
static void domain_update_bandwidth(struct dirty_throttle_control *dtc, |
c42843f2f writeback: introd... |
1110 1111 |
unsigned long now) { |
e9f07dfd7 writeback: add di... |
1112 |
struct wb_domain *dom = dtc_dom(dtc); |
c42843f2f writeback: introd... |
1113 1114 1115 1116 |
/* * check locklessly first to optimize away locking for the most time */ |
dcc25ae76 writeback: move g... |
1117 |
if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) |
c42843f2f writeback: introd... |
1118 |
return; |
dcc25ae76 writeback: move g... |
1119 1120 |
spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { |
2bc00aef0 writeback: consol... |
1121 |
update_dirty_limit(dtc); |
dcc25ae76 writeback: move g... |
1122 |
dom->dirty_limit_tstamp = now; |
c42843f2f writeback: introd... |
1123 |
} |
dcc25ae76 writeback: move g... |
1124 |
spin_unlock(&dom->lock); |
c42843f2f writeback: introd... |
1125 |
} |
be3ffa276 writeback: dirty ... |
1126 |
/* |
de1fff37b writeback: s/bdi/... |
1127 |
* Maintain wb->dirty_ratelimit, the base dirty throttle rate. |
be3ffa276 writeback: dirty ... |
1128 |
* |
de1fff37b writeback: s/bdi/... |
1129 |
* Normal wb tasks will be curbed at or below it in long term. |
be3ffa276 writeback: dirty ... |
1130 1131 |
* Obviously it should be around (write_bw / N) when there are N dd tasks. */ |
2bc00aef0 writeback: consol... |
1132 |
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, |
a88a341a7 writeback: move b... |
1133 1134 |
unsigned long dirtied, unsigned long elapsed) |
be3ffa276 writeback: dirty ... |
1135 |
{ |
2bc00aef0 writeback: consol... |
1136 1137 1138 |
struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e writeback: make _... |
1139 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
7381131cb writeback: stabil... |
1140 |
unsigned long setpoint = (freerun + limit) / 2; |
a88a341a7 writeback: move b... |
1141 1142 |
unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; |
be3ffa276 writeback: dirty ... |
1143 1144 1145 |
unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; |
7381131cb writeback: stabil... |
1146 1147 |
unsigned long step; unsigned long x; |
d59b1087a mm/page-writeback... |
1148 |
unsigned long shift; |
be3ffa276 writeback: dirty ... |
1149 1150 1151 1152 1153 |
/* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ |
a88a341a7 writeback: move b... |
1154 |
dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; |
be3ffa276 writeback: dirty ... |
1155 |
|
be3ffa276 writeback: dirty ... |
1156 1157 1158 1159 |
/* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * |
daddfa3cb writeback: add di... |
1160 |
dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; |
be3ffa276 writeback: dirty ... |
1161 1162 1163 1164 |
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, |
de1fff37b writeback: s/bdi/... |
1165 |
* if there are N dd tasks, each throttled at task_ratelimit, the wb's |
be3ffa276 writeback: dirty ... |
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 |
* dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); |
bdaac4902 writeback: balanc... |
1195 1196 1197 1198 1199 |
/* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; |
be3ffa276 writeback: dirty ... |
1200 |
|
7381131cb writeback: stabil... |
1201 1202 1203 |
/* * We could safely do this and return immediately: * |
de1fff37b writeback: s/bdi/... |
1204 |
* wb->dirty_ratelimit = balanced_dirty_ratelimit; |
7381131cb writeback: stabil... |
1205 1206 |
* * However to get a more stable dirty_ratelimit, the below elaborated |
331cbdeed writeback: Fix so... |
1207 |
* code makes use of task_ratelimit to filter out singular points and |
7381131cb writeback: stabil... |
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 |
* limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
331cbdeed writeback: Fix so... |
1230 |
* and filter out the singular points of balanced_dirty_ratelimit. Which |
7381131cb writeback: stabil... |
1231 1232 1233 1234 1235 |
* keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; |
5a5374856 mm/page-writeback... |
1236 1237 |
/* |
de1fff37b writeback: s/bdi/... |
1238 |
* For strictlimit case, calculations above were based on wb counters |
a88a341a7 writeback: move b... |
1239 |
* and limits (starting from pos_ratio = wb_position_ratio() and up to |
5a5374856 mm/page-writeback... |
1240 |
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). |
de1fff37b writeback: s/bdi/... |
1241 1242 |
* Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". |
5a5374856 mm/page-writeback... |
1243 |
* |
de1fff37b writeback: s/bdi/... |
1244 1245 |
* We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity |
970fb01ad writeback: add di... |
1246 |
* of backing device. |
5a5374856 mm/page-writeback... |
1247 |
*/ |
a88a341a7 writeback: move b... |
1248 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
2bc00aef0 writeback: consol... |
1249 1250 1251 |
dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; |
5a5374856 mm/page-writeback... |
1252 |
else |
970fb01ad writeback: add di... |
1253 |
setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; |
5a5374856 mm/page-writeback... |
1254 |
} |
7381131cb writeback: stabil... |
1255 |
if (dirty < setpoint) { |
a88a341a7 writeback: move b... |
1256 |
x = min3(wb->balanced_dirty_ratelimit, |
7c809968f mm/page-writeback... |
1257 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb writeback: stabil... |
1258 1259 1260 |
if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { |
a88a341a7 writeback: move b... |
1261 |
x = max3(wb->balanced_dirty_ratelimit, |
7c809968f mm/page-writeback... |
1262 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb writeback: stabil... |
1263 1264 1265 1266 1267 1268 1269 1270 1271 |
if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ |
d59b1087a mm/page-writeback... |
1272 1273 1274 1275 1276 |
shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; |
7381131cb writeback: stabil... |
1277 1278 1279 1280 1281 |
if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; |
a88a341a7 writeback: move b... |
1282 1283 |
wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; |
b48c104d2 writeback: trace ... |
1284 |
|
5634cc2aa writeback: update... |
1285 |
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); |
be3ffa276 writeback: dirty ... |
1286 |
} |
c2aa723a6 writeback: implem... |
1287 1288 |
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, |
8a7317995 writeback: reorga... |
1289 1290 |
unsigned long start_time, bool update_ratelimit) |
e98be2d59 writeback: bdi wr... |
1291 |
{ |
c2aa723a6 writeback: implem... |
1292 |
struct bdi_writeback *wb = gdtc->wb; |
e98be2d59 writeback: bdi wr... |
1293 |
unsigned long now = jiffies; |
a88a341a7 writeback: move b... |
1294 |
unsigned long elapsed = now - wb->bw_time_stamp; |
be3ffa276 writeback: dirty ... |
1295 |
unsigned long dirtied; |
e98be2d59 writeback: bdi wr... |
1296 |
unsigned long written; |
8a7317995 writeback: reorga... |
1297 |
lockdep_assert_held(&wb->list_lock); |
e98be2d59 writeback: bdi wr... |
1298 1299 1300 1301 1302 |
/* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; |
a88a341a7 writeback: move b... |
1303 1304 |
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); |
e98be2d59 writeback: bdi wr... |
1305 1306 1307 1308 1309 |
/* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ |
a88a341a7 writeback: move b... |
1310 |
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) |
e98be2d59 writeback: bdi wr... |
1311 |
goto snapshot; |
8a7317995 writeback: reorga... |
1312 |
if (update_ratelimit) { |
c2aa723a6 writeback: implem... |
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 |
domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_bandwidth(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } |
be3ffa276 writeback: dirty ... |
1324 |
} |
a88a341a7 writeback: move b... |
1325 |
wb_update_write_bandwidth(wb, elapsed, written); |
e98be2d59 writeback: bdi wr... |
1326 1327 |
snapshot: |
a88a341a7 writeback: move b... |
1328 1329 1330 |
wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; |
e98be2d59 writeback: bdi wr... |
1331 |
} |
8a7317995 writeback: reorga... |
1332 |
void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) |
e98be2d59 writeback: bdi wr... |
1333 |
{ |
2bc00aef0 writeback: consol... |
1334 |
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1335 |
__wb_update_bandwidth(&gdtc, NULL, start_time, false); |
e98be2d59 writeback: bdi wr... |
1336 |
} |
1da177e4c Linux-2.6.12-rc2 |
1337 |
/* |
d0e1d66b5 writeback: remove... |
1338 |
* After a task dirtied this many pages, balance_dirty_pages_ratelimited() |
9d823e8f6 writeback: per ta... |
1339 1340 1341 |
* will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive |
c41f012ad mm: rename global... |
1342 |
* global_zone_page_state() too often. So scale it near-sqrt to the safety margin |
9d823e8f6 writeback: per ta... |
1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 |
* (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } |
a88a341a7 writeback: move b... |
1353 |
static unsigned long wb_max_pause(struct bdi_writeback *wb, |
de1fff37b writeback: s/bdi/... |
1354 |
unsigned long wb_dirty) |
c8462cc9d writeback: limit ... |
1355 |
{ |
a88a341a7 writeback: move b... |
1356 |
unsigned long bw = wb->avg_write_bandwidth; |
e3b6c655b writeback: fix ne... |
1357 |
unsigned long t; |
c8462cc9d writeback: limit ... |
1358 |
|
7ccb9ad53 writeback: max, m... |
1359 1360 1361 1362 1363 1364 1365 |
/* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ |
de1fff37b writeback: s/bdi/... |
1366 |
t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); |
7ccb9ad53 writeback: max, m... |
1367 |
t++; |
e3b6c655b writeback: fix ne... |
1368 |
return min_t(unsigned long, t, MAX_PAUSE); |
7ccb9ad53 writeback: max, m... |
1369 |
} |
a88a341a7 writeback: move b... |
1370 1371 1372 1373 1374 |
static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) |
c8462cc9d writeback: limit ... |
1375 |
{ |
a88a341a7 writeback: move b... |
1376 1377 |
long hi = ilog2(wb->avg_write_bandwidth); long lo = ilog2(wb->dirty_ratelimit); |
7ccb9ad53 writeback: max, m... |
1378 1379 1380 |
long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ |
c8462cc9d writeback: limit ... |
1381 |
|
7ccb9ad53 writeback: max, m... |
1382 1383 |
/* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); |
c8462cc9d writeback: limit ... |
1384 1385 1386 1387 1388 |
/* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * |
7ccb9ad53 writeback: max, m... |
1389 |
* (N * 10ms) on 2^N concurrent tasks. |
c8462cc9d writeback: limit ... |
1390 1391 |
*/ if (hi > lo) |
7ccb9ad53 writeback: max, m... |
1392 |
t += (hi - lo) * (10 * HZ) / 1024; |
c8462cc9d writeback: limit ... |
1393 1394 |
/* |
7ccb9ad53 writeback: max, m... |
1395 1396 1397 1398 1399 1400 1401 1402 |
* This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. |
c8462cc9d writeback: limit ... |
1403 |
* |
7ccb9ad53 writeback: max, m... |
1404 1405 1406 1407 1408 1409 1410 |
* We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. |
c8462cc9d writeback: limit ... |
1411 |
*/ |
7ccb9ad53 writeback: max, m... |
1412 1413 |
t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); |
c8462cc9d writeback: limit ... |
1414 1415 |
/* |
5b9b35743 writeback: avoid ... |
1416 1417 1418 1419 1420 1421 |
* Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. |
c8462cc9d writeback: limit ... |
1422 |
*/ |
5b9b35743 writeback: avoid ... |
1423 1424 1425 1426 1427 1428 1429 1430 |
if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } |
7ccb9ad53 writeback: max, m... |
1431 1432 1433 1434 1435 |
pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } |
c8462cc9d writeback: limit ... |
1436 |
|
7ccb9ad53 writeback: max, m... |
1437 |
*nr_dirtied_pause = pages; |
c8462cc9d writeback: limit ... |
1438 |
/* |
7ccb9ad53 writeback: max, m... |
1439 |
* The minimal pause time will normally be half the target pause time. |
c8462cc9d writeback: limit ... |
1440 |
*/ |
5b9b35743 writeback: avoid ... |
1441 |
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; |
c8462cc9d writeback: limit ... |
1442 |
} |
970fb01ad writeback: add di... |
1443 |
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) |
5a5374856 mm/page-writeback... |
1444 |
{ |
2bc00aef0 writeback: consol... |
1445 |
struct bdi_writeback *wb = dtc->wb; |
93f78d882 writeback: move b... |
1446 |
unsigned long wb_reclaimable; |
5a5374856 mm/page-writeback... |
1447 1448 |
/* |
de1fff37b writeback: s/bdi/... |
1449 |
* wb_thresh is not treated as some limiting factor as |
5a5374856 mm/page-writeback... |
1450 |
* dirty_thresh, due to reasons |
de1fff37b writeback: s/bdi/... |
1451 |
* - in JBOD setup, wb_thresh can fluctuate a lot |
5a5374856 mm/page-writeback... |
1452 |
* - in a system with HDD and USB key, the USB key may somehow |
de1fff37b writeback: s/bdi/... |
1453 1454 |
* go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. |
5a5374856 mm/page-writeback... |
1455 |
* In this case we don't want to hard throttle the USB key |
de1fff37b writeback: s/bdi/... |
1456 1457 |
* dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in |
a88a341a7 writeback: move b... |
1458 |
* wb_position_ratio() will let the dirtier task progress |
de1fff37b writeback: s/bdi/... |
1459 |
* at some rate <= (write_bw / 2) for bringing down wb_dirty. |
5a5374856 mm/page-writeback... |
1460 |
*/ |
b1cbc6d40 writeback: make _... |
1461 |
dtc->wb_thresh = __wb_calc_thresh(dtc); |
970fb01ad writeback: add di... |
1462 1463 |
dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; |
5a5374856 mm/page-writeback... |
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 |
/* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ |
2bc00aef0 writeback: consol... |
1475 |
if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { |
93f78d882 writeback: move b... |
1476 |
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
2bc00aef0 writeback: consol... |
1477 |
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); |
5a5374856 mm/page-writeback... |
1478 |
} else { |
93f78d882 writeback: move b... |
1479 |
wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
2bc00aef0 writeback: consol... |
1480 |
dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); |
5a5374856 mm/page-writeback... |
1481 1482 |
} } |
9d823e8f6 writeback: per ta... |
1483 |
/* |
1da177e4c Linux-2.6.12-rc2 |
1484 1485 |
* balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force |
143dfe861 writeback: IO-les... |
1486 |
* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
5b0830cb9 writeback: get ri... |
1487 1488 |
* If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. |
1da177e4c Linux-2.6.12-rc2 |
1489 |
*/ |
3a2e9a5a2 writeback: balanc... |
1490 |
static void balance_dirty_pages(struct address_space *mapping, |
dfb8ae567 writeback: let ba... |
1491 |
struct bdi_writeback *wb, |
143dfe861 writeback: IO-les... |
1492 |
unsigned long pages_dirtied) |
1da177e4c Linux-2.6.12-rc2 |
1493 |
{ |
2bc00aef0 writeback: consol... |
1494 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1495 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
2bc00aef0 writeback: consol... |
1496 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6 writeback: implem... |
1497 1498 1499 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; |
143dfe861 writeback: IO-les... |
1500 |
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
83712358b writeback: dirty ... |
1501 |
long period; |
7ccb9ad53 writeback: max, m... |
1502 1503 1504 1505 |
long pause; long max_pause; long min_pause; int nr_dirtied_pause; |
e50e37201 writeback: balanc... |
1506 |
bool dirty_exceeded = false; |
143dfe861 writeback: IO-les... |
1507 |
unsigned long task_ratelimit; |
7ccb9ad53 writeback: max, m... |
1508 |
unsigned long dirty_ratelimit; |
dfb8ae567 writeback: let ba... |
1509 |
struct backing_dev_info *bdi = wb->bdi; |
5a5374856 mm/page-writeback... |
1510 |
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; |
e98be2d59 writeback: bdi wr... |
1511 |
unsigned long start_time = jiffies; |
1da177e4c Linux-2.6.12-rc2 |
1512 1513 |
for (;;) { |
83712358b writeback: dirty ... |
1514 |
unsigned long now = jiffies; |
2bc00aef0 writeback: consol... |
1515 |
unsigned long dirty, thresh, bg_thresh; |
50e55bf62 mm/page-writeback... |
1516 1517 1518 |
unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; |
83712358b writeback: dirty ... |
1519 |
|
143dfe861 writeback: IO-les... |
1520 1521 1522 1523 1524 1525 |
/* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. */ |
11fb99898 mm: move most fil... |
1526 1527 |
nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
9fc3a43e1 writeback: separa... |
1528 |
gdtc->avail = global_dirtyable_memory(); |
11fb99898 mm: move most fil... |
1529 |
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); |
5fce25a9d mm: speed up writ... |
1530 |
|
9fc3a43e1 writeback: separa... |
1531 |
domain_dirty_limits(gdtc); |
16c4042f0 writeback: avoid ... |
1532 |
|
5a5374856 mm/page-writeback... |
1533 |
if (unlikely(strictlimit)) { |
970fb01ad writeback: add di... |
1534 |
wb_dirty_limits(gdtc); |
5a5374856 mm/page-writeback... |
1535 |
|
2bc00aef0 writeback: consol... |
1536 1537 |
dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; |
970fb01ad writeback: add di... |
1538 |
bg_thresh = gdtc->wb_bg_thresh; |
5a5374856 mm/page-writeback... |
1539 |
} else { |
2bc00aef0 writeback: consol... |
1540 1541 1542 |
dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; |
5a5374856 mm/page-writeback... |
1543 |
} |
c2aa723a6 writeback: implem... |
1544 |
if (mdtc) { |
c5edf9cdc writeback: fix in... |
1545 |
unsigned long filepages, headroom, writeback; |
c2aa723a6 writeback: implem... |
1546 1547 1548 1549 1550 |
/* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ |
c5edf9cdc writeback: fix in... |
1551 1552 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); |
c2aa723a6 writeback: implem... |
1553 |
mdtc->dirty += writeback; |
c5edf9cdc writeback: fix in... |
1554 |
mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6 writeback: implem... |
1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 |
domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } |
5a5374856 mm/page-writeback... |
1568 |
} |
16c4042f0 writeback: avoid ... |
1569 1570 1571 |
/* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts |
de1fff37b writeback: s/bdi/... |
1572 |
* when the wb limits are ramping up in case of !strictlimit. |
5a5374856 mm/page-writeback... |
1573 |
* |
de1fff37b writeback: s/bdi/... |
1574 1575 |
* In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping |
5a5374856 mm/page-writeback... |
1576 |
* up are the price we consciously pay for strictlimit-ing. |
c2aa723a6 writeback: implem... |
1577 1578 1579 |
* * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. |
16c4042f0 writeback: avoid ... |
1580 |
*/ |
c2aa723a6 writeback: implem... |
1581 1582 1583 1584 1585 |
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv = dirty_poll_interval(dirty, thresh); unsigned long m_intv = ULONG_MAX; |
83712358b writeback: dirty ... |
1586 1587 |
current->dirty_paused_when = now; current->nr_dirtied = 0; |
c2aa723a6 writeback: implem... |
1588 1589 1590 |
if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); |
16c4042f0 writeback: avoid ... |
1591 |
break; |
83712358b writeback: dirty ... |
1592 |
} |
16c4042f0 writeback: avoid ... |
1593 |
|
bc05873dc writeback: make w... |
1594 |
if (unlikely(!writeback_in_progress(wb))) |
9ecf4866c writeback: make b... |
1595 |
wb_start_background_writeback(wb); |
143dfe861 writeback: IO-les... |
1596 |
|
c2aa723a6 writeback: implem... |
1597 1598 1599 1600 |
/* * Calculate global domain's pos_ratio and select the * global dtc by default. */ |
5a5374856 mm/page-writeback... |
1601 |
if (!strictlimit) |
970fb01ad writeback: add di... |
1602 |
wb_dirty_limits(gdtc); |
5fce25a9d mm: speed up writ... |
1603 |
|
2bc00aef0 writeback: consol... |
1604 1605 |
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); |
daddfa3cb writeback: add di... |
1606 1607 |
wb_position_ratio(gdtc); |
c2aa723a6 writeback: implem... |
1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 |
sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) wb_dirty_limits(mdtc); dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } |
daddfa3cb writeback: add di... |
1627 |
|
a88a341a7 writeback: move b... |
1628 1629 |
if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; |
1da177e4c Linux-2.6.12-rc2 |
1630 |
|
8a7317995 writeback: reorga... |
1631 1632 1633 |
if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); |
c2aa723a6 writeback: implem... |
1634 |
__wb_update_bandwidth(gdtc, mdtc, start_time, true); |
8a7317995 writeback: reorga... |
1635 1636 |
spin_unlock(&wb->list_lock); } |
e98be2d59 writeback: bdi wr... |
1637 |
|
c2aa723a6 writeback: implem... |
1638 |
/* throttle according to the chosen dtc */ |
a88a341a7 writeback: move b... |
1639 |
dirty_ratelimit = wb->dirty_ratelimit; |
c2aa723a6 writeback: implem... |
1640 |
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> |
3a73dbbc9 writeback: fix un... |
1641 |
RATELIMIT_CALC_SHIFT; |
c2aa723a6 writeback: implem... |
1642 |
max_pause = wb_max_pause(wb, sdtc->wb_dirty); |
a88a341a7 writeback: move b... |
1643 1644 1645 |
min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); |
7ccb9ad53 writeback: max, m... |
1646 |
|
3a73dbbc9 writeback: fix un... |
1647 |
if (unlikely(task_ratelimit == 0)) { |
83712358b writeback: dirty ... |
1648 |
period = max_pause; |
c8462cc9d writeback: limit ... |
1649 |
pause = max_pause; |
143dfe861 writeback: IO-les... |
1650 |
goto pause; |
04fbfdc14 mm: per device di... |
1651 |
} |
83712358b writeback: dirty ... |
1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 |
period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ |
7ccb9ad53 writeback: max, m... |
1663 |
if (pause < min_pause) { |
5634cc2aa writeback: update... |
1664 |
trace_balance_dirty_pages(wb, |
c2aa723a6 writeback: implem... |
1665 1666 1667 1668 1669 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31 writeback: trace ... |
1670 1671 1672 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b writeback: dirty ... |
1673 |
period, |
7ccb9ad53 writeback: max, m... |
1674 |
min(pause, 0L), |
ece13ac31 writeback: trace ... |
1675 |
start_time); |
83712358b writeback: dirty ... |
1676 1677 1678 1679 1680 1681 |
if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; |
7ccb9ad53 writeback: max, m... |
1682 1683 |
} else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; |
57fc978cf writeback: contro... |
1684 |
break; |
04fbfdc14 mm: per device di... |
1685 |
} |
7ccb9ad53 writeback: max, m... |
1686 1687 1688 1689 1690 |
if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } |
143dfe861 writeback: IO-les... |
1691 1692 |
pause: |
5634cc2aa writeback: update... |
1693 |
trace_balance_dirty_pages(wb, |
c2aa723a6 writeback: implem... |
1694 1695 1696 1697 1698 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31 writeback: trace ... |
1699 1700 1701 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b writeback: dirty ... |
1702 |
period, |
ece13ac31 writeback: trace ... |
1703 1704 |
pause, start_time); |
499d05ecf mm: Make task in ... |
1705 |
__set_current_state(TASK_KILLABLE); |
b57d74aff writeback: track ... |
1706 |
wb->dirty_sleep = now; |
d25105e89 writeback: accoun... |
1707 |
io_schedule_timeout(pause); |
87c6a9b25 writeback: make b... |
1708 |
|
83712358b writeback: dirty ... |
1709 1710 |
current->dirty_paused_when = now + pause; current->nr_dirtied = 0; |
7ccb9ad53 writeback: max, m... |
1711 |
current->nr_dirtied_pause = nr_dirtied_pause; |
83712358b writeback: dirty ... |
1712 |
|
ffd1f609a writeback: introd... |
1713 |
/* |
2bc00aef0 writeback: consol... |
1714 1715 |
* This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. |
ffd1f609a writeback: introd... |
1716 |
*/ |
1df647197 writeback: hard t... |
1717 |
if (task_ratelimit) |
ffd1f609a writeback: introd... |
1718 |
break; |
499d05ecf mm: Make task in ... |
1719 |
|
c5c6343c4 writeback: permit... |
1720 1721 |
/* * In the case of an unresponding NFS server and the NFS dirty |
de1fff37b writeback: s/bdi/... |
1722 |
* pages exceeds dirty_thresh, give the other good wb's a pipe |
c5c6343c4 writeback: permit... |
1723 1724 |
* to go through, so that tasks on them still remain responsive. * |
3f8b6fb7f scripts/spelling.... |
1725 |
* In theory 1 page is enough to keep the consumer-producer |
c5c6343c4 writeback: permit... |
1726 |
* pipe going: the flusher cleans 1 page => the task dirties 1 |
de1fff37b writeback: s/bdi/... |
1727 |
* more page. However wb_dirty has accounting errors. So use |
93f78d882 writeback: move b... |
1728 |
* the larger and more IO friendly wb_stat_error. |
c5c6343c4 writeback: permit... |
1729 |
*/ |
c2aa723a6 writeback: implem... |
1730 |
if (sdtc->wb_dirty <= wb_stat_error(wb)) |
c5c6343c4 writeback: permit... |
1731 |
break; |
499d05ecf mm: Make task in ... |
1732 1733 |
if (fatal_signal_pending(current)) break; |
1da177e4c Linux-2.6.12-rc2 |
1734 |
} |
a88a341a7 writeback: move b... |
1735 1736 |
if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; |
1da177e4c Linux-2.6.12-rc2 |
1737 |
|
bc05873dc writeback: make w... |
1738 |
if (writeback_in_progress(wb)) |
5b0830cb9 writeback: get ri... |
1739 |
return; |
1da177e4c Linux-2.6.12-rc2 |
1740 1741 1742 1743 1744 1745 1746 1747 1748 |
/* * In laptop mode, we wait until hitting the higher threshold before * starting background writeout, and then write out all the way down * to the lower threshold. So slow writers cause minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ |
143dfe861 writeback: IO-les... |
1749 1750 |
if (laptop_mode) return; |
2bc00aef0 writeback: consol... |
1751 |
if (nr_reclaimable > gdtc->bg_thresh) |
9ecf4866c writeback: make b... |
1752 |
wb_start_background_writeback(wb); |
1da177e4c Linux-2.6.12-rc2 |
1753 |
} |
9d823e8f6 writeback: per ta... |
1754 |
static DEFINE_PER_CPU(int, bdp_ratelimits); |
245b2e70e percpu: clean up ... |
1755 |
|
54848d73f writeback: charge... |
1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 |
/* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; |
1da177e4c Linux-2.6.12-rc2 |
1771 |
/** |
d0e1d66b5 writeback: remove... |
1772 |
* balance_dirty_pages_ratelimited - balance dirty memory state |
67be2dd1b [PATCH] DocBook: ... |
1773 |
* @mapping: address_space which was dirtied |
1da177e4c Linux-2.6.12-rc2 |
1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 |
* * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * On really big machines, get_writeback_state is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ |
d0e1d66b5 writeback: remove... |
1784 |
void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1da177e4c Linux-2.6.12-rc2 |
1785 |
{ |
dfb8ae567 writeback: let ba... |
1786 1787 1788 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; |
9d823e8f6 writeback: per ta... |
1789 1790 |
int ratelimit; int *p; |
1da177e4c Linux-2.6.12-rc2 |
1791 |
|
36715cef0 writeback: skip t... |
1792 1793 |
if (!bdi_cap_account_dirty(bdi)) return; |
dfb8ae567 writeback: let ba... |
1794 1795 1796 1797 |
if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; |
9d823e8f6 writeback: per ta... |
1798 |
ratelimit = current->nr_dirtied_pause; |
a88a341a7 writeback: move b... |
1799 |
if (wb->dirty_exceeded) |
9d823e8f6 writeback: per ta... |
1800 |
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
9d823e8f6 writeback: per ta... |
1801 |
preempt_disable(); |
1da177e4c Linux-2.6.12-rc2 |
1802 |
/* |
9d823e8f6 writeback: per ta... |
1803 1804 1805 1806 |
* This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. |
1da177e4c Linux-2.6.12-rc2 |
1807 |
*/ |
7c8e0181e mm: replace __get... |
1808 |
p = this_cpu_ptr(&bdp_ratelimits); |
9d823e8f6 writeback: per ta... |
1809 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
fa5a734e4 [PATCH] balance_d... |
1810 |
*p = 0; |
d3bc1fef9 writeback: fix di... |
1811 1812 1813 |
else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; |
1da177e4c Linux-2.6.12-rc2 |
1814 |
} |
54848d73f writeback: charge... |
1815 1816 1817 1818 1819 |
/* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ |
7c8e0181e mm: replace __get... |
1820 |
p = this_cpu_ptr(&dirty_throttle_leaks); |
54848d73f writeback: charge... |
1821 |
if (*p > 0 && current->nr_dirtied < ratelimit) { |
d0e1d66b5 writeback: remove... |
1822 |
unsigned long nr_pages_dirtied; |
54848d73f writeback: charge... |
1823 1824 1825 |
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; |
1da177e4c Linux-2.6.12-rc2 |
1826 |
} |
fa5a734e4 [PATCH] balance_d... |
1827 |
preempt_enable(); |
9d823e8f6 writeback: per ta... |
1828 1829 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
dfb8ae567 writeback: let ba... |
1830 1831 1832 |
balance_dirty_pages(mapping, wb, current->nr_dirtied); wb_put(wb); |
1da177e4c Linux-2.6.12-rc2 |
1833 |
} |
d0e1d66b5 writeback: remove... |
1834 |
EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
1da177e4c Linux-2.6.12-rc2 |
1835 |
|
aa661bbe1 writeback: move o... |
1836 1837 1838 1839 1840 1841 1842 1843 1844 |
/** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's * clean enough. Returns %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { |
947e9762a writeback: update... |
1845 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1846 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
947e9762a writeback: update... |
1847 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6 writeback: implem... |
1848 1849 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; |
aa661bbe1 writeback: move o... |
1850 |
|
947e9762a writeback: update... |
1851 1852 1853 1854 1855 |
/* * Similar to balance_dirty_pages() but ignores pages being written * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); |
11fb99898 mm: move most fil... |
1856 1857 |
gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
947e9762a writeback: update... |
1858 |
domain_dirty_limits(gdtc); |
aa661bbe1 writeback: move o... |
1859 |
|
947e9762a writeback: update... |
1860 |
if (gdtc->dirty > gdtc->bg_thresh) |
aa661bbe1 writeback: move o... |
1861 |
return true; |
74d369443 writeback: Fix pe... |
1862 1863 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) |
aa661bbe1 writeback: move o... |
1864 |
return true; |
c2aa723a6 writeback: implem... |
1865 |
if (mdtc) { |
c5edf9cdc writeback: fix in... |
1866 |
unsigned long filepages, headroom, writeback; |
c2aa723a6 writeback: implem... |
1867 |
|
c5edf9cdc writeback: fix in... |
1868 1869 1870 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6 writeback: implem... |
1871 1872 1873 1874 |
domain_dirty_limits(mdtc); /* ditto, ignore writeback */ if (mdtc->dirty > mdtc->bg_thresh) return true; |
74d369443 writeback: Fix pe... |
1875 1876 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) |
c2aa723a6 writeback: implem... |
1877 1878 |
return true; } |
aa661bbe1 writeback: move o... |
1879 1880 |
return false; } |
1da177e4c Linux-2.6.12-rc2 |
1881 |
/* |
1da177e4c Linux-2.6.12-rc2 |
1882 1883 |
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ |
cccad5b98 mm: convert use o... |
1884 |
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
1885 |
void __user *buffer, size_t *length, loff_t *ppos) |
1da177e4c Linux-2.6.12-rc2 |
1886 |
{ |
8d65af789 sysctl: remove "s... |
1887 |
proc_dointvec(table, write, buffer, length, ppos); |
1da177e4c Linux-2.6.12-rc2 |
1888 1889 |
return 0; } |
c2c4986ed writeback: fix pr... |
1890 |
#ifdef CONFIG_BLOCK |
31373d09d laptop-mode: Make... |
1891 |
void laptop_mode_timer_fn(unsigned long data) |
1da177e4c Linux-2.6.12-rc2 |
1892 |
{ |
31373d09d laptop-mode: Make... |
1893 |
struct request_queue *q = (struct request_queue *)data; |
11fb99898 mm: move most fil... |
1894 1895 |
int nr_pages = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
a06fd6b10 writeback: make l... |
1896 |
struct bdi_writeback *wb; |
1da177e4c Linux-2.6.12-rc2 |
1897 |
|
31373d09d laptop-mode: Make... |
1898 1899 1900 1901 |
/* * We want to write everything out, not just down to the dirty * threshold */ |
dc3b17cc8 block: Use pointe... |
1902 |
if (!bdi_has_dirty_io(q->backing_dev_info)) |
a06fd6b10 writeback: make l... |
1903 |
return; |
9ad18ab93 writeback: laptop... |
1904 |
rcu_read_lock(); |
dc3b17cc8 block: Use pointe... |
1905 |
list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) |
a06fd6b10 writeback: make l... |
1906 1907 1908 |
if (wb_has_dirty_io(wb)) wb_start_writeback(wb, nr_pages, true, WB_REASON_LAPTOP_TIMER); |
9ad18ab93 writeback: laptop... |
1909 |
rcu_read_unlock(); |
1da177e4c Linux-2.6.12-rc2 |
1910 1911 1912 1913 1914 1915 1916 |
} /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ |
31373d09d laptop-mode: Make... |
1917 |
void laptop_io_completion(struct backing_dev_info *info) |
1da177e4c Linux-2.6.12-rc2 |
1918 |
{ |
31373d09d laptop-mode: Make... |
1919 |
mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); |
1da177e4c Linux-2.6.12-rc2 |
1920 1921 1922 1923 1924 1925 1926 1927 1928 |
} /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { |
31373d09d laptop-mode: Make... |
1929 1930 1931 1932 1933 1934 1935 1936 |
struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); |
1da177e4c Linux-2.6.12-rc2 |
1937 |
} |
c2c4986ed writeback: fix pr... |
1938 |
#endif |
1da177e4c Linux-2.6.12-rc2 |
1939 1940 1941 1942 1943 1944 1945 1946 1947 |
/* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * If it is too low then SMP machines will call the (expensive) * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
9d823e8f6 writeback: per ta... |
1948 |
* thresholds. |
1da177e4c Linux-2.6.12-rc2 |
1949 |
*/ |
2d1d43f6a [PATCH] call mm/p... |
1950 |
void writeback_set_ratelimit(void) |
1da177e4c Linux-2.6.12-rc2 |
1951 |
{ |
dcc25ae76 writeback: move g... |
1952 |
struct wb_domain *dom = &global_wb_domain; |
9d823e8f6 writeback: per ta... |
1953 1954 |
unsigned long background_thresh; unsigned long dirty_thresh; |
dcc25ae76 writeback: move g... |
1955 |
|
9d823e8f6 writeback: per ta... |
1956 |
global_dirty_limits(&background_thresh, &dirty_thresh); |
dcc25ae76 writeback: move g... |
1957 |
dom->dirty_limit = dirty_thresh; |
9d823e8f6 writeback: per ta... |
1958 |
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); |
1da177e4c Linux-2.6.12-rc2 |
1959 1960 |
if (ratelimit_pages < 16) ratelimit_pages = 16; |
1da177e4c Linux-2.6.12-rc2 |
1961 |
} |
1d7ac6aec mm/writeback: Con... |
1962 |
static int page_writeback_cpu_online(unsigned int cpu) |
1da177e4c Linux-2.6.12-rc2 |
1963 |
{ |
1d7ac6aec mm/writeback: Con... |
1964 1965 |
writeback_set_ratelimit(); return 0; |
1da177e4c Linux-2.6.12-rc2 |
1966 |
} |
1da177e4c Linux-2.6.12-rc2 |
1967 |
/* |
dc6e29da9 Fix balance_dirty... |
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 |
* Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers (by * comparing nr_free_buffer_pages() to vm_total_pages. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory (by subtracting * totalhigh_pages from vm_total_pages), and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. |
1da177e4c Linux-2.6.12-rc2 |
1984 1985 1986 |
*/ void __init page_writeback_init(void) { |
a50fcb512 writeback: fix in... |
1987 |
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); |
1d7ac6aec mm/writeback: Con... |
1988 1989 1990 1991 |
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); |
1da177e4c Linux-2.6.12-rc2 |
1992 |
} |
811d736f9 [PATCH] BLOCK: Di... |
1993 |
/** |
f446daaea mm: implement wri... |
1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 |
* tag_pages_for_writeback - tag pages to be written by write_cache_pages * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is * that write_cache_pages (or whoever calls this function) will then use * TOWRITE tag to identify pages eligible for writeback. This mechanism is * used to avoid livelocking of writeback by a process steadily creating new * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ /* * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. */ |
f446daaea mm: implement wri... |
2010 2011 2012 |
void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { |
3c111a071 mm: fix fatal ker... |
2013 |
#define WRITEBACK_TAG_BATCH 4096 |
268f42de7 radix-tree: delet... |
2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 |
unsigned long tagged = 0; struct radix_tree_iter iter; void **slot; spin_lock_irq(&mapping->tree_lock); radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, PAGECACHE_TAG_DIRTY) { if (iter.index > end) break; radix_tree_iter_tag_set(&mapping->page_tree, &iter, PAGECACHE_TAG_TOWRITE); tagged++; if ((tagged % WRITEBACK_TAG_BATCH) != 0) continue; slot = radix_tree_iter_resume(slot, &iter); |
f446daaea mm: implement wri... |
2029 |
spin_unlock_irq(&mapping->tree_lock); |
f446daaea mm: implement wri... |
2030 |
cond_resched(); |
268f42de7 radix-tree: delet... |
2031 2032 2033 |
spin_lock_irq(&mapping->tree_lock); } spin_unlock_irq(&mapping->tree_lock); |
f446daaea mm: implement wri... |
2034 2035 2036 2037 |
} EXPORT_SYMBOL(tag_pages_for_writeback); /** |
0ea971801 consolidate gener... |
2038 |
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them. |
811d736f9 [PATCH] BLOCK: Di... |
2039 2040 |
* @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
0ea971801 consolidate gener... |
2041 2042 |
* @writepage: function called for each page * @data: data passed to writepage function |
811d736f9 [PATCH] BLOCK: Di... |
2043 |
* |
0ea971801 consolidate gener... |
2044 |
* If a page is already under I/O, write_cache_pages() skips it, even |
811d736f9 [PATCH] BLOCK: Di... |
2045 2046 2047 2048 2049 2050 |
* if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. |
f446daaea mm: implement wri... |
2051 2052 2053 2054 2055 2056 2057 |
* * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). |
811d736f9 [PATCH] BLOCK: Di... |
2058 |
*/ |
0ea971801 consolidate gener... |
2059 2060 2061 |
int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) |
811d736f9 [PATCH] BLOCK: Di... |
2062 |
{ |
811d736f9 [PATCH] BLOCK: Di... |
2063 2064 |
int ret = 0; int done = 0; |
694c20fe0 mm/page-writeback... |
2065 |
int error; |
811d736f9 [PATCH] BLOCK: Di... |
2066 2067 |
struct pagevec pvec; int nr_pages; |
31a12666d mm: write_cache_p... |
2068 |
pgoff_t uninitialized_var(writeback_index); |
811d736f9 [PATCH] BLOCK: Di... |
2069 2070 |
pgoff_t index; pgoff_t end; /* Inclusive */ |
bd19e012f mm: write_cache_p... |
2071 |
pgoff_t done_index; |
31a12666d mm: write_cache_p... |
2072 |
int cycled; |
811d736f9 [PATCH] BLOCK: Di... |
2073 |
int range_whole = 0; |
f446daaea mm: implement wri... |
2074 |
int tag; |
811d736f9 [PATCH] BLOCK: Di... |
2075 |
|
811d736f9 [PATCH] BLOCK: Di... |
2076 2077 |
pagevec_init(&pvec, 0); if (wbc->range_cyclic) { |
31a12666d mm: write_cache_p... |
2078 2079 2080 2081 2082 2083 |
writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; |
811d736f9 [PATCH] BLOCK: Di... |
2084 2085 |
end = -1; } else { |
09cbfeaf1 mm, fs: get rid o... |
2086 2087 |
index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; |
811d736f9 [PATCH] BLOCK: Di... |
2088 2089 |
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; |
31a12666d mm: write_cache_p... |
2090 |
cycled = 1; /* ignore range_cyclic tests */ |
811d736f9 [PATCH] BLOCK: Di... |
2091 |
} |
6e6938b6d writeback: introd... |
2092 |
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
f446daaea mm: implement wri... |
2093 2094 2095 |
tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; |
811d736f9 [PATCH] BLOCK: Di... |
2096 |
retry: |
6e6938b6d writeback: introd... |
2097 |
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
f446daaea mm: implement wri... |
2098 |
tag_pages_for_writeback(mapping, index, end); |
bd19e012f mm: write_cache_p... |
2099 |
done_index = index; |
5a3d5c981 mm: write_cache_p... |
2100 2101 |
while (!done && (index <= end)) { int i; |
f446daaea mm: implement wri... |
2102 |
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, |
5a3d5c981 mm: write_cache_p... |
2103 2104 2105 |
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; |
811d736f9 [PATCH] BLOCK: Di... |
2106 |
|
811d736f9 [PATCH] BLOCK: Di... |
2107 2108 2109 2110 |
for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* |
d5482cdf8 mm: write_cache_p... |
2111 2112 2113 2114 2115 |
* At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. |
811d736f9 [PATCH] BLOCK: Di... |
2116 |
*/ |
d5482cdf8 mm: write_cache_p... |
2117 2118 2119 2120 2121 2122 2123 2124 |
if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } |
cf15b07cf writeback: make m... |
2125 |
done_index = page->index; |
d5482cdf8 mm: write_cache_p... |
2126 |
|
811d736f9 [PATCH] BLOCK: Di... |
2127 |
lock_page(page); |
5a3d5c981 mm: write_cache_p... |
2128 2129 2130 2131 2132 2133 2134 2135 |
/* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ |
811d736f9 [PATCH] BLOCK: Di... |
2136 |
if (unlikely(page->mapping != mapping)) { |
5a3d5c981 mm: write_cache_p... |
2137 |
continue_unlock: |
811d736f9 [PATCH] BLOCK: Di... |
2138 2139 2140 |
unlock_page(page); continue; } |
515f4a037 mm: write_cache_p... |
2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 |
if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } |
811d736f9 [PATCH] BLOCK: Di... |
2152 |
|
515f4a037 mm: write_cache_p... |
2153 2154 |
BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) |
5a3d5c981 mm: write_cache_p... |
2155 |
goto continue_unlock; |
811d736f9 [PATCH] BLOCK: Di... |
2156 |
|
de1414a65 fs: export inode_... |
2157 |
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); |
694c20fe0 mm/page-writeback... |
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 |
error = (*writepage)(page, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of * writeback. There's no need to continue for * background writeback. Just push done_index * past this page so media errors won't choke * writeout for the entire file. For integrity * writeback, we must process the entire dirty * set regardless of errors because the fs may * still have state to clear for each page. In * that case we continue processing and return * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { |
00266770b mm: write_cache_p... |
2173 |
unlock_page(page); |
694c20fe0 mm/page-writeback... |
2174 2175 2176 |
error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; |
cf15b07cf writeback: make m... |
2177 |
done_index = page->index + 1; |
00266770b mm: write_cache_p... |
2178 2179 2180 |
done = 1; break; } |
694c20fe0 mm/page-writeback... |
2181 2182 |
if (!ret) ret = error; |
0b5649278 writeback: pay at... |
2183 |
} |
00266770b mm: write_cache_p... |
2184 |
|
546a19242 writeback: write_... |
2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 |
/* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; |
05fe478dd mm: write_cache_p... |
2195 |
} |
811d736f9 [PATCH] BLOCK: Di... |
2196 2197 2198 2199 |
} pagevec_release(&pvec); cond_resched(); } |
3a4c6800f Fix page writebac... |
2200 |
if (!cycled && !done) { |
811d736f9 [PATCH] BLOCK: Di... |
2201 |
/* |
31a12666d mm: write_cache_p... |
2202 |
* range_cyclic: |
811d736f9 [PATCH] BLOCK: Di... |
2203 2204 2205 |
* We hit the last page and there is more work to be done: wrap * back to the start of the file */ |
31a12666d mm: write_cache_p... |
2206 |
cycled = 1; |
811d736f9 [PATCH] BLOCK: Di... |
2207 |
index = 0; |
31a12666d mm: write_cache_p... |
2208 |
end = writeback_index - 1; |
811d736f9 [PATCH] BLOCK: Di... |
2209 2210 |
goto retry; } |
0b5649278 writeback: pay at... |
2211 2212 |
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; |
06d6cf695 mm: Add range_con... |
2213 |
|
811d736f9 [PATCH] BLOCK: Di... |
2214 2215 |
return ret; } |
0ea971801 consolidate gener... |
2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 |
EXPORT_SYMBOL(write_cache_pages); /* * Function used by generic_writepages to call the real writepage * function and set the mapping flags on error */ static int __writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /** * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * * This is a library function, which implements the writepages() * address_space_operation. */ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
9b6096a65 mm: make generic_... |
2242 2243 |
struct blk_plug plug; int ret; |
0ea971801 consolidate gener... |
2244 2245 2246 |
/* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; |
9b6096a65 mm: make generic_... |
2247 2248 2249 2250 |
blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __writepage, mapping); blk_finish_plug(&plug); return ret; |
0ea971801 consolidate gener... |
2251 |
} |
811d736f9 [PATCH] BLOCK: Di... |
2252 2253 |
EXPORT_SYMBOL(generic_writepages); |
1da177e4c Linux-2.6.12-rc2 |
2254 2255 |
int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
22905f775 identify multipag... |
2256 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2257 2258 |
if (wbc->nr_to_write <= 0) return 0; |
80a2ea9f8 mm: retry writepa... |
2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 |
while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) break; cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } |
22905f775 identify multipag... |
2269 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2270 2271 2272 |
} /** |
2b69c8280 mm: drop "wait" p... |
2273 |
* write_one_page - write out a single page and wait on I/O |
67be2dd1b [PATCH] DocBook: ... |
2274 |
* @page: the page to write |
1da177e4c Linux-2.6.12-rc2 |
2275 2276 2277 |
* * The page must be locked by the caller and will be unlocked upon return. * |
37e51a764 mm: clean up erro... |
2278 2279 |
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. |
1da177e4c Linux-2.6.12-rc2 |
2280 |
*/ |
2b69c8280 mm: drop "wait" p... |
2281 |
int write_one_page(struct page *page) |
1da177e4c Linux-2.6.12-rc2 |
2282 2283 2284 2285 2286 2287 2288 2289 2290 |
{ struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); |
2b69c8280 mm: drop "wait" p... |
2291 |
wait_on_page_writeback(page); |
1da177e4c Linux-2.6.12-rc2 |
2292 2293 |
if (clear_page_dirty_for_io(page)) { |
09cbfeaf1 mm, fs: get rid o... |
2294 |
get_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2295 |
ret = mapping->a_ops->writepage(page, &wbc); |
37e51a764 mm: clean up erro... |
2296 |
if (ret == 0) |
1da177e4c Linux-2.6.12-rc2 |
2297 |
wait_on_page_writeback(page); |
09cbfeaf1 mm, fs: get rid o... |
2298 |
put_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2299 2300 2301 |
} else { unlock_page(page); } |
37e51a764 mm: clean up erro... |
2302 2303 2304 |
if (!ret) ret = filemap_check_errors(mapping); |
1da177e4c Linux-2.6.12-rc2 |
2305 2306 2307 2308 2309 |
return ret; } EXPORT_SYMBOL(write_one_page); /* |
767193253 [PATCH] simplify ... |
2310 2311 2312 2313 2314 |
* For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) |
c3f0da631 mm/page-writeback... |
2315 |
return !TestSetPageDirty(page); |
767193253 [PATCH] simplify ... |
2316 2317 2318 2319 |
return 0; } /* |
e3a7cca1e vfs: add/use acco... |
2320 |
* Helper function for set_page_dirty family. |
c4843a759 memcg: add per cg... |
2321 |
* |
81f8c3a46 mm: memcontrol: g... |
2322 |
* Caller must hold lock_page_memcg(). |
c4843a759 memcg: add per cg... |
2323 |
* |
e3a7cca1e vfs: add/use acco... |
2324 2325 |
* NOTE: This relies on being atomic wrt interrupts. */ |
62cccb8c8 mm: simplify lock... |
2326 |
void account_page_dirtied(struct page *page, struct address_space *mapping) |
e3a7cca1e vfs: add/use acco... |
2327 |
{ |
52ebea749 writeback: make b... |
2328 |
struct inode *inode = mapping->host; |
9fb0a7da0 writeback: add mo... |
2329 |
trace_writeback_dirty_page(page, mapping); |
e3a7cca1e vfs: add/use acco... |
2330 |
if (mapping_cap_account_dirty(mapping)) { |
52ebea749 writeback: make b... |
2331 |
struct bdi_writeback *wb; |
de1414a65 fs: export inode_... |
2332 |
|
52ebea749 writeback: make b... |
2333 2334 |
inode_attach_wb(inode, page); wb = inode_to_wb(inode); |
de1414a65 fs: export inode_... |
2335 |
|
00f3ca2c2 mm: memcontrol: p... |
2336 |
__inc_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2337 |
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b mm: move vmscan w... |
2338 |
__inc_node_page_state(page, NR_DIRTIED); |
3e8f399da writeback: rework... |
2339 2340 |
inc_wb_stat(wb, WB_RECLAIMABLE); inc_wb_stat(wb, WB_DIRTIED); |
09cbfeaf1 mm, fs: get rid o... |
2341 |
task_io_account_write(PAGE_SIZE); |
d3bc1fef9 writeback: fix di... |
2342 2343 |
current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); |
e3a7cca1e vfs: add/use acco... |
2344 2345 |
} } |
679ceace8 mm: exporting acc... |
2346 |
EXPORT_SYMBOL(account_page_dirtied); |
e3a7cca1e vfs: add/use acco... |
2347 2348 |
/* |
b9ea25152 page_writeback: c... |
2349 2350 |
* Helper function for deaccounting dirty page without writeback. * |
81f8c3a46 mm: memcontrol: g... |
2351 |
* Caller must hold lock_page_memcg(). |
b9ea25152 page_writeback: c... |
2352 |
*/ |
c4843a759 memcg: add per cg... |
2353 |
void account_page_cleaned(struct page *page, struct address_space *mapping, |
62cccb8c8 mm: simplify lock... |
2354 |
struct bdi_writeback *wb) |
b9ea25152 page_writeback: c... |
2355 2356 |
{ if (mapping_cap_account_dirty(mapping)) { |
00f3ca2c2 mm: memcontrol: p... |
2357 |
dec_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2358 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a writeback: implem... |
2359 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
09cbfeaf1 mm, fs: get rid o... |
2360 |
task_io_account_cancelled_write(PAGE_SIZE); |
b9ea25152 page_writeback: c... |
2361 2362 |
} } |
b9ea25152 page_writeback: c... |
2363 2364 |
/* |
1da177e4c Linux-2.6.12-rc2 |
2365 2366 2367 2368 2369 2370 2371 |
* For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * |
2d6d7f982 mm: protect set_p... |
2372 2373 2374 |
* The caller must ensure this doesn't race with truncation. Most will simply * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and * the pte lock held, which also locks out truncation. |
1da177e4c Linux-2.6.12-rc2 |
2375 2376 2377 |
*/ int __set_page_dirty_nobuffers(struct page *page) { |
62cccb8c8 mm: simplify lock... |
2378 |
lock_page_memcg(page); |
1da177e4c Linux-2.6.12-rc2 |
2379 2380 |
if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); |
a85d9df1e mm: __set_page_di... |
2381 |
unsigned long flags; |
1da177e4c Linux-2.6.12-rc2 |
2382 |
|
c4843a759 memcg: add per cg... |
2383 |
if (!mapping) { |
62cccb8c8 mm: simplify lock... |
2384 |
unlock_page_memcg(page); |
8c08540f8 [PATCH] clean up ... |
2385 |
return 1; |
c4843a759 memcg: add per cg... |
2386 |
} |
8c08540f8 [PATCH] clean up ... |
2387 |
|
a85d9df1e mm: __set_page_di... |
2388 |
spin_lock_irqsave(&mapping->tree_lock, flags); |
2d6d7f982 mm: protect set_p... |
2389 2390 |
BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
62cccb8c8 mm: simplify lock... |
2391 |
account_page_dirtied(page, mapping); |
2d6d7f982 mm: protect set_p... |
2392 2393 |
radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); |
a85d9df1e mm: __set_page_di... |
2394 |
spin_unlock_irqrestore(&mapping->tree_lock, flags); |
62cccb8c8 mm: simplify lock... |
2395 |
unlock_page_memcg(page); |
c4843a759 memcg: add per cg... |
2396 |
|
8c08540f8 [PATCH] clean up ... |
2397 2398 2399 |
if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
1da177e4c Linux-2.6.12-rc2 |
2400 |
} |
4741c9fd3 [PATCH] set_page_... |
2401 |
return 1; |
1da177e4c Linux-2.6.12-rc2 |
2402 |
} |
62cccb8c8 mm: simplify lock... |
2403 |
unlock_page_memcg(page); |
4741c9fd3 [PATCH] set_page_... |
2404 |
return 0; |
1da177e4c Linux-2.6.12-rc2 |
2405 2406 2407 2408 |
} EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* |
2f800fbd7 writeback: fix di... |
2409 2410 2411 2412 2413 2414 2415 2416 2417 |
* Call this whenever redirtying a page, to de-account the dirty counters * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to * systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; |
910181343 writeback: attrib... |
2418 |
|
2f800fbd7 writeback: fix di... |
2419 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2420 2421 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
7c9b87a78 writeback: safer ... |
2422 |
struct wb_lock_cookie cookie = {}; |
910181343 writeback: attrib... |
2423 |
|
7c9b87a78 writeback: safer ... |
2424 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
2f800fbd7 writeback: fix di... |
2425 |
current->nr_dirtied--; |
c4a25635b mm: move vmscan w... |
2426 |
dec_node_page_state(page, NR_DIRTIED); |
910181343 writeback: attrib... |
2427 |
dec_wb_stat(wb, WB_DIRTIED); |
7c9b87a78 writeback: safer ... |
2428 |
unlocked_inode_to_wb_end(inode, &cookie); |
2f800fbd7 writeback: fix di... |
2429 2430 2431 2432 2433 |
} } EXPORT_SYMBOL(account_page_redirty); /* |
1da177e4c Linux-2.6.12-rc2 |
2434 2435 2436 2437 2438 2439 |
* When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { |
8d38633c3 page_writeback: p... |
2440 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2441 |
wbc->pages_skipped++; |
8d38633c3 page_writeback: p... |
2442 |
ret = __set_page_dirty_nobuffers(page); |
2f800fbd7 writeback: fix di... |
2443 |
account_page_redirty(page); |
8d38633c3 page_writeback: p... |
2444 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2445 2446 2447 2448 |
} EXPORT_SYMBOL(redirty_page_for_writepage); /* |
6746aff74 HWPOISON: shmem: ... |
2449 2450 2451 2452 2453 2454 2455 |
* Dirty a page. * * For pages with a mapping this should be done under the page lock * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. * |
1da177e4c Linux-2.6.12-rc2 |
2456 2457 2458 |
* If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ |
1cf6e7d83 mm: task dirty ac... |
2459 |
int set_page_dirty(struct page *page) |
1da177e4c Linux-2.6.12-rc2 |
2460 2461 |
{ struct address_space *mapping = page_mapping(page); |
800d8c63b shmem: add huge p... |
2462 |
page = compound_head(page); |
1da177e4c Linux-2.6.12-rc2 |
2463 2464 |
if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
278df9f45 mm: reclaim inval... |
2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 |
/* * readahead/lru_deactivate_page could remain * PG_readahead/PG_reclaim due to race with end_page_writeback * About readahead, if the page is written, the flags would be * reset. So no problem. * About lru_deactivate_page, if the page is redirty, the flag * will be reset. So no problem. but if the page is used by readahead * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ |
a4bb3ecdc mm/page-writeback... |
2475 2476 |
if (PageReclaim(page)) ClearPageReclaim(page); |
9361401eb [PATCH] BLOCK: Ma... |
2477 2478 2479 2480 2481 |
#ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; #endif return (*spd)(page); |
1da177e4c Linux-2.6.12-rc2 |
2482 |
} |
4741c9fd3 [PATCH] set_page_... |
2483 2484 2485 2486 |
if (!PageDirty(page)) { if (!TestSetPageDirty(page)) return 1; } |
1da177e4c Linux-2.6.12-rc2 |
2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 |
return 0; } EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. * * Usually, the page _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the page should be locked before running set_page_dirty(). */ int set_page_dirty_lock(struct page *page) { int ret; |
7eaceacca block: remove per... |
2504 |
lock_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2505 2506 2507 2508 2509 2510 2511 |
ret = set_page_dirty(page); unlock_page(page); return ret; } EXPORT_SYMBOL(set_page_dirty_lock); /* |
11f81becc page_writeback: r... |
2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 |
* This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ void cancel_dirty_page(struct page *page) { |
c4843a759 memcg: add per cg... |
2526 2527 2528 |
struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2529 2530 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
7c9b87a78 writeback: safer ... |
2531 |
struct wb_lock_cookie cookie = {}; |
c4843a759 memcg: add per cg... |
2532 |
|
62cccb8c8 mm: simplify lock... |
2533 |
lock_page_memcg(page); |
7c9b87a78 writeback: safer ... |
2534 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
c4843a759 memcg: add per cg... |
2535 2536 |
if (TestClearPageDirty(page)) |
62cccb8c8 mm: simplify lock... |
2537 |
account_page_cleaned(page, mapping, wb); |
c4843a759 memcg: add per cg... |
2538 |
|
7c9b87a78 writeback: safer ... |
2539 |
unlocked_inode_to_wb_end(inode, &cookie); |
62cccb8c8 mm: simplify lock... |
2540 |
unlock_page_memcg(page); |
c4843a759 memcg: add per cg... |
2541 2542 2543 |
} else { ClearPageDirty(page); } |
11f81becc page_writeback: r... |
2544 2545 2546 2547 |
} EXPORT_SYMBOL(cancel_dirty_page); /* |
1da177e4c Linux-2.6.12-rc2 |
2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 |
* Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page * tagged as dirty in the radix tree so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), * at which stage we bring the page's dirty flag and radix-tree dirty tag * back into sync. * * This incoherency between the page's dirty flag and radix-tree tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); |
c4843a759 memcg: add per cg... |
2564 |
int ret = 0; |
1da177e4c Linux-2.6.12-rc2 |
2565 |
|
79352894b mm: fix clear_pag... |
2566 |
BUG_ON(!PageLocked(page)); |
7658cc289 VM: Fix nasty and... |
2567 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2568 2569 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
7c9b87a78 writeback: safer ... |
2570 |
struct wb_lock_cookie cookie = {}; |
682aa8e1a writeback: implem... |
2571 |
|
7658cc289 VM: Fix nasty and... |
2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 |
/* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole page dirty if it was * dirty in a pagetable. Only to then * (c) clean the page again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "set_page_dirty(page)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the page "master dirty bit" * as a serialization point for all the different * threads doing their things. |
7658cc289 VM: Fix nasty and... |
2596 2597 2598 |
*/ if (page_mkclean(page)) set_page_dirty(page); |
79352894b mm: fix clear_pag... |
2599 2600 2601 |
/* * We carefully synchronise fault handlers against * installing a dirty pte and marking the page dirty |
2d6d7f982 mm: protect set_p... |
2602 2603 2604 2605 |
* at this point. We do this by having them hold the * page lock while dirtying the page, and pages are * always locked coming in here, so we get the desired * exclusion. |
79352894b mm: fix clear_pag... |
2606 |
*/ |
7c9b87a78 writeback: safer ... |
2607 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
7658cc289 VM: Fix nasty and... |
2608 |
if (TestClearPageDirty(page)) { |
00f3ca2c2 mm: memcontrol: p... |
2609 |
dec_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2610 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a writeback: implem... |
2611 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
c4843a759 memcg: add per cg... |
2612 |
ret = 1; |
1da177e4c Linux-2.6.12-rc2 |
2613 |
} |
7c9b87a78 writeback: safer ... |
2614 |
unlocked_inode_to_wb_end(inode, &cookie); |
c4843a759 memcg: add per cg... |
2615 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2616 |
} |
7658cc289 VM: Fix nasty and... |
2617 |
return TestClearPageDirty(page); |
1da177e4c Linux-2.6.12-rc2 |
2618 |
} |
58bb01a9c [PATCH] re-export... |
2619 |
EXPORT_SYMBOL(clear_page_dirty_for_io); |
1da177e4c Linux-2.6.12-rc2 |
2620 2621 2622 2623 |
int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); |
739f79fc9 mm: memcontrol: f... |
2624 2625 |
struct mem_cgroup *memcg; struct lruvec *lruvec; |
d7365e783 mm: memcontrol: f... |
2626 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2627 |
|
739f79fc9 mm: memcontrol: f... |
2628 2629 |
memcg = lock_page_memcg(page); lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); |
371a096ed mm: don't use rad... |
2630 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
910181343 writeback: attrib... |
2631 2632 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c Linux-2.6.12-rc2 |
2633 |
unsigned long flags; |
19fd62312 mm: spinlock tree... |
2634 |
spin_lock_irqsave(&mapping->tree_lock, flags); |
1da177e4c Linux-2.6.12-rc2 |
2635 |
ret = TestClearPageWriteback(page); |
69cb51d18 mm: count writeba... |
2636 |
if (ret) { |
1da177e4c Linux-2.6.12-rc2 |
2637 2638 2639 |
radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6 mm: bdi: add sepa... |
2640 |
if (bdi_cap_account_writeback(bdi)) { |
910181343 writeback: attrib... |
2641 |
struct bdi_writeback *wb = inode_to_wb(inode); |
3e8f399da writeback: rework... |
2642 |
dec_wb_stat(wb, WB_WRITEBACK); |
910181343 writeback: attrib... |
2643 |
__wb_writeout_inc(wb); |
04fbfdc14 mm: per device di... |
2644 |
} |
69cb51d18 mm: count writeba... |
2645 |
} |
6c60d2b57 fs/fs-writeback.c... |
2646 2647 2648 2649 |
if (mapping->host && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); |
19fd62312 mm: spinlock tree... |
2650 |
spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1da177e4c Linux-2.6.12-rc2 |
2651 2652 2653 |
} else { ret = TestClearPageWriteback(page); } |
739f79fc9 mm: memcontrol: f... |
2654 2655 2656 2657 2658 2659 |
/* * NOTE: Page might be free now! Writeback doesn't hold a page * reference on its own, it relies on truncation to wait for * the clearing of PG_writeback. The below can only access * page state that is static across allocation cycles. */ |
99b12e3d8 writeback: accoun... |
2660 |
if (ret) { |
739f79fc9 mm: memcontrol: f... |
2661 |
dec_lruvec_state(lruvec, NR_WRITEBACK); |
5a1c84b40 mm: remove reclai... |
2662 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b mm: move vmscan w... |
2663 |
inc_node_page_state(page, NR_WRITTEN); |
99b12e3d8 writeback: accoun... |
2664 |
} |
739f79fc9 mm: memcontrol: f... |
2665 |
__unlock_page_memcg(memcg); |
1da177e4c Linux-2.6.12-rc2 |
2666 2667 |
return ret; } |
1c8349a17 ext4: fix data in... |
2668 |
int __test_set_page_writeback(struct page *page, bool keep_write) |
1da177e4c Linux-2.6.12-rc2 |
2669 2670 |
{ struct address_space *mapping = page_mapping(page); |
d7365e783 mm: memcontrol: f... |
2671 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2672 |
|
62cccb8c8 mm: simplify lock... |
2673 |
lock_page_memcg(page); |
371a096ed mm: don't use rad... |
2674 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
910181343 writeback: attrib... |
2675 2676 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c Linux-2.6.12-rc2 |
2677 |
unsigned long flags; |
19fd62312 mm: spinlock tree... |
2678 |
spin_lock_irqsave(&mapping->tree_lock, flags); |
1da177e4c Linux-2.6.12-rc2 |
2679 |
ret = TestSetPageWriteback(page); |
69cb51d18 mm: count writeba... |
2680 |
if (!ret) { |
6c60d2b57 fs/fs-writeback.c... |
2681 2682 2683 2684 |
bool on_wblist; on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); |
1da177e4c Linux-2.6.12-rc2 |
2685 2686 2687 |
radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6 mm: bdi: add sepa... |
2688 |
if (bdi_cap_account_writeback(bdi)) |
3e8f399da writeback: rework... |
2689 |
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); |
6c60d2b57 fs/fs-writeback.c... |
2690 2691 2692 2693 2694 2695 2696 2697 |
/* * We can come through here when swapping anonymous * pages, so we don't necessarily have an inode to track * for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); |
69cb51d18 mm: count writeba... |
2698 |
} |
1da177e4c Linux-2.6.12-rc2 |
2699 2700 2701 2702 |
if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); |
1c8349a17 ext4: fix data in... |
2703 2704 2705 2706 |
if (!keep_write) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_TOWRITE); |
19fd62312 mm: spinlock tree... |
2707 |
spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1da177e4c Linux-2.6.12-rc2 |
2708 2709 2710 |
} else { ret = TestSetPageWriteback(page); } |
3a3c02ecf mm: page-writebac... |
2711 |
if (!ret) { |
00f3ca2c2 mm: memcontrol: p... |
2712 |
inc_lruvec_page_state(page, NR_WRITEBACK); |
5a1c84b40 mm: remove reclai... |
2713 |
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
3a3c02ecf mm: page-writebac... |
2714 |
} |
62cccb8c8 mm: simplify lock... |
2715 |
unlock_page_memcg(page); |
1da177e4c Linux-2.6.12-rc2 |
2716 2717 2718 |
return ret; } |
1c8349a17 ext4: fix data in... |
2719 |
EXPORT_SYMBOL(__test_set_page_writeback); |
1da177e4c Linux-2.6.12-rc2 |
2720 2721 |
/* |
001281881 mm: use lockless ... |
2722 |
* Return true if any of the pages in the mapping are marked with the |
1da177e4c Linux-2.6.12-rc2 |
2723 2724 2725 2726 |
* passed tag. */ int mapping_tagged(struct address_space *mapping, int tag) { |
72c478321 mm: remove useles... |
2727 |
return radix_tree_tagged(&mapping->page_tree, tag); |
1da177e4c Linux-2.6.12-rc2 |
2728 2729 |
} EXPORT_SYMBOL(mapping_tagged); |
1d1d1a767 mm: only enforce ... |
2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 |
/** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. * * This function determines if the given page is related to a backing device * that requires page contents to be held stable during writeback. If so, then * it will wait for any pending writeback to complete. */ void wait_for_stable_page(struct page *page) { |
de1414a65 fs: export inode_... |
2741 2742 |
if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) wait_on_page_writeback(page); |
1d1d1a767 mm: only enforce ... |
2743 2744 |
} EXPORT_SYMBOL_GPL(wait_for_stable_page); |