Blame view
mm/page-writeback.c
84.7 KB
457c89965 treewide: Add SPD... |
1 |
// SPDX-License-Identifier: GPL-2.0-only |
1da177e4c Linux-2.6.12-rc2 |
2 |
/* |
f30c22695 fix file specific... |
3 |
* mm/page-writeback.c |
1da177e4c Linux-2.6.12-rc2 |
4 5 |
* * Copyright (C) 2002, Linus Torvalds. |
90eec103b treewide: Remove ... |
6 |
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
1da177e4c Linux-2.6.12-rc2 |
7 8 9 10 |
* * Contains functions related to writing back dirty pages at the * address_space level. * |
e1f8e8744 Remove Andrew Mor... |
11 |
* 10Apr2002 Andrew Morton |
1da177e4c Linux-2.6.12-rc2 |
12 13 14 15 |
* Initial version */ #include <linux/kernel.h> |
b95f1b31b mm: Map most file... |
16 |
#include <linux/export.h> |
1da177e4c Linux-2.6.12-rc2 |
17 18 19 20 21 22 23 24 25 |
#include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> |
55e829af0 [PATCH] io-accoun... |
26 |
#include <linux/task_io_accounting_ops.h> |
1da177e4c Linux-2.6.12-rc2 |
27 28 |
#include <linux/blkdev.h> #include <linux/mpage.h> |
d08b3851d [PATCH] mm: track... |
29 |
#include <linux/rmap.h> |
1da177e4c Linux-2.6.12-rc2 |
30 |
#include <linux/percpu.h> |
1da177e4c Linux-2.6.12-rc2 |
31 32 33 34 |
#include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> |
ff01bb483 fs: move code out... |
35 |
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
811d736f9 [PATCH] BLOCK: Di... |
36 |
#include <linux/pagevec.h> |
eb608e3a3 block: Convert BD... |
37 |
#include <linux/timer.h> |
8bd75c77b sched/rt: Move rt... |
38 |
#include <linux/sched/rt.h> |
f361bf4a6 sched/headers: Pr... |
39 |
#include <linux/sched/signal.h> |
6e543d578 mm: vmscan: fix d... |
40 |
#include <linux/mm_inline.h> |
028c2dd18 writeback: Add tr... |
41 |
#include <trace/events/writeback.h> |
1da177e4c Linux-2.6.12-rc2 |
42 |
|
6e543d578 mm: vmscan: fix d... |
43 |
#include "internal.h" |
1da177e4c Linux-2.6.12-rc2 |
44 |
/* |
ffd1f609a writeback: introd... |
45 46 47 48 49 |
* Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* |
5b9b35743 writeback: avoid ... |
50 51 52 53 54 55 |
* Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* |
e98be2d59 writeback: bdi wr... |
56 57 58 |
* Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
6c14ae1e9 writeback: dirty ... |
59 |
#define RATELIMIT_CALC_SHIFT 10 |
e98be2d59 writeback: bdi wr... |
60 |
/* |
1da177e4c Linux-2.6.12-rc2 |
61 62 63 64 |
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; |
1da177e4c Linux-2.6.12-rc2 |
65 66 67 |
/* The following parameters are exported via /proc/sys/vm */ /* |
5b0830cb9 writeback: get ri... |
68 |
* Start background writeback (via writeback threads) at this percentage |
1da177e4c Linux-2.6.12-rc2 |
69 |
*/ |
1b5e62b42 writeback: double... |
70 |
int dirty_background_ratio = 10; |
1da177e4c Linux-2.6.12-rc2 |
71 72 |
/* |
2da02997e mm: add dirty_bac... |
73 74 75 76 77 78 |
* dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ unsigned long dirty_background_bytes; /* |
195cf453d mm/page-writeback... |
79 80 81 82 83 84 |
* free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ int vm_highmem_is_dirtyable; /* |
1da177e4c Linux-2.6.12-rc2 |
85 86 |
* The generator of dirty data starts writeback at this percentage */ |
1b5e62b42 writeback: double... |
87 |
int vm_dirty_ratio = 20; |
1da177e4c Linux-2.6.12-rc2 |
88 89 |
/* |
2da02997e mm: add dirty_bac... |
90 91 92 93 94 95 |
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ unsigned long vm_dirty_bytes; /* |
704503d83 mm: fix proc_doin... |
96 |
* The interval between `kupdate'-style writebacks |
1da177e4c Linux-2.6.12-rc2 |
97 |
*/ |
22ef37eed page-writeback: f... |
98 |
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ |
1da177e4c Linux-2.6.12-rc2 |
99 |
|
91913a294 mm: export dirty_... |
100 |
EXPORT_SYMBOL_GPL(dirty_writeback_interval); |
1da177e4c Linux-2.6.12-rc2 |
101 |
/* |
704503d83 mm: fix proc_doin... |
102 |
* The longest time for which data is allowed to remain dirty |
1da177e4c Linux-2.6.12-rc2 |
103 |
*/ |
22ef37eed page-writeback: f... |
104 |
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ |
1da177e4c Linux-2.6.12-rc2 |
105 106 107 108 109 110 111 |
/* * Flag that makes the machine dump writes/reads and block dirtyings. */ int block_dump; /* |
ed5b43f15 [PATCH] Represent... |
112 113 |
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. |
1da177e4c Linux-2.6.12-rc2 |
114 115 116 117 118 119 |
*/ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ |
dcc25ae76 writeback: move g... |
120 |
struct wb_domain global_wb_domain; |
1da177e4c Linux-2.6.12-rc2 |
121 |
|
2bc00aef0 writeback: consol... |
122 123 |
/* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { |
e9f07dfd7 writeback: add di... |
124 125 |
#ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; |
9fc3a43e1 writeback: separa... |
126 |
struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ |
e9f07dfd7 writeback: add di... |
127 |
#endif |
2bc00aef0 writeback: consol... |
128 |
struct bdi_writeback *wb; |
e9770b348 writeback: add di... |
129 |
struct fprop_local_percpu *wb_completions; |
eb608e3a3 block: Convert BD... |
130 |
|
9fc3a43e1 writeback: separa... |
131 |
unsigned long avail; /* dirtyable */ |
2bc00aef0 writeback: consol... |
132 133 134 135 136 137 |
unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; |
970fb01ad writeback: add di... |
138 |
unsigned long wb_bg_thresh; |
daddfa3cb writeback: add di... |
139 140 |
unsigned long pos_ratio; |
2bc00aef0 writeback: consol... |
141 |
}; |
eb608e3a3 block: Convert BD... |
142 143 144 145 146 147 |
/* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) |
04fbfdc14 mm: per device di... |
148 |
|
693108a8a writeback: make b... |
149 |
#ifdef CONFIG_CGROUP_WRITEBACK |
d60d1bddd writeback: memcg ... |
150 151 152 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions |
9fc3a43e1 writeback: separa... |
153 |
#define GDTC_INIT_NO_WB .dom = &global_wb_domain |
d60d1bddd writeback: memcg ... |
154 155 156 157 158 |
#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc |
c2aa723a6 writeback: implem... |
159 160 161 162 163 |
static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } |
e9f07dfd7 writeback: add di... |
164 165 166 167 168 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } |
9fc3a43e1 writeback: separa... |
169 170 171 172 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } |
841710aa6 writeback: implem... |
173 174 175 176 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } |
693108a8a writeback: make b... |
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; do_div(min, tot_bw); } if (max < 100) { max *= this_bw; do_div(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ |
d60d1bddd writeback: memcg ... |
205 206 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions |
9fc3a43e1 writeback: separa... |
207 |
#define GDTC_INIT_NO_WB |
c2aa723a6 writeback: implem... |
208 209 210 211 212 213 |
#define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } |
e9f07dfd7 writeback: add di... |
214 215 216 217 218 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } |
9fc3a43e1 writeback: separa... |
219 220 221 222 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } |
841710aa6 writeback: implem... |
223 224 225 226 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } |
693108a8a writeback: make b... |
227 228 229 230 231 232 233 234 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ |
04fbfdc14 mm: per device di... |
235 |
/* |
a756cf590 mm: try to distri... |
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
* In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effictive number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ |
a804552b9 mm/page-writeback... |
252 |
/** |
281e37265 mm, page_alloc: c... |
253 254 |
* node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node |
a804552b9 mm/page-writeback... |
255 |
* |
a862f68a8 docs/core-api/mm:... |
256 |
* Return: the node's number of pages potentially available for dirty |
281e37265 mm, page_alloc: c... |
257 |
* page cache. This is the base value for the per-node dirty limits. |
a804552b9 mm/page-writeback... |
258 |
*/ |
281e37265 mm, page_alloc: c... |
259 |
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
a804552b9 mm/page-writeback... |
260 |
{ |
281e37265 mm, page_alloc: c... |
261 262 263 264 265 266 267 268 269 270 271 |
unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } |
a804552b9 mm/page-writeback... |
272 |
|
a8d014373 mm: page_alloc: g... |
273 274 275 276 277 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ |
281e37265 mm, page_alloc: c... |
278 |
nr_pages -= min(nr_pages, pgdat->totalreserve_pages); |
a804552b9 mm/page-writeback... |
279 |
|
281e37265 mm, page_alloc: c... |
280 281 |
nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); |
a804552b9 mm/page-writeback... |
282 283 284 |
return nr_pages; } |
1edf22348 mm/page-writeback... |
285 286 287 288 |
static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; |
bb4cc2bea mm, vmscan: remov... |
289 |
unsigned long x = 0; |
09b4ab3c4 mm/writeback: cor... |
290 |
int i; |
1edf22348 mm/page-writeback... |
291 292 |
for_each_node_state(node, N_HIGH_MEMORY) { |
281e37265 mm, page_alloc: c... |
293 294 |
for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; |
9cb937e21 mm, page_alloc: f... |
295 |
unsigned long nr_pages; |
281e37265 mm, page_alloc: c... |
296 297 298 299 300 |
if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; |
9cb937e21 mm, page_alloc: f... |
301 302 |
if (!populated_zone(z)) continue; |
1edf22348 mm/page-writeback... |
303 |
|
9cb937e21 mm, page_alloc: f... |
304 |
nr_pages = zone_page_state(z, NR_FREE_PAGES); |
281e37265 mm, page_alloc: c... |
305 |
/* watch for underflows */ |
9cb937e21 mm, page_alloc: f... |
306 |
nr_pages -= min(nr_pages, high_wmark_pages(z)); |
bb4cc2bea mm, vmscan: remov... |
307 308 309 |
nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; |
09b4ab3c4 mm/writeback: cor... |
310 |
} |
1edf22348 mm/page-writeback... |
311 |
} |
281e37265 mm, page_alloc: c... |
312 |
|
1edf22348 mm/page-writeback... |
313 |
/* |
c8b74c2f6 mm: fix calculati... |
314 315 316 317 318 319 320 321 322 323 324 325 |
* Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below * the zone's dirty balance reserve and the above calculation * will underflow. However we still want to add in nodes * which are below threshold (negative values) to get a more * accurate calculation but make sure that the total never * underflows. */ if ((long)x < 0) x = 0; /* |
1edf22348 mm/page-writeback... |
326 327 328 329 330 331 332 333 334 335 336 337 |
* Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** |
ccafa2879 mm: writeback: cl... |
338 |
* global_dirtyable_memory - number of globally dirtyable pages |
1edf22348 mm/page-writeback... |
339 |
* |
a862f68a8 docs/core-api/mm:... |
340 |
* Return: the global number of pages potentially available for dirty |
ccafa2879 mm: writeback: cl... |
341 |
* page cache. This is the base value for the global dirty limits. |
1edf22348 mm/page-writeback... |
342 |
*/ |
18cf8cf8b mm: page-writebac... |
343 |
static unsigned long global_dirtyable_memory(void) |
1edf22348 mm/page-writeback... |
344 345 |
{ unsigned long x; |
c41f012ad mm: rename global... |
346 |
x = global_zone_page_state(NR_FREE_PAGES); |
a8d014373 mm: page_alloc: g... |
347 348 349 350 351 352 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); |
1edf22348 mm/page-writeback... |
353 |
|
599d0c954 mm, vmscan: move ... |
354 355 |
x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); |
a804552b9 mm/page-writeback... |
356 |
|
1edf22348 mm/page-writeback... |
357 358 359 360 361 |
if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } |
9fc3a43e1 writeback: separa... |
362 363 364 |
/** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest |
ccafa2879 mm: writeback: cl... |
365 |
* |
9fc3a43e1 writeback: separa... |
366 367 368 369 |
* Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
ccafa2879 mm: writeback: cl... |
370 371 |
* real-time tasks. */ |
9fc3a43e1 writeback: separa... |
372 |
static void domain_dirty_limits(struct dirty_throttle_control *dtc) |
ccafa2879 mm: writeback: cl... |
373 |
{ |
9fc3a43e1 writeback: separa... |
374 375 376 377 |
const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; |
62a584fe0 writeback: use hi... |
378 379 380 |
/* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; |
9fc3a43e1 writeback: separa... |
381 382 |
unsigned long thresh; unsigned long bg_thresh; |
ccafa2879 mm: writeback: cl... |
383 |
struct task_struct *tsk; |
9fc3a43e1 writeback: separa... |
384 385 386 387 388 389 390 |
/* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against |
62a584fe0 writeback: use hi... |
391 392 393 |
* globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. |
9fc3a43e1 writeback: separa... |
394 395 |
*/ if (bytes) |
62a584fe0 writeback: use hi... |
396 397 |
ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); |
9fc3a43e1 writeback: separa... |
398 |
if (bg_bytes) |
62a584fe0 writeback: use hi... |
399 400 |
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); |
9fc3a43e1 writeback: separa... |
401 402 403 404 405 |
bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); |
ccafa2879 mm: writeback: cl... |
406 |
else |
62a584fe0 writeback: use hi... |
407 |
thresh = (ratio * available_memory) / PAGE_SIZE; |
ccafa2879 mm: writeback: cl... |
408 |
|
9fc3a43e1 writeback: separa... |
409 410 |
if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); |
ccafa2879 mm: writeback: cl... |
411 |
else |
62a584fe0 writeback: use hi... |
412 |
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; |
ccafa2879 mm: writeback: cl... |
413 |
|
90daf3062 Revert "mm/page-w... |
414 |
if (bg_thresh >= thresh) |
9fc3a43e1 writeback: separa... |
415 |
bg_thresh = thresh / 2; |
ccafa2879 mm: writeback: cl... |
416 417 |
tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
a53eaff8c MM: increase safe... |
418 419 |
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; |
ccafa2879 mm: writeback: cl... |
420 |
} |
9fc3a43e1 writeback: separa... |
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 |
dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; |
ccafa2879 mm: writeback: cl... |
446 |
} |
a756cf590 mm: try to distri... |
447 |
/** |
281e37265 mm, page_alloc: c... |
448 449 |
* node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node |
a756cf590 mm: try to distri... |
450 |
* |
a862f68a8 docs/core-api/mm:... |
451 |
* Return: the maximum number of dirty pages allowed in a node, based |
281e37265 mm, page_alloc: c... |
452 |
* on the node's dirtyable memory. |
a756cf590 mm: try to distri... |
453 |
*/ |
281e37265 mm, page_alloc: c... |
454 |
static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
a756cf590 mm: try to distri... |
455 |
{ |
281e37265 mm, page_alloc: c... |
456 |
unsigned long node_memory = node_dirtyable_memory(pgdat); |
a756cf590 mm: try to distri... |
457 458 459 460 461 |
struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * |
281e37265 mm, page_alloc: c... |
462 |
node_memory / global_dirtyable_memory(); |
a756cf590 mm: try to distri... |
463 |
else |
281e37265 mm, page_alloc: c... |
464 |
dirty = vm_dirty_ratio * node_memory / 100; |
a756cf590 mm: try to distri... |
465 466 467 468 469 470 471 472 |
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) dirty += dirty / 4; return dirty; } /** |
281e37265 mm, page_alloc: c... |
473 474 |
* node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check |
a756cf590 mm: try to distri... |
475 |
* |
a862f68a8 docs/core-api/mm:... |
476 |
* Return: %true when the dirty pages in @pgdat are within the node's |
a756cf590 mm: try to distri... |
477 478 |
* dirty limit, %false if the limit is exceeded. */ |
281e37265 mm, page_alloc: c... |
479 |
bool node_dirty_ok(struct pglist_data *pgdat) |
a756cf590 mm: try to distri... |
480 |
{ |
281e37265 mm, page_alloc: c... |
481 482 |
unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; |
11fb99898 mm: move most fil... |
483 484 485 |
nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); |
a756cf590 mm: try to distri... |
486 |
|
281e37265 mm, page_alloc: c... |
487 |
return nr_pages <= limit; |
a756cf590 mm: try to distri... |
488 |
} |
2da02997e mm: add dirty_bac... |
489 |
int dirty_background_ratio_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
490 |
void __user *buffer, size_t *lenp, |
2da02997e mm: add dirty_bac... |
491 492 493 |
loff_t *ppos) { int ret; |
8d65af789 sysctl: remove "s... |
494 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
495 496 497 498 499 500 |
if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
501 |
void __user *buffer, size_t *lenp, |
2da02997e mm: add dirty_bac... |
502 503 504 |
loff_t *ppos) { int ret; |
8d65af789 sysctl: remove "s... |
505 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
506 507 508 509 |
if (ret == 0 && write) dirty_background_ratio = 0; return ret; } |
04fbfdc14 mm: per device di... |
510 |
int dirty_ratio_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
511 |
void __user *buffer, size_t *lenp, |
04fbfdc14 mm: per device di... |
512 513 514 |
loff_t *ppos) { int old_ratio = vm_dirty_ratio; |
2da02997e mm: add dirty_bac... |
515 |
int ret; |
8d65af789 sysctl: remove "s... |
516 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
04fbfdc14 mm: per device di... |
517 |
if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
eb608e3a3 block: Convert BD... |
518 |
writeback_set_ratelimit(); |
2da02997e mm: add dirty_bac... |
519 520 521 522 |
vm_dirty_bytes = 0; } return ret; } |
2da02997e mm: add dirty_bac... |
523 |
int dirty_bytes_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
524 |
void __user *buffer, size_t *lenp, |
2da02997e mm: add dirty_bac... |
525 526 |
loff_t *ppos) { |
fc3501d41 mm: fix dirty_byt... |
527 |
unsigned long old_bytes = vm_dirty_bytes; |
2da02997e mm: add dirty_bac... |
528 |
int ret; |
8d65af789 sysctl: remove "s... |
529 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
530 |
if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
eb608e3a3 block: Convert BD... |
531 |
writeback_set_ratelimit(); |
2da02997e mm: add dirty_bac... |
532 |
vm_dirty_ratio = 0; |
04fbfdc14 mm: per device di... |
533 534 535 |
} return ret; } |
eb608e3a3 block: Convert BD... |
536 537 538 539 540 541 542 543 |
static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } |
c7981433e writeback: make _... |
544 545 546 |
static void wb_domain_writeout_inc(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac) |
04fbfdc14 mm: per device di... |
547 |
{ |
c7981433e writeback: make _... |
548 549 |
__fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); |
eb608e3a3 block: Convert BD... |
550 |
/* First event after period switching was turned off? */ |
517663edd mm/page-writeback... |
551 |
if (unlikely(!dom->period_time)) { |
eb608e3a3 block: Convert BD... |
552 553 554 555 556 557 |
/* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ |
380c27ca3 writeback: implem... |
558 559 |
dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3 block: Convert BD... |
560 |
} |
04fbfdc14 mm: per device di... |
561 |
} |
c7981433e writeback: make _... |
562 563 564 565 566 |
/* * Increment @wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) |
dd5656e59 mm: bdi: export b... |
567 |
{ |
841710aa6 writeback: implem... |
568 |
struct wb_domain *cgdom; |
dd5656e59 mm: bdi: export b... |
569 |
|
3e8f399da writeback: rework... |
570 |
inc_wb_stat(wb, WB_WRITTEN); |
c7981433e writeback: make _... |
571 572 |
wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); |
841710aa6 writeback: implem... |
573 574 575 576 577 |
cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac); |
dd5656e59 mm: bdi: export b... |
578 |
} |
dd5656e59 mm: bdi: export b... |
579 |
|
93f78d882 writeback: move b... |
580 |
void wb_writeout_inc(struct bdi_writeback *wb) |
04fbfdc14 mm: per device di... |
581 |
{ |
dd5656e59 mm: bdi: export b... |
582 583 584 |
unsigned long flags; local_irq_save(flags); |
93f78d882 writeback: move b... |
585 |
__wb_writeout_inc(wb); |
dd5656e59 mm: bdi: export b... |
586 |
local_irq_restore(flags); |
04fbfdc14 mm: per device di... |
587 |
} |
93f78d882 writeback: move b... |
588 |
EXPORT_SYMBOL_GPL(wb_writeout_inc); |
04fbfdc14 mm: per device di... |
589 |
|
04fbfdc14 mm: per device di... |
590 |
/* |
eb608e3a3 block: Convert BD... |
591 592 593 |
* On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ |
9823e51bf mm/page-writeback... |
594 |
static void writeout_period(struct timer_list *t) |
eb608e3a3 block: Convert BD... |
595 |
{ |
9823e51bf mm/page-writeback... |
596 |
struct wb_domain *dom = from_timer(dom, t, period_timer); |
380c27ca3 writeback: implem... |
597 |
int miss_periods = (jiffies - dom->period_time) / |
eb608e3a3 block: Convert BD... |
598 |
VM_COMPLETIONS_PERIOD_LEN; |
380c27ca3 writeback: implem... |
599 600 |
if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + |
eb608e3a3 block: Convert BD... |
601 |
miss_periods * VM_COMPLETIONS_PERIOD_LEN); |
380c27ca3 writeback: implem... |
602 |
mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3 block: Convert BD... |
603 604 605 606 607 |
} else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ |
380c27ca3 writeback: implem... |
608 |
dom->period_time = 0; |
eb608e3a3 block: Convert BD... |
609 610 |
} } |
380c27ca3 writeback: implem... |
611 612 613 |
int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); |
dcc25ae76 writeback: move g... |
614 615 |
spin_lock_init(&dom->lock); |
9823e51bf mm/page-writeback... |
616 |
timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); |
dcc25ae76 writeback: move g... |
617 618 |
dom->dirty_limit_tstamp = jiffies; |
380c27ca3 writeback: implem... |
619 620 |
return fprop_global_init(&dom->completions, gfp); } |
841710aa6 writeback: implem... |
621 622 623 624 625 626 627 |
#ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif |
eb608e3a3 block: Convert BD... |
628 |
/* |
d08c429b0 mm/page-writeback... |
629 630 631 |
* bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. |
189d3c4a9 mm: bdi: allow se... |
632 |
*/ |
189d3c4a9 mm: bdi: allow se... |
633 634 635 636 637 |
static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; |
189d3c4a9 mm: bdi: allow se... |
638 |
|
cfc4ba536 writeback: use RC... |
639 |
spin_lock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
640 |
if (min_ratio > bdi->max_ratio) { |
189d3c4a9 mm: bdi: allow se... |
641 |
ret = -EINVAL; |
a42dde041 mm: bdi: allow se... |
642 643 644 645 646 647 648 649 650 |
} else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } |
cfc4ba536 writeback: use RC... |
651 |
spin_unlock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
652 653 654 655 656 657 |
return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { |
a42dde041 mm: bdi: allow se... |
658 659 660 661 |
int ret = 0; if (max_ratio > 100) return -EINVAL; |
cfc4ba536 writeback: use RC... |
662 |
spin_lock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
663 664 665 666 |
if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; |
eb608e3a3 block: Convert BD... |
667 |
bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
a42dde041 mm: bdi: allow se... |
668 |
} |
cfc4ba536 writeback: use RC... |
669 |
spin_unlock_bh(&bdi_lock); |
189d3c4a9 mm: bdi: allow se... |
670 671 672 |
return ret; } |
a42dde041 mm: bdi: allow se... |
673 |
EXPORT_SYMBOL(bdi_set_max_ratio); |
189d3c4a9 mm: bdi: allow se... |
674 |
|
6c14ae1e9 writeback: dirty ... |
675 676 677 678 679 |
static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } |
c7981433e writeback: make _... |
680 681 |
static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) |
ffd1f609a writeback: introd... |
682 |
{ |
dcc25ae76 writeback: move g... |
683 |
return max(thresh, dom->dirty_limit); |
ffd1f609a writeback: introd... |
684 |
} |
c5edf9cdc writeback: fix in... |
685 686 687 688 689 690 |
/* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) |
c2aa723a6 writeback: implem... |
691 692 |
{ struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); |
c5edf9cdc writeback: fix in... |
693 694 695 |
unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); |
c2aa723a6 writeback: implem... |
696 |
|
c5edf9cdc writeback: fix in... |
697 |
mdtc->avail = filepages + min(headroom, other_clean); |
ffd1f609a writeback: introd... |
698 |
} |
6f7186562 writeback: add bd... |
699 |
/** |
b1cbc6d40 writeback: make _... |
700 701 |
* __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest |
1babe1838 writeback: add co... |
702 |
* |
aed21ad28 writeback: commen... |
703 704 705 706 707 |
* Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks |
a88a341a7 writeback: move b... |
708 |
* more (rather than completely block them) when the wb dirty pages go high. |
1babe1838 writeback: add co... |
709 |
* |
6f7186562 writeback: add bd... |
710 |
* It allocates high/low dirty limits to fast/slow devices, in order to prevent |
1babe1838 writeback: add co... |
711 712 713 |
* - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * |
a88a341a7 writeback: move b... |
714 |
* The wb's share of dirty limit will be adapting to its throughput and |
1babe1838 writeback: add co... |
715 |
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. |
a862f68a8 docs/core-api/mm:... |
716 717 718 |
* * Return: @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. |
1babe1838 writeback: add co... |
719 |
*/ |
b1cbc6d40 writeback: make _... |
720 |
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) |
16c4042f0 writeback: avoid ... |
721 |
{ |
e9f07dfd7 writeback: add di... |
722 |
struct wb_domain *dom = dtc_dom(dtc); |
b1cbc6d40 writeback: make _... |
723 |
unsigned long thresh = dtc->thresh; |
0d960a383 writeback: clean ... |
724 |
u64 wb_thresh; |
16c4042f0 writeback: avoid ... |
725 |
long numerator, denominator; |
693108a8a writeback: make b... |
726 |
unsigned long wb_min_ratio, wb_max_ratio; |
04fbfdc14 mm: per device di... |
727 |
|
16c4042f0 writeback: avoid ... |
728 |
/* |
0d960a383 writeback: clean ... |
729 |
* Calculate this BDI's share of the thresh ratio. |
16c4042f0 writeback: avoid ... |
730 |
*/ |
e9770b348 writeback: add di... |
731 |
fprop_fraction_percpu(&dom->completions, dtc->wb_completions, |
380c27ca3 writeback: implem... |
732 |
&numerator, &denominator); |
04fbfdc14 mm: per device di... |
733 |
|
0d960a383 writeback: clean ... |
734 735 736 |
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; do_div(wb_thresh, denominator); |
04fbfdc14 mm: per device di... |
737 |
|
b1cbc6d40 writeback: make _... |
738 |
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); |
04fbfdc14 mm: per device di... |
739 |
|
0d960a383 writeback: clean ... |
740 741 742 |
wb_thresh += (thresh * wb_min_ratio) / 100; if (wb_thresh > (thresh * wb_max_ratio) / 100) wb_thresh = thresh * wb_max_ratio / 100; |
16c4042f0 writeback: avoid ... |
743 |
|
0d960a383 writeback: clean ... |
744 |
return wb_thresh; |
1da177e4c Linux-2.6.12-rc2 |
745 |
} |
b1cbc6d40 writeback: make _... |
746 747 748 749 750 |
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); |
1da177e4c Linux-2.6.12-rc2 |
751 |
} |
6c14ae1e9 writeback: dirty ... |
752 |
/* |
5a5374856 mm/page-writeback... |
753 754 755 756 757 758 759 760 761 762 763 764 765 |
* setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ |
d5c9fde3d mm/page-writeback... |
766 |
static long long pos_ratio_polynom(unsigned long setpoint, |
5a5374856 mm/page-writeback... |
767 768 769 770 771 |
unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; |
d5c9fde3d mm/page-writeback... |
772 |
x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, |
464d1387a writeback: use |1... |
773 |
(limit - setpoint) | 1); |
5a5374856 mm/page-writeback... |
774 775 776 777 778 779 780 781 782 |
pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* |
6c14ae1e9 writeback: dirty ... |
783 784 785 786 |
* Dirty position control. * * (o) global/bdi setpoints * |
de1fff37b writeback: s/bdi/... |
787 |
* We want the dirty pages be balanced around the global/wb setpoints. |
6c14ae1e9 writeback: dirty ... |
788 789 790 791 792 793 794 795 796 |
* When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * |
de1fff37b writeback: s/bdi/... |
797 798 |
* if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio |
6c14ae1e9 writeback: dirty ... |
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 |
* * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 .............* * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * |
de1fff37b writeback: s/bdi/... |
823 |
* (o) wb control line |
6c14ae1e9 writeback: dirty ... |
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 |
* * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> |
de1fff37b writeback: s/bdi/... |
849 |
* wb_setpoint^ x_intercept^ |
6c14ae1e9 writeback: dirty ... |
850 |
* |
de1fff37b writeback: s/bdi/... |
851 |
* The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can |
6c14ae1e9 writeback: dirty ... |
852 853 |
* be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD |
de1fff37b writeback: s/bdi/... |
854 855 |
* card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload |
6c14ae1e9 writeback: dirty ... |
856 |
*/ |
daddfa3cb writeback: add di... |
857 |
static void wb_position_ratio(struct dirty_throttle_control *dtc) |
6c14ae1e9 writeback: dirty ... |
858 |
{ |
2bc00aef0 writeback: consol... |
859 |
struct bdi_writeback *wb = dtc->wb; |
a88a341a7 writeback: move b... |
860 |
unsigned long write_bw = wb->avg_write_bandwidth; |
2bc00aef0 writeback: consol... |
861 |
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e writeback: make _... |
862 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
2bc00aef0 writeback: consol... |
863 |
unsigned long wb_thresh = dtc->wb_thresh; |
6c14ae1e9 writeback: dirty ... |
864 865 |
unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ |
de1fff37b writeback: s/bdi/... |
866 |
unsigned long wb_setpoint; |
6c14ae1e9 writeback: dirty ... |
867 868 869 |
unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; |
daddfa3cb writeback: add di... |
870 |
dtc->pos_ratio = 0; |
2bc00aef0 writeback: consol... |
871 |
if (unlikely(dtc->dirty >= limit)) |
daddfa3cb writeback: add di... |
872 |
return; |
6c14ae1e9 writeback: dirty ... |
873 874 875 876 |
/* * global setpoint * |
5a5374856 mm/page-writeback... |
877 878 879 |
* See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; |
2bc00aef0 writeback: consol... |
880 |
pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); |
5a5374856 mm/page-writeback... |
881 882 883 884 |
/* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For |
de1fff37b writeback: s/bdi/... |
885 886 |
* such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". |
5a5374856 mm/page-writeback... |
887 888 889 890 |
* This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". |
6c14ae1e9 writeback: dirty ... |
891 |
* |
a88a341a7 writeback: move b... |
892 |
* Here, in wb_position_ratio(), we calculate pos_ratio based on |
de1fff37b writeback: s/bdi/... |
893 |
* two values: wb_dirty and wb_thresh. Let's consider an example: |
5a5374856 mm/page-writeback... |
894 895 |
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). |
de1fff37b writeback: s/bdi/... |
896 |
* Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. |
0d960a383 writeback: clean ... |
897 |
* wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is |
de1fff37b writeback: s/bdi/... |
898 |
* about ~6K pages (as the average of background and throttle wb |
5a5374856 mm/page-writeback... |
899 |
* limits). The 3rd order polynomial will provide positive feedback if |
de1fff37b writeback: s/bdi/... |
900 |
* wb_dirty is under wb_setpoint and vice versa. |
6c14ae1e9 writeback: dirty ... |
901 |
* |
5a5374856 mm/page-writeback... |
902 |
* Note, that we cannot use global counters in these calculations |
de1fff37b writeback: s/bdi/... |
903 |
* because we want to throttle process writing to a strictlimit wb |
5a5374856 mm/page-writeback... |
904 905 |
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). |
6c14ae1e9 writeback: dirty ... |
906 |
*/ |
a88a341a7 writeback: move b... |
907 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
de1fff37b writeback: s/bdi/... |
908 |
long long wb_pos_ratio; |
5a5374856 mm/page-writeback... |
909 |
|
daddfa3cb writeback: add di... |
910 911 912 913 914 |
if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } |
5a5374856 mm/page-writeback... |
915 |
|
2bc00aef0 writeback: consol... |
916 |
if (dtc->wb_dirty >= wb_thresh) |
daddfa3cb writeback: add di... |
917 |
return; |
5a5374856 mm/page-writeback... |
918 |
|
970fb01ad writeback: add di... |
919 920 |
wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); |
5a5374856 mm/page-writeback... |
921 |
|
de1fff37b writeback: s/bdi/... |
922 |
if (wb_setpoint == 0 || wb_setpoint == wb_thresh) |
daddfa3cb writeback: add di... |
923 |
return; |
5a5374856 mm/page-writeback... |
924 |
|
2bc00aef0 writeback: consol... |
925 |
wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, |
de1fff37b writeback: s/bdi/... |
926 |
wb_thresh); |
5a5374856 mm/page-writeback... |
927 928 |
/* |
de1fff37b writeback: s/bdi/... |
929 930 |
* Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global |
5a5374856 mm/page-writeback... |
931 |
* state ("dirty") is not limiting factor and we have to |
de1fff37b writeback: s/bdi/... |
932 |
* make decision based on wb counters. But there is an |
5a5374856 mm/page-writeback... |
933 934 |
* important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other |
de1fff37b writeback: s/bdi/... |
935 |
* wb's) while given strictlimit wb is below limit. |
5a5374856 mm/page-writeback... |
936 |
* |
de1fff37b writeback: s/bdi/... |
937 |
* "pos_ratio * wb_pos_ratio" would work for the case above, |
5a5374856 mm/page-writeback... |
938 |
* but it would look too non-natural for the case of all |
de1fff37b writeback: s/bdi/... |
939 |
* activity in the system coming from a single strictlimit wb |
5a5374856 mm/page-writeback... |
940 941 942 943 |
* with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 |
de1fff37b writeback: s/bdi/... |
944 |
* (when globally we are at freerun and wb is well below wb |
5a5374856 mm/page-writeback... |
945 946 947 948 |
* setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ |
daddfa3cb writeback: add di... |
949 950 |
dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; |
5a5374856 mm/page-writeback... |
951 |
} |
6c14ae1e9 writeback: dirty ... |
952 953 954 |
/* * We have computed basic pos_ratio above based on global situation. If |
de1fff37b writeback: s/bdi/... |
955 |
* the wb is over/under its share of dirty pages, we want to scale |
6c14ae1e9 writeback: dirty ... |
956 957 958 959 |
* pos_ratio further down/up. That is done by the following mechanism. */ /* |
de1fff37b writeback: s/bdi/... |
960 |
* wb setpoint |
6c14ae1e9 writeback: dirty ... |
961 |
* |
de1fff37b writeback: s/bdi/... |
962 |
* f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) |
6c14ae1e9 writeback: dirty ... |
963 |
* |
de1fff37b writeback: s/bdi/... |
964 |
* x_intercept - wb_dirty |
6c14ae1e9 writeback: dirty ... |
965 |
* := -------------------------- |
de1fff37b writeback: s/bdi/... |
966 |
* x_intercept - wb_setpoint |
6c14ae1e9 writeback: dirty ... |
967 |
* |
de1fff37b writeback: s/bdi/... |
968 |
* The main wb control line is a linear function that subjects to |
6c14ae1e9 writeback: dirty ... |
969 |
* |
de1fff37b writeback: s/bdi/... |
970 971 972 |
* (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw |
6c14ae1e9 writeback: dirty ... |
973 |
* |
de1fff37b writeback: s/bdi/... |
974 |
* For single wb case, the dirty pages are observed to fluctuate |
6c14ae1e9 writeback: dirty ... |
975 |
* regularly within range |
de1fff37b writeback: s/bdi/... |
976 |
* [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] |
6c14ae1e9 writeback: dirty ... |
977 978 979 |
* for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * |
de1fff37b writeback: s/bdi/... |
980 |
* For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its |
6c14ae1e9 writeback: dirty ... |
981 |
* own size, so move the slope over accordingly and choose a slope that |
de1fff37b writeback: s/bdi/... |
982 |
* yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. |
6c14ae1e9 writeback: dirty ... |
983 |
*/ |
2bc00aef0 writeback: consol... |
984 985 |
if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; |
aed21ad28 writeback: commen... |
986 |
/* |
de1fff37b writeback: s/bdi/... |
987 |
* It's very possible that wb_thresh is close to 0 not because the |
aed21ad28 writeback: commen... |
988 989 990 991 992 |
* device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ |
2bc00aef0 writeback: consol... |
993 |
wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); |
6c14ae1e9 writeback: dirty ... |
994 |
/* |
de1fff37b writeback: s/bdi/... |
995 996 |
* scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh |
6c14ae1e9 writeback: dirty ... |
997 |
*/ |
e4bc13adf Merge branch 'for... |
998 |
x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); |
de1fff37b writeback: s/bdi/... |
999 |
wb_setpoint = setpoint * (u64)x >> 16; |
6c14ae1e9 writeback: dirty ... |
1000 |
/* |
de1fff37b writeback: s/bdi/... |
1001 1002 |
* Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. |
6c14ae1e9 writeback: dirty ... |
1003 |
* |
de1fff37b writeback: s/bdi/... |
1004 1005 1006 |
* wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh |
6c14ae1e9 writeback: dirty ... |
1007 |
*/ |
2bc00aef0 writeback: consol... |
1008 |
span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; |
de1fff37b writeback: s/bdi/... |
1009 |
x_intercept = wb_setpoint + span; |
6c14ae1e9 writeback: dirty ... |
1010 |
|
2bc00aef0 writeback: consol... |
1011 1012 |
if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), |
e4bc13adf Merge branch 'for... |
1013 |
(x_intercept - wb_setpoint) | 1); |
6c14ae1e9 writeback: dirty ... |
1014 1015 |
} else pos_ratio /= 4; |
8927f66c4 writeback: dirty ... |
1016 |
/* |
de1fff37b writeback: s/bdi/... |
1017 |
* wb reserve area, safeguard against dirty pool underrun and disk idle |
8927f66c4 writeback: dirty ... |
1018 1019 1020 |
* It may push the desired control point of global dirty pages higher * than setpoint. */ |
de1fff37b writeback: s/bdi/... |
1021 |
x_intercept = wb_thresh / 2; |
2bc00aef0 writeback: consol... |
1022 1023 1024 1025 |
if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); |
50657fc4d writeback: fix pp... |
1026 |
else |
8927f66c4 writeback: dirty ... |
1027 1028 |
pos_ratio *= 8; } |
daddfa3cb writeback: add di... |
1029 |
dtc->pos_ratio = pos_ratio; |
6c14ae1e9 writeback: dirty ... |
1030 |
} |
a88a341a7 writeback: move b... |
1031 1032 1033 |
static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) |
e98be2d59 writeback: bdi wr... |
1034 1035 |
{ const unsigned long period = roundup_pow_of_two(3 * HZ); |
a88a341a7 writeback: move b... |
1036 1037 |
unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; |
e98be2d59 writeback: bdi wr... |
1038 1039 1040 1041 1042 1043 1044 1045 |
u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period |
c72efb658 writeback: fix po... |
1046 1047 1048 |
* * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. |
e98be2d59 writeback: bdi wr... |
1049 |
*/ |
a88a341a7 writeback: move b... |
1050 |
bw = written - min(written, wb->written_stamp); |
e98be2d59 writeback: bdi wr... |
1051 1052 1053 1054 1055 1056 |
bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } |
a88a341a7 writeback: move b... |
1057 |
bw += (u64)wb->write_bandwidth * (period - elapsed); |
e98be2d59 writeback: bdi wr... |
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 |
bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: |
95a46c65e writeback: make b... |
1070 1071 1072 1073 1074 1075 1076 |
/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } |
a88a341a7 writeback: move b... |
1077 1078 |
wb->write_bandwidth = bw; wb->avg_write_bandwidth = avg; |
e98be2d59 writeback: bdi wr... |
1079 |
} |
2bc00aef0 writeback: consol... |
1080 |
static void update_dirty_limit(struct dirty_throttle_control *dtc) |
c42843f2f writeback: introd... |
1081 |
{ |
e9f07dfd7 writeback: add di... |
1082 |
struct wb_domain *dom = dtc_dom(dtc); |
2bc00aef0 writeback: consol... |
1083 |
unsigned long thresh = dtc->thresh; |
dcc25ae76 writeback: move g... |
1084 |
unsigned long limit = dom->dirty_limit; |
c42843f2f writeback: introd... |
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 |
/* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce |
dcc25ae76 writeback: move g... |
1097 |
* dom->dirty_limit which is guaranteed to lie above the dirty pages. |
c42843f2f writeback: introd... |
1098 |
*/ |
2bc00aef0 writeback: consol... |
1099 |
thresh = max(thresh, dtc->dirty); |
c42843f2f writeback: introd... |
1100 1101 1102 1103 1104 1105 |
if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: |
dcc25ae76 writeback: move g... |
1106 |
dom->dirty_limit = limit; |
c42843f2f writeback: introd... |
1107 |
} |
e9f07dfd7 writeback: add di... |
1108 |
static void domain_update_bandwidth(struct dirty_throttle_control *dtc, |
c42843f2f writeback: introd... |
1109 1110 |
unsigned long now) { |
e9f07dfd7 writeback: add di... |
1111 |
struct wb_domain *dom = dtc_dom(dtc); |
c42843f2f writeback: introd... |
1112 1113 1114 1115 |
/* * check locklessly first to optimize away locking for the most time */ |
dcc25ae76 writeback: move g... |
1116 |
if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) |
c42843f2f writeback: introd... |
1117 |
return; |
dcc25ae76 writeback: move g... |
1118 1119 |
spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { |
2bc00aef0 writeback: consol... |
1120 |
update_dirty_limit(dtc); |
dcc25ae76 writeback: move g... |
1121 |
dom->dirty_limit_tstamp = now; |
c42843f2f writeback: introd... |
1122 |
} |
dcc25ae76 writeback: move g... |
1123 |
spin_unlock(&dom->lock); |
c42843f2f writeback: introd... |
1124 |
} |
be3ffa276 writeback: dirty ... |
1125 |
/* |
de1fff37b writeback: s/bdi/... |
1126 |
* Maintain wb->dirty_ratelimit, the base dirty throttle rate. |
be3ffa276 writeback: dirty ... |
1127 |
* |
de1fff37b writeback: s/bdi/... |
1128 |
* Normal wb tasks will be curbed at or below it in long term. |
be3ffa276 writeback: dirty ... |
1129 1130 |
* Obviously it should be around (write_bw / N) when there are N dd tasks. */ |
2bc00aef0 writeback: consol... |
1131 |
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, |
a88a341a7 writeback: move b... |
1132 1133 |
unsigned long dirtied, unsigned long elapsed) |
be3ffa276 writeback: dirty ... |
1134 |
{ |
2bc00aef0 writeback: consol... |
1135 1136 1137 |
struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e writeback: make _... |
1138 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
7381131cb writeback: stabil... |
1139 |
unsigned long setpoint = (freerun + limit) / 2; |
a88a341a7 writeback: move b... |
1140 1141 |
unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; |
be3ffa276 writeback: dirty ... |
1142 1143 1144 |
unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; |
7381131cb writeback: stabil... |
1145 1146 |
unsigned long step; unsigned long x; |
d59b1087a mm/page-writeback... |
1147 |
unsigned long shift; |
be3ffa276 writeback: dirty ... |
1148 1149 1150 1151 1152 |
/* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ |
a88a341a7 writeback: move b... |
1153 |
dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; |
be3ffa276 writeback: dirty ... |
1154 |
|
be3ffa276 writeback: dirty ... |
1155 1156 1157 1158 |
/* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * |
daddfa3cb writeback: add di... |
1159 |
dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; |
be3ffa276 writeback: dirty ... |
1160 1161 1162 1163 |
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, |
de1fff37b writeback: s/bdi/... |
1164 |
* if there are N dd tasks, each throttled at task_ratelimit, the wb's |
be3ffa276 writeback: dirty ... |
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 |
* dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); |
bdaac4902 writeback: balanc... |
1194 1195 1196 1197 1198 |
/* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; |
be3ffa276 writeback: dirty ... |
1199 |
|
7381131cb writeback: stabil... |
1200 1201 1202 |
/* * We could safely do this and return immediately: * |
de1fff37b writeback: s/bdi/... |
1203 |
* wb->dirty_ratelimit = balanced_dirty_ratelimit; |
7381131cb writeback: stabil... |
1204 1205 |
* * However to get a more stable dirty_ratelimit, the below elaborated |
331cbdeed writeback: Fix so... |
1206 |
* code makes use of task_ratelimit to filter out singular points and |
7381131cb writeback: stabil... |
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 |
* limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
331cbdeed writeback: Fix so... |
1229 |
* and filter out the singular points of balanced_dirty_ratelimit. Which |
7381131cb writeback: stabil... |
1230 1231 1232 1233 1234 |
* keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; |
5a5374856 mm/page-writeback... |
1235 1236 |
/* |
de1fff37b writeback: s/bdi/... |
1237 |
* For strictlimit case, calculations above were based on wb counters |
a88a341a7 writeback: move b... |
1238 |
* and limits (starting from pos_ratio = wb_position_ratio() and up to |
5a5374856 mm/page-writeback... |
1239 |
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). |
de1fff37b writeback: s/bdi/... |
1240 1241 |
* Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". |
5a5374856 mm/page-writeback... |
1242 |
* |
de1fff37b writeback: s/bdi/... |
1243 1244 |
* We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity |
970fb01ad writeback: add di... |
1245 |
* of backing device. |
5a5374856 mm/page-writeback... |
1246 |
*/ |
a88a341a7 writeback: move b... |
1247 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
2bc00aef0 writeback: consol... |
1248 1249 1250 |
dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; |
5a5374856 mm/page-writeback... |
1251 |
else |
970fb01ad writeback: add di... |
1252 |
setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; |
5a5374856 mm/page-writeback... |
1253 |
} |
7381131cb writeback: stabil... |
1254 |
if (dirty < setpoint) { |
a88a341a7 writeback: move b... |
1255 |
x = min3(wb->balanced_dirty_ratelimit, |
7c809968f mm/page-writeback... |
1256 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb writeback: stabil... |
1257 1258 1259 |
if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { |
a88a341a7 writeback: move b... |
1260 |
x = max3(wb->balanced_dirty_ratelimit, |
7c809968f mm/page-writeback... |
1261 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb writeback: stabil... |
1262 1263 1264 1265 1266 1267 1268 1269 1270 |
if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ |
d59b1087a mm/page-writeback... |
1271 1272 1273 1274 1275 |
shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; |
7381131cb writeback: stabil... |
1276 1277 1278 1279 1280 |
if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; |
a88a341a7 writeback: move b... |
1281 1282 |
wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; |
b48c104d2 writeback: trace ... |
1283 |
|
5634cc2aa writeback: update... |
1284 |
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); |
be3ffa276 writeback: dirty ... |
1285 |
} |
c2aa723a6 writeback: implem... |
1286 1287 |
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, |
8a7317995 writeback: reorga... |
1288 1289 |
unsigned long start_time, bool update_ratelimit) |
e98be2d59 writeback: bdi wr... |
1290 |
{ |
c2aa723a6 writeback: implem... |
1291 |
struct bdi_writeback *wb = gdtc->wb; |
e98be2d59 writeback: bdi wr... |
1292 |
unsigned long now = jiffies; |
a88a341a7 writeback: move b... |
1293 |
unsigned long elapsed = now - wb->bw_time_stamp; |
be3ffa276 writeback: dirty ... |
1294 |
unsigned long dirtied; |
e98be2d59 writeback: bdi wr... |
1295 |
unsigned long written; |
8a7317995 writeback: reorga... |
1296 |
lockdep_assert_held(&wb->list_lock); |
e98be2d59 writeback: bdi wr... |
1297 1298 1299 1300 1301 |
/* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; |
a88a341a7 writeback: move b... |
1302 1303 |
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); |
e98be2d59 writeback: bdi wr... |
1304 1305 1306 1307 1308 |
/* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ |
a88a341a7 writeback: move b... |
1309 |
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) |
e98be2d59 writeback: bdi wr... |
1310 |
goto snapshot; |
8a7317995 writeback: reorga... |
1311 |
if (update_ratelimit) { |
c2aa723a6 writeback: implem... |
1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 |
domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_bandwidth(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } |
be3ffa276 writeback: dirty ... |
1323 |
} |
a88a341a7 writeback: move b... |
1324 |
wb_update_write_bandwidth(wb, elapsed, written); |
e98be2d59 writeback: bdi wr... |
1325 1326 |
snapshot: |
a88a341a7 writeback: move b... |
1327 1328 1329 |
wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; |
e98be2d59 writeback: bdi wr... |
1330 |
} |
8a7317995 writeback: reorga... |
1331 |
void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) |
e98be2d59 writeback: bdi wr... |
1332 |
{ |
2bc00aef0 writeback: consol... |
1333 |
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1334 |
__wb_update_bandwidth(&gdtc, NULL, start_time, false); |
e98be2d59 writeback: bdi wr... |
1335 |
} |
1da177e4c Linux-2.6.12-rc2 |
1336 |
/* |
d0e1d66b5 writeback: remove... |
1337 |
* After a task dirtied this many pages, balance_dirty_pages_ratelimited() |
9d823e8f6 writeback: per ta... |
1338 1339 1340 |
* will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive |
c41f012ad mm: rename global... |
1341 |
* global_zone_page_state() too often. So scale it near-sqrt to the safety margin |
9d823e8f6 writeback: per ta... |
1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 |
* (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } |
a88a341a7 writeback: move b... |
1352 |
static unsigned long wb_max_pause(struct bdi_writeback *wb, |
de1fff37b writeback: s/bdi/... |
1353 |
unsigned long wb_dirty) |
c8462cc9d writeback: limit ... |
1354 |
{ |
a88a341a7 writeback: move b... |
1355 |
unsigned long bw = wb->avg_write_bandwidth; |
e3b6c655b writeback: fix ne... |
1356 |
unsigned long t; |
c8462cc9d writeback: limit ... |
1357 |
|
7ccb9ad53 writeback: max, m... |
1358 1359 1360 1361 1362 1363 1364 |
/* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ |
de1fff37b writeback: s/bdi/... |
1365 |
t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); |
7ccb9ad53 writeback: max, m... |
1366 |
t++; |
e3b6c655b writeback: fix ne... |
1367 |
return min_t(unsigned long, t, MAX_PAUSE); |
7ccb9ad53 writeback: max, m... |
1368 |
} |
a88a341a7 writeback: move b... |
1369 1370 1371 1372 1373 |
static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) |
c8462cc9d writeback: limit ... |
1374 |
{ |
a88a341a7 writeback: move b... |
1375 1376 |
long hi = ilog2(wb->avg_write_bandwidth); long lo = ilog2(wb->dirty_ratelimit); |
7ccb9ad53 writeback: max, m... |
1377 1378 1379 |
long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ |
c8462cc9d writeback: limit ... |
1380 |
|
7ccb9ad53 writeback: max, m... |
1381 1382 |
/* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); |
c8462cc9d writeback: limit ... |
1383 1384 1385 1386 1387 |
/* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * |
7ccb9ad53 writeback: max, m... |
1388 |
* (N * 10ms) on 2^N concurrent tasks. |
c8462cc9d writeback: limit ... |
1389 1390 |
*/ if (hi > lo) |
7ccb9ad53 writeback: max, m... |
1391 |
t += (hi - lo) * (10 * HZ) / 1024; |
c8462cc9d writeback: limit ... |
1392 1393 |
/* |
7ccb9ad53 writeback: max, m... |
1394 1395 1396 1397 1398 1399 1400 1401 |
* This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. |
c8462cc9d writeback: limit ... |
1402 |
* |
7ccb9ad53 writeback: max, m... |
1403 1404 1405 1406 1407 1408 1409 |
* We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. |
c8462cc9d writeback: limit ... |
1410 |
*/ |
7ccb9ad53 writeback: max, m... |
1411 1412 |
t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); |
c8462cc9d writeback: limit ... |
1413 1414 |
/* |
5b9b35743 writeback: avoid ... |
1415 1416 1417 1418 1419 1420 |
* Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. |
c8462cc9d writeback: limit ... |
1421 |
*/ |
5b9b35743 writeback: avoid ... |
1422 1423 1424 1425 1426 1427 1428 1429 |
if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } |
7ccb9ad53 writeback: max, m... |
1430 1431 1432 1433 1434 |
pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } |
c8462cc9d writeback: limit ... |
1435 |
|
7ccb9ad53 writeback: max, m... |
1436 |
*nr_dirtied_pause = pages; |
c8462cc9d writeback: limit ... |
1437 |
/* |
7ccb9ad53 writeback: max, m... |
1438 |
* The minimal pause time will normally be half the target pause time. |
c8462cc9d writeback: limit ... |
1439 |
*/ |
5b9b35743 writeback: avoid ... |
1440 |
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; |
c8462cc9d writeback: limit ... |
1441 |
} |
970fb01ad writeback: add di... |
1442 |
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) |
5a5374856 mm/page-writeback... |
1443 |
{ |
2bc00aef0 writeback: consol... |
1444 |
struct bdi_writeback *wb = dtc->wb; |
93f78d882 writeback: move b... |
1445 |
unsigned long wb_reclaimable; |
5a5374856 mm/page-writeback... |
1446 1447 |
/* |
de1fff37b writeback: s/bdi/... |
1448 |
* wb_thresh is not treated as some limiting factor as |
5a5374856 mm/page-writeback... |
1449 |
* dirty_thresh, due to reasons |
de1fff37b writeback: s/bdi/... |
1450 |
* - in JBOD setup, wb_thresh can fluctuate a lot |
5a5374856 mm/page-writeback... |
1451 |
* - in a system with HDD and USB key, the USB key may somehow |
de1fff37b writeback: s/bdi/... |
1452 1453 |
* go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. |
5a5374856 mm/page-writeback... |
1454 |
* In this case we don't want to hard throttle the USB key |
de1fff37b writeback: s/bdi/... |
1455 1456 |
* dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in |
a88a341a7 writeback: move b... |
1457 |
* wb_position_ratio() will let the dirtier task progress |
de1fff37b writeback: s/bdi/... |
1458 |
* at some rate <= (write_bw / 2) for bringing down wb_dirty. |
5a5374856 mm/page-writeback... |
1459 |
*/ |
b1cbc6d40 writeback: make _... |
1460 |
dtc->wb_thresh = __wb_calc_thresh(dtc); |
970fb01ad writeback: add di... |
1461 1462 |
dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; |
5a5374856 mm/page-writeback... |
1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 |
/* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ |
2bce774e8 writeback: remove... |
1474 |
if (dtc->wb_thresh < 2 * wb_stat_error()) { |
93f78d882 writeback: move b... |
1475 |
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
2bc00aef0 writeback: consol... |
1476 |
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); |
5a5374856 mm/page-writeback... |
1477 |
} else { |
93f78d882 writeback: move b... |
1478 |
wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
2bc00aef0 writeback: consol... |
1479 |
dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); |
5a5374856 mm/page-writeback... |
1480 1481 |
} } |
9d823e8f6 writeback: per ta... |
1482 |
/* |
1da177e4c Linux-2.6.12-rc2 |
1483 1484 |
* balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force |
143dfe861 writeback: IO-les... |
1485 |
* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
5b0830cb9 writeback: get ri... |
1486 1487 |
* If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. |
1da177e4c Linux-2.6.12-rc2 |
1488 |
*/ |
4c578dce5 mm/page-writeback... |
1489 |
static void balance_dirty_pages(struct bdi_writeback *wb, |
143dfe861 writeback: IO-les... |
1490 |
unsigned long pages_dirtied) |
1da177e4c Linux-2.6.12-rc2 |
1491 |
{ |
2bc00aef0 writeback: consol... |
1492 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1493 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
2bc00aef0 writeback: consol... |
1494 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6 writeback: implem... |
1495 1496 1497 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; |
143dfe861 writeback: IO-les... |
1498 |
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
83712358b writeback: dirty ... |
1499 |
long period; |
7ccb9ad53 writeback: max, m... |
1500 1501 1502 1503 |
long pause; long max_pause; long min_pause; int nr_dirtied_pause; |
e50e37201 writeback: balanc... |
1504 |
bool dirty_exceeded = false; |
143dfe861 writeback: IO-les... |
1505 |
unsigned long task_ratelimit; |
7ccb9ad53 writeback: max, m... |
1506 |
unsigned long dirty_ratelimit; |
dfb8ae567 writeback: let ba... |
1507 |
struct backing_dev_info *bdi = wb->bdi; |
5a5374856 mm/page-writeback... |
1508 |
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; |
e98be2d59 writeback: bdi wr... |
1509 |
unsigned long start_time = jiffies; |
1da177e4c Linux-2.6.12-rc2 |
1510 1511 |
for (;;) { |
83712358b writeback: dirty ... |
1512 |
unsigned long now = jiffies; |
2bc00aef0 writeback: consol... |
1513 |
unsigned long dirty, thresh, bg_thresh; |
50e55bf62 mm/page-writeback... |
1514 1515 1516 |
unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; |
83712358b writeback: dirty ... |
1517 |
|
143dfe861 writeback: IO-les... |
1518 1519 1520 1521 1522 1523 |
/* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. */ |
11fb99898 mm: move most fil... |
1524 1525 |
nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
9fc3a43e1 writeback: separa... |
1526 |
gdtc->avail = global_dirtyable_memory(); |
11fb99898 mm: move most fil... |
1527 |
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); |
5fce25a9d mm: speed up writ... |
1528 |
|
9fc3a43e1 writeback: separa... |
1529 |
domain_dirty_limits(gdtc); |
16c4042f0 writeback: avoid ... |
1530 |
|
5a5374856 mm/page-writeback... |
1531 |
if (unlikely(strictlimit)) { |
970fb01ad writeback: add di... |
1532 |
wb_dirty_limits(gdtc); |
5a5374856 mm/page-writeback... |
1533 |
|
2bc00aef0 writeback: consol... |
1534 1535 |
dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; |
970fb01ad writeback: add di... |
1536 |
bg_thresh = gdtc->wb_bg_thresh; |
5a5374856 mm/page-writeback... |
1537 |
} else { |
2bc00aef0 writeback: consol... |
1538 1539 1540 |
dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; |
5a5374856 mm/page-writeback... |
1541 |
} |
c2aa723a6 writeback: implem... |
1542 |
if (mdtc) { |
c5edf9cdc writeback: fix in... |
1543 |
unsigned long filepages, headroom, writeback; |
c2aa723a6 writeback: implem... |
1544 1545 1546 1547 1548 |
/* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ |
c5edf9cdc writeback: fix in... |
1549 1550 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); |
c2aa723a6 writeback: implem... |
1551 |
mdtc->dirty += writeback; |
c5edf9cdc writeback: fix in... |
1552 |
mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6 writeback: implem... |
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 |
domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } |
5a5374856 mm/page-writeback... |
1566 |
} |
16c4042f0 writeback: avoid ... |
1567 1568 1569 |
/* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts |
de1fff37b writeback: s/bdi/... |
1570 |
* when the wb limits are ramping up in case of !strictlimit. |
5a5374856 mm/page-writeback... |
1571 |
* |
de1fff37b writeback: s/bdi/... |
1572 1573 |
* In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping |
5a5374856 mm/page-writeback... |
1574 |
* up are the price we consciously pay for strictlimit-ing. |
c2aa723a6 writeback: implem... |
1575 1576 1577 |
* * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. |
16c4042f0 writeback: avoid ... |
1578 |
*/ |
c2aa723a6 writeback: implem... |
1579 1580 1581 1582 1583 |
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv = dirty_poll_interval(dirty, thresh); unsigned long m_intv = ULONG_MAX; |
83712358b writeback: dirty ... |
1584 1585 |
current->dirty_paused_when = now; current->nr_dirtied = 0; |
c2aa723a6 writeback: implem... |
1586 1587 1588 |
if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); |
16c4042f0 writeback: avoid ... |
1589 |
break; |
83712358b writeback: dirty ... |
1590 |
} |
16c4042f0 writeback: avoid ... |
1591 |
|
bc05873dc writeback: make w... |
1592 |
if (unlikely(!writeback_in_progress(wb))) |
9ecf4866c writeback: make b... |
1593 |
wb_start_background_writeback(wb); |
143dfe861 writeback: IO-les... |
1594 |
|
97b27821b writeback, memcg:... |
1595 |
mem_cgroup_flush_foreign(wb); |
c2aa723a6 writeback: implem... |
1596 1597 1598 1599 |
/* * Calculate global domain's pos_ratio and select the * global dtc by default. */ |
5a5374856 mm/page-writeback... |
1600 |
if (!strictlimit) |
970fb01ad writeback: add di... |
1601 |
wb_dirty_limits(gdtc); |
5fce25a9d mm: speed up writ... |
1602 |
|
2bc00aef0 writeback: consol... |
1603 1604 |
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); |
daddfa3cb writeback: add di... |
1605 1606 |
wb_position_ratio(gdtc); |
c2aa723a6 writeback: implem... |
1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 |
sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) wb_dirty_limits(mdtc); dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } |
daddfa3cb writeback: add di... |
1626 |
|
a88a341a7 writeback: move b... |
1627 1628 |
if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; |
1da177e4c Linux-2.6.12-rc2 |
1629 |
|
8a7317995 writeback: reorga... |
1630 1631 1632 |
if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); |
c2aa723a6 writeback: implem... |
1633 |
__wb_update_bandwidth(gdtc, mdtc, start_time, true); |
8a7317995 writeback: reorga... |
1634 1635 |
spin_unlock(&wb->list_lock); } |
e98be2d59 writeback: bdi wr... |
1636 |
|
c2aa723a6 writeback: implem... |
1637 |
/* throttle according to the chosen dtc */ |
a88a341a7 writeback: move b... |
1638 |
dirty_ratelimit = wb->dirty_ratelimit; |
c2aa723a6 writeback: implem... |
1639 |
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> |
3a73dbbc9 writeback: fix un... |
1640 |
RATELIMIT_CALC_SHIFT; |
c2aa723a6 writeback: implem... |
1641 |
max_pause = wb_max_pause(wb, sdtc->wb_dirty); |
a88a341a7 writeback: move b... |
1642 1643 1644 |
min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); |
7ccb9ad53 writeback: max, m... |
1645 |
|
3a73dbbc9 writeback: fix un... |
1646 |
if (unlikely(task_ratelimit == 0)) { |
83712358b writeback: dirty ... |
1647 |
period = max_pause; |
c8462cc9d writeback: limit ... |
1648 |
pause = max_pause; |
143dfe861 writeback: IO-les... |
1649 |
goto pause; |
04fbfdc14 mm: per device di... |
1650 |
} |
83712358b writeback: dirty ... |
1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 |
period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ |
7ccb9ad53 writeback: max, m... |
1662 |
if (pause < min_pause) { |
5634cc2aa writeback: update... |
1663 |
trace_balance_dirty_pages(wb, |
c2aa723a6 writeback: implem... |
1664 1665 1666 1667 1668 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31 writeback: trace ... |
1669 1670 1671 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b writeback: dirty ... |
1672 |
period, |
7ccb9ad53 writeback: max, m... |
1673 |
min(pause, 0L), |
ece13ac31 writeback: trace ... |
1674 |
start_time); |
83712358b writeback: dirty ... |
1675 1676 1677 1678 1679 1680 |
if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; |
7ccb9ad53 writeback: max, m... |
1681 1682 |
} else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; |
57fc978cf writeback: contro... |
1683 |
break; |
04fbfdc14 mm: per device di... |
1684 |
} |
7ccb9ad53 writeback: max, m... |
1685 1686 1687 1688 1689 |
if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } |
143dfe861 writeback: IO-les... |
1690 1691 |
pause: |
5634cc2aa writeback: update... |
1692 |
trace_balance_dirty_pages(wb, |
c2aa723a6 writeback: implem... |
1693 1694 1695 1696 1697 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31 writeback: trace ... |
1698 1699 1700 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b writeback: dirty ... |
1701 |
period, |
ece13ac31 writeback: trace ... |
1702 1703 |
pause, start_time); |
499d05ecf mm: Make task in ... |
1704 |
__set_current_state(TASK_KILLABLE); |
b57d74aff writeback: track ... |
1705 |
wb->dirty_sleep = now; |
d25105e89 writeback: accoun... |
1706 |
io_schedule_timeout(pause); |
87c6a9b25 writeback: make b... |
1707 |
|
83712358b writeback: dirty ... |
1708 1709 |
current->dirty_paused_when = now + pause; current->nr_dirtied = 0; |
7ccb9ad53 writeback: max, m... |
1710 |
current->nr_dirtied_pause = nr_dirtied_pause; |
83712358b writeback: dirty ... |
1711 |
|
ffd1f609a writeback: introd... |
1712 |
/* |
2bc00aef0 writeback: consol... |
1713 1714 |
* This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. |
ffd1f609a writeback: introd... |
1715 |
*/ |
1df647197 writeback: hard t... |
1716 |
if (task_ratelimit) |
ffd1f609a writeback: introd... |
1717 |
break; |
499d05ecf mm: Make task in ... |
1718 |
|
c5c6343c4 writeback: permit... |
1719 1720 |
/* * In the case of an unresponding NFS server and the NFS dirty |
de1fff37b writeback: s/bdi/... |
1721 |
* pages exceeds dirty_thresh, give the other good wb's a pipe |
c5c6343c4 writeback: permit... |
1722 1723 |
* to go through, so that tasks on them still remain responsive. * |
3f8b6fb7f scripts/spelling.... |
1724 |
* In theory 1 page is enough to keep the consumer-producer |
c5c6343c4 writeback: permit... |
1725 |
* pipe going: the flusher cleans 1 page => the task dirties 1 |
de1fff37b writeback: s/bdi/... |
1726 |
* more page. However wb_dirty has accounting errors. So use |
93f78d882 writeback: move b... |
1727 |
* the larger and more IO friendly wb_stat_error. |
c5c6343c4 writeback: permit... |
1728 |
*/ |
2bce774e8 writeback: remove... |
1729 |
if (sdtc->wb_dirty <= wb_stat_error()) |
c5c6343c4 writeback: permit... |
1730 |
break; |
499d05ecf mm: Make task in ... |
1731 1732 |
if (fatal_signal_pending(current)) break; |
1da177e4c Linux-2.6.12-rc2 |
1733 |
} |
a88a341a7 writeback: move b... |
1734 1735 |
if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; |
1da177e4c Linux-2.6.12-rc2 |
1736 |
|
bc05873dc writeback: make w... |
1737 |
if (writeback_in_progress(wb)) |
5b0830cb9 writeback: get ri... |
1738 |
return; |
1da177e4c Linux-2.6.12-rc2 |
1739 1740 1741 1742 1743 1744 1745 1746 1747 |
/* * In laptop mode, we wait until hitting the higher threshold before * starting background writeout, and then write out all the way down * to the lower threshold. So slow writers cause minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ |
143dfe861 writeback: IO-les... |
1748 1749 |
if (laptop_mode) return; |
2bc00aef0 writeback: consol... |
1750 |
if (nr_reclaimable > gdtc->bg_thresh) |
9ecf4866c writeback: make b... |
1751 |
wb_start_background_writeback(wb); |
1da177e4c Linux-2.6.12-rc2 |
1752 |
} |
9d823e8f6 writeback: per ta... |
1753 |
static DEFINE_PER_CPU(int, bdp_ratelimits); |
245b2e70e percpu: clean up ... |
1754 |
|
54848d73f writeback: charge... |
1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 |
/* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; |
1da177e4c Linux-2.6.12-rc2 |
1770 |
/** |
d0e1d66b5 writeback: remove... |
1771 |
* balance_dirty_pages_ratelimited - balance dirty memory state |
67be2dd1b [PATCH] DocBook: ... |
1772 |
* @mapping: address_space which was dirtied |
1da177e4c Linux-2.6.12-rc2 |
1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 |
* * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * On really big machines, get_writeback_state is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ |
d0e1d66b5 writeback: remove... |
1783 |
void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1da177e4c Linux-2.6.12-rc2 |
1784 |
{ |
dfb8ae567 writeback: let ba... |
1785 1786 1787 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; |
9d823e8f6 writeback: per ta... |
1788 1789 |
int ratelimit; int *p; |
1da177e4c Linux-2.6.12-rc2 |
1790 |
|
36715cef0 writeback: skip t... |
1791 1792 |
if (!bdi_cap_account_dirty(bdi)) return; |
dfb8ae567 writeback: let ba... |
1793 1794 1795 1796 |
if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; |
9d823e8f6 writeback: per ta... |
1797 |
ratelimit = current->nr_dirtied_pause; |
a88a341a7 writeback: move b... |
1798 |
if (wb->dirty_exceeded) |
9d823e8f6 writeback: per ta... |
1799 |
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
9d823e8f6 writeback: per ta... |
1800 |
preempt_disable(); |
1da177e4c Linux-2.6.12-rc2 |
1801 |
/* |
9d823e8f6 writeback: per ta... |
1802 1803 1804 1805 |
* This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. |
1da177e4c Linux-2.6.12-rc2 |
1806 |
*/ |
7c8e0181e mm: replace __get... |
1807 |
p = this_cpu_ptr(&bdp_ratelimits); |
9d823e8f6 writeback: per ta... |
1808 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
fa5a734e4 [PATCH] balance_d... |
1809 |
*p = 0; |
d3bc1fef9 writeback: fix di... |
1810 1811 1812 |
else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; |
1da177e4c Linux-2.6.12-rc2 |
1813 |
} |
54848d73f writeback: charge... |
1814 1815 1816 1817 1818 |
/* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ |
7c8e0181e mm: replace __get... |
1819 |
p = this_cpu_ptr(&dirty_throttle_leaks); |
54848d73f writeback: charge... |
1820 |
if (*p > 0 && current->nr_dirtied < ratelimit) { |
d0e1d66b5 writeback: remove... |
1821 |
unsigned long nr_pages_dirtied; |
54848d73f writeback: charge... |
1822 1823 1824 |
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; |
1da177e4c Linux-2.6.12-rc2 |
1825 |
} |
fa5a734e4 [PATCH] balance_d... |
1826 |
preempt_enable(); |
9d823e8f6 writeback: per ta... |
1827 1828 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
4c578dce5 mm/page-writeback... |
1829 |
balance_dirty_pages(wb, current->nr_dirtied); |
dfb8ae567 writeback: let ba... |
1830 1831 |
wb_put(wb); |
1da177e4c Linux-2.6.12-rc2 |
1832 |
} |
d0e1d66b5 writeback: remove... |
1833 |
EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
1da177e4c Linux-2.6.12-rc2 |
1834 |
|
aa661bbe1 writeback: move o... |
1835 1836 1837 1838 1839 |
/** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's |
a862f68a8 docs/core-api/mm:... |
1840 1841 1842 |
* clean enough. * * Return: %true if writeback should continue. |
aa661bbe1 writeback: move o... |
1843 1844 1845 |
*/ bool wb_over_bg_thresh(struct bdi_writeback *wb) { |
947e9762a writeback: update... |
1846 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1847 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
947e9762a writeback: update... |
1848 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6 writeback: implem... |
1849 1850 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; |
aa661bbe1 writeback: move o... |
1851 |
|
947e9762a writeback: update... |
1852 1853 1854 1855 1856 |
/* * Similar to balance_dirty_pages() but ignores pages being written * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); |
11fb99898 mm: move most fil... |
1857 1858 |
gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + global_node_page_state(NR_UNSTABLE_NFS); |
947e9762a writeback: update... |
1859 |
domain_dirty_limits(gdtc); |
aa661bbe1 writeback: move o... |
1860 |
|
947e9762a writeback: update... |
1861 |
if (gdtc->dirty > gdtc->bg_thresh) |
aa661bbe1 writeback: move o... |
1862 |
return true; |
74d369443 writeback: Fix pe... |
1863 1864 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) |
aa661bbe1 writeback: move o... |
1865 |
return true; |
c2aa723a6 writeback: implem... |
1866 |
if (mdtc) { |
c5edf9cdc writeback: fix in... |
1867 |
unsigned long filepages, headroom, writeback; |
c2aa723a6 writeback: implem... |
1868 |
|
c5edf9cdc writeback: fix in... |
1869 1870 1871 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6 writeback: implem... |
1872 1873 1874 1875 |
domain_dirty_limits(mdtc); /* ditto, ignore writeback */ if (mdtc->dirty > mdtc->bg_thresh) return true; |
74d369443 writeback: Fix pe... |
1876 1877 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) |
c2aa723a6 writeback: implem... |
1878 1879 |
return true; } |
aa661bbe1 writeback: move o... |
1880 1881 |
return false; } |
1da177e4c Linux-2.6.12-rc2 |
1882 |
/* |
1da177e4c Linux-2.6.12-rc2 |
1883 1884 |
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ |
cccad5b98 mm: convert use o... |
1885 |
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, |
8d65af789 sysctl: remove "s... |
1886 |
void __user *buffer, size_t *length, loff_t *ppos) |
1da177e4c Linux-2.6.12-rc2 |
1887 |
{ |
94af58469 writeback: schedu... |
1888 1889 1890 1891 |
unsigned int old_interval = dirty_writeback_interval; int ret; ret = proc_dointvec(table, write, buffer, length, ppos); |
515c24c13 mm/page-writeback... |
1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 |
/* * Writing 0 to dirty_writeback_interval will disable periodic writeback * and a different non-zero value will wakeup the writeback threads. * wb_wakeup_delayed() would be more appropriate, but it's a pain to * iterate over all bdis and wbs. * The reason we do this is to make the change take effect immediately. */ if (!ret && write && dirty_writeback_interval && dirty_writeback_interval != old_interval) |
94af58469 writeback: schedu... |
1902 1903 1904 |
wakeup_flusher_threads(WB_REASON_PERIODIC); return ret; |
1da177e4c Linux-2.6.12-rc2 |
1905 |
} |
c2c4986ed writeback: fix pr... |
1906 |
#ifdef CONFIG_BLOCK |
bca237a52 block/laptop_mode... |
1907 |
void laptop_mode_timer_fn(struct timer_list *t) |
1da177e4c Linux-2.6.12-rc2 |
1908 |
{ |
bca237a52 block/laptop_mode... |
1909 1910 |
struct backing_dev_info *backing_dev_info = from_timer(backing_dev_info, t, laptop_mode_wb_timer); |
1da177e4c Linux-2.6.12-rc2 |
1911 |
|
bca237a52 block/laptop_mode... |
1912 |
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); |
1da177e4c Linux-2.6.12-rc2 |
1913 1914 1915 1916 1917 1918 1919 |
} /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ |
31373d09d laptop-mode: Make... |
1920 |
void laptop_io_completion(struct backing_dev_info *info) |
1da177e4c Linux-2.6.12-rc2 |
1921 |
{ |
31373d09d laptop-mode: Make... |
1922 |
mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); |
1da177e4c Linux-2.6.12-rc2 |
1923 1924 1925 1926 1927 1928 1929 1930 1931 |
} /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { |
31373d09d laptop-mode: Make... |
1932 1933 1934 1935 1936 1937 1938 1939 |
struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); |
1da177e4c Linux-2.6.12-rc2 |
1940 |
} |
c2c4986ed writeback: fix pr... |
1941 |
#endif |
1da177e4c Linux-2.6.12-rc2 |
1942 1943 1944 1945 1946 1947 1948 1949 1950 |
/* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * If it is too low then SMP machines will call the (expensive) * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
9d823e8f6 writeback: per ta... |
1951 |
* thresholds. |
1da177e4c Linux-2.6.12-rc2 |
1952 |
*/ |
2d1d43f6a [PATCH] call mm/p... |
1953 |
void writeback_set_ratelimit(void) |
1da177e4c Linux-2.6.12-rc2 |
1954 |
{ |
dcc25ae76 writeback: move g... |
1955 |
struct wb_domain *dom = &global_wb_domain; |
9d823e8f6 writeback: per ta... |
1956 1957 |
unsigned long background_thresh; unsigned long dirty_thresh; |
dcc25ae76 writeback: move g... |
1958 |
|
9d823e8f6 writeback: per ta... |
1959 |
global_dirty_limits(&background_thresh, &dirty_thresh); |
dcc25ae76 writeback: move g... |
1960 |
dom->dirty_limit = dirty_thresh; |
9d823e8f6 writeback: per ta... |
1961 |
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); |
1da177e4c Linux-2.6.12-rc2 |
1962 1963 |
if (ratelimit_pages < 16) ratelimit_pages = 16; |
1da177e4c Linux-2.6.12-rc2 |
1964 |
} |
1d7ac6aec mm/writeback: Con... |
1965 |
static int page_writeback_cpu_online(unsigned int cpu) |
1da177e4c Linux-2.6.12-rc2 |
1966 |
{ |
1d7ac6aec mm/writeback: Con... |
1967 1968 |
writeback_set_ratelimit(); return 0; |
1da177e4c Linux-2.6.12-rc2 |
1969 |
} |
1da177e4c Linux-2.6.12-rc2 |
1970 |
/* |
dc6e29da9 Fix balance_dirty... |
1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 |
* Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers (by * comparing nr_free_buffer_pages() to vm_total_pages. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory (by subtracting * totalhigh_pages from vm_total_pages), and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. |
1da177e4c Linux-2.6.12-rc2 |
1987 1988 1989 |
*/ void __init page_writeback_init(void) { |
a50fcb512 writeback: fix in... |
1990 |
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); |
1d7ac6aec mm/writeback: Con... |
1991 1992 1993 1994 |
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); |
1da177e4c Linux-2.6.12-rc2 |
1995 |
} |
811d736f9 [PATCH] BLOCK: Di... |
1996 |
/** |
f446daaea mm: implement wri... |
1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 |
* tag_pages_for_writeback - tag pages to be written by write_cache_pages * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is * that write_cache_pages (or whoever calls this function) will then use * TOWRITE tag to identify pages eligible for writeback. This mechanism is * used to avoid livelocking of writeback by a process steadily creating new * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ |
f446daaea mm: implement wri... |
2010 2011 2012 |
void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { |
ff9c745b8 mm: Convert page-... |
2013 2014 2015 |
XA_STATE(xas, &mapping->i_pages, start); unsigned int tagged = 0; void *page; |
268f42de7 radix-tree: delet... |
2016 |
|
ff9c745b8 mm: Convert page-... |
2017 2018 2019 2020 |
xas_lock_irq(&xas); xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); if (++tagged % XA_CHECK_SCHED) |
268f42de7 radix-tree: delet... |
2021 |
continue; |
ff9c745b8 mm: Convert page-... |
2022 2023 2024 |
xas_pause(&xas); xas_unlock_irq(&xas); |
f446daaea mm: implement wri... |
2025 |
cond_resched(); |
ff9c745b8 mm: Convert page-... |
2026 |
xas_lock_irq(&xas); |
268f42de7 radix-tree: delet... |
2027 |
} |
ff9c745b8 mm: Convert page-... |
2028 |
xas_unlock_irq(&xas); |
f446daaea mm: implement wri... |
2029 2030 2031 2032 |
} EXPORT_SYMBOL(tag_pages_for_writeback); /** |
0ea971801 consolidate gener... |
2033 |
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them. |
811d736f9 [PATCH] BLOCK: Di... |
2034 2035 |
* @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
0ea971801 consolidate gener... |
2036 2037 |
* @writepage: function called for each page * @data: data passed to writepage function |
811d736f9 [PATCH] BLOCK: Di... |
2038 |
* |
0ea971801 consolidate gener... |
2039 |
* If a page is already under I/O, write_cache_pages() skips it, even |
811d736f9 [PATCH] BLOCK: Di... |
2040 2041 2042 2043 2044 2045 |
* if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. |
f446daaea mm: implement wri... |
2046 2047 2048 2049 2050 2051 2052 |
* * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). |
64081362e mm/page-writeback... |
2053 2054 2055 2056 2057 2058 2059 |
* * To avoid deadlocks between range_cyclic writeback and callers that hold * pages in PageWriteback to aggregate IO until write_cache_pages() returns, * we do not loop back to the start of the file. Doing so causes a page * lock/page writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping back to the start * of the file violates that rule and causes deadlocks. |
a862f68a8 docs/core-api/mm:... |
2060 2061 |
* * Return: %0 on success, negative error code otherwise |
811d736f9 [PATCH] BLOCK: Di... |
2062 |
*/ |
0ea971801 consolidate gener... |
2063 2064 2065 |
int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) |
811d736f9 [PATCH] BLOCK: Di... |
2066 |
{ |
811d736f9 [PATCH] BLOCK: Di... |
2067 2068 |
int ret = 0; int done = 0; |
3fa750dcf mm/page-writeback... |
2069 |
int error; |
811d736f9 [PATCH] BLOCK: Di... |
2070 2071 |
struct pagevec pvec; int nr_pages; |
31a12666d mm: write_cache_p... |
2072 |
pgoff_t uninitialized_var(writeback_index); |
811d736f9 [PATCH] BLOCK: Di... |
2073 2074 |
pgoff_t index; pgoff_t end; /* Inclusive */ |
bd19e012f mm: write_cache_p... |
2075 |
pgoff_t done_index; |
811d736f9 [PATCH] BLOCK: Di... |
2076 |
int range_whole = 0; |
ff9c745b8 mm: Convert page-... |
2077 |
xa_mark_t tag; |
811d736f9 [PATCH] BLOCK: Di... |
2078 |
|
866798201 mm, pagevec: remo... |
2079 |
pagevec_init(&pvec); |
811d736f9 [PATCH] BLOCK: Di... |
2080 |
if (wbc->range_cyclic) { |
31a12666d mm: write_cache_p... |
2081 2082 |
writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; |
811d736f9 [PATCH] BLOCK: Di... |
2083 2084 |
end = -1; } else { |
09cbfeaf1 mm, fs: get rid o... |
2085 2086 |
index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; |
811d736f9 [PATCH] BLOCK: Di... |
2087 2088 |
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; |
811d736f9 [PATCH] BLOCK: Di... |
2089 |
} |
6e6938b6d writeback: introd... |
2090 |
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
f446daaea mm: implement wri... |
2091 2092 2093 |
tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; |
6e6938b6d writeback: introd... |
2094 |
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
f446daaea mm: implement wri... |
2095 |
tag_pages_for_writeback(mapping, index, end); |
bd19e012f mm: write_cache_p... |
2096 |
done_index = index; |
5a3d5c981 mm: write_cache_p... |
2097 2098 |
while (!done && (index <= end)) { int i; |
2b9775ae4 mm: use pagevec_l... |
2099 |
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, |
67fd707f4 mm: remove nr_pag... |
2100 |
tag); |
5a3d5c981 mm: write_cache_p... |
2101 2102 |
if (nr_pages == 0) break; |
811d736f9 [PATCH] BLOCK: Di... |
2103 |
|
811d736f9 [PATCH] BLOCK: Di... |
2104 2105 |
for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; |
cf15b07cf writeback: make m... |
2106 |
done_index = page->index; |
d5482cdf8 mm: write_cache_p... |
2107 |
|
811d736f9 [PATCH] BLOCK: Di... |
2108 |
lock_page(page); |
5a3d5c981 mm: write_cache_p... |
2109 2110 2111 2112 2113 2114 2115 2116 |
/* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ |
811d736f9 [PATCH] BLOCK: Di... |
2117 |
if (unlikely(page->mapping != mapping)) { |
5a3d5c981 mm: write_cache_p... |
2118 |
continue_unlock: |
811d736f9 [PATCH] BLOCK: Di... |
2119 2120 2121 |
unlock_page(page); continue; } |
515f4a037 mm: write_cache_p... |
2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 |
if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } |
811d736f9 [PATCH] BLOCK: Di... |
2133 |
|
515f4a037 mm: write_cache_p... |
2134 2135 |
BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) |
5a3d5c981 mm: write_cache_p... |
2136 |
goto continue_unlock; |
811d736f9 [PATCH] BLOCK: Di... |
2137 |
|
de1414a65 fs: export inode_... |
2138 |
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); |
3fa750dcf mm/page-writeback... |
2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 |
error = (*writepage)(page, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of * writeback. There's no need to continue for * background writeback. Just push done_index * past this page so media errors won't choke * writeout for the entire file. For integrity * writeback, we must process the entire dirty * set regardless of errors because the fs may * still have state to clear for each page. In * that case we continue processing and return * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { |
00266770b mm: write_cache_p... |
2154 |
unlock_page(page); |
3fa750dcf mm/page-writeback... |
2155 2156 2157 |
error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; |
cf15b07cf writeback: make m... |
2158 |
done_index = page->index + 1; |
00266770b mm: write_cache_p... |
2159 2160 2161 |
done = 1; break; } |
3fa750dcf mm/page-writeback... |
2162 2163 |
if (!ret) ret = error; |
0b5649278 writeback: pay at... |
2164 |
} |
00266770b mm: write_cache_p... |
2165 |
|
546a19242 writeback: write_... |
2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 |
/* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; |
05fe478dd mm: write_cache_p... |
2176 |
} |
811d736f9 [PATCH] BLOCK: Di... |
2177 2178 2179 2180 |
} pagevec_release(&pvec); cond_resched(); } |
64081362e mm/page-writeback... |
2181 2182 2183 2184 2185 2186 2187 2188 |
/* * If we hit the last page and there is more work to be done: wrap * back the index back to the start of the file for the next * time we are called. */ if (wbc->range_cyclic && !done) done_index = 0; |
0b5649278 writeback: pay at... |
2189 2190 |
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; |
06d6cf695 mm: Add range_con... |
2191 |
|
811d736f9 [PATCH] BLOCK: Di... |
2192 2193 |
return ret; } |
0ea971801 consolidate gener... |
2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 |
EXPORT_SYMBOL(write_cache_pages); /* * Function used by generic_writepages to call the real writepage * function and set the mapping flags on error */ static int __writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /** * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * * This is a library function, which implements the writepages() * address_space_operation. |
a862f68a8 docs/core-api/mm:... |
2216 2217 |
* * Return: %0 on success, negative error code otherwise |
0ea971801 consolidate gener... |
2218 2219 2220 2221 |
*/ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
9b6096a65 mm: make generic_... |
2222 2223 |
struct blk_plug plug; int ret; |
0ea971801 consolidate gener... |
2224 2225 2226 |
/* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; |
9b6096a65 mm: make generic_... |
2227 2228 2229 2230 |
blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __writepage, mapping); blk_finish_plug(&plug); return ret; |
0ea971801 consolidate gener... |
2231 |
} |
811d736f9 [PATCH] BLOCK: Di... |
2232 2233 |
EXPORT_SYMBOL(generic_writepages); |
1da177e4c Linux-2.6.12-rc2 |
2234 2235 |
int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
22905f775 identify multipag... |
2236 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2237 2238 |
if (wbc->nr_to_write <= 0) return 0; |
80a2ea9f8 mm: retry writepa... |
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 |
while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) break; cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } |
22905f775 identify multipag... |
2249 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2250 2251 2252 |
} /** |
2b69c8280 mm: drop "wait" p... |
2253 |
* write_one_page - write out a single page and wait on I/O |
67be2dd1b [PATCH] DocBook: ... |
2254 |
* @page: the page to write |
1da177e4c Linux-2.6.12-rc2 |
2255 2256 2257 |
* * The page must be locked by the caller and will be unlocked upon return. * |
37e51a764 mm: clean up erro... |
2258 2259 |
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. |
a862f68a8 docs/core-api/mm:... |
2260 2261 |
* * Return: %0 on success, negative error code otherwise |
1da177e4c Linux-2.6.12-rc2 |
2262 |
*/ |
2b69c8280 mm: drop "wait" p... |
2263 |
int write_one_page(struct page *page) |
1da177e4c Linux-2.6.12-rc2 |
2264 2265 2266 2267 2268 2269 2270 2271 2272 |
{ struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); |
2b69c8280 mm: drop "wait" p... |
2273 |
wait_on_page_writeback(page); |
1da177e4c Linux-2.6.12-rc2 |
2274 2275 |
if (clear_page_dirty_for_io(page)) { |
09cbfeaf1 mm, fs: get rid o... |
2276 |
get_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2277 |
ret = mapping->a_ops->writepage(page, &wbc); |
37e51a764 mm: clean up erro... |
2278 |
if (ret == 0) |
1da177e4c Linux-2.6.12-rc2 |
2279 |
wait_on_page_writeback(page); |
09cbfeaf1 mm, fs: get rid o... |
2280 |
put_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2281 2282 2283 |
} else { unlock_page(page); } |
37e51a764 mm: clean up erro... |
2284 2285 2286 |
if (!ret) ret = filemap_check_errors(mapping); |
1da177e4c Linux-2.6.12-rc2 |
2287 2288 2289 2290 2291 |
return ret; } EXPORT_SYMBOL(write_one_page); /* |
767193253 [PATCH] simplify ... |
2292 2293 2294 2295 2296 |
* For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) |
c3f0da631 mm/page-writeback... |
2297 |
return !TestSetPageDirty(page); |
767193253 [PATCH] simplify ... |
2298 2299 2300 2301 |
return 0; } /* |
e3a7cca1e vfs: add/use acco... |
2302 |
* Helper function for set_page_dirty family. |
c4843a759 memcg: add per cg... |
2303 |
* |
81f8c3a46 mm: memcontrol: g... |
2304 |
* Caller must hold lock_page_memcg(). |
c4843a759 memcg: add per cg... |
2305 |
* |
e3a7cca1e vfs: add/use acco... |
2306 2307 |
* NOTE: This relies on being atomic wrt interrupts. */ |
62cccb8c8 mm: simplify lock... |
2308 |
void account_page_dirtied(struct page *page, struct address_space *mapping) |
e3a7cca1e vfs: add/use acco... |
2309 |
{ |
52ebea749 writeback: make b... |
2310 |
struct inode *inode = mapping->host; |
9fb0a7da0 writeback: add mo... |
2311 |
trace_writeback_dirty_page(page, mapping); |
e3a7cca1e vfs: add/use acco... |
2312 |
if (mapping_cap_account_dirty(mapping)) { |
52ebea749 writeback: make b... |
2313 |
struct bdi_writeback *wb; |
de1414a65 fs: export inode_... |
2314 |
|
52ebea749 writeback: make b... |
2315 2316 |
inode_attach_wb(inode, page); wb = inode_to_wb(inode); |
de1414a65 fs: export inode_... |
2317 |
|
00f3ca2c2 mm: memcontrol: p... |
2318 |
__inc_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2319 |
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b mm: move vmscan w... |
2320 |
__inc_node_page_state(page, NR_DIRTIED); |
3e8f399da writeback: rework... |
2321 2322 |
inc_wb_stat(wb, WB_RECLAIMABLE); inc_wb_stat(wb, WB_DIRTIED); |
09cbfeaf1 mm, fs: get rid o... |
2323 |
task_io_account_write(PAGE_SIZE); |
d3bc1fef9 writeback: fix di... |
2324 2325 |
current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); |
97b27821b writeback, memcg:... |
2326 2327 |
mem_cgroup_track_foreign_dirty(page, wb); |
e3a7cca1e vfs: add/use acco... |
2328 2329 2330 2331 |
} } /* |
b9ea25152 page_writeback: c... |
2332 2333 |
* Helper function for deaccounting dirty page without writeback. * |
81f8c3a46 mm: memcontrol: g... |
2334 |
* Caller must hold lock_page_memcg(). |
b9ea25152 page_writeback: c... |
2335 |
*/ |
c4843a759 memcg: add per cg... |
2336 |
void account_page_cleaned(struct page *page, struct address_space *mapping, |
62cccb8c8 mm: simplify lock... |
2337 |
struct bdi_writeback *wb) |
b9ea25152 page_writeback: c... |
2338 2339 |
{ if (mapping_cap_account_dirty(mapping)) { |
00f3ca2c2 mm: memcontrol: p... |
2340 |
dec_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2341 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a writeback: implem... |
2342 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
09cbfeaf1 mm, fs: get rid o... |
2343 |
task_io_account_cancelled_write(PAGE_SIZE); |
b9ea25152 page_writeback: c... |
2344 2345 |
} } |
b9ea25152 page_writeback: c... |
2346 2347 |
/* |
1da177e4c Linux-2.6.12-rc2 |
2348 |
* For address_spaces which do not use buffers. Just tag the page as dirty in |
ff9c745b8 mm: Convert page-... |
2349 |
* the xarray. |
1da177e4c Linux-2.6.12-rc2 |
2350 2351 2352 2353 2354 |
* * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * |
2d6d7f982 mm: protect set_p... |
2355 2356 2357 |
* The caller must ensure this doesn't race with truncation. Most will simply * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and * the pte lock held, which also locks out truncation. |
1da177e4c Linux-2.6.12-rc2 |
2358 2359 2360 |
*/ int __set_page_dirty_nobuffers(struct page *page) { |
62cccb8c8 mm: simplify lock... |
2361 |
lock_page_memcg(page); |
1da177e4c Linux-2.6.12-rc2 |
2362 2363 |
if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); |
a85d9df1e mm: __set_page_di... |
2364 |
unsigned long flags; |
1da177e4c Linux-2.6.12-rc2 |
2365 |
|
c4843a759 memcg: add per cg... |
2366 |
if (!mapping) { |
62cccb8c8 mm: simplify lock... |
2367 |
unlock_page_memcg(page); |
8c08540f8 [PATCH] clean up ... |
2368 |
return 1; |
c4843a759 memcg: add per cg... |
2369 |
} |
8c08540f8 [PATCH] clean up ... |
2370 |
|
b93b01631 page cache: use x... |
2371 |
xa_lock_irqsave(&mapping->i_pages, flags); |
2d6d7f982 mm: protect set_p... |
2372 2373 |
BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
62cccb8c8 mm: simplify lock... |
2374 |
account_page_dirtied(page, mapping); |
ff9c745b8 mm: Convert page-... |
2375 |
__xa_set_mark(&mapping->i_pages, page_index(page), |
2d6d7f982 mm: protect set_p... |
2376 |
PAGECACHE_TAG_DIRTY); |
b93b01631 page cache: use x... |
2377 |
xa_unlock_irqrestore(&mapping->i_pages, flags); |
62cccb8c8 mm: simplify lock... |
2378 |
unlock_page_memcg(page); |
c4843a759 memcg: add per cg... |
2379 |
|
8c08540f8 [PATCH] clean up ... |
2380 2381 2382 |
if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
1da177e4c Linux-2.6.12-rc2 |
2383 |
} |
4741c9fd3 [PATCH] set_page_... |
2384 |
return 1; |
1da177e4c Linux-2.6.12-rc2 |
2385 |
} |
62cccb8c8 mm: simplify lock... |
2386 |
unlock_page_memcg(page); |
4741c9fd3 [PATCH] set_page_... |
2387 |
return 0; |
1da177e4c Linux-2.6.12-rc2 |
2388 2389 2390 2391 |
} EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* |
2f800fbd7 writeback: fix di... |
2392 |
* Call this whenever redirtying a page, to de-account the dirty counters |
dcfe4df3d mm/page-writeback... |
2393 2394 |
* (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to |
2f800fbd7 writeback: fix di... |
2395 2396 2397 2398 2399 2400 |
* systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; |
910181343 writeback: attrib... |
2401 |
|
2f800fbd7 writeback: fix di... |
2402 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2403 2404 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
2e898e4c0 writeback: safer ... |
2405 |
struct wb_lock_cookie cookie = {}; |
910181343 writeback: attrib... |
2406 |
|
2e898e4c0 writeback: safer ... |
2407 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
2f800fbd7 writeback: fix di... |
2408 |
current->nr_dirtied--; |
c4a25635b mm: move vmscan w... |
2409 |
dec_node_page_state(page, NR_DIRTIED); |
910181343 writeback: attrib... |
2410 |
dec_wb_stat(wb, WB_DIRTIED); |
2e898e4c0 writeback: safer ... |
2411 |
unlocked_inode_to_wb_end(inode, &cookie); |
2f800fbd7 writeback: fix di... |
2412 2413 2414 2415 2416 |
} } EXPORT_SYMBOL(account_page_redirty); /* |
1da177e4c Linux-2.6.12-rc2 |
2417 2418 2419 2420 2421 2422 |
* When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { |
8d38633c3 page_writeback: p... |
2423 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2424 |
wbc->pages_skipped++; |
8d38633c3 page_writeback: p... |
2425 |
ret = __set_page_dirty_nobuffers(page); |
2f800fbd7 writeback: fix di... |
2426 |
account_page_redirty(page); |
8d38633c3 page_writeback: p... |
2427 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2428 2429 2430 2431 |
} EXPORT_SYMBOL(redirty_page_for_writepage); /* |
6746aff74 HWPOISON: shmem: ... |
2432 2433 2434 2435 2436 2437 2438 |
* Dirty a page. * * For pages with a mapping this should be done under the page lock * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. * |
1da177e4c Linux-2.6.12-rc2 |
2439 2440 2441 |
* If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ |
1cf6e7d83 mm: task dirty ac... |
2442 |
int set_page_dirty(struct page *page) |
1da177e4c Linux-2.6.12-rc2 |
2443 2444 |
{ struct address_space *mapping = page_mapping(page); |
800d8c63b shmem: add huge p... |
2445 |
page = compound_head(page); |
1da177e4c Linux-2.6.12-rc2 |
2446 2447 |
if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
278df9f45 mm: reclaim inval... |
2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 |
/* * readahead/lru_deactivate_page could remain * PG_readahead/PG_reclaim due to race with end_page_writeback * About readahead, if the page is written, the flags would be * reset. So no problem. * About lru_deactivate_page, if the page is redirty, the flag * will be reset. So no problem. but if the page is used by readahead * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ |
a4bb3ecdc mm/page-writeback... |
2458 2459 |
if (PageReclaim(page)) ClearPageReclaim(page); |
9361401eb [PATCH] BLOCK: Ma... |
2460 2461 2462 2463 2464 |
#ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; #endif return (*spd)(page); |
1da177e4c Linux-2.6.12-rc2 |
2465 |
} |
4741c9fd3 [PATCH] set_page_... |
2466 2467 2468 2469 |
if (!PageDirty(page)) { if (!TestSetPageDirty(page)) return 1; } |
1da177e4c Linux-2.6.12-rc2 |
2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 |
return 0; } EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. * * Usually, the page _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the page should be locked before running set_page_dirty(). */ int set_page_dirty_lock(struct page *page) { int ret; |
7eaceacca block: remove per... |
2487 |
lock_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2488 2489 2490 2491 2492 2493 2494 |
ret = set_page_dirty(page); unlock_page(page); return ret; } EXPORT_SYMBOL(set_page_dirty_lock); /* |
11f81becc page_writeback: r... |
2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 |
* This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ |
736304f32 mm: speed up canc... |
2507 |
void __cancel_dirty_page(struct page *page) |
11f81becc page_writeback: r... |
2508 |
{ |
c4843a759 memcg: add per cg... |
2509 2510 2511 |
struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2512 2513 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
2e898e4c0 writeback: safer ... |
2514 |
struct wb_lock_cookie cookie = {}; |
c4843a759 memcg: add per cg... |
2515 |
|
62cccb8c8 mm: simplify lock... |
2516 |
lock_page_memcg(page); |
2e898e4c0 writeback: safer ... |
2517 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
c4843a759 memcg: add per cg... |
2518 2519 |
if (TestClearPageDirty(page)) |
62cccb8c8 mm: simplify lock... |
2520 |
account_page_cleaned(page, mapping, wb); |
c4843a759 memcg: add per cg... |
2521 |
|
2e898e4c0 writeback: safer ... |
2522 |
unlocked_inode_to_wb_end(inode, &cookie); |
62cccb8c8 mm: simplify lock... |
2523 |
unlock_page_memcg(page); |
c4843a759 memcg: add per cg... |
2524 2525 2526 |
} else { ClearPageDirty(page); } |
11f81becc page_writeback: r... |
2527 |
} |
736304f32 mm: speed up canc... |
2528 |
EXPORT_SYMBOL(__cancel_dirty_page); |
11f81becc page_writeback: r... |
2529 2530 |
/* |
1da177e4c Linux-2.6.12-rc2 |
2531 2532 2533 2534 |
* Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page |
ff9c745b8 mm: Convert page-... |
2535 |
* tagged as dirty in the xarray so that a concurrent write-for-sync |
1da177e4c Linux-2.6.12-rc2 |
2536 2537 |
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), |
ff9c745b8 mm: Convert page-... |
2538 |
* at which stage we bring the page's dirty flag and xarray dirty tag |
1da177e4c Linux-2.6.12-rc2 |
2539 2540 |
* back into sync. * |
ff9c745b8 mm: Convert page-... |
2541 |
* This incoherency between the page's dirty flag and xarray tag is |
1da177e4c Linux-2.6.12-rc2 |
2542 2543 2544 2545 2546 |
* unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); |
c4843a759 memcg: add per cg... |
2547 |
int ret = 0; |
1da177e4c Linux-2.6.12-rc2 |
2548 |
|
79352894b mm: fix clear_pag... |
2549 |
BUG_ON(!PageLocked(page)); |
7658cc289 VM: Fix nasty and... |
2550 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2551 2552 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
2e898e4c0 writeback: safer ... |
2553 |
struct wb_lock_cookie cookie = {}; |
682aa8e1a writeback: implem... |
2554 |
|
7658cc289 VM: Fix nasty and... |
2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 |
/* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole page dirty if it was * dirty in a pagetable. Only to then * (c) clean the page again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "set_page_dirty(page)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the page "master dirty bit" * as a serialization point for all the different * threads doing their things. |
7658cc289 VM: Fix nasty and... |
2579 2580 2581 |
*/ if (page_mkclean(page)) set_page_dirty(page); |
79352894b mm: fix clear_pag... |
2582 2583 2584 |
/* * We carefully synchronise fault handlers against * installing a dirty pte and marking the page dirty |
2d6d7f982 mm: protect set_p... |
2585 2586 2587 2588 |
* at this point. We do this by having them hold the * page lock while dirtying the page, and pages are * always locked coming in here, so we get the desired * exclusion. |
79352894b mm: fix clear_pag... |
2589 |
*/ |
2e898e4c0 writeback: safer ... |
2590 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
7658cc289 VM: Fix nasty and... |
2591 |
if (TestClearPageDirty(page)) { |
00f3ca2c2 mm: memcontrol: p... |
2592 |
dec_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2593 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a writeback: implem... |
2594 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
c4843a759 memcg: add per cg... |
2595 |
ret = 1; |
1da177e4c Linux-2.6.12-rc2 |
2596 |
} |
2e898e4c0 writeback: safer ... |
2597 |
unlocked_inode_to_wb_end(inode, &cookie); |
c4843a759 memcg: add per cg... |
2598 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2599 |
} |
7658cc289 VM: Fix nasty and... |
2600 |
return TestClearPageDirty(page); |
1da177e4c Linux-2.6.12-rc2 |
2601 |
} |
58bb01a9c [PATCH] re-export... |
2602 |
EXPORT_SYMBOL(clear_page_dirty_for_io); |
1da177e4c Linux-2.6.12-rc2 |
2603 2604 2605 2606 |
int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); |
739f79fc9 mm: memcontrol: f... |
2607 2608 |
struct mem_cgroup *memcg; struct lruvec *lruvec; |
d7365e783 mm: memcontrol: f... |
2609 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2610 |
|
739f79fc9 mm: memcontrol: f... |
2611 2612 |
memcg = lock_page_memcg(page); lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); |
371a096ed mm: don't use rad... |
2613 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
910181343 writeback: attrib... |
2614 2615 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c Linux-2.6.12-rc2 |
2616 |
unsigned long flags; |
b93b01631 page cache: use x... |
2617 |
xa_lock_irqsave(&mapping->i_pages, flags); |
1da177e4c Linux-2.6.12-rc2 |
2618 |
ret = TestClearPageWriteback(page); |
69cb51d18 mm: count writeba... |
2619 |
if (ret) { |
ff9c745b8 mm: Convert page-... |
2620 |
__xa_clear_mark(&mapping->i_pages, page_index(page), |
1da177e4c Linux-2.6.12-rc2 |
2621 |
PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6 mm: bdi: add sepa... |
2622 |
if (bdi_cap_account_writeback(bdi)) { |
910181343 writeback: attrib... |
2623 |
struct bdi_writeback *wb = inode_to_wb(inode); |
3e8f399da writeback: rework... |
2624 |
dec_wb_stat(wb, WB_WRITEBACK); |
910181343 writeback: attrib... |
2625 |
__wb_writeout_inc(wb); |
04fbfdc14 mm: per device di... |
2626 |
} |
69cb51d18 mm: count writeba... |
2627 |
} |
6c60d2b57 fs/fs-writeback.c... |
2628 2629 2630 2631 |
if (mapping->host && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); |
b93b01631 page cache: use x... |
2632 |
xa_unlock_irqrestore(&mapping->i_pages, flags); |
1da177e4c Linux-2.6.12-rc2 |
2633 2634 2635 |
} else { ret = TestClearPageWriteback(page); } |
739f79fc9 mm: memcontrol: f... |
2636 2637 2638 2639 2640 2641 |
/* * NOTE: Page might be free now! Writeback doesn't hold a page * reference on its own, it relies on truncation to wait for * the clearing of PG_writeback. The below can only access * page state that is static across allocation cycles. */ |
99b12e3d8 writeback: accoun... |
2642 |
if (ret) { |
739f79fc9 mm: memcontrol: f... |
2643 |
dec_lruvec_state(lruvec, NR_WRITEBACK); |
5a1c84b40 mm: remove reclai... |
2644 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b mm: move vmscan w... |
2645 |
inc_node_page_state(page, NR_WRITTEN); |
99b12e3d8 writeback: accoun... |
2646 |
} |
739f79fc9 mm: memcontrol: f... |
2647 |
__unlock_page_memcg(memcg); |
1da177e4c Linux-2.6.12-rc2 |
2648 2649 |
return ret; } |
1c8349a17 ext4: fix data in... |
2650 |
int __test_set_page_writeback(struct page *page, bool keep_write) |
1da177e4c Linux-2.6.12-rc2 |
2651 2652 |
{ struct address_space *mapping = page_mapping(page); |
d7365e783 mm: memcontrol: f... |
2653 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2654 |
|
62cccb8c8 mm: simplify lock... |
2655 |
lock_page_memcg(page); |
371a096ed mm: don't use rad... |
2656 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
ff9c745b8 mm: Convert page-... |
2657 |
XA_STATE(xas, &mapping->i_pages, page_index(page)); |
910181343 writeback: attrib... |
2658 2659 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c Linux-2.6.12-rc2 |
2660 |
unsigned long flags; |
ff9c745b8 mm: Convert page-... |
2661 2662 |
xas_lock_irqsave(&xas, flags); xas_load(&xas); |
1da177e4c Linux-2.6.12-rc2 |
2663 |
ret = TestSetPageWriteback(page); |
69cb51d18 mm: count writeba... |
2664 |
if (!ret) { |
6c60d2b57 fs/fs-writeback.c... |
2665 2666 2667 2668 |
bool on_wblist; on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); |
ff9c745b8 mm: Convert page-... |
2669 |
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6 mm: bdi: add sepa... |
2670 |
if (bdi_cap_account_writeback(bdi)) |
3e8f399da writeback: rework... |
2671 |
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); |
6c60d2b57 fs/fs-writeback.c... |
2672 2673 2674 2675 2676 2677 2678 2679 |
/* * We can come through here when swapping anonymous * pages, so we don't necessarily have an inode to track * for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); |
69cb51d18 mm: count writeba... |
2680 |
} |
1da177e4c Linux-2.6.12-rc2 |
2681 |
if (!PageDirty(page)) |
ff9c745b8 mm: Convert page-... |
2682 |
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); |
1c8349a17 ext4: fix data in... |
2683 |
if (!keep_write) |
ff9c745b8 mm: Convert page-... |
2684 2685 |
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); |
1da177e4c Linux-2.6.12-rc2 |
2686 2687 2688 |
} else { ret = TestSetPageWriteback(page); } |
3a3c02ecf mm: page-writebac... |
2689 |
if (!ret) { |
00f3ca2c2 mm: memcontrol: p... |
2690 |
inc_lruvec_page_state(page, NR_WRITEBACK); |
5a1c84b40 mm: remove reclai... |
2691 |
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
3a3c02ecf mm: page-writebac... |
2692 |
} |
62cccb8c8 mm: simplify lock... |
2693 |
unlock_page_memcg(page); |
1da177e4c Linux-2.6.12-rc2 |
2694 2695 2696 |
return ret; } |
1c8349a17 ext4: fix data in... |
2697 |
EXPORT_SYMBOL(__test_set_page_writeback); |
1da177e4c Linux-2.6.12-rc2 |
2698 |
|
19343b5bd mm/page-writeback... |
2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 |
/* * Wait for a page to complete writeback */ void wait_on_page_writeback(struct page *page) { if (PageWriteback(page)) { trace_wait_on_page_writeback(page, page_mapping(page)); wait_on_page_bit(page, PG_writeback); } } EXPORT_SYMBOL_GPL(wait_on_page_writeback); |
1d1d1a767 mm: only enforce ... |
2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 |
/** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. * * This function determines if the given page is related to a backing device * that requires page contents to be held stable during writeback. If so, then * it will wait for any pending writeback to complete. */ void wait_for_stable_page(struct page *page) { |
de1414a65 fs: export inode_... |
2720 2721 |
if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) wait_on_page_writeback(page); |
1d1d1a767 mm: only enforce ... |
2722 2723 |
} EXPORT_SYMBOL_GPL(wait_for_stable_page); |