Blame view
mm/page-writeback.c
85 KB
457c89965 treewide: Add SPD... |
1 |
// SPDX-License-Identifier: GPL-2.0-only |
1da177e4c Linux-2.6.12-rc2 |
2 |
/* |
f30c22695 fix file specific... |
3 |
* mm/page-writeback.c |
1da177e4c Linux-2.6.12-rc2 |
4 5 |
* * Copyright (C) 2002, Linus Torvalds. |
90eec103b treewide: Remove ... |
6 |
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
1da177e4c Linux-2.6.12-rc2 |
7 8 9 10 |
* * Contains functions related to writing back dirty pages at the * address_space level. * |
e1f8e8744 Remove Andrew Mor... |
11 |
* 10Apr2002 Andrew Morton |
1da177e4c Linux-2.6.12-rc2 |
12 13 14 15 |
* Initial version */ #include <linux/kernel.h> |
b95f1b31b mm: Map most file... |
16 |
#include <linux/export.h> |
1da177e4c Linux-2.6.12-rc2 |
17 18 19 20 21 22 23 24 25 |
#include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> |
55e829af0 [PATCH] io-accoun... |
26 |
#include <linux/task_io_accounting_ops.h> |
1da177e4c Linux-2.6.12-rc2 |
27 28 |
#include <linux/blkdev.h> #include <linux/mpage.h> |
d08b3851d [PATCH] mm: track... |
29 |
#include <linux/rmap.h> |
1da177e4c Linux-2.6.12-rc2 |
30 |
#include <linux/percpu.h> |
1da177e4c Linux-2.6.12-rc2 |
31 32 33 34 |
#include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> |
ff01bb483 fs: move code out... |
35 |
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
811d736f9 [PATCH] BLOCK: Di... |
36 |
#include <linux/pagevec.h> |
eb608e3a3 block: Convert BD... |
37 |
#include <linux/timer.h> |
8bd75c77b sched/rt: Move rt... |
38 |
#include <linux/sched/rt.h> |
f361bf4a6 sched/headers: Pr... |
39 |
#include <linux/sched/signal.h> |
6e543d578 mm: vmscan: fix d... |
40 |
#include <linux/mm_inline.h> |
028c2dd18 writeback: Add tr... |
41 |
#include <trace/events/writeback.h> |
1da177e4c Linux-2.6.12-rc2 |
42 |
|
6e543d578 mm: vmscan: fix d... |
43 |
#include "internal.h" |
1da177e4c Linux-2.6.12-rc2 |
44 |
/* |
ffd1f609a writeback: introd... |
45 46 47 48 49 |
* Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* |
5b9b35743 writeback: avoid ... |
50 51 52 53 54 55 |
* Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* |
e98be2d59 writeback: bdi wr... |
56 57 58 |
* Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
6c14ae1e9 writeback: dirty ... |
59 |
#define RATELIMIT_CALC_SHIFT 10 |
e98be2d59 writeback: bdi wr... |
60 |
/* |
1da177e4c Linux-2.6.12-rc2 |
61 62 63 64 |
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; |
1da177e4c Linux-2.6.12-rc2 |
65 66 67 |
/* The following parameters are exported via /proc/sys/vm */ /* |
5b0830cb9 writeback: get ri... |
68 |
* Start background writeback (via writeback threads) at this percentage |
1da177e4c Linux-2.6.12-rc2 |
69 |
*/ |
1b5e62b42 writeback: double... |
70 |
int dirty_background_ratio = 10; |
1da177e4c Linux-2.6.12-rc2 |
71 72 |
/* |
2da02997e mm: add dirty_bac... |
73 74 75 76 77 78 |
* dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ unsigned long dirty_background_bytes; /* |
195cf453d mm/page-writeback... |
79 80 81 82 83 84 |
* free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ int vm_highmem_is_dirtyable; /* |
1da177e4c Linux-2.6.12-rc2 |
85 86 |
* The generator of dirty data starts writeback at this percentage */ |
1b5e62b42 writeback: double... |
87 |
int vm_dirty_ratio = 20; |
1da177e4c Linux-2.6.12-rc2 |
88 89 |
/* |
2da02997e mm: add dirty_bac... |
90 91 92 93 94 95 |
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ unsigned long vm_dirty_bytes; /* |
704503d83 mm: fix proc_doin... |
96 |
* The interval between `kupdate'-style writebacks |
1da177e4c Linux-2.6.12-rc2 |
97 |
*/ |
22ef37eed page-writeback: f... |
98 |
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ |
1da177e4c Linux-2.6.12-rc2 |
99 |
|
91913a294 mm: export dirty_... |
100 |
EXPORT_SYMBOL_GPL(dirty_writeback_interval); |
1da177e4c Linux-2.6.12-rc2 |
101 |
/* |
704503d83 mm: fix proc_doin... |
102 |
* The longest time for which data is allowed to remain dirty |
1da177e4c Linux-2.6.12-rc2 |
103 |
*/ |
22ef37eed page-writeback: f... |
104 |
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ |
1da177e4c Linux-2.6.12-rc2 |
105 106 107 108 109 110 111 |
/* * Flag that makes the machine dump writes/reads and block dirtyings. */ int block_dump; /* |
ed5b43f15 [PATCH] Represent... |
112 113 |
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. |
1da177e4c Linux-2.6.12-rc2 |
114 115 116 117 118 119 |
*/ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ |
dcc25ae76 writeback: move g... |
120 |
struct wb_domain global_wb_domain; |
1da177e4c Linux-2.6.12-rc2 |
121 |
|
2bc00aef0 writeback: consol... |
122 123 |
/* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { |
e9f07dfd7 writeback: add di... |
124 125 |
#ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; |
9fc3a43e1 writeback: separa... |
126 |
struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ |
e9f07dfd7 writeback: add di... |
127 |
#endif |
2bc00aef0 writeback: consol... |
128 |
struct bdi_writeback *wb; |
e9770b348 writeback: add di... |
129 |
struct fprop_local_percpu *wb_completions; |
eb608e3a3 block: Convert BD... |
130 |
|
9fc3a43e1 writeback: separa... |
131 |
unsigned long avail; /* dirtyable */ |
2bc00aef0 writeback: consol... |
132 133 134 135 136 137 |
unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; |
970fb01ad writeback: add di... |
138 |
unsigned long wb_bg_thresh; |
daddfa3cb writeback: add di... |
139 140 |
unsigned long pos_ratio; |
2bc00aef0 writeback: consol... |
141 |
}; |
eb608e3a3 block: Convert BD... |
142 143 144 145 146 147 |
/* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) |
04fbfdc14 mm: per device di... |
148 |
|
693108a8a writeback: make b... |
149 |
#ifdef CONFIG_CGROUP_WRITEBACK |
d60d1bddd writeback: memcg ... |
150 151 152 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions |
9fc3a43e1 writeback: separa... |
153 |
#define GDTC_INIT_NO_WB .dom = &global_wb_domain |
d60d1bddd writeback: memcg ... |
154 155 156 157 158 |
#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc |
c2aa723a6 writeback: implem... |
159 160 161 162 163 |
static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } |
e9f07dfd7 writeback: add di... |
164 165 166 167 168 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } |
9fc3a43e1 writeback: separa... |
169 170 171 172 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } |
841710aa6 writeback: implem... |
173 174 175 176 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } |
693108a8a writeback: make b... |
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; |
6d9e8c651 mm/page-writeback... |
192 |
min = div64_ul(min, tot_bw); |
693108a8a writeback: make b... |
193 194 195 |
} if (max < 100) { max *= this_bw; |
6d9e8c651 mm/page-writeback... |
196 |
max = div64_ul(max, tot_bw); |
693108a8a writeback: make b... |
197 198 199 200 201 202 203 204 |
} } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ |
d60d1bddd writeback: memcg ... |
205 206 |
#define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions |
9fc3a43e1 writeback: separa... |
207 |
#define GDTC_INIT_NO_WB |
c2aa723a6 writeback: implem... |
208 209 210 211 212 213 |
#define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } |
e9f07dfd7 writeback: add di... |
214 215 216 217 218 |
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } |
9fc3a43e1 writeback: separa... |
219 220 221 222 |
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } |
841710aa6 writeback: implem... |
223 224 225 226 |
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } |
693108a8a writeback: make b... |
227 228 229 230 231 232 233 234 |
static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ |
04fbfdc14 mm: per device di... |
235 |
/* |
a756cf590 mm: try to distri... |
236 237 238 239 240 241 242 |
* In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the |
e0857cf5a mm/page-writeback... |
243 |
* user-configurable dirty ratio is the effective number of pages that |
a756cf590 mm: try to distri... |
244 245 246 247 248 249 250 251 |
* are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ |
a804552b9 mm/page-writeback... |
252 |
/** |
281e37265 mm, page_alloc: c... |
253 254 |
* node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node |
a804552b9 mm/page-writeback... |
255 |
* |
a862f68a8 docs/core-api/mm:... |
256 |
* Return: the node's number of pages potentially available for dirty |
281e37265 mm, page_alloc: c... |
257 |
* page cache. This is the base value for the per-node dirty limits. |
a804552b9 mm/page-writeback... |
258 |
*/ |
281e37265 mm, page_alloc: c... |
259 |
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
a804552b9 mm/page-writeback... |
260 |
{ |
281e37265 mm, page_alloc: c... |
261 262 263 264 265 266 267 268 269 270 271 |
unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } |
a804552b9 mm/page-writeback... |
272 |
|
a8d014373 mm: page_alloc: g... |
273 274 275 276 277 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ |
281e37265 mm, page_alloc: c... |
278 |
nr_pages -= min(nr_pages, pgdat->totalreserve_pages); |
a804552b9 mm/page-writeback... |
279 |
|
281e37265 mm, page_alloc: c... |
280 281 |
nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); |
a804552b9 mm/page-writeback... |
282 283 284 |
return nr_pages; } |
1edf22348 mm/page-writeback... |
285 286 287 288 |
static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; |
bb4cc2bea mm, vmscan: remov... |
289 |
unsigned long x = 0; |
09b4ab3c4 mm/writeback: cor... |
290 |
int i; |
1edf22348 mm/page-writeback... |
291 292 |
for_each_node_state(node, N_HIGH_MEMORY) { |
281e37265 mm, page_alloc: c... |
293 294 |
for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; |
9cb937e21 mm, page_alloc: f... |
295 |
unsigned long nr_pages; |
281e37265 mm, page_alloc: c... |
296 297 298 299 300 |
if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; |
9cb937e21 mm, page_alloc: f... |
301 302 |
if (!populated_zone(z)) continue; |
1edf22348 mm/page-writeback... |
303 |
|
9cb937e21 mm, page_alloc: f... |
304 |
nr_pages = zone_page_state(z, NR_FREE_PAGES); |
281e37265 mm, page_alloc: c... |
305 |
/* watch for underflows */ |
9cb937e21 mm, page_alloc: f... |
306 |
nr_pages -= min(nr_pages, high_wmark_pages(z)); |
bb4cc2bea mm, vmscan: remov... |
307 308 309 |
nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; |
09b4ab3c4 mm/writeback: cor... |
310 |
} |
1edf22348 mm/page-writeback... |
311 |
} |
281e37265 mm, page_alloc: c... |
312 |
|
1edf22348 mm/page-writeback... |
313 |
/* |
c8b74c2f6 mm: fix calculati... |
314 315 316 317 318 319 320 321 322 323 324 325 |
* Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below * the zone's dirty balance reserve and the above calculation * will underflow. However we still want to add in nodes * which are below threshold (negative values) to get a more * accurate calculation but make sure that the total never * underflows. */ if ((long)x < 0) x = 0; /* |
1edf22348 mm/page-writeback... |
326 327 328 329 330 331 332 333 334 335 336 337 |
* Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** |
ccafa2879 mm: writeback: cl... |
338 |
* global_dirtyable_memory - number of globally dirtyable pages |
1edf22348 mm/page-writeback... |
339 |
* |
a862f68a8 docs/core-api/mm:... |
340 |
* Return: the global number of pages potentially available for dirty |
ccafa2879 mm: writeback: cl... |
341 |
* page cache. This is the base value for the global dirty limits. |
1edf22348 mm/page-writeback... |
342 |
*/ |
18cf8cf8b mm: page-writebac... |
343 |
static unsigned long global_dirtyable_memory(void) |
1edf22348 mm/page-writeback... |
344 345 |
{ unsigned long x; |
c41f012ad mm: rename global... |
346 |
x = global_zone_page_state(NR_FREE_PAGES); |
a8d014373 mm: page_alloc: g... |
347 348 349 350 351 352 |
/* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); |
1edf22348 mm/page-writeback... |
353 |
|
599d0c954 mm, vmscan: move ... |
354 355 |
x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); |
a804552b9 mm/page-writeback... |
356 |
|
1edf22348 mm/page-writeback... |
357 358 359 360 361 |
if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } |
9fc3a43e1 writeback: separa... |
362 363 364 |
/** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest |
ccafa2879 mm: writeback: cl... |
365 |
* |
9fc3a43e1 writeback: separa... |
366 367 368 |
* Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The |
a37b0715d mm/writeback: rep... |
369 |
* dirty limits will be lifted by 1/4 for real-time tasks. |
ccafa2879 mm: writeback: cl... |
370 |
*/ |
9fc3a43e1 writeback: separa... |
371 |
static void domain_dirty_limits(struct dirty_throttle_control *dtc) |
ccafa2879 mm: writeback: cl... |
372 |
{ |
9fc3a43e1 writeback: separa... |
373 374 375 376 |
const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; |
62a584fe0 writeback: use hi... |
377 378 379 |
/* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; |
9fc3a43e1 writeback: separa... |
380 381 |
unsigned long thresh; unsigned long bg_thresh; |
ccafa2879 mm: writeback: cl... |
382 |
struct task_struct *tsk; |
9fc3a43e1 writeback: separa... |
383 384 385 386 387 388 389 |
/* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against |
62a584fe0 writeback: use hi... |
390 391 392 |
* globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. |
9fc3a43e1 writeback: separa... |
393 394 |
*/ if (bytes) |
62a584fe0 writeback: use hi... |
395 396 |
ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); |
9fc3a43e1 writeback: separa... |
397 |
if (bg_bytes) |
62a584fe0 writeback: use hi... |
398 399 |
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); |
9fc3a43e1 writeback: separa... |
400 401 402 403 404 |
bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); |
ccafa2879 mm: writeback: cl... |
405 |
else |
62a584fe0 writeback: use hi... |
406 |
thresh = (ratio * available_memory) / PAGE_SIZE; |
ccafa2879 mm: writeback: cl... |
407 |
|
9fc3a43e1 writeback: separa... |
408 409 |
if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); |
ccafa2879 mm: writeback: cl... |
410 |
else |
62a584fe0 writeback: use hi... |
411 |
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; |
ccafa2879 mm: writeback: cl... |
412 |
|
90daf3062 Revert "mm/page-w... |
413 |
if (bg_thresh >= thresh) |
9fc3a43e1 writeback: separa... |
414 |
bg_thresh = thresh / 2; |
ccafa2879 mm: writeback: cl... |
415 |
tsk = current; |
a37b0715d mm/writeback: rep... |
416 |
if (rt_task(tsk)) { |
a53eaff8c MM: increase safe... |
417 418 |
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; |
ccafa2879 mm: writeback: cl... |
419 |
} |
9fc3a43e1 writeback: separa... |
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 |
dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; |
ccafa2879 mm: writeback: cl... |
445 |
} |
a756cf590 mm: try to distri... |
446 |
/** |
281e37265 mm, page_alloc: c... |
447 448 |
* node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node |
a756cf590 mm: try to distri... |
449 |
* |
a862f68a8 docs/core-api/mm:... |
450 |
* Return: the maximum number of dirty pages allowed in a node, based |
281e37265 mm, page_alloc: c... |
451 |
* on the node's dirtyable memory. |
a756cf590 mm: try to distri... |
452 |
*/ |
281e37265 mm, page_alloc: c... |
453 |
static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
a756cf590 mm: try to distri... |
454 |
{ |
281e37265 mm, page_alloc: c... |
455 |
unsigned long node_memory = node_dirtyable_memory(pgdat); |
a756cf590 mm: try to distri... |
456 457 458 459 460 |
struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * |
281e37265 mm, page_alloc: c... |
461 |
node_memory / global_dirtyable_memory(); |
a756cf590 mm: try to distri... |
462 |
else |
281e37265 mm, page_alloc: c... |
463 |
dirty = vm_dirty_ratio * node_memory / 100; |
a756cf590 mm: try to distri... |
464 |
|
a37b0715d mm/writeback: rep... |
465 |
if (rt_task(tsk)) |
a756cf590 mm: try to distri... |
466 467 468 469 470 471 |
dirty += dirty / 4; return dirty; } /** |
281e37265 mm, page_alloc: c... |
472 473 |
* node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check |
a756cf590 mm: try to distri... |
474 |
* |
a862f68a8 docs/core-api/mm:... |
475 |
* Return: %true when the dirty pages in @pgdat are within the node's |
a756cf590 mm: try to distri... |
476 477 |
* dirty limit, %false if the limit is exceeded. */ |
281e37265 mm, page_alloc: c... |
478 |
bool node_dirty_ok(struct pglist_data *pgdat) |
a756cf590 mm: try to distri... |
479 |
{ |
281e37265 mm, page_alloc: c... |
480 481 |
unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; |
11fb99898 mm: move most fil... |
482 |
nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); |
11fb99898 mm: move most fil... |
483 |
nr_pages += node_page_state(pgdat, NR_WRITEBACK); |
a756cf590 mm: try to distri... |
484 |
|
281e37265 mm, page_alloc: c... |
485 |
return nr_pages <= limit; |
a756cf590 mm: try to distri... |
486 |
} |
2da02997e mm: add dirty_bac... |
487 |
int dirty_background_ratio_handler(struct ctl_table *table, int write, |
32927393d sysctl: pass kern... |
488 |
void *buffer, size_t *lenp, loff_t *ppos) |
2da02997e mm: add dirty_bac... |
489 490 |
{ int ret; |
8d65af789 sysctl: remove "s... |
491 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
492 493 494 495 496 497 |
if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, |
32927393d sysctl: pass kern... |
498 |
void *buffer, size_t *lenp, loff_t *ppos) |
2da02997e mm: add dirty_bac... |
499 500 |
{ int ret; |
8d65af789 sysctl: remove "s... |
501 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
502 503 504 505 |
if (ret == 0 && write) dirty_background_ratio = 0; return ret; } |
32927393d sysctl: pass kern... |
506 507 |
int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) |
04fbfdc14 mm: per device di... |
508 509 |
{ int old_ratio = vm_dirty_ratio; |
2da02997e mm: add dirty_bac... |
510 |
int ret; |
8d65af789 sysctl: remove "s... |
511 |
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
04fbfdc14 mm: per device di... |
512 |
if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
eb608e3a3 block: Convert BD... |
513 |
writeback_set_ratelimit(); |
2da02997e mm: add dirty_bac... |
514 515 516 517 |
vm_dirty_bytes = 0; } return ret; } |
2da02997e mm: add dirty_bac... |
518 |
int dirty_bytes_handler(struct ctl_table *table, int write, |
32927393d sysctl: pass kern... |
519 |
void *buffer, size_t *lenp, loff_t *ppos) |
2da02997e mm: add dirty_bac... |
520 |
{ |
fc3501d41 mm: fix dirty_byt... |
521 |
unsigned long old_bytes = vm_dirty_bytes; |
2da02997e mm: add dirty_bac... |
522 |
int ret; |
8d65af789 sysctl: remove "s... |
523 |
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
2da02997e mm: add dirty_bac... |
524 |
if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
eb608e3a3 block: Convert BD... |
525 |
writeback_set_ratelimit(); |
2da02997e mm: add dirty_bac... |
526 |
vm_dirty_ratio = 0; |
04fbfdc14 mm: per device di... |
527 528 529 |
} return ret; } |
eb608e3a3 block: Convert BD... |
530 531 532 533 534 535 536 537 |
static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } |
c7981433e writeback: make _... |
538 539 540 |
static void wb_domain_writeout_inc(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac) |
04fbfdc14 mm: per device di... |
541 |
{ |
c7981433e writeback: make _... |
542 543 |
__fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); |
eb608e3a3 block: Convert BD... |
544 |
/* First event after period switching was turned off? */ |
517663edd mm/page-writeback... |
545 |
if (unlikely(!dom->period_time)) { |
eb608e3a3 block: Convert BD... |
546 547 548 549 550 551 |
/* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ |
380c27ca3 writeback: implem... |
552 553 |
dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3 block: Convert BD... |
554 |
} |
04fbfdc14 mm: per device di... |
555 |
} |
c7981433e writeback: make _... |
556 557 558 559 560 |
/* * Increment @wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) |
dd5656e59 mm: bdi: export b... |
561 |
{ |
841710aa6 writeback: implem... |
562 |
struct wb_domain *cgdom; |
dd5656e59 mm: bdi: export b... |
563 |
|
3e8f399da writeback: rework... |
564 |
inc_wb_stat(wb, WB_WRITTEN); |
c7981433e writeback: make _... |
565 566 |
wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); |
841710aa6 writeback: implem... |
567 568 569 570 571 |
cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac); |
dd5656e59 mm: bdi: export b... |
572 |
} |
dd5656e59 mm: bdi: export b... |
573 |
|
93f78d882 writeback: move b... |
574 |
void wb_writeout_inc(struct bdi_writeback *wb) |
04fbfdc14 mm: per device di... |
575 |
{ |
dd5656e59 mm: bdi: export b... |
576 577 578 |
unsigned long flags; local_irq_save(flags); |
93f78d882 writeback: move b... |
579 |
__wb_writeout_inc(wb); |
dd5656e59 mm: bdi: export b... |
580 |
local_irq_restore(flags); |
04fbfdc14 mm: per device di... |
581 |
} |
93f78d882 writeback: move b... |
582 |
EXPORT_SYMBOL_GPL(wb_writeout_inc); |
04fbfdc14 mm: per device di... |
583 |
|
04fbfdc14 mm: per device di... |
584 |
/* |
eb608e3a3 block: Convert BD... |
585 586 587 |
* On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ |
9823e51bf mm/page-writeback... |
588 |
static void writeout_period(struct timer_list *t) |
eb608e3a3 block: Convert BD... |
589 |
{ |
9823e51bf mm/page-writeback... |
590 |
struct wb_domain *dom = from_timer(dom, t, period_timer); |
380c27ca3 writeback: implem... |
591 |
int miss_periods = (jiffies - dom->period_time) / |
eb608e3a3 block: Convert BD... |
592 |
VM_COMPLETIONS_PERIOD_LEN; |
380c27ca3 writeback: implem... |
593 594 |
if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + |
eb608e3a3 block: Convert BD... |
595 |
miss_periods * VM_COMPLETIONS_PERIOD_LEN); |
380c27ca3 writeback: implem... |
596 |
mod_timer(&dom->period_timer, dom->period_time); |
eb608e3a3 block: Convert BD... |
597 598 599 600 601 |
} else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ |
380c27ca3 writeback: implem... |
602 |
dom->period_time = 0; |
eb608e3a3 block: Convert BD... |
603 604 |
} } |
380c27ca3 writeback: implem... |
605 606 607 |
int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); |
dcc25ae76 writeback: move g... |
608 609 |
spin_lock_init(&dom->lock); |
9823e51bf mm/page-writeback... |
610 |
timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); |
dcc25ae76 writeback: move g... |
611 612 |
dom->dirty_limit_tstamp = jiffies; |
380c27ca3 writeback: implem... |
613 614 |
return fprop_global_init(&dom->completions, gfp); } |
841710aa6 writeback: implem... |
615 616 617 618 619 620 621 |
#ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif |
eb608e3a3 block: Convert BD... |
622 |
/* |
d08c429b0 mm/page-writeback... |
623 624 625 |
* bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. |
189d3c4a9 mm: bdi: allow se... |
626 |
*/ |
189d3c4a9 mm: bdi: allow se... |
627 628 629 630 631 |
static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; |
189d3c4a9 mm: bdi: allow se... |
632 |
|
cfc4ba536 writeback: use RC... |
633 |
spin_lock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
634 |
if (min_ratio > bdi->max_ratio) { |
189d3c4a9 mm: bdi: allow se... |
635 |
ret = -EINVAL; |
a42dde041 mm: bdi: allow se... |
636 637 638 639 640 641 642 643 644 |
} else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } |
cfc4ba536 writeback: use RC... |
645 |
spin_unlock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
646 647 648 649 650 651 |
return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { |
a42dde041 mm: bdi: allow se... |
652 653 654 655 |
int ret = 0; if (max_ratio > 100) return -EINVAL; |
cfc4ba536 writeback: use RC... |
656 |
spin_lock_bh(&bdi_lock); |
a42dde041 mm: bdi: allow se... |
657 658 659 660 |
if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; |
eb608e3a3 block: Convert BD... |
661 |
bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
a42dde041 mm: bdi: allow se... |
662 |
} |
cfc4ba536 writeback: use RC... |
663 |
spin_unlock_bh(&bdi_lock); |
189d3c4a9 mm: bdi: allow se... |
664 665 666 |
return ret; } |
a42dde041 mm: bdi: allow se... |
667 |
EXPORT_SYMBOL(bdi_set_max_ratio); |
189d3c4a9 mm: bdi: allow se... |
668 |
|
6c14ae1e9 writeback: dirty ... |
669 670 671 672 673 |
static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } |
c7981433e writeback: make _... |
674 675 |
static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) |
ffd1f609a writeback: introd... |
676 |
{ |
dcc25ae76 writeback: move g... |
677 |
return max(thresh, dom->dirty_limit); |
ffd1f609a writeback: introd... |
678 |
} |
c5edf9cdc writeback: fix in... |
679 680 681 682 683 684 |
/* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) |
c2aa723a6 writeback: implem... |
685 686 |
{ struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); |
c5edf9cdc writeback: fix in... |
687 688 689 |
unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); |
c2aa723a6 writeback: implem... |
690 |
|
c5edf9cdc writeback: fix in... |
691 |
mdtc->avail = filepages + min(headroom, other_clean); |
ffd1f609a writeback: introd... |
692 |
} |
6f7186562 writeback: add bd... |
693 |
/** |
b1cbc6d40 writeback: make _... |
694 695 |
* __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest |
1babe1838 writeback: add co... |
696 |
* |
aed21ad28 writeback: commen... |
697 698 699 700 701 |
* Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks |
a88a341a7 writeback: move b... |
702 |
* more (rather than completely block them) when the wb dirty pages go high. |
1babe1838 writeback: add co... |
703 |
* |
6f7186562 writeback: add bd... |
704 |
* It allocates high/low dirty limits to fast/slow devices, in order to prevent |
1babe1838 writeback: add co... |
705 706 707 |
* - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * |
a88a341a7 writeback: move b... |
708 |
* The wb's share of dirty limit will be adapting to its throughput and |
1babe1838 writeback: add co... |
709 |
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. |
a862f68a8 docs/core-api/mm:... |
710 711 |
* * Return: @wb's dirty limit in pages. The term "dirty" in the context of |
8d92890bd mm/writeback: dis... |
712 |
* dirty balancing includes all PG_dirty and PG_writeback pages. |
1babe1838 writeback: add co... |
713 |
*/ |
b1cbc6d40 writeback: make _... |
714 |
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) |
16c4042f0 writeback: avoid ... |
715 |
{ |
e9f07dfd7 writeback: add di... |
716 |
struct wb_domain *dom = dtc_dom(dtc); |
b1cbc6d40 writeback: make _... |
717 |
unsigned long thresh = dtc->thresh; |
0d960a383 writeback: clean ... |
718 |
u64 wb_thresh; |
d3ac946ec mm/page-writeback... |
719 |
unsigned long numerator, denominator; |
693108a8a writeback: make b... |
720 |
unsigned long wb_min_ratio, wb_max_ratio; |
04fbfdc14 mm: per device di... |
721 |
|
16c4042f0 writeback: avoid ... |
722 |
/* |
0d960a383 writeback: clean ... |
723 |
* Calculate this BDI's share of the thresh ratio. |
16c4042f0 writeback: avoid ... |
724 |
*/ |
e9770b348 writeback: add di... |
725 |
fprop_fraction_percpu(&dom->completions, dtc->wb_completions, |
380c27ca3 writeback: implem... |
726 |
&numerator, &denominator); |
04fbfdc14 mm: per device di... |
727 |
|
0d960a383 writeback: clean ... |
728 729 |
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; |
d3ac946ec mm/page-writeback... |
730 |
wb_thresh = div64_ul(wb_thresh, denominator); |
04fbfdc14 mm: per device di... |
731 |
|
b1cbc6d40 writeback: make _... |
732 |
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); |
04fbfdc14 mm: per device di... |
733 |
|
0d960a383 writeback: clean ... |
734 735 736 |
wb_thresh += (thresh * wb_min_ratio) / 100; if (wb_thresh > (thresh * wb_max_ratio) / 100) wb_thresh = thresh * wb_max_ratio / 100; |
16c4042f0 writeback: avoid ... |
737 |
|
0d960a383 writeback: clean ... |
738 |
return wb_thresh; |
1da177e4c Linux-2.6.12-rc2 |
739 |
} |
b1cbc6d40 writeback: make _... |
740 741 742 743 744 |
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); |
1da177e4c Linux-2.6.12-rc2 |
745 |
} |
6c14ae1e9 writeback: dirty ... |
746 |
/* |
5a5374856 mm/page-writeback... |
747 748 749 750 751 752 753 754 755 756 757 758 759 |
* setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ |
d5c9fde3d mm/page-writeback... |
760 |
static long long pos_ratio_polynom(unsigned long setpoint, |
5a5374856 mm/page-writeback... |
761 762 763 764 765 |
unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; |
d5c9fde3d mm/page-writeback... |
766 |
x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, |
464d1387a writeback: use |1... |
767 |
(limit - setpoint) | 1); |
5a5374856 mm/page-writeback... |
768 769 770 771 772 773 774 775 776 |
pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* |
6c14ae1e9 writeback: dirty ... |
777 778 779 780 |
* Dirty position control. * * (o) global/bdi setpoints * |
de1fff37b writeback: s/bdi/... |
781 |
* We want the dirty pages be balanced around the global/wb setpoints. |
6c14ae1e9 writeback: dirty ... |
782 783 784 785 786 787 788 789 790 |
* When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * |
de1fff37b writeback: s/bdi/... |
791 792 |
* if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio |
6c14ae1e9 writeback: dirty ... |
793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 |
* * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 .............* * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * |
de1fff37b writeback: s/bdi/... |
817 |
* (o) wb control line |
6c14ae1e9 writeback: dirty ... |
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 |
* * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> |
de1fff37b writeback: s/bdi/... |
843 |
* wb_setpoint^ x_intercept^ |
6c14ae1e9 writeback: dirty ... |
844 |
* |
de1fff37b writeback: s/bdi/... |
845 |
* The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can |
6c14ae1e9 writeback: dirty ... |
846 847 |
* be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD |
de1fff37b writeback: s/bdi/... |
848 849 |
* card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload |
6c14ae1e9 writeback: dirty ... |
850 |
*/ |
daddfa3cb writeback: add di... |
851 |
static void wb_position_ratio(struct dirty_throttle_control *dtc) |
6c14ae1e9 writeback: dirty ... |
852 |
{ |
2bc00aef0 writeback: consol... |
853 |
struct bdi_writeback *wb = dtc->wb; |
a88a341a7 writeback: move b... |
854 |
unsigned long write_bw = wb->avg_write_bandwidth; |
2bc00aef0 writeback: consol... |
855 |
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e writeback: make _... |
856 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
2bc00aef0 writeback: consol... |
857 |
unsigned long wb_thresh = dtc->wb_thresh; |
6c14ae1e9 writeback: dirty ... |
858 859 |
unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ |
de1fff37b writeback: s/bdi/... |
860 |
unsigned long wb_setpoint; |
6c14ae1e9 writeback: dirty ... |
861 862 863 |
unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; |
daddfa3cb writeback: add di... |
864 |
dtc->pos_ratio = 0; |
2bc00aef0 writeback: consol... |
865 |
if (unlikely(dtc->dirty >= limit)) |
daddfa3cb writeback: add di... |
866 |
return; |
6c14ae1e9 writeback: dirty ... |
867 868 869 870 |
/* * global setpoint * |
5a5374856 mm/page-writeback... |
871 872 873 |
* See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; |
2bc00aef0 writeback: consol... |
874 |
pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); |
5a5374856 mm/page-writeback... |
875 876 877 878 |
/* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For |
de1fff37b writeback: s/bdi/... |
879 880 |
* such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". |
5a5374856 mm/page-writeback... |
881 882 883 884 |
* This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". |
6c14ae1e9 writeback: dirty ... |
885 |
* |
a88a341a7 writeback: move b... |
886 |
* Here, in wb_position_ratio(), we calculate pos_ratio based on |
de1fff37b writeback: s/bdi/... |
887 |
* two values: wb_dirty and wb_thresh. Let's consider an example: |
5a5374856 mm/page-writeback... |
888 889 |
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). |
de1fff37b writeback: s/bdi/... |
890 |
* Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. |
0d960a383 writeback: clean ... |
891 |
* wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is |
de1fff37b writeback: s/bdi/... |
892 |
* about ~6K pages (as the average of background and throttle wb |
5a5374856 mm/page-writeback... |
893 |
* limits). The 3rd order polynomial will provide positive feedback if |
de1fff37b writeback: s/bdi/... |
894 |
* wb_dirty is under wb_setpoint and vice versa. |
6c14ae1e9 writeback: dirty ... |
895 |
* |
5a5374856 mm/page-writeback... |
896 |
* Note, that we cannot use global counters in these calculations |
de1fff37b writeback: s/bdi/... |
897 |
* because we want to throttle process writing to a strictlimit wb |
5a5374856 mm/page-writeback... |
898 899 |
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). |
6c14ae1e9 writeback: dirty ... |
900 |
*/ |
a88a341a7 writeback: move b... |
901 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
de1fff37b writeback: s/bdi/... |
902 |
long long wb_pos_ratio; |
5a5374856 mm/page-writeback... |
903 |
|
daddfa3cb writeback: add di... |
904 905 906 907 908 |
if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } |
5a5374856 mm/page-writeback... |
909 |
|
2bc00aef0 writeback: consol... |
910 |
if (dtc->wb_dirty >= wb_thresh) |
daddfa3cb writeback: add di... |
911 |
return; |
5a5374856 mm/page-writeback... |
912 |
|
970fb01ad writeback: add di... |
913 914 |
wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); |
5a5374856 mm/page-writeback... |
915 |
|
de1fff37b writeback: s/bdi/... |
916 |
if (wb_setpoint == 0 || wb_setpoint == wb_thresh) |
daddfa3cb writeback: add di... |
917 |
return; |
5a5374856 mm/page-writeback... |
918 |
|
2bc00aef0 writeback: consol... |
919 |
wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, |
de1fff37b writeback: s/bdi/... |
920 |
wb_thresh); |
5a5374856 mm/page-writeback... |
921 922 |
/* |
de1fff37b writeback: s/bdi/... |
923 924 |
* Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global |
5a5374856 mm/page-writeback... |
925 |
* state ("dirty") is not limiting factor and we have to |
de1fff37b writeback: s/bdi/... |
926 |
* make decision based on wb counters. But there is an |
5a5374856 mm/page-writeback... |
927 928 |
* important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other |
de1fff37b writeback: s/bdi/... |
929 |
* wb's) while given strictlimit wb is below limit. |
5a5374856 mm/page-writeback... |
930 |
* |
de1fff37b writeback: s/bdi/... |
931 |
* "pos_ratio * wb_pos_ratio" would work for the case above, |
5a5374856 mm/page-writeback... |
932 |
* but it would look too non-natural for the case of all |
de1fff37b writeback: s/bdi/... |
933 |
* activity in the system coming from a single strictlimit wb |
5a5374856 mm/page-writeback... |
934 935 936 937 |
* with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 |
de1fff37b writeback: s/bdi/... |
938 |
* (when globally we are at freerun and wb is well below wb |
5a5374856 mm/page-writeback... |
939 940 941 942 |
* setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ |
daddfa3cb writeback: add di... |
943 944 |
dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; |
5a5374856 mm/page-writeback... |
945 |
} |
6c14ae1e9 writeback: dirty ... |
946 947 948 |
/* * We have computed basic pos_ratio above based on global situation. If |
de1fff37b writeback: s/bdi/... |
949 |
* the wb is over/under its share of dirty pages, we want to scale |
6c14ae1e9 writeback: dirty ... |
950 951 952 953 |
* pos_ratio further down/up. That is done by the following mechanism. */ /* |
de1fff37b writeback: s/bdi/... |
954 |
* wb setpoint |
6c14ae1e9 writeback: dirty ... |
955 |
* |
de1fff37b writeback: s/bdi/... |
956 |
* f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) |
6c14ae1e9 writeback: dirty ... |
957 |
* |
de1fff37b writeback: s/bdi/... |
958 |
* x_intercept - wb_dirty |
6c14ae1e9 writeback: dirty ... |
959 |
* := -------------------------- |
de1fff37b writeback: s/bdi/... |
960 |
* x_intercept - wb_setpoint |
6c14ae1e9 writeback: dirty ... |
961 |
* |
de1fff37b writeback: s/bdi/... |
962 |
* The main wb control line is a linear function that subjects to |
6c14ae1e9 writeback: dirty ... |
963 |
* |
de1fff37b writeback: s/bdi/... |
964 965 966 |
* (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw |
6c14ae1e9 writeback: dirty ... |
967 |
* |
de1fff37b writeback: s/bdi/... |
968 |
* For single wb case, the dirty pages are observed to fluctuate |
6c14ae1e9 writeback: dirty ... |
969 |
* regularly within range |
de1fff37b writeback: s/bdi/... |
970 |
* [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] |
6c14ae1e9 writeback: dirty ... |
971 972 973 |
* for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * |
de1fff37b writeback: s/bdi/... |
974 |
* For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its |
6c14ae1e9 writeback: dirty ... |
975 |
* own size, so move the slope over accordingly and choose a slope that |
de1fff37b writeback: s/bdi/... |
976 |
* yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. |
6c14ae1e9 writeback: dirty ... |
977 |
*/ |
2bc00aef0 writeback: consol... |
978 979 |
if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; |
aed21ad28 writeback: commen... |
980 |
/* |
de1fff37b writeback: s/bdi/... |
981 |
* It's very possible that wb_thresh is close to 0 not because the |
aed21ad28 writeback: commen... |
982 983 984 985 986 |
* device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ |
2bc00aef0 writeback: consol... |
987 |
wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); |
6c14ae1e9 writeback: dirty ... |
988 |
/* |
de1fff37b writeback: s/bdi/... |
989 990 |
* scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh |
6c14ae1e9 writeback: dirty ... |
991 |
*/ |
e4bc13adf Merge branch 'for... |
992 |
x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); |
de1fff37b writeback: s/bdi/... |
993 |
wb_setpoint = setpoint * (u64)x >> 16; |
6c14ae1e9 writeback: dirty ... |
994 |
/* |
de1fff37b writeback: s/bdi/... |
995 996 |
* Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. |
6c14ae1e9 writeback: dirty ... |
997 |
* |
de1fff37b writeback: s/bdi/... |
998 999 1000 |
* wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh |
6c14ae1e9 writeback: dirty ... |
1001 |
*/ |
2bc00aef0 writeback: consol... |
1002 |
span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; |
de1fff37b writeback: s/bdi/... |
1003 |
x_intercept = wb_setpoint + span; |
6c14ae1e9 writeback: dirty ... |
1004 |
|
2bc00aef0 writeback: consol... |
1005 1006 |
if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), |
e4bc13adf Merge branch 'for... |
1007 |
(x_intercept - wb_setpoint) | 1); |
6c14ae1e9 writeback: dirty ... |
1008 1009 |
} else pos_ratio /= 4; |
8927f66c4 writeback: dirty ... |
1010 |
/* |
de1fff37b writeback: s/bdi/... |
1011 |
* wb reserve area, safeguard against dirty pool underrun and disk idle |
8927f66c4 writeback: dirty ... |
1012 1013 1014 |
* It may push the desired control point of global dirty pages higher * than setpoint. */ |
de1fff37b writeback: s/bdi/... |
1015 |
x_intercept = wb_thresh / 2; |
2bc00aef0 writeback: consol... |
1016 1017 1018 1019 |
if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); |
50657fc4d writeback: fix pp... |
1020 |
else |
8927f66c4 writeback: dirty ... |
1021 1022 |
pos_ratio *= 8; } |
daddfa3cb writeback: add di... |
1023 |
dtc->pos_ratio = pos_ratio; |
6c14ae1e9 writeback: dirty ... |
1024 |
} |
a88a341a7 writeback: move b... |
1025 1026 1027 |
static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) |
e98be2d59 writeback: bdi wr... |
1028 1029 |
{ const unsigned long period = roundup_pow_of_two(3 * HZ); |
a88a341a7 writeback: move b... |
1030 1031 |
unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; |
e98be2d59 writeback: bdi wr... |
1032 1033 1034 1035 1036 1037 1038 1039 |
u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period |
c72efb658 writeback: fix po... |
1040 1041 1042 |
* * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. |
e98be2d59 writeback: bdi wr... |
1043 |
*/ |
a88a341a7 writeback: move b... |
1044 |
bw = written - min(written, wb->written_stamp); |
e98be2d59 writeback: bdi wr... |
1045 1046 |
bw *= HZ; if (unlikely(elapsed > period)) { |
0a5d1a7f6 mm/page-writeback... |
1047 |
bw = div64_ul(bw, elapsed); |
e98be2d59 writeback: bdi wr... |
1048 1049 1050 |
avg = bw; goto out; } |
a88a341a7 writeback: move b... |
1051 |
bw += (u64)wb->write_bandwidth * (period - elapsed); |
e98be2d59 writeback: bdi wr... |
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 |
bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: |
95a46c65e writeback: make b... |
1064 1065 1066 1067 1068 1069 1070 |
/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } |
a88a341a7 writeback: move b... |
1071 1072 |
wb->write_bandwidth = bw; wb->avg_write_bandwidth = avg; |
e98be2d59 writeback: bdi wr... |
1073 |
} |
2bc00aef0 writeback: consol... |
1074 |
static void update_dirty_limit(struct dirty_throttle_control *dtc) |
c42843f2f writeback: introd... |
1075 |
{ |
e9f07dfd7 writeback: add di... |
1076 |
struct wb_domain *dom = dtc_dom(dtc); |
2bc00aef0 writeback: consol... |
1077 |
unsigned long thresh = dtc->thresh; |
dcc25ae76 writeback: move g... |
1078 |
unsigned long limit = dom->dirty_limit; |
c42843f2f writeback: introd... |
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 |
/* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce |
dcc25ae76 writeback: move g... |
1091 |
* dom->dirty_limit which is guaranteed to lie above the dirty pages. |
c42843f2f writeback: introd... |
1092 |
*/ |
2bc00aef0 writeback: consol... |
1093 |
thresh = max(thresh, dtc->dirty); |
c42843f2f writeback: introd... |
1094 1095 1096 1097 1098 1099 |
if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: |
dcc25ae76 writeback: move g... |
1100 |
dom->dirty_limit = limit; |
c42843f2f writeback: introd... |
1101 |
} |
e9f07dfd7 writeback: add di... |
1102 |
static void domain_update_bandwidth(struct dirty_throttle_control *dtc, |
c42843f2f writeback: introd... |
1103 1104 |
unsigned long now) { |
e9f07dfd7 writeback: add di... |
1105 |
struct wb_domain *dom = dtc_dom(dtc); |
c42843f2f writeback: introd... |
1106 1107 1108 1109 |
/* * check locklessly first to optimize away locking for the most time */ |
dcc25ae76 writeback: move g... |
1110 |
if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) |
c42843f2f writeback: introd... |
1111 |
return; |
dcc25ae76 writeback: move g... |
1112 1113 |
spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { |
2bc00aef0 writeback: consol... |
1114 |
update_dirty_limit(dtc); |
dcc25ae76 writeback: move g... |
1115 |
dom->dirty_limit_tstamp = now; |
c42843f2f writeback: introd... |
1116 |
} |
dcc25ae76 writeback: move g... |
1117 |
spin_unlock(&dom->lock); |
c42843f2f writeback: introd... |
1118 |
} |
be3ffa276 writeback: dirty ... |
1119 |
/* |
de1fff37b writeback: s/bdi/... |
1120 |
* Maintain wb->dirty_ratelimit, the base dirty throttle rate. |
be3ffa276 writeback: dirty ... |
1121 |
* |
de1fff37b writeback: s/bdi/... |
1122 |
* Normal wb tasks will be curbed at or below it in long term. |
be3ffa276 writeback: dirty ... |
1123 1124 |
* Obviously it should be around (write_bw / N) when there are N dd tasks. */ |
2bc00aef0 writeback: consol... |
1125 |
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, |
a88a341a7 writeback: move b... |
1126 1127 |
unsigned long dirtied, unsigned long elapsed) |
be3ffa276 writeback: dirty ... |
1128 |
{ |
2bc00aef0 writeback: consol... |
1129 1130 1131 |
struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); |
c7981433e writeback: make _... |
1132 |
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); |
7381131cb writeback: stabil... |
1133 |
unsigned long setpoint = (freerun + limit) / 2; |
a88a341a7 writeback: move b... |
1134 1135 |
unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; |
be3ffa276 writeback: dirty ... |
1136 1137 1138 |
unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; |
7381131cb writeback: stabil... |
1139 1140 |
unsigned long step; unsigned long x; |
d59b1087a mm/page-writeback... |
1141 |
unsigned long shift; |
be3ffa276 writeback: dirty ... |
1142 1143 1144 1145 1146 |
/* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ |
a88a341a7 writeback: move b... |
1147 |
dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; |
be3ffa276 writeback: dirty ... |
1148 |
|
be3ffa276 writeback: dirty ... |
1149 1150 1151 1152 |
/* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * |
daddfa3cb writeback: add di... |
1153 |
dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; |
be3ffa276 writeback: dirty ... |
1154 1155 1156 1157 |
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, |
de1fff37b writeback: s/bdi/... |
1158 |
* if there are N dd tasks, each throttled at task_ratelimit, the wb's |
be3ffa276 writeback: dirty ... |
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 |
* dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); |
bdaac4902 writeback: balanc... |
1188 1189 1190 1191 1192 |
/* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; |
be3ffa276 writeback: dirty ... |
1193 |
|
7381131cb writeback: stabil... |
1194 1195 1196 |
/* * We could safely do this and return immediately: * |
de1fff37b writeback: s/bdi/... |
1197 |
* wb->dirty_ratelimit = balanced_dirty_ratelimit; |
7381131cb writeback: stabil... |
1198 1199 |
* * However to get a more stable dirty_ratelimit, the below elaborated |
331cbdeed writeback: Fix so... |
1200 |
* code makes use of task_ratelimit to filter out singular points and |
7381131cb writeback: stabil... |
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 |
* limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
331cbdeed writeback: Fix so... |
1223 |
* and filter out the singular points of balanced_dirty_ratelimit. Which |
7381131cb writeback: stabil... |
1224 1225 1226 1227 1228 |
* keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; |
5a5374856 mm/page-writeback... |
1229 1230 |
/* |
de1fff37b writeback: s/bdi/... |
1231 |
* For strictlimit case, calculations above were based on wb counters |
a88a341a7 writeback: move b... |
1232 |
* and limits (starting from pos_ratio = wb_position_ratio() and up to |
5a5374856 mm/page-writeback... |
1233 |
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). |
de1fff37b writeback: s/bdi/... |
1234 1235 |
* Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". |
5a5374856 mm/page-writeback... |
1236 |
* |
de1fff37b writeback: s/bdi/... |
1237 1238 |
* We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity |
970fb01ad writeback: add di... |
1239 |
* of backing device. |
5a5374856 mm/page-writeback... |
1240 |
*/ |
a88a341a7 writeback: move b... |
1241 |
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
2bc00aef0 writeback: consol... |
1242 1243 1244 |
dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; |
5a5374856 mm/page-writeback... |
1245 |
else |
970fb01ad writeback: add di... |
1246 |
setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; |
5a5374856 mm/page-writeback... |
1247 |
} |
7381131cb writeback: stabil... |
1248 |
if (dirty < setpoint) { |
a88a341a7 writeback: move b... |
1249 |
x = min3(wb->balanced_dirty_ratelimit, |
7c809968f mm/page-writeback... |
1250 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb writeback: stabil... |
1251 1252 1253 |
if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { |
a88a341a7 writeback: move b... |
1254 |
x = max3(wb->balanced_dirty_ratelimit, |
7c809968f mm/page-writeback... |
1255 |
balanced_dirty_ratelimit, task_ratelimit); |
7381131cb writeback: stabil... |
1256 1257 1258 1259 1260 1261 1262 1263 1264 |
if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ |
d59b1087a mm/page-writeback... |
1265 1266 1267 1268 1269 |
shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; |
7381131cb writeback: stabil... |
1270 1271 1272 1273 1274 |
if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; |
a88a341a7 writeback: move b... |
1275 1276 |
wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; |
b48c104d2 writeback: trace ... |
1277 |
|
5634cc2aa writeback: update... |
1278 |
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); |
be3ffa276 writeback: dirty ... |
1279 |
} |
c2aa723a6 writeback: implem... |
1280 1281 |
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, |
8a7317995 writeback: reorga... |
1282 1283 |
unsigned long start_time, bool update_ratelimit) |
e98be2d59 writeback: bdi wr... |
1284 |
{ |
c2aa723a6 writeback: implem... |
1285 |
struct bdi_writeback *wb = gdtc->wb; |
e98be2d59 writeback: bdi wr... |
1286 |
unsigned long now = jiffies; |
a88a341a7 writeback: move b... |
1287 |
unsigned long elapsed = now - wb->bw_time_stamp; |
be3ffa276 writeback: dirty ... |
1288 |
unsigned long dirtied; |
e98be2d59 writeback: bdi wr... |
1289 |
unsigned long written; |
8a7317995 writeback: reorga... |
1290 |
lockdep_assert_held(&wb->list_lock); |
e98be2d59 writeback: bdi wr... |
1291 1292 1293 1294 1295 |
/* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; |
a88a341a7 writeback: move b... |
1296 1297 |
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); |
e98be2d59 writeback: bdi wr... |
1298 1299 1300 1301 1302 |
/* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ |
a88a341a7 writeback: move b... |
1303 |
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) |
e98be2d59 writeback: bdi wr... |
1304 |
goto snapshot; |
8a7317995 writeback: reorga... |
1305 |
if (update_ratelimit) { |
c2aa723a6 writeback: implem... |
1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 |
domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_bandwidth(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } |
be3ffa276 writeback: dirty ... |
1317 |
} |
a88a341a7 writeback: move b... |
1318 |
wb_update_write_bandwidth(wb, elapsed, written); |
e98be2d59 writeback: bdi wr... |
1319 1320 |
snapshot: |
a88a341a7 writeback: move b... |
1321 1322 1323 |
wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; |
e98be2d59 writeback: bdi wr... |
1324 |
} |
8a7317995 writeback: reorga... |
1325 |
void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) |
e98be2d59 writeback: bdi wr... |
1326 |
{ |
2bc00aef0 writeback: consol... |
1327 |
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1328 |
__wb_update_bandwidth(&gdtc, NULL, start_time, false); |
e98be2d59 writeback: bdi wr... |
1329 |
} |
1da177e4c Linux-2.6.12-rc2 |
1330 |
/* |
d0e1d66b5 writeback: remove... |
1331 |
* After a task dirtied this many pages, balance_dirty_pages_ratelimited() |
9d823e8f6 writeback: per ta... |
1332 1333 1334 |
* will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive |
c41f012ad mm: rename global... |
1335 |
* global_zone_page_state() too often. So scale it near-sqrt to the safety margin |
9d823e8f6 writeback: per ta... |
1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 |
* (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } |
a88a341a7 writeback: move b... |
1346 |
static unsigned long wb_max_pause(struct bdi_writeback *wb, |
de1fff37b writeback: s/bdi/... |
1347 |
unsigned long wb_dirty) |
c8462cc9d writeback: limit ... |
1348 |
{ |
a88a341a7 writeback: move b... |
1349 |
unsigned long bw = wb->avg_write_bandwidth; |
e3b6c655b writeback: fix ne... |
1350 |
unsigned long t; |
c8462cc9d writeback: limit ... |
1351 |
|
7ccb9ad53 writeback: max, m... |
1352 1353 1354 1355 1356 1357 1358 |
/* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ |
de1fff37b writeback: s/bdi/... |
1359 |
t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); |
7ccb9ad53 writeback: max, m... |
1360 |
t++; |
e3b6c655b writeback: fix ne... |
1361 |
return min_t(unsigned long, t, MAX_PAUSE); |
7ccb9ad53 writeback: max, m... |
1362 |
} |
a88a341a7 writeback: move b... |
1363 1364 1365 1366 1367 |
static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) |
c8462cc9d writeback: limit ... |
1368 |
{ |
a88a341a7 writeback: move b... |
1369 1370 |
long hi = ilog2(wb->avg_write_bandwidth); long lo = ilog2(wb->dirty_ratelimit); |
7ccb9ad53 writeback: max, m... |
1371 1372 1373 |
long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ |
c8462cc9d writeback: limit ... |
1374 |
|
7ccb9ad53 writeback: max, m... |
1375 1376 |
/* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); |
c8462cc9d writeback: limit ... |
1377 1378 1379 1380 1381 |
/* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * |
7ccb9ad53 writeback: max, m... |
1382 |
* (N * 10ms) on 2^N concurrent tasks. |
c8462cc9d writeback: limit ... |
1383 1384 |
*/ if (hi > lo) |
7ccb9ad53 writeback: max, m... |
1385 |
t += (hi - lo) * (10 * HZ) / 1024; |
c8462cc9d writeback: limit ... |
1386 1387 |
/* |
7ccb9ad53 writeback: max, m... |
1388 1389 1390 1391 1392 1393 1394 1395 |
* This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. |
c8462cc9d writeback: limit ... |
1396 |
* |
7ccb9ad53 writeback: max, m... |
1397 1398 1399 1400 1401 1402 1403 |
* We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. |
c8462cc9d writeback: limit ... |
1404 |
*/ |
7ccb9ad53 writeback: max, m... |
1405 1406 |
t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); |
c8462cc9d writeback: limit ... |
1407 1408 |
/* |
5b9b35743 writeback: avoid ... |
1409 1410 1411 1412 1413 1414 |
* Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. |
c8462cc9d writeback: limit ... |
1415 |
*/ |
5b9b35743 writeback: avoid ... |
1416 1417 1418 1419 1420 1421 1422 1423 |
if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } |
7ccb9ad53 writeback: max, m... |
1424 1425 1426 1427 1428 |
pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } |
c8462cc9d writeback: limit ... |
1429 |
|
7ccb9ad53 writeback: max, m... |
1430 |
*nr_dirtied_pause = pages; |
c8462cc9d writeback: limit ... |
1431 |
/* |
7ccb9ad53 writeback: max, m... |
1432 |
* The minimal pause time will normally be half the target pause time. |
c8462cc9d writeback: limit ... |
1433 |
*/ |
5b9b35743 writeback: avoid ... |
1434 |
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; |
c8462cc9d writeback: limit ... |
1435 |
} |
970fb01ad writeback: add di... |
1436 |
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) |
5a5374856 mm/page-writeback... |
1437 |
{ |
2bc00aef0 writeback: consol... |
1438 |
struct bdi_writeback *wb = dtc->wb; |
93f78d882 writeback: move b... |
1439 |
unsigned long wb_reclaimable; |
5a5374856 mm/page-writeback... |
1440 1441 |
/* |
de1fff37b writeback: s/bdi/... |
1442 |
* wb_thresh is not treated as some limiting factor as |
5a5374856 mm/page-writeback... |
1443 |
* dirty_thresh, due to reasons |
de1fff37b writeback: s/bdi/... |
1444 |
* - in JBOD setup, wb_thresh can fluctuate a lot |
5a5374856 mm/page-writeback... |
1445 |
* - in a system with HDD and USB key, the USB key may somehow |
de1fff37b writeback: s/bdi/... |
1446 1447 |
* go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. |
5a5374856 mm/page-writeback... |
1448 |
* In this case we don't want to hard throttle the USB key |
de1fff37b writeback: s/bdi/... |
1449 1450 |
* dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in |
a88a341a7 writeback: move b... |
1451 |
* wb_position_ratio() will let the dirtier task progress |
de1fff37b writeback: s/bdi/... |
1452 |
* at some rate <= (write_bw / 2) for bringing down wb_dirty. |
5a5374856 mm/page-writeback... |
1453 |
*/ |
b1cbc6d40 writeback: make _... |
1454 |
dtc->wb_thresh = __wb_calc_thresh(dtc); |
970fb01ad writeback: add di... |
1455 1456 |
dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; |
5a5374856 mm/page-writeback... |
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 |
/* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ |
2bce774e8 writeback: remove... |
1468 |
if (dtc->wb_thresh < 2 * wb_stat_error()) { |
93f78d882 writeback: move b... |
1469 |
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
2bc00aef0 writeback: consol... |
1470 |
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); |
5a5374856 mm/page-writeback... |
1471 |
} else { |
93f78d882 writeback: move b... |
1472 |
wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
2bc00aef0 writeback: consol... |
1473 |
dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); |
5a5374856 mm/page-writeback... |
1474 1475 |
} } |
9d823e8f6 writeback: per ta... |
1476 |
/* |
1da177e4c Linux-2.6.12-rc2 |
1477 1478 |
* balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force |
143dfe861 writeback: IO-les... |
1479 |
* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
5b0830cb9 writeback: get ri... |
1480 1481 |
* If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. |
1da177e4c Linux-2.6.12-rc2 |
1482 |
*/ |
4c578dce5 mm/page-writeback... |
1483 |
static void balance_dirty_pages(struct bdi_writeback *wb, |
143dfe861 writeback: IO-les... |
1484 |
unsigned long pages_dirtied) |
1da177e4c Linux-2.6.12-rc2 |
1485 |
{ |
2bc00aef0 writeback: consol... |
1486 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1487 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
2bc00aef0 writeback: consol... |
1488 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6 writeback: implem... |
1489 1490 1491 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; |
8d92890bd mm/writeback: dis... |
1492 |
unsigned long nr_reclaimable; /* = file_dirty */ |
83712358b writeback: dirty ... |
1493 |
long period; |
7ccb9ad53 writeback: max, m... |
1494 1495 1496 1497 |
long pause; long max_pause; long min_pause; int nr_dirtied_pause; |
e50e37201 writeback: balanc... |
1498 |
bool dirty_exceeded = false; |
143dfe861 writeback: IO-les... |
1499 |
unsigned long task_ratelimit; |
7ccb9ad53 writeback: max, m... |
1500 |
unsigned long dirty_ratelimit; |
dfb8ae567 writeback: let ba... |
1501 |
struct backing_dev_info *bdi = wb->bdi; |
5a5374856 mm/page-writeback... |
1502 |
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; |
e98be2d59 writeback: bdi wr... |
1503 |
unsigned long start_time = jiffies; |
1da177e4c Linux-2.6.12-rc2 |
1504 1505 |
for (;;) { |
83712358b writeback: dirty ... |
1506 |
unsigned long now = jiffies; |
2bc00aef0 writeback: consol... |
1507 |
unsigned long dirty, thresh, bg_thresh; |
50e55bf62 mm/page-writeback... |
1508 1509 1510 |
unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; |
83712358b writeback: dirty ... |
1511 |
|
8d92890bd mm/writeback: dis... |
1512 |
nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); |
9fc3a43e1 writeback: separa... |
1513 |
gdtc->avail = global_dirtyable_memory(); |
11fb99898 mm: move most fil... |
1514 |
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); |
5fce25a9d mm: speed up writ... |
1515 |
|
9fc3a43e1 writeback: separa... |
1516 |
domain_dirty_limits(gdtc); |
16c4042f0 writeback: avoid ... |
1517 |
|
5a5374856 mm/page-writeback... |
1518 |
if (unlikely(strictlimit)) { |
970fb01ad writeback: add di... |
1519 |
wb_dirty_limits(gdtc); |
5a5374856 mm/page-writeback... |
1520 |
|
2bc00aef0 writeback: consol... |
1521 1522 |
dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; |
970fb01ad writeback: add di... |
1523 |
bg_thresh = gdtc->wb_bg_thresh; |
5a5374856 mm/page-writeback... |
1524 |
} else { |
2bc00aef0 writeback: consol... |
1525 1526 1527 |
dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; |
5a5374856 mm/page-writeback... |
1528 |
} |
c2aa723a6 writeback: implem... |
1529 |
if (mdtc) { |
c5edf9cdc writeback: fix in... |
1530 |
unsigned long filepages, headroom, writeback; |
c2aa723a6 writeback: implem... |
1531 1532 1533 1534 1535 |
/* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ |
c5edf9cdc writeback: fix in... |
1536 1537 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); |
c2aa723a6 writeback: implem... |
1538 |
mdtc->dirty += writeback; |
c5edf9cdc writeback: fix in... |
1539 |
mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6 writeback: implem... |
1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 |
domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } |
5a5374856 mm/page-writeback... |
1553 |
} |
16c4042f0 writeback: avoid ... |
1554 1555 1556 |
/* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts |
de1fff37b writeback: s/bdi/... |
1557 |
* when the wb limits are ramping up in case of !strictlimit. |
5a5374856 mm/page-writeback... |
1558 |
* |
de1fff37b writeback: s/bdi/... |
1559 1560 |
* In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping |
5a5374856 mm/page-writeback... |
1561 |
* up are the price we consciously pay for strictlimit-ing. |
c2aa723a6 writeback: implem... |
1562 1563 1564 |
* * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. |
16c4042f0 writeback: avoid ... |
1565 |
*/ |
c2aa723a6 writeback: implem... |
1566 1567 1568 |
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { |
a37b0715d mm/writeback: rep... |
1569 1570 1571 1572 1573 1574 |
unsigned long intv; unsigned long m_intv; free_running: intv = dirty_poll_interval(dirty, thresh); m_intv = ULONG_MAX; |
c2aa723a6 writeback: implem... |
1575 |
|
83712358b writeback: dirty ... |
1576 1577 |
current->dirty_paused_when = now; current->nr_dirtied = 0; |
c2aa723a6 writeback: implem... |
1578 1579 1580 |
if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); |
16c4042f0 writeback: avoid ... |
1581 |
break; |
83712358b writeback: dirty ... |
1582 |
} |
16c4042f0 writeback: avoid ... |
1583 |
|
bc05873dc writeback: make w... |
1584 |
if (unlikely(!writeback_in_progress(wb))) |
9ecf4866c writeback: make b... |
1585 |
wb_start_background_writeback(wb); |
143dfe861 writeback: IO-les... |
1586 |
|
97b27821b writeback, memcg:... |
1587 |
mem_cgroup_flush_foreign(wb); |
c2aa723a6 writeback: implem... |
1588 1589 1590 1591 |
/* * Calculate global domain's pos_ratio and select the * global dtc by default. */ |
a37b0715d mm/writeback: rep... |
1592 |
if (!strictlimit) { |
970fb01ad writeback: add di... |
1593 |
wb_dirty_limits(gdtc); |
5fce25a9d mm: speed up writ... |
1594 |
|
a37b0715d mm/writeback: rep... |
1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 |
if ((current->flags & PF_LOCAL_THROTTLE) && gdtc->wb_dirty < dirty_freerun_ceiling(gdtc->wb_thresh, gdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be throttled * when below the per-wb freerun ceiling. */ goto free_running; } |
2bc00aef0 writeback: consol... |
1605 1606 |
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); |
daddfa3cb writeback: add di... |
1607 1608 |
wb_position_ratio(gdtc); |
c2aa723a6 writeback: implem... |
1609 1610 1611 1612 1613 1614 1615 1616 1617 |
sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ |
a37b0715d mm/writeback: rep... |
1618 |
if (!strictlimit) { |
c2aa723a6 writeback: implem... |
1619 |
wb_dirty_limits(mdtc); |
a37b0715d mm/writeback: rep... |
1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 |
if ((current->flags & PF_LOCAL_THROTTLE) && mdtc->wb_dirty < dirty_freerun_ceiling(mdtc->wb_thresh, mdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be * throttled when below the per-wb * freerun ceiling. */ goto free_running; } |
c2aa723a6 writeback: implem... |
1631 1632 1633 1634 1635 1636 1637 |
dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } |
daddfa3cb writeback: add di... |
1638 |
|
a88a341a7 writeback: move b... |
1639 1640 |
if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; |
1da177e4c Linux-2.6.12-rc2 |
1641 |
|
8a7317995 writeback: reorga... |
1642 1643 1644 |
if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); |
c2aa723a6 writeback: implem... |
1645 |
__wb_update_bandwidth(gdtc, mdtc, start_time, true); |
8a7317995 writeback: reorga... |
1646 1647 |
spin_unlock(&wb->list_lock); } |
e98be2d59 writeback: bdi wr... |
1648 |
|
c2aa723a6 writeback: implem... |
1649 |
/* throttle according to the chosen dtc */ |
a88a341a7 writeback: move b... |
1650 |
dirty_ratelimit = wb->dirty_ratelimit; |
c2aa723a6 writeback: implem... |
1651 |
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> |
3a73dbbc9 writeback: fix un... |
1652 |
RATELIMIT_CALC_SHIFT; |
c2aa723a6 writeback: implem... |
1653 |
max_pause = wb_max_pause(wb, sdtc->wb_dirty); |
a88a341a7 writeback: move b... |
1654 1655 1656 |
min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); |
7ccb9ad53 writeback: max, m... |
1657 |
|
3a73dbbc9 writeback: fix un... |
1658 |
if (unlikely(task_ratelimit == 0)) { |
83712358b writeback: dirty ... |
1659 |
period = max_pause; |
c8462cc9d writeback: limit ... |
1660 |
pause = max_pause; |
143dfe861 writeback: IO-les... |
1661 |
goto pause; |
04fbfdc14 mm: per device di... |
1662 |
} |
83712358b writeback: dirty ... |
1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 |
period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ |
7ccb9ad53 writeback: max, m... |
1674 |
if (pause < min_pause) { |
5634cc2aa writeback: update... |
1675 |
trace_balance_dirty_pages(wb, |
c2aa723a6 writeback: implem... |
1676 1677 1678 1679 1680 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31 writeback: trace ... |
1681 1682 1683 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b writeback: dirty ... |
1684 |
period, |
7ccb9ad53 writeback: max, m... |
1685 |
min(pause, 0L), |
ece13ac31 writeback: trace ... |
1686 |
start_time); |
83712358b writeback: dirty ... |
1687 1688 1689 1690 1691 1692 |
if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; |
7ccb9ad53 writeback: max, m... |
1693 1694 |
} else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; |
57fc978cf writeback: contro... |
1695 |
break; |
04fbfdc14 mm: per device di... |
1696 |
} |
7ccb9ad53 writeback: max, m... |
1697 1698 1699 1700 1701 |
if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } |
143dfe861 writeback: IO-les... |
1702 1703 |
pause: |
5634cc2aa writeback: update... |
1704 |
trace_balance_dirty_pages(wb, |
c2aa723a6 writeback: implem... |
1705 1706 1707 1708 1709 |
sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, |
ece13ac31 writeback: trace ... |
1710 1711 1712 |
dirty_ratelimit, task_ratelimit, pages_dirtied, |
83712358b writeback: dirty ... |
1713 |
period, |
ece13ac31 writeback: trace ... |
1714 1715 |
pause, start_time); |
499d05ecf mm: Make task in ... |
1716 |
__set_current_state(TASK_KILLABLE); |
b57d74aff writeback: track ... |
1717 |
wb->dirty_sleep = now; |
d25105e89 writeback: accoun... |
1718 |
io_schedule_timeout(pause); |
87c6a9b25 writeback: make b... |
1719 |
|
83712358b writeback: dirty ... |
1720 1721 |
current->dirty_paused_when = now + pause; current->nr_dirtied = 0; |
7ccb9ad53 writeback: max, m... |
1722 |
current->nr_dirtied_pause = nr_dirtied_pause; |
83712358b writeback: dirty ... |
1723 |
|
ffd1f609a writeback: introd... |
1724 |
/* |
2bc00aef0 writeback: consol... |
1725 1726 |
* This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. |
ffd1f609a writeback: introd... |
1727 |
*/ |
1df647197 writeback: hard t... |
1728 |
if (task_ratelimit) |
ffd1f609a writeback: introd... |
1729 |
break; |
499d05ecf mm: Make task in ... |
1730 |
|
c5c6343c4 writeback: permit... |
1731 1732 |
/* * In the case of an unresponding NFS server and the NFS dirty |
de1fff37b writeback: s/bdi/... |
1733 |
* pages exceeds dirty_thresh, give the other good wb's a pipe |
c5c6343c4 writeback: permit... |
1734 1735 |
* to go through, so that tasks on them still remain responsive. * |
3f8b6fb7f scripts/spelling.... |
1736 |
* In theory 1 page is enough to keep the consumer-producer |
c5c6343c4 writeback: permit... |
1737 |
* pipe going: the flusher cleans 1 page => the task dirties 1 |
de1fff37b writeback: s/bdi/... |
1738 |
* more page. However wb_dirty has accounting errors. So use |
93f78d882 writeback: move b... |
1739 |
* the larger and more IO friendly wb_stat_error. |
c5c6343c4 writeback: permit... |
1740 |
*/ |
2bce774e8 writeback: remove... |
1741 |
if (sdtc->wb_dirty <= wb_stat_error()) |
c5c6343c4 writeback: permit... |
1742 |
break; |
499d05ecf mm: Make task in ... |
1743 1744 |
if (fatal_signal_pending(current)) break; |
1da177e4c Linux-2.6.12-rc2 |
1745 |
} |
a88a341a7 writeback: move b... |
1746 1747 |
if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; |
1da177e4c Linux-2.6.12-rc2 |
1748 |
|
bc05873dc writeback: make w... |
1749 |
if (writeback_in_progress(wb)) |
5b0830cb9 writeback: get ri... |
1750 |
return; |
1da177e4c Linux-2.6.12-rc2 |
1751 1752 1753 1754 1755 1756 1757 1758 1759 |
/* * In laptop mode, we wait until hitting the higher threshold before * starting background writeout, and then write out all the way down * to the lower threshold. So slow writers cause minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ |
143dfe861 writeback: IO-les... |
1760 1761 |
if (laptop_mode) return; |
2bc00aef0 writeback: consol... |
1762 |
if (nr_reclaimable > gdtc->bg_thresh) |
9ecf4866c writeback: make b... |
1763 |
wb_start_background_writeback(wb); |
1da177e4c Linux-2.6.12-rc2 |
1764 |
} |
9d823e8f6 writeback: per ta... |
1765 |
static DEFINE_PER_CPU(int, bdp_ratelimits); |
245b2e70e percpu: clean up ... |
1766 |
|
54848d73f writeback: charge... |
1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 |
/* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; |
1da177e4c Linux-2.6.12-rc2 |
1782 |
/** |
d0e1d66b5 writeback: remove... |
1783 |
* balance_dirty_pages_ratelimited - balance dirty memory state |
67be2dd1b [PATCH] DocBook: ... |
1784 |
* @mapping: address_space which was dirtied |
1da177e4c Linux-2.6.12-rc2 |
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 |
* * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * On really big machines, get_writeback_state is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ |
d0e1d66b5 writeback: remove... |
1795 |
void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1da177e4c Linux-2.6.12-rc2 |
1796 |
{ |
dfb8ae567 writeback: let ba... |
1797 1798 1799 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; |
9d823e8f6 writeback: per ta... |
1800 1801 |
int ratelimit; int *p; |
1da177e4c Linux-2.6.12-rc2 |
1802 |
|
36715cef0 writeback: skip t... |
1803 1804 |
if (!bdi_cap_account_dirty(bdi)) return; |
dfb8ae567 writeback: let ba... |
1805 1806 1807 1808 |
if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; |
9d823e8f6 writeback: per ta... |
1809 |
ratelimit = current->nr_dirtied_pause; |
a88a341a7 writeback: move b... |
1810 |
if (wb->dirty_exceeded) |
9d823e8f6 writeback: per ta... |
1811 |
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
9d823e8f6 writeback: per ta... |
1812 |
preempt_disable(); |
1da177e4c Linux-2.6.12-rc2 |
1813 |
/* |
9d823e8f6 writeback: per ta... |
1814 1815 1816 1817 |
* This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. |
1da177e4c Linux-2.6.12-rc2 |
1818 |
*/ |
7c8e0181e mm: replace __get... |
1819 |
p = this_cpu_ptr(&bdp_ratelimits); |
9d823e8f6 writeback: per ta... |
1820 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
fa5a734e4 [PATCH] balance_d... |
1821 |
*p = 0; |
d3bc1fef9 writeback: fix di... |
1822 1823 1824 |
else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; |
1da177e4c Linux-2.6.12-rc2 |
1825 |
} |
54848d73f writeback: charge... |
1826 1827 1828 1829 1830 |
/* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ |
7c8e0181e mm: replace __get... |
1831 |
p = this_cpu_ptr(&dirty_throttle_leaks); |
54848d73f writeback: charge... |
1832 |
if (*p > 0 && current->nr_dirtied < ratelimit) { |
d0e1d66b5 writeback: remove... |
1833 |
unsigned long nr_pages_dirtied; |
54848d73f writeback: charge... |
1834 1835 1836 |
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; |
1da177e4c Linux-2.6.12-rc2 |
1837 |
} |
fa5a734e4 [PATCH] balance_d... |
1838 |
preempt_enable(); |
9d823e8f6 writeback: per ta... |
1839 1840 |
if (unlikely(current->nr_dirtied >= ratelimit)) |
4c578dce5 mm/page-writeback... |
1841 |
balance_dirty_pages(wb, current->nr_dirtied); |
dfb8ae567 writeback: let ba... |
1842 1843 |
wb_put(wb); |
1da177e4c Linux-2.6.12-rc2 |
1844 |
} |
d0e1d66b5 writeback: remove... |
1845 |
EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
1da177e4c Linux-2.6.12-rc2 |
1846 |
|
aa661bbe1 writeback: move o... |
1847 1848 1849 1850 1851 |
/** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's |
a862f68a8 docs/core-api/mm:... |
1852 1853 1854 |
* clean enough. * * Return: %true if writeback should continue. |
aa661bbe1 writeback: move o... |
1855 1856 1857 |
*/ bool wb_over_bg_thresh(struct bdi_writeback *wb) { |
947e9762a writeback: update... |
1858 |
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
c2aa723a6 writeback: implem... |
1859 |
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; |
947e9762a writeback: update... |
1860 |
struct dirty_throttle_control * const gdtc = &gdtc_stor; |
c2aa723a6 writeback: implem... |
1861 1862 |
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; |
aa661bbe1 writeback: move o... |
1863 |
|
947e9762a writeback: update... |
1864 1865 1866 1867 1868 |
/* * Similar to balance_dirty_pages() but ignores pages being written * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); |
8d92890bd mm/writeback: dis... |
1869 |
gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); |
947e9762a writeback: update... |
1870 |
domain_dirty_limits(gdtc); |
aa661bbe1 writeback: move o... |
1871 |
|
947e9762a writeback: update... |
1872 |
if (gdtc->dirty > gdtc->bg_thresh) |
aa661bbe1 writeback: move o... |
1873 |
return true; |
74d369443 writeback: Fix pe... |
1874 1875 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) |
aa661bbe1 writeback: move o... |
1876 |
return true; |
c2aa723a6 writeback: implem... |
1877 |
if (mdtc) { |
c5edf9cdc writeback: fix in... |
1878 |
unsigned long filepages, headroom, writeback; |
c2aa723a6 writeback: implem... |
1879 |
|
c5edf9cdc writeback: fix in... |
1880 1881 1882 |
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc_calc_avail(mdtc, filepages, headroom); |
c2aa723a6 writeback: implem... |
1883 1884 1885 1886 |
domain_dirty_limits(mdtc); /* ditto, ignore writeback */ if (mdtc->dirty > mdtc->bg_thresh) return true; |
74d369443 writeback: Fix pe... |
1887 1888 |
if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) |
c2aa723a6 writeback: implem... |
1889 1890 |
return true; } |
aa661bbe1 writeback: move o... |
1891 1892 |
return false; } |
1da177e4c Linux-2.6.12-rc2 |
1893 |
/* |
1da177e4c Linux-2.6.12-rc2 |
1894 1895 |
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ |
cccad5b98 mm: convert use o... |
1896 |
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, |
32927393d sysctl: pass kern... |
1897 |
void *buffer, size_t *length, loff_t *ppos) |
1da177e4c Linux-2.6.12-rc2 |
1898 |
{ |
94af58469 writeback: schedu... |
1899 1900 1901 1902 |
unsigned int old_interval = dirty_writeback_interval; int ret; ret = proc_dointvec(table, write, buffer, length, ppos); |
515c24c13 mm/page-writeback... |
1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 |
/* * Writing 0 to dirty_writeback_interval will disable periodic writeback * and a different non-zero value will wakeup the writeback threads. * wb_wakeup_delayed() would be more appropriate, but it's a pain to * iterate over all bdis and wbs. * The reason we do this is to make the change take effect immediately. */ if (!ret && write && dirty_writeback_interval && dirty_writeback_interval != old_interval) |
94af58469 writeback: schedu... |
1913 1914 1915 |
wakeup_flusher_threads(WB_REASON_PERIODIC); return ret; |
1da177e4c Linux-2.6.12-rc2 |
1916 |
} |
c2c4986ed writeback: fix pr... |
1917 |
#ifdef CONFIG_BLOCK |
bca237a52 block/laptop_mode... |
1918 |
void laptop_mode_timer_fn(struct timer_list *t) |
1da177e4c Linux-2.6.12-rc2 |
1919 |
{ |
bca237a52 block/laptop_mode... |
1920 1921 |
struct backing_dev_info *backing_dev_info = from_timer(backing_dev_info, t, laptop_mode_wb_timer); |
1da177e4c Linux-2.6.12-rc2 |
1922 |
|
bca237a52 block/laptop_mode... |
1923 |
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); |
1da177e4c Linux-2.6.12-rc2 |
1924 1925 1926 1927 1928 1929 1930 |
} /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ |
31373d09d laptop-mode: Make... |
1931 |
void laptop_io_completion(struct backing_dev_info *info) |
1da177e4c Linux-2.6.12-rc2 |
1932 |
{ |
31373d09d laptop-mode: Make... |
1933 |
mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); |
1da177e4c Linux-2.6.12-rc2 |
1934 1935 1936 1937 1938 1939 1940 1941 1942 |
} /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { |
31373d09d laptop-mode: Make... |
1943 1944 1945 1946 1947 1948 1949 1950 |
struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); |
1da177e4c Linux-2.6.12-rc2 |
1951 |
} |
c2c4986ed writeback: fix pr... |
1952 |
#endif |
1da177e4c Linux-2.6.12-rc2 |
1953 1954 1955 1956 1957 1958 1959 1960 1961 |
/* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * If it is too low then SMP machines will call the (expensive) * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
9d823e8f6 writeback: per ta... |
1962 |
* thresholds. |
1da177e4c Linux-2.6.12-rc2 |
1963 |
*/ |
2d1d43f6a [PATCH] call mm/p... |
1964 |
void writeback_set_ratelimit(void) |
1da177e4c Linux-2.6.12-rc2 |
1965 |
{ |
dcc25ae76 writeback: move g... |
1966 |
struct wb_domain *dom = &global_wb_domain; |
9d823e8f6 writeback: per ta... |
1967 1968 |
unsigned long background_thresh; unsigned long dirty_thresh; |
dcc25ae76 writeback: move g... |
1969 |
|
9d823e8f6 writeback: per ta... |
1970 |
global_dirty_limits(&background_thresh, &dirty_thresh); |
dcc25ae76 writeback: move g... |
1971 |
dom->dirty_limit = dirty_thresh; |
9d823e8f6 writeback: per ta... |
1972 |
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); |
1da177e4c Linux-2.6.12-rc2 |
1973 1974 |
if (ratelimit_pages < 16) ratelimit_pages = 16; |
1da177e4c Linux-2.6.12-rc2 |
1975 |
} |
1d7ac6aec mm/writeback: Con... |
1976 |
static int page_writeback_cpu_online(unsigned int cpu) |
1da177e4c Linux-2.6.12-rc2 |
1977 |
{ |
1d7ac6aec mm/writeback: Con... |
1978 1979 |
writeback_set_ratelimit(); return 0; |
1da177e4c Linux-2.6.12-rc2 |
1980 |
} |
1da177e4c Linux-2.6.12-rc2 |
1981 |
/* |
dc6e29da9 Fix balance_dirty... |
1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 |
* Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers (by * comparing nr_free_buffer_pages() to vm_total_pages. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory (by subtracting * totalhigh_pages from vm_total_pages), and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. |
1da177e4c Linux-2.6.12-rc2 |
1998 1999 2000 |
*/ void __init page_writeback_init(void) { |
a50fcb512 writeback: fix in... |
2001 |
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); |
1d7ac6aec mm/writeback: Con... |
2002 2003 2004 2005 |
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); |
1da177e4c Linux-2.6.12-rc2 |
2006 |
} |
811d736f9 [PATCH] BLOCK: Di... |
2007 |
/** |
f446daaea mm: implement wri... |
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 |
* tag_pages_for_writeback - tag pages to be written by write_cache_pages * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is * that write_cache_pages (or whoever calls this function) will then use * TOWRITE tag to identify pages eligible for writeback. This mechanism is * used to avoid livelocking of writeback by a process steadily creating new * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ |
f446daaea mm: implement wri... |
2021 2022 2023 |
void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { |
ff9c745b8 mm: Convert page-... |
2024 2025 2026 |
XA_STATE(xas, &mapping->i_pages, start); unsigned int tagged = 0; void *page; |
268f42de7 radix-tree: delet... |
2027 |
|
ff9c745b8 mm: Convert page-... |
2028 2029 2030 2031 |
xas_lock_irq(&xas); xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); if (++tagged % XA_CHECK_SCHED) |
268f42de7 radix-tree: delet... |
2032 |
continue; |
ff9c745b8 mm: Convert page-... |
2033 2034 2035 |
xas_pause(&xas); xas_unlock_irq(&xas); |
f446daaea mm: implement wri... |
2036 |
cond_resched(); |
ff9c745b8 mm: Convert page-... |
2037 |
xas_lock_irq(&xas); |
268f42de7 radix-tree: delet... |
2038 |
} |
ff9c745b8 mm: Convert page-... |
2039 |
xas_unlock_irq(&xas); |
f446daaea mm: implement wri... |
2040 2041 2042 2043 |
} EXPORT_SYMBOL(tag_pages_for_writeback); /** |
0ea971801 consolidate gener... |
2044 |
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them. |
811d736f9 [PATCH] BLOCK: Di... |
2045 2046 |
* @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
0ea971801 consolidate gener... |
2047 2048 |
* @writepage: function called for each page * @data: data passed to writepage function |
811d736f9 [PATCH] BLOCK: Di... |
2049 |
* |
0ea971801 consolidate gener... |
2050 |
* If a page is already under I/O, write_cache_pages() skips it, even |
811d736f9 [PATCH] BLOCK: Di... |
2051 2052 2053 2054 2055 2056 |
* if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. |
f446daaea mm: implement wri... |
2057 2058 2059 2060 2061 2062 2063 |
* * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). |
64081362e mm/page-writeback... |
2064 2065 2066 2067 2068 2069 2070 |
* * To avoid deadlocks between range_cyclic writeback and callers that hold * pages in PageWriteback to aggregate IO until write_cache_pages() returns, * we do not loop back to the start of the file. Doing so causes a page * lock/page writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping back to the start * of the file violates that rule and causes deadlocks. |
a862f68a8 docs/core-api/mm:... |
2071 2072 |
* * Return: %0 on success, negative error code otherwise |
811d736f9 [PATCH] BLOCK: Di... |
2073 |
*/ |
0ea971801 consolidate gener... |
2074 2075 2076 |
int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) |
811d736f9 [PATCH] BLOCK: Di... |
2077 |
{ |
811d736f9 [PATCH] BLOCK: Di... |
2078 2079 |
int ret = 0; int done = 0; |
3fa750dcf mm/page-writeback... |
2080 |
int error; |
811d736f9 [PATCH] BLOCK: Di... |
2081 2082 2083 2084 |
struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ |
bd19e012f mm: write_cache_p... |
2085 |
pgoff_t done_index; |
811d736f9 [PATCH] BLOCK: Di... |
2086 |
int range_whole = 0; |
ff9c745b8 mm: Convert page-... |
2087 |
xa_mark_t tag; |
811d736f9 [PATCH] BLOCK: Di... |
2088 |
|
866798201 mm, pagevec: remo... |
2089 |
pagevec_init(&pvec); |
811d736f9 [PATCH] BLOCK: Di... |
2090 |
if (wbc->range_cyclic) { |
28659cc8c mm/page-writeback... |
2091 |
index = mapping->writeback_index; /* prev offset */ |
811d736f9 [PATCH] BLOCK: Di... |
2092 2093 |
end = -1; } else { |
09cbfeaf1 mm, fs: get rid o... |
2094 2095 |
index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; |
811d736f9 [PATCH] BLOCK: Di... |
2096 2097 |
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; |
811d736f9 [PATCH] BLOCK: Di... |
2098 |
} |
cc7b8f624 mm/page-writeback... |
2099 2100 |
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { tag_pages_for_writeback(mapping, index, end); |
f446daaea mm: implement wri... |
2101 |
tag = PAGECACHE_TAG_TOWRITE; |
cc7b8f624 mm/page-writeback... |
2102 |
} else { |
f446daaea mm: implement wri... |
2103 |
tag = PAGECACHE_TAG_DIRTY; |
cc7b8f624 mm/page-writeback... |
2104 |
} |
bd19e012f mm: write_cache_p... |
2105 |
done_index = index; |
5a3d5c981 mm: write_cache_p... |
2106 2107 |
while (!done && (index <= end)) { int i; |
2b9775ae4 mm: use pagevec_l... |
2108 |
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, |
67fd707f4 mm: remove nr_pag... |
2109 |
tag); |
5a3d5c981 mm: write_cache_p... |
2110 2111 |
if (nr_pages == 0) break; |
811d736f9 [PATCH] BLOCK: Di... |
2112 |
|
811d736f9 [PATCH] BLOCK: Di... |
2113 2114 |
for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; |
cf15b07cf writeback: make m... |
2115 |
done_index = page->index; |
d5482cdf8 mm: write_cache_p... |
2116 |
|
811d736f9 [PATCH] BLOCK: Di... |
2117 |
lock_page(page); |
5a3d5c981 mm: write_cache_p... |
2118 2119 2120 2121 2122 2123 2124 2125 |
/* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ |
811d736f9 [PATCH] BLOCK: Di... |
2126 |
if (unlikely(page->mapping != mapping)) { |
5a3d5c981 mm: write_cache_p... |
2127 |
continue_unlock: |
811d736f9 [PATCH] BLOCK: Di... |
2128 2129 2130 |
unlock_page(page); continue; } |
515f4a037 mm: write_cache_p... |
2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 |
if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } |
811d736f9 [PATCH] BLOCK: Di... |
2142 |
|
515f4a037 mm: write_cache_p... |
2143 2144 |
BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) |
5a3d5c981 mm: write_cache_p... |
2145 |
goto continue_unlock; |
811d736f9 [PATCH] BLOCK: Di... |
2146 |
|
de1414a65 fs: export inode_... |
2147 |
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); |
3fa750dcf mm/page-writeback... |
2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 |
error = (*writepage)(page, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of * writeback. There's no need to continue for * background writeback. Just push done_index * past this page so media errors won't choke * writeout for the entire file. For integrity * writeback, we must process the entire dirty * set regardless of errors because the fs may * still have state to clear for each page. In * that case we continue processing and return * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { |
00266770b mm: write_cache_p... |
2163 |
unlock_page(page); |
3fa750dcf mm/page-writeback... |
2164 2165 2166 |
error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; |
cf15b07cf writeback: make m... |
2167 |
done_index = page->index + 1; |
00266770b mm: write_cache_p... |
2168 2169 2170 |
done = 1; break; } |
3fa750dcf mm/page-writeback... |
2171 2172 |
if (!ret) ret = error; |
0b5649278 writeback: pay at... |
2173 |
} |
00266770b mm: write_cache_p... |
2174 |
|
546a19242 writeback: write_... |
2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 |
/* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; |
05fe478dd mm: write_cache_p... |
2185 |
} |
811d736f9 [PATCH] BLOCK: Di... |
2186 2187 2188 2189 |
} pagevec_release(&pvec); cond_resched(); } |
64081362e mm/page-writeback... |
2190 2191 2192 2193 2194 2195 2196 2197 |
/* * If we hit the last page and there is more work to be done: wrap * back the index back to the start of the file for the next * time we are called. */ if (wbc->range_cyclic && !done) done_index = 0; |
0b5649278 writeback: pay at... |
2198 2199 |
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; |
06d6cf695 mm: Add range_con... |
2200 |
|
811d736f9 [PATCH] BLOCK: Di... |
2201 2202 |
return ret; } |
0ea971801 consolidate gener... |
2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 |
EXPORT_SYMBOL(write_cache_pages); /* * Function used by generic_writepages to call the real writepage * function and set the mapping flags on error */ static int __writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /** * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * * This is a library function, which implements the writepages() * address_space_operation. |
a862f68a8 docs/core-api/mm:... |
2225 2226 |
* * Return: %0 on success, negative error code otherwise |
0ea971801 consolidate gener... |
2227 2228 2229 2230 |
*/ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
9b6096a65 mm: make generic_... |
2231 2232 |
struct blk_plug plug; int ret; |
0ea971801 consolidate gener... |
2233 2234 2235 |
/* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; |
9b6096a65 mm: make generic_... |
2236 2237 2238 2239 |
blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __writepage, mapping); blk_finish_plug(&plug); return ret; |
0ea971801 consolidate gener... |
2240 |
} |
811d736f9 [PATCH] BLOCK: Di... |
2241 2242 |
EXPORT_SYMBOL(generic_writepages); |
1da177e4c Linux-2.6.12-rc2 |
2243 2244 |
int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { |
22905f775 identify multipag... |
2245 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2246 2247 |
if (wbc->nr_to_write <= 0) return 0; |
80a2ea9f8 mm: retry writepa... |
2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 |
while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) break; cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } |
22905f775 identify multipag... |
2258 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2259 2260 2261 |
} /** |
2b69c8280 mm: drop "wait" p... |
2262 |
* write_one_page - write out a single page and wait on I/O |
67be2dd1b [PATCH] DocBook: ... |
2263 |
* @page: the page to write |
1da177e4c Linux-2.6.12-rc2 |
2264 2265 2266 |
* * The page must be locked by the caller and will be unlocked upon return. * |
37e51a764 mm: clean up erro... |
2267 2268 |
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. |
a862f68a8 docs/core-api/mm:... |
2269 2270 |
* * Return: %0 on success, negative error code otherwise |
1da177e4c Linux-2.6.12-rc2 |
2271 |
*/ |
2b69c8280 mm: drop "wait" p... |
2272 |
int write_one_page(struct page *page) |
1da177e4c Linux-2.6.12-rc2 |
2273 2274 2275 2276 2277 2278 2279 2280 2281 |
{ struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); |
2b69c8280 mm: drop "wait" p... |
2282 |
wait_on_page_writeback(page); |
1da177e4c Linux-2.6.12-rc2 |
2283 2284 |
if (clear_page_dirty_for_io(page)) { |
09cbfeaf1 mm, fs: get rid o... |
2285 |
get_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2286 |
ret = mapping->a_ops->writepage(page, &wbc); |
37e51a764 mm: clean up erro... |
2287 |
if (ret == 0) |
1da177e4c Linux-2.6.12-rc2 |
2288 |
wait_on_page_writeback(page); |
09cbfeaf1 mm, fs: get rid o... |
2289 |
put_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2290 2291 2292 |
} else { unlock_page(page); } |
37e51a764 mm: clean up erro... |
2293 2294 2295 |
if (!ret) ret = filemap_check_errors(mapping); |
1da177e4c Linux-2.6.12-rc2 |
2296 2297 2298 2299 2300 |
return ret; } EXPORT_SYMBOL(write_one_page); /* |
767193253 [PATCH] simplify ... |
2301 2302 2303 2304 2305 |
* For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) |
c3f0da631 mm/page-writeback... |
2306 |
return !TestSetPageDirty(page); |
767193253 [PATCH] simplify ... |
2307 2308 2309 2310 |
return 0; } /* |
e3a7cca1e vfs: add/use acco... |
2311 |
* Helper function for set_page_dirty family. |
c4843a759 memcg: add per cg... |
2312 |
* |
81f8c3a46 mm: memcontrol: g... |
2313 |
* Caller must hold lock_page_memcg(). |
c4843a759 memcg: add per cg... |
2314 |
* |
e3a7cca1e vfs: add/use acco... |
2315 2316 |
* NOTE: This relies on being atomic wrt interrupts. */ |
62cccb8c8 mm: simplify lock... |
2317 |
void account_page_dirtied(struct page *page, struct address_space *mapping) |
e3a7cca1e vfs: add/use acco... |
2318 |
{ |
52ebea749 writeback: make b... |
2319 |
struct inode *inode = mapping->host; |
9fb0a7da0 writeback: add mo... |
2320 |
trace_writeback_dirty_page(page, mapping); |
e3a7cca1e vfs: add/use acco... |
2321 |
if (mapping_cap_account_dirty(mapping)) { |
52ebea749 writeback: make b... |
2322 |
struct bdi_writeback *wb; |
de1414a65 fs: export inode_... |
2323 |
|
52ebea749 writeback: make b... |
2324 2325 |
inode_attach_wb(inode, page); wb = inode_to_wb(inode); |
de1414a65 fs: export inode_... |
2326 |
|
00f3ca2c2 mm: memcontrol: p... |
2327 |
__inc_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2328 |
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b mm: move vmscan w... |
2329 |
__inc_node_page_state(page, NR_DIRTIED); |
3e8f399da writeback: rework... |
2330 2331 |
inc_wb_stat(wb, WB_RECLAIMABLE); inc_wb_stat(wb, WB_DIRTIED); |
09cbfeaf1 mm, fs: get rid o... |
2332 |
task_io_account_write(PAGE_SIZE); |
d3bc1fef9 writeback: fix di... |
2333 2334 |
current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); |
97b27821b writeback, memcg:... |
2335 2336 |
mem_cgroup_track_foreign_dirty(page, wb); |
e3a7cca1e vfs: add/use acco... |
2337 2338 2339 2340 |
} } /* |
b9ea25152 page_writeback: c... |
2341 2342 |
* Helper function for deaccounting dirty page without writeback. * |
81f8c3a46 mm: memcontrol: g... |
2343 |
* Caller must hold lock_page_memcg(). |
b9ea25152 page_writeback: c... |
2344 |
*/ |
c4843a759 memcg: add per cg... |
2345 |
void account_page_cleaned(struct page *page, struct address_space *mapping, |
62cccb8c8 mm: simplify lock... |
2346 |
struct bdi_writeback *wb) |
b9ea25152 page_writeback: c... |
2347 2348 |
{ if (mapping_cap_account_dirty(mapping)) { |
00f3ca2c2 mm: memcontrol: p... |
2349 |
dec_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2350 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a writeback: implem... |
2351 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
09cbfeaf1 mm, fs: get rid o... |
2352 |
task_io_account_cancelled_write(PAGE_SIZE); |
b9ea25152 page_writeback: c... |
2353 2354 |
} } |
b9ea25152 page_writeback: c... |
2355 2356 |
/* |
1da177e4c Linux-2.6.12-rc2 |
2357 |
* For address_spaces which do not use buffers. Just tag the page as dirty in |
ff9c745b8 mm: Convert page-... |
2358 |
* the xarray. |
1da177e4c Linux-2.6.12-rc2 |
2359 2360 2361 2362 2363 |
* * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * |
2d6d7f982 mm: protect set_p... |
2364 2365 2366 |
* The caller must ensure this doesn't race with truncation. Most will simply * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and * the pte lock held, which also locks out truncation. |
1da177e4c Linux-2.6.12-rc2 |
2367 2368 2369 |
*/ int __set_page_dirty_nobuffers(struct page *page) { |
62cccb8c8 mm: simplify lock... |
2370 |
lock_page_memcg(page); |
1da177e4c Linux-2.6.12-rc2 |
2371 2372 |
if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); |
a85d9df1e mm: __set_page_di... |
2373 |
unsigned long flags; |
1da177e4c Linux-2.6.12-rc2 |
2374 |
|
c4843a759 memcg: add per cg... |
2375 |
if (!mapping) { |
62cccb8c8 mm: simplify lock... |
2376 |
unlock_page_memcg(page); |
8c08540f8 [PATCH] clean up ... |
2377 |
return 1; |
c4843a759 memcg: add per cg... |
2378 |
} |
8c08540f8 [PATCH] clean up ... |
2379 |
|
b93b01631 page cache: use x... |
2380 |
xa_lock_irqsave(&mapping->i_pages, flags); |
2d6d7f982 mm: protect set_p... |
2381 2382 |
BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
62cccb8c8 mm: simplify lock... |
2383 |
account_page_dirtied(page, mapping); |
ff9c745b8 mm: Convert page-... |
2384 |
__xa_set_mark(&mapping->i_pages, page_index(page), |
2d6d7f982 mm: protect set_p... |
2385 |
PAGECACHE_TAG_DIRTY); |
b93b01631 page cache: use x... |
2386 |
xa_unlock_irqrestore(&mapping->i_pages, flags); |
62cccb8c8 mm: simplify lock... |
2387 |
unlock_page_memcg(page); |
c4843a759 memcg: add per cg... |
2388 |
|
8c08540f8 [PATCH] clean up ... |
2389 2390 2391 |
if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
1da177e4c Linux-2.6.12-rc2 |
2392 |
} |
4741c9fd3 [PATCH] set_page_... |
2393 |
return 1; |
1da177e4c Linux-2.6.12-rc2 |
2394 |
} |
62cccb8c8 mm: simplify lock... |
2395 |
unlock_page_memcg(page); |
4741c9fd3 [PATCH] set_page_... |
2396 |
return 0; |
1da177e4c Linux-2.6.12-rc2 |
2397 2398 2399 2400 |
} EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* |
2f800fbd7 writeback: fix di... |
2401 |
* Call this whenever redirtying a page, to de-account the dirty counters |
dcfe4df3d mm/page-writeback... |
2402 2403 |
* (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to |
2f800fbd7 writeback: fix di... |
2404 2405 2406 2407 2408 2409 |
* systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; |
910181343 writeback: attrib... |
2410 |
|
2f800fbd7 writeback: fix di... |
2411 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2412 2413 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
2e898e4c0 writeback: safer ... |
2414 |
struct wb_lock_cookie cookie = {}; |
910181343 writeback: attrib... |
2415 |
|
2e898e4c0 writeback: safer ... |
2416 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
2f800fbd7 writeback: fix di... |
2417 |
current->nr_dirtied--; |
c4a25635b mm: move vmscan w... |
2418 |
dec_node_page_state(page, NR_DIRTIED); |
910181343 writeback: attrib... |
2419 |
dec_wb_stat(wb, WB_DIRTIED); |
2e898e4c0 writeback: safer ... |
2420 |
unlocked_inode_to_wb_end(inode, &cookie); |
2f800fbd7 writeback: fix di... |
2421 2422 2423 2424 2425 |
} } EXPORT_SYMBOL(account_page_redirty); /* |
1da177e4c Linux-2.6.12-rc2 |
2426 2427 2428 2429 2430 2431 |
* When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { |
8d38633c3 page_writeback: p... |
2432 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2433 |
wbc->pages_skipped++; |
8d38633c3 page_writeback: p... |
2434 |
ret = __set_page_dirty_nobuffers(page); |
2f800fbd7 writeback: fix di... |
2435 |
account_page_redirty(page); |
8d38633c3 page_writeback: p... |
2436 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2437 2438 2439 2440 |
} EXPORT_SYMBOL(redirty_page_for_writepage); /* |
6746aff74 HWPOISON: shmem: ... |
2441 2442 2443 2444 2445 2446 2447 |
* Dirty a page. * * For pages with a mapping this should be done under the page lock * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. * |
1da177e4c Linux-2.6.12-rc2 |
2448 2449 2450 |
* If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ |
1cf6e7d83 mm: task dirty ac... |
2451 |
int set_page_dirty(struct page *page) |
1da177e4c Linux-2.6.12-rc2 |
2452 2453 |
{ struct address_space *mapping = page_mapping(page); |
800d8c63b shmem: add huge p... |
2454 |
page = compound_head(page); |
1da177e4c Linux-2.6.12-rc2 |
2455 2456 |
if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
278df9f45 mm: reclaim inval... |
2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 |
/* * readahead/lru_deactivate_page could remain * PG_readahead/PG_reclaim due to race with end_page_writeback * About readahead, if the page is written, the flags would be * reset. So no problem. * About lru_deactivate_page, if the page is redirty, the flag * will be reset. So no problem. but if the page is used by readahead * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ |
a4bb3ecdc mm/page-writeback... |
2467 2468 |
if (PageReclaim(page)) ClearPageReclaim(page); |
9361401eb [PATCH] BLOCK: Ma... |
2469 2470 2471 2472 2473 |
#ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; #endif return (*spd)(page); |
1da177e4c Linux-2.6.12-rc2 |
2474 |
} |
4741c9fd3 [PATCH] set_page_... |
2475 2476 2477 2478 |
if (!PageDirty(page)) { if (!TestSetPageDirty(page)) return 1; } |
1da177e4c Linux-2.6.12-rc2 |
2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 |
return 0; } EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. * * Usually, the page _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the page should be locked before running set_page_dirty(). */ int set_page_dirty_lock(struct page *page) { int ret; |
7eaceacca block: remove per... |
2496 |
lock_page(page); |
1da177e4c Linux-2.6.12-rc2 |
2497 2498 2499 2500 2501 2502 2503 |
ret = set_page_dirty(page); unlock_page(page); return ret; } EXPORT_SYMBOL(set_page_dirty_lock); /* |
11f81becc page_writeback: r... |
2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 |
* This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ |
736304f32 mm: speed up canc... |
2516 |
void __cancel_dirty_page(struct page *page) |
11f81becc page_writeback: r... |
2517 |
{ |
c4843a759 memcg: add per cg... |
2518 2519 2520 |
struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2521 2522 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
2e898e4c0 writeback: safer ... |
2523 |
struct wb_lock_cookie cookie = {}; |
c4843a759 memcg: add per cg... |
2524 |
|
62cccb8c8 mm: simplify lock... |
2525 |
lock_page_memcg(page); |
2e898e4c0 writeback: safer ... |
2526 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
c4843a759 memcg: add per cg... |
2527 2528 |
if (TestClearPageDirty(page)) |
62cccb8c8 mm: simplify lock... |
2529 |
account_page_cleaned(page, mapping, wb); |
c4843a759 memcg: add per cg... |
2530 |
|
2e898e4c0 writeback: safer ... |
2531 |
unlocked_inode_to_wb_end(inode, &cookie); |
62cccb8c8 mm: simplify lock... |
2532 |
unlock_page_memcg(page); |
c4843a759 memcg: add per cg... |
2533 2534 2535 |
} else { ClearPageDirty(page); } |
11f81becc page_writeback: r... |
2536 |
} |
736304f32 mm: speed up canc... |
2537 |
EXPORT_SYMBOL(__cancel_dirty_page); |
11f81becc page_writeback: r... |
2538 2539 |
/* |
1da177e4c Linux-2.6.12-rc2 |
2540 2541 2542 2543 |
* Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page |
ff9c745b8 mm: Convert page-... |
2544 |
* tagged as dirty in the xarray so that a concurrent write-for-sync |
1da177e4c Linux-2.6.12-rc2 |
2545 2546 |
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), |
ff9c745b8 mm: Convert page-... |
2547 |
* at which stage we bring the page's dirty flag and xarray dirty tag |
1da177e4c Linux-2.6.12-rc2 |
2548 2549 |
* back into sync. * |
ff9c745b8 mm: Convert page-... |
2550 |
* This incoherency between the page's dirty flag and xarray tag is |
1da177e4c Linux-2.6.12-rc2 |
2551 2552 2553 2554 2555 |
* unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); |
c4843a759 memcg: add per cg... |
2556 |
int ret = 0; |
1da177e4c Linux-2.6.12-rc2 |
2557 |
|
184b4fef5 mm/page-writeback... |
2558 |
VM_BUG_ON_PAGE(!PageLocked(page), page); |
79352894b mm: fix clear_pag... |
2559 |
|
7658cc289 VM: Fix nasty and... |
2560 |
if (mapping && mapping_cap_account_dirty(mapping)) { |
682aa8e1a writeback: implem... |
2561 2562 |
struct inode *inode = mapping->host; struct bdi_writeback *wb; |
2e898e4c0 writeback: safer ... |
2563 |
struct wb_lock_cookie cookie = {}; |
682aa8e1a writeback: implem... |
2564 |
|
7658cc289 VM: Fix nasty and... |
2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 |
/* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole page dirty if it was * dirty in a pagetable. Only to then * (c) clean the page again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "set_page_dirty(page)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the page "master dirty bit" * as a serialization point for all the different * threads doing their things. |
7658cc289 VM: Fix nasty and... |
2589 2590 2591 |
*/ if (page_mkclean(page)) set_page_dirty(page); |
79352894b mm: fix clear_pag... |
2592 2593 2594 |
/* * We carefully synchronise fault handlers against * installing a dirty pte and marking the page dirty |
2d6d7f982 mm: protect set_p... |
2595 2596 2597 2598 |
* at this point. We do this by having them hold the * page lock while dirtying the page, and pages are * always locked coming in here, so we get the desired * exclusion. |
79352894b mm: fix clear_pag... |
2599 |
*/ |
2e898e4c0 writeback: safer ... |
2600 |
wb = unlocked_inode_to_wb_begin(inode, &cookie); |
7658cc289 VM: Fix nasty and... |
2601 |
if (TestClearPageDirty(page)) { |
00f3ca2c2 mm: memcontrol: p... |
2602 |
dec_lruvec_page_state(page, NR_FILE_DIRTY); |
5a1c84b40 mm: remove reclai... |
2603 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
682aa8e1a writeback: implem... |
2604 |
dec_wb_stat(wb, WB_RECLAIMABLE); |
c4843a759 memcg: add per cg... |
2605 |
ret = 1; |
1da177e4c Linux-2.6.12-rc2 |
2606 |
} |
2e898e4c0 writeback: safer ... |
2607 |
unlocked_inode_to_wb_end(inode, &cookie); |
c4843a759 memcg: add per cg... |
2608 |
return ret; |
1da177e4c Linux-2.6.12-rc2 |
2609 |
} |
7658cc289 VM: Fix nasty and... |
2610 |
return TestClearPageDirty(page); |
1da177e4c Linux-2.6.12-rc2 |
2611 |
} |
58bb01a9c [PATCH] re-export... |
2612 |
EXPORT_SYMBOL(clear_page_dirty_for_io); |
1da177e4c Linux-2.6.12-rc2 |
2613 2614 2615 2616 |
int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); |
739f79fc9 mm: memcontrol: f... |
2617 2618 |
struct mem_cgroup *memcg; struct lruvec *lruvec; |
d7365e783 mm: memcontrol: f... |
2619 |
int ret; |
1da177e4c Linux-2.6.12-rc2 |
2620 |
|
739f79fc9 mm: memcontrol: f... |
2621 2622 |
memcg = lock_page_memcg(page); lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); |
371a096ed mm: don't use rad... |
2623 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
910181343 writeback: attrib... |
2624 2625 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c Linux-2.6.12-rc2 |
2626 |
unsigned long flags; |
b93b01631 page cache: use x... |
2627 |
xa_lock_irqsave(&mapping->i_pages, flags); |
1da177e4c Linux-2.6.12-rc2 |
2628 |
ret = TestClearPageWriteback(page); |
69cb51d18 mm: count writeba... |
2629 |
if (ret) { |
ff9c745b8 mm: Convert page-... |
2630 |
__xa_clear_mark(&mapping->i_pages, page_index(page), |
1da177e4c Linux-2.6.12-rc2 |
2631 |
PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6 mm: bdi: add sepa... |
2632 |
if (bdi_cap_account_writeback(bdi)) { |
910181343 writeback: attrib... |
2633 |
struct bdi_writeback *wb = inode_to_wb(inode); |
3e8f399da writeback: rework... |
2634 |
dec_wb_stat(wb, WB_WRITEBACK); |
910181343 writeback: attrib... |
2635 |
__wb_writeout_inc(wb); |
04fbfdc14 mm: per device di... |
2636 |
} |
69cb51d18 mm: count writeba... |
2637 |
} |
6c60d2b57 fs/fs-writeback.c... |
2638 2639 2640 2641 |
if (mapping->host && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); |
b93b01631 page cache: use x... |
2642 |
xa_unlock_irqrestore(&mapping->i_pages, flags); |
1da177e4c Linux-2.6.12-rc2 |
2643 2644 2645 |
} else { ret = TestClearPageWriteback(page); } |
739f79fc9 mm: memcontrol: f... |
2646 2647 2648 2649 2650 2651 |
/* * NOTE: Page might be free now! Writeback doesn't hold a page * reference on its own, it relies on truncation to wait for * the clearing of PG_writeback. The below can only access * page state that is static across allocation cycles. */ |
99b12e3d8 writeback: accoun... |
2652 |
if (ret) { |
739f79fc9 mm: memcontrol: f... |
2653 |
dec_lruvec_state(lruvec, NR_WRITEBACK); |
5a1c84b40 mm: remove reclai... |
2654 |
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
c4a25635b mm: move vmscan w... |
2655 |
inc_node_page_state(page, NR_WRITTEN); |
99b12e3d8 writeback: accoun... |
2656 |
} |
739f79fc9 mm: memcontrol: f... |
2657 |
__unlock_page_memcg(memcg); |
1da177e4c Linux-2.6.12-rc2 |
2658 2659 |
return ret; } |
1c8349a17 ext4: fix data in... |
2660 |
int __test_set_page_writeback(struct page *page, bool keep_write) |
1da177e4c Linux-2.6.12-rc2 |
2661 2662 |
{ struct address_space *mapping = page_mapping(page); |
f28d43636 mm/gup/writeback:... |
2663 |
int ret, access_ret; |
1da177e4c Linux-2.6.12-rc2 |
2664 |
|
62cccb8c8 mm: simplify lock... |
2665 |
lock_page_memcg(page); |
371a096ed mm: don't use rad... |
2666 |
if (mapping && mapping_use_writeback_tags(mapping)) { |
ff9c745b8 mm: Convert page-... |
2667 |
XA_STATE(xas, &mapping->i_pages, page_index(page)); |
910181343 writeback: attrib... |
2668 2669 |
struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); |
1da177e4c Linux-2.6.12-rc2 |
2670 |
unsigned long flags; |
ff9c745b8 mm: Convert page-... |
2671 2672 |
xas_lock_irqsave(&xas, flags); xas_load(&xas); |
1da177e4c Linux-2.6.12-rc2 |
2673 |
ret = TestSetPageWriteback(page); |
69cb51d18 mm: count writeba... |
2674 |
if (!ret) { |
6c60d2b57 fs/fs-writeback.c... |
2675 2676 2677 2678 |
bool on_wblist; on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); |
ff9c745b8 mm: Convert page-... |
2679 |
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); |
e4ad08fe6 mm: bdi: add sepa... |
2680 |
if (bdi_cap_account_writeback(bdi)) |
3e8f399da writeback: rework... |
2681 |
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); |
6c60d2b57 fs/fs-writeback.c... |
2682 2683 2684 2685 2686 2687 2688 2689 |
/* * We can come through here when swapping anonymous * pages, so we don't necessarily have an inode to track * for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); |
69cb51d18 mm: count writeba... |
2690 |
} |
1da177e4c Linux-2.6.12-rc2 |
2691 |
if (!PageDirty(page)) |
ff9c745b8 mm: Convert page-... |
2692 |
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); |
1c8349a17 ext4: fix data in... |
2693 |
if (!keep_write) |
ff9c745b8 mm: Convert page-... |
2694 2695 |
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); |
1da177e4c Linux-2.6.12-rc2 |
2696 2697 2698 |
} else { ret = TestSetPageWriteback(page); } |
3a3c02ecf mm: page-writebac... |
2699 |
if (!ret) { |
00f3ca2c2 mm: memcontrol: p... |
2700 |
inc_lruvec_page_state(page, NR_WRITEBACK); |
5a1c84b40 mm: remove reclai... |
2701 |
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
3a3c02ecf mm: page-writebac... |
2702 |
} |
62cccb8c8 mm: simplify lock... |
2703 |
unlock_page_memcg(page); |
f28d43636 mm/gup/writeback:... |
2704 2705 2706 2707 2708 2709 |
access_ret = arch_make_page_accessible(page); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ VM_BUG_ON_PAGE(access_ret != 0, page); |
1da177e4c Linux-2.6.12-rc2 |
2710 2711 2712 |
return ret; } |
1c8349a17 ext4: fix data in... |
2713 |
EXPORT_SYMBOL(__test_set_page_writeback); |
1da177e4c Linux-2.6.12-rc2 |
2714 |
|
19343b5bd mm/page-writeback... |
2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 |
/* * Wait for a page to complete writeback */ void wait_on_page_writeback(struct page *page) { if (PageWriteback(page)) { trace_wait_on_page_writeback(page, page_mapping(page)); wait_on_page_bit(page, PG_writeback); } } EXPORT_SYMBOL_GPL(wait_on_page_writeback); |
1d1d1a767 mm: only enforce ... |
2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 |
/** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. * * This function determines if the given page is related to a backing device * that requires page contents to be held stable during writeback. If so, then * it will wait for any pending writeback to complete. */ void wait_for_stable_page(struct page *page) { |
de1414a65 fs: export inode_... |
2736 2737 |
if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) wait_on_page_writeback(page); |
1d1d1a767 mm: only enforce ... |
2738 2739 |
} EXPORT_SYMBOL_GPL(wait_for_stable_page); |