Blame view
mm/readahead.c
15.1 KB
1da177e4c
|
1 2 3 4 5 |
/* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * |
e1f8e8744
|
6 |
* 09Apr2002 Andrew Morton |
1da177e4c
|
7 8 9 10 11 |
* Initial version. */ #include <linux/kernel.h> #include <linux/fs.h> |
5a0e3ad6a
|
12 |
#include <linux/gfp.h> |
1da177e4c
|
13 14 15 16 |
#include <linux/mm.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> |
8bde37f08
|
17 |
#include <linux/task_io_accounting_ops.h> |
1da177e4c
|
18 |
#include <linux/pagevec.h> |
f5ff8422b
|
19 |
#include <linux/pagemap.h> |
1da177e4c
|
20 |
|
1da177e4c
|
21 22 23 24 25 26 27 28 |
/* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = mapping->backing_dev_info->ra_pages; |
f4e6b498d
|
29 |
ra->prev_pos = -1; |
1da177e4c
|
30 |
} |
d41cc702c
|
31 |
EXPORT_SYMBOL_GPL(file_ra_state_init); |
1da177e4c
|
32 |
|
1da177e4c
|
33 |
#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) |
03fb3d2af
|
34 35 |
/* * see if a page needs releasing upon read_cache_pages() failure |
266cf658e
|
36 37 38 39 |
* - the caller of read_cache_pages() may have set PG_private or PG_fscache * before calling, such as the NFS fs marking pages that are cached locally * on disk, thus we need to give the fs a chance to clean up in the event of * an error |
03fb3d2af
|
40 41 42 43 |
*/ static void read_cache_pages_invalidate_page(struct address_space *mapping, struct page *page) { |
266cf658e
|
44 |
if (page_has_private(page)) { |
03fb3d2af
|
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
if (!trylock_page(page)) BUG(); page->mapping = mapping; do_invalidatepage(page, 0); page->mapping = NULL; unlock_page(page); } page_cache_release(page); } /* * release a list of pages, invalidating them first if need be */ static void read_cache_pages_invalidate_pages(struct address_space *mapping, struct list_head *pages) { struct page *victim; while (!list_empty(pages)) { victim = list_to_page(pages); list_del(&victim->lru); read_cache_pages_invalidate_page(mapping, victim); } } |
1da177e4c
|
69 |
/** |
bd40cddae
|
70 |
* read_cache_pages - populate an address space with some pages & start reads against them |
1da177e4c
|
71 72 73 74 75 76 77 78 79 80 81 82 |
* @mapping: the address_space * @pages: The address of a list_head which contains the target pages. These * pages have their ->index populated and are otherwise uninitialised. * @filler: callback routine for filling a single page. * @data: private data for the callback routine. * * Hides the details of the LRU cache etc from the filesystems. */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int (*filler)(void *, struct page *), void *data) { struct page *page; |
1da177e4c
|
83 |
int ret = 0; |
1da177e4c
|
84 85 86 |
while (!list_empty(pages)) { page = list_to_page(pages); list_del(&page->lru); |
eb2be1893
|
87 88 |
if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) { |
03fb3d2af
|
89 |
read_cache_pages_invalidate_page(mapping, page); |
1da177e4c
|
90 91 |
continue; } |
eb2be1893
|
92 |
page_cache_release(page); |
1da177e4c
|
93 |
ret = filler(data, page); |
eb2be1893
|
94 |
if (unlikely(ret)) { |
03fb3d2af
|
95 |
read_cache_pages_invalidate_pages(mapping, pages); |
1da177e4c
|
96 97 |
break; } |
8bde37f08
|
98 |
task_io_account_read(PAGE_CACHE_SIZE); |
1da177e4c
|
99 |
} |
1da177e4c
|
100 101 102 103 104 105 106 107 |
return ret; } EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, struct list_head *pages, unsigned nr_pages) { |
5b417b187
|
108 |
struct blk_plug plug; |
1da177e4c
|
109 |
unsigned page_idx; |
994fc28c7
|
110 |
int ret; |
1da177e4c
|
111 |
|
5b417b187
|
112 |
blk_start_plug(&plug); |
1da177e4c
|
113 114 |
if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); |
029e332ea
|
115 116 |
/* Clean up the remaining pages */ put_pages_list(pages); |
1da177e4c
|
117 118 |
goto out; } |
1da177e4c
|
119 120 121 |
for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_to_page(pages); list_del(&page->lru); |
eb2be1893
|
122 |
if (!add_to_page_cache_lru(page, mapping, |
1da177e4c
|
123 |
page->index, GFP_KERNEL)) { |
9f1a3cfcf
|
124 |
mapping->a_ops->readpage(filp, page); |
eb2be1893
|
125 126 |
} page_cache_release(page); |
1da177e4c
|
127 |
} |
994fc28c7
|
128 |
ret = 0; |
5b417b187
|
129 |
|
1da177e4c
|
130 |
out: |
5b417b187
|
131 |
blk_finish_plug(&plug); |
1da177e4c
|
132 133 134 135 |
return ret; } /* |
d30a11004
|
136 |
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all |
1da177e4c
|
137 138 139 140 141 |
* the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. |
1da177e4c
|
142 143 144 |
*/ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
46fc3e7b4
|
145 146 |
pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) |
1da177e4c
|
147 148 149 150 151 152 153 154 155 156 157 |
{ struct inode *inode = mapping->host; struct page *page; unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); int page_idx; int ret = 0; loff_t isize = i_size_read(inode); if (isize == 0) goto out; |
46fc3e7b4
|
158 |
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); |
1da177e4c
|
159 160 161 162 |
/* * Preallocate as many pages as we will need. */ |
1da177e4c
|
163 |
for (page_idx = 0; page_idx < nr_to_read; page_idx++) { |
7361f4d8c
|
164 |
pgoff_t page_offset = offset + page_idx; |
c743d96b6
|
165 |
|
1da177e4c
|
166 167 |
if (page_offset > end_index) break; |
001281881
|
168 |
rcu_read_lock(); |
1da177e4c
|
169 |
page = radix_tree_lookup(&mapping->page_tree, page_offset); |
001281881
|
170 |
rcu_read_unlock(); |
1da177e4c
|
171 172 |
if (page) continue; |
7b1de5868
|
173 |
page = page_cache_alloc_readahead(mapping); |
1da177e4c
|
174 175 176 177 |
if (!page) break; page->index = page_offset; list_add(&page->lru, &page_pool); |
46fc3e7b4
|
178 179 |
if (page_idx == nr_to_read - lookahead_size) SetPageReadahead(page); |
1da177e4c
|
180 181 |
ret++; } |
1da177e4c
|
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
/* * Now start the IO. We ignore I/O errors - if the page is not * uptodate then the caller will launch readpage again, and * will then handle the error. */ if (ret) read_pages(mapping, filp, &page_pool, ret); BUG_ON(!list_empty(&page_pool)); out: return ret; } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, |
7361f4d8c
|
200 |
pgoff_t offset, unsigned long nr_to_read) |
1da177e4c
|
201 202 203 204 205 |
{ int ret = 0; if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; |
f7e839dd3
|
206 |
nr_to_read = max_sane_readahead(nr_to_read); |
1da177e4c
|
207 208 209 210 211 212 213 214 |
while (nr_to_read) { int err; unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; err = __do_page_cache_readahead(mapping, filp, |
46fc3e7b4
|
215 |
offset, this_chunk, 0); |
1da177e4c
|
216 217 218 219 220 221 222 223 224 225 226 227 |
if (err < 0) { ret = err; break; } ret += err; offset += this_chunk; nr_to_read -= this_chunk; } return ret; } /* |
1da177e4c
|
228 229 230 231 232 |
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ unsigned long max_sane_readahead(unsigned long nr) { |
4f98a2fee
|
233 |
return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) |
05a0416be
|
234 |
+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); |
1da177e4c
|
235 |
} |
5ce1110b9
|
236 237 238 239 |
/* * Submit IO for the read-ahead request in file_ra_state. */ |
d30a11004
|
240 |
unsigned long ra_submit(struct file_ra_state *ra, |
5ce1110b9
|
241 242 |
struct address_space *mapping, struct file *filp) { |
5ce1110b9
|
243 |
int actual; |
5ce1110b9
|
244 |
actual = __do_page_cache_readahead(mapping, filp, |
f9acc8c7b
|
245 |
ra->start, ra->size, ra->async_size); |
5ce1110b9
|
246 247 248 |
return actual; } |
122a21d11
|
249 250 |
/* |
c743d96b6
|
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 |
* Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-8 page = 32k initial, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* |
122a21d11
|
271 272 273 |
* Get the previous window size, ramp it up, and * return it as the new window size. */ |
c743d96b6
|
274 |
static unsigned long get_next_ra_size(struct file_ra_state *ra, |
122a21d11
|
275 276 |
unsigned long max) { |
f9acc8c7b
|
277 |
unsigned long cur = ra->size; |
122a21d11
|
278 279 280 |
unsigned long newsize; if (cur < max / 16) |
c743d96b6
|
281 |
newsize = 4 * cur; |
122a21d11
|
282 |
else |
c743d96b6
|
283 |
newsize = 2 * cur; |
122a21d11
|
284 285 286 287 288 289 290 291 292 293 |
return min(newsize, max); } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * |
f9acc8c7b
|
294 295 296 297 |
* |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead |
122a21d11
|
298 299 300 301 |
* * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; |
f9acc8c7b
|
302 303 304 |
* Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. |
122a21d11
|
305 306 307 |
* * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead |
f9acc8c7b
|
308 |
* page at (start+size-async_size) with PG_readahead, and use it as readahead |
122a21d11
|
309 310 311 |
* indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * |
f4e6b498d
|
312 |
* prev_pos tracks the last visited byte in the _previous_ read request. |
122a21d11
|
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 |
* It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ /* |
10be0b372
|
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 |
* Count contiguously cached pages from @offset-1 to @offset-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset, unsigned long max) { pgoff_t head; rcu_read_lock(); head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; } /* * page cache context based read-ahead */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset, unsigned long req_size, unsigned long max) { pgoff_t size; size = count_history_pages(mapping, ra, offset, max); /* * no history pages: * it could be a random read */ if (!size) return 0; /* * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ if (size >= offset) size *= 2; ra->start = offset; ra->size = get_init_ra_size(size + req_size, max); ra->async_size = ra->size; return 1; } /* |
122a21d11
|
381 382 383 384 385 |
* A minimal readahead algorithm for trivial sequential/random reads. */ static unsigned long ondemand_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, |
cf914a7d6
|
386 |
bool hit_readahead_marker, pgoff_t offset, |
122a21d11
|
387 388 |
unsigned long req_size) { |
fc31d16ad
|
389 |
unsigned long max = max_sane_readahead(ra->ra_pages); |
045a2529a
|
390 391 392 393 394 395 |
/* * start of file */ if (!offset) goto initial_readahead; |
122a21d11
|
396 397 |
/* |
f9acc8c7b
|
398 |
* It's the expected callback offset, assume sequential access. |
122a21d11
|
399 400 |
* Ramp up sizes, and push forward the readahead window. */ |
045a2529a
|
401 402 |
if ((offset == (ra->start + ra->size - ra->async_size) || offset == (ra->start + ra->size))) { |
f9acc8c7b
|
403 404 405 406 |
ra->start += ra->size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; |
122a21d11
|
407 |
} |
122a21d11
|
408 |
/* |
6b10c6c9f
|
409 410 411 412 413 414 415 |
* Hit a marked page without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ if (hit_readahead_marker) { pgoff_t start; |
30002ed2e
|
416 |
rcu_read_lock(); |
caca7cb74
|
417 |
start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); |
30002ed2e
|
418 |
rcu_read_unlock(); |
6b10c6c9f
|
419 420 421 422 423 424 |
if (!start || start - offset > max) return 0; ra->start = start; ra->size = start - offset; /* old async_size */ |
160334a0c
|
425 |
ra->size += req_size; |
6b10c6c9f
|
426 427 428 429 430 431 |
ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; } /* |
045a2529a
|
432 |
* oversize read |
122a21d11
|
433 |
*/ |
045a2529a
|
434 435 436 437 438 439 440 441 442 443 |
if (req_size > max) goto initial_readahead; /* * sequential cache miss */ if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) goto initial_readahead; /* |
10be0b372
|
444 445 446 447 448 449 450 |
* Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ if (try_context_readahead(mapping, ra, offset, req_size, max)) goto readit; /* |
045a2529a
|
451 452 453 454 455 456 |
* standalone, small random read * Read as is, and do not pollute the readahead state. */ return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); initial_readahead: |
f9acc8c7b
|
457 458 459 |
ra->start = offset; ra->size = get_init_ra_size(req_size, max); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
122a21d11
|
460 |
|
f9acc8c7b
|
461 |
readit: |
51daa88eb
|
462 463 464 465 466 467 468 469 470 |
/* * Will this read hit the readahead marker made by itself? * If so, trigger the readahead marker hit now, and merge * the resulted next readahead window into the current one. */ if (offset == ra->start && ra->size == ra->async_size) { ra->async_size = get_next_ra_size(ra, max); ra->size += ra->async_size; } |
122a21d11
|
471 472 473 474 |
return ra_submit(ra, mapping, filp); } /** |
cf914a7d6
|
475 |
* page_cache_sync_readahead - generic file readahead |
122a21d11
|
476 477 478 |
* @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() |
cf914a7d6
|
479 |
* @offset: start offset into @mapping, in pagecache page-sized units |
122a21d11
|
480 |
* @req_size: hint: total size of the read which the caller is performing in |
cf914a7d6
|
481 |
* pagecache pages |
122a21d11
|
482 |
* |
cf914a7d6
|
483 484 485 486 |
* page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more * pages onto the read request if access patterns suggest it will improve * performance. |
122a21d11
|
487 |
*/ |
cf914a7d6
|
488 489 490 |
void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, pgoff_t offset, unsigned long req_size) |
122a21d11
|
491 492 493 |
{ /* no read-ahead */ if (!ra->ra_pages) |
cf914a7d6
|
494 |
return; |
0141450f6
|
495 |
/* be dumb */ |
70655c06b
|
496 |
if (filp && (filp->f_mode & FMODE_RANDOM)) { |
0141450f6
|
497 498 499 |
force_page_cache_readahead(mapping, filp, offset, req_size); return; } |
cf914a7d6
|
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 |
/* do read-ahead */ ondemand_readahead(mapping, ra, filp, false, offset, req_size); } EXPORT_SYMBOL_GPL(page_cache_sync_readahead); /** * page_cache_async_readahead - file readahead for marked pages * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() * @page: the page at @offset which has the PG_readahead flag set * @offset: start offset into @mapping, in pagecache page-sized units * @req_size: hint: total size of the read which the caller is performing in * pagecache pages * |
bf8abe8b9
|
515 |
* page_cache_async_readahead() should be called when a page is used which |
f7850d932
|
516 |
* has the PG_readahead flag; this is a marker to suggest that the application |
cf914a7d6
|
517 |
* has used up enough of the readahead window that we should start pulling in |
f7850d932
|
518 519 |
* more pages. */ |
cf914a7d6
|
520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 |
void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, struct page *page, pgoff_t offset, unsigned long req_size) { /* no read-ahead */ if (!ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (PageWriteback(page)) return; ClearPageReadahead(page); /* * Defer asynchronous read-ahead on IO congestion. */ if (bdi_read_congested(mapping->backing_dev_info)) return; |
122a21d11
|
543 544 |
/* do read-ahead */ |
cf914a7d6
|
545 |
ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
122a21d11
|
546 |
} |
cf914a7d6
|
547 |
EXPORT_SYMBOL_GPL(page_cache_async_readahead); |