Commit c743d96b6d2ff55a94df7b5ac7c74987bb9c343b
Committed by
Linus Torvalds
1 parent
dc7868fcb9
Exists in
master
and in
20 other branches
readahead: remove the old algorithm
Remove the old readahead algorithm. Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn> Cc: Steven Pratt <slpratt@austin.ibm.com> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 26 additions and 365 deletions Side-by-side Diff
include/linux/fs.h
... | ... | @@ -701,14 +701,6 @@ |
701 | 701 | * file_ra_state.la_index .ra_index .lookahead_index .readahead_index |
702 | 702 | */ |
703 | 703 | struct file_ra_state { |
704 | - unsigned long start; /* Current window */ | |
705 | - unsigned long size; | |
706 | - unsigned long flags; /* ra flags RA_FLAG_xxx*/ | |
707 | - unsigned long cache_hit; /* cache hit count*/ | |
708 | - unsigned long prev_index; /* Cache last read() position */ | |
709 | - unsigned long ahead_start; /* Ahead window */ | |
710 | - unsigned long ahead_size; | |
711 | - | |
712 | 704 | pgoff_t la_index; /* enqueue time */ |
713 | 705 | pgoff_t ra_index; /* begin offset */ |
714 | 706 | pgoff_t lookahead_index; /* time to do next readahead */ |
715 | 707 | |
... | ... | @@ -717,10 +709,9 @@ |
717 | 709 | unsigned long ra_pages; /* Maximum readahead window */ |
718 | 710 | unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ |
719 | 711 | unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ |
712 | + unsigned long prev_index; /* Cache last read() position */ | |
720 | 713 | unsigned int prev_offset; /* Offset where last read() ended in a page */ |
721 | 714 | }; |
722 | -#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */ | |
723 | -#define RA_FLAG_INCACHE 0x02 /* file is already in cache */ | |
724 | 715 | |
725 | 716 | /* |
726 | 717 | * Measuring read-ahead sizes. |
include/linux/mm.h
... | ... | @@ -1144,13 +1144,6 @@ |
1144 | 1144 | struct page *page, |
1145 | 1145 | pgoff_t offset, |
1146 | 1146 | unsigned long size); |
1147 | -unsigned long page_cache_readahead(struct address_space *mapping, | |
1148 | - struct file_ra_state *ra, | |
1149 | - struct file *filp, | |
1150 | - pgoff_t offset, | |
1151 | - unsigned long size); | |
1152 | -void handle_ra_miss(struct address_space *mapping, | |
1153 | - struct file_ra_state *ra, pgoff_t offset); | |
1154 | 1147 | unsigned long max_sane_readahead(unsigned long nr); |
1155 | 1148 | |
1156 | 1149 | /* Do stack extension */ |
mm/readahead.c
... | ... | @@ -49,82 +49,6 @@ |
49 | 49 | } |
50 | 50 | EXPORT_SYMBOL_GPL(file_ra_state_init); |
51 | 51 | |
52 | -/* | |
53 | - * Return max readahead size for this inode in number-of-pages. | |
54 | - */ | |
55 | -static inline unsigned long get_max_readahead(struct file_ra_state *ra) | |
56 | -{ | |
57 | - return ra->ra_pages; | |
58 | -} | |
59 | - | |
60 | -static inline unsigned long get_min_readahead(struct file_ra_state *ra) | |
61 | -{ | |
62 | - return MIN_RA_PAGES; | |
63 | -} | |
64 | - | |
65 | -static inline void reset_ahead_window(struct file_ra_state *ra) | |
66 | -{ | |
67 | - /* | |
68 | - * ... but preserve ahead_start + ahead_size value, | |
69 | - * see 'recheck:' label in page_cache_readahead(). | |
70 | - * Note: We never use ->ahead_size as rvalue without | |
71 | - * checking ->ahead_start != 0 first. | |
72 | - */ | |
73 | - ra->ahead_size += ra->ahead_start; | |
74 | - ra->ahead_start = 0; | |
75 | -} | |
76 | - | |
77 | -static inline void ra_off(struct file_ra_state *ra) | |
78 | -{ | |
79 | - ra->start = 0; | |
80 | - ra->flags = 0; | |
81 | - ra->size = 0; | |
82 | - reset_ahead_window(ra); | |
83 | - return; | |
84 | -} | |
85 | - | |
86 | -/* | |
87 | - * Set the initial window size, round to next power of 2 and square | |
88 | - * for small size, x 4 for medium, and x 2 for large | |
89 | - * for 128k (32 page) max ra | |
90 | - * 1-8 page = 32k initial, > 8 page = 128k initial | |
91 | - */ | |
92 | -static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | |
93 | -{ | |
94 | - unsigned long newsize = roundup_pow_of_two(size); | |
95 | - | |
96 | - if (newsize <= max / 32) | |
97 | - newsize = newsize * 4; | |
98 | - else if (newsize <= max / 4) | |
99 | - newsize = newsize * 2; | |
100 | - else | |
101 | - newsize = max; | |
102 | - return newsize; | |
103 | -} | |
104 | - | |
105 | -/* | |
106 | - * Set the new window size, this is called only when I/O is to be submitted, | |
107 | - * not for each call to readahead. If a cache miss occured, reduce next I/O | |
108 | - * size, else increase depending on how close to max we are. | |
109 | - */ | |
110 | -static inline unsigned long get_next_ra_size(struct file_ra_state *ra) | |
111 | -{ | |
112 | - unsigned long max = get_max_readahead(ra); | |
113 | - unsigned long min = get_min_readahead(ra); | |
114 | - unsigned long cur = ra->size; | |
115 | - unsigned long newsize; | |
116 | - | |
117 | - if (ra->flags & RA_FLAG_MISS) { | |
118 | - ra->flags &= ~RA_FLAG_MISS; | |
119 | - newsize = max((cur - 2), min); | |
120 | - } else if (cur < max / 16) { | |
121 | - newsize = 4 * cur; | |
122 | - } else { | |
123 | - newsize = 2 * cur; | |
124 | - } | |
125 | - return min(newsize, max); | |
126 | -} | |
127 | - | |
128 | 52 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) |
129 | 53 | |
130 | 54 | /** |
... | ... | @@ -201,66 +125,6 @@ |
201 | 125 | } |
202 | 126 | |
203 | 127 | /* |
204 | - * Readahead design. | |
205 | - * | |
206 | - * The fields in struct file_ra_state represent the most-recently-executed | |
207 | - * readahead attempt: | |
208 | - * | |
209 | - * start: Page index at which we started the readahead | |
210 | - * size: Number of pages in that read | |
211 | - * Together, these form the "current window". | |
212 | - * Together, start and size represent the `readahead window'. | |
213 | - * prev_index: The page which the readahead algorithm most-recently inspected. | |
214 | - * It is mainly used to detect sequential file reading. | |
215 | - * If page_cache_readahead sees that it is again being called for | |
216 | - * a page which it just looked at, it can return immediately without | |
217 | - * making any state changes. | |
218 | - * offset: Offset in the prev_index where the last read ended - used for | |
219 | - * detection of sequential file reading. | |
220 | - * ahead_start, | |
221 | - * ahead_size: Together, these form the "ahead window". | |
222 | - * ra_pages: The externally controlled max readahead for this fd. | |
223 | - * | |
224 | - * When readahead is in the off state (size == 0), readahead is disabled. | |
225 | - * In this state, prev_index is used to detect the resumption of sequential I/O. | |
226 | - * | |
227 | - * The readahead code manages two windows - the "current" and the "ahead" | |
228 | - * windows. The intent is that while the application is walking the pages | |
229 | - * in the current window, I/O is underway on the ahead window. When the | |
230 | - * current window is fully traversed, it is replaced by the ahead window | |
231 | - * and the ahead window is invalidated. When this copying happens, the | |
232 | - * new current window's pages are probably still locked. So | |
233 | - * we submit a new batch of I/O immediately, creating a new ahead window. | |
234 | - * | |
235 | - * So: | |
236 | - * | |
237 | - * ----|----------------|----------------|----- | |
238 | - * ^start ^start+size | |
239 | - * ^ahead_start ^ahead_start+ahead_size | |
240 | - * | |
241 | - * ^ When this page is read, we submit I/O for the | |
242 | - * ahead window. | |
243 | - * | |
244 | - * A `readahead hit' occurs when a read request is made against a page which is | |
245 | - * the next sequential page. Ahead window calculations are done only when it | |
246 | - * is time to submit a new IO. The code ramps up the size agressively at first, | |
247 | - * but slow down as it approaches max_readhead. | |
248 | - * | |
249 | - * Any seek/ramdom IO will result in readahead being turned off. It will resume | |
250 | - * at the first sequential access. | |
251 | - * | |
252 | - * There is a special-case: if the first page which the application tries to | |
253 | - * read happens to be the first page of the file, it is assumed that a linear | |
254 | - * read is about to happen and the window is immediately set to the initial size | |
255 | - * based on I/O request size and the max_readahead. | |
256 | - * | |
257 | - * This function is to be called for every read request, rather than when | |
258 | - * it is time to perform readahead. It is called only once for the entire I/O | |
259 | - * regardless of size unless readahead is unable to start enough I/O to satisfy | |
260 | - * the request (I/O request > max_readahead). | |
261 | - */ | |
262 | - | |
263 | -/* | |
264 | 128 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all |
265 | 129 | * the pages first, then submits them all for I/O. This avoids the very bad |
266 | 130 | * behaviour which would occur if page allocations are causing VM writeback. |
... | ... | @@ -295,7 +159,7 @@ |
295 | 159 | read_lock_irq(&mapping->tree_lock); |
296 | 160 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { |
297 | 161 | pgoff_t page_offset = offset + page_idx; |
298 | - | |
162 | + | |
299 | 163 | if (page_offset > end_index) |
300 | 164 | break; |
301 | 165 | |
... | ... | @@ -361,28 +225,6 @@ |
361 | 225 | } |
362 | 226 | |
363 | 227 | /* |
364 | - * Check how effective readahead is being. If the amount of started IO is | |
365 | - * less than expected then the file is partly or fully in pagecache and | |
366 | - * readahead isn't helping. | |
367 | - * | |
368 | - */ | |
369 | -static inline int check_ra_success(struct file_ra_state *ra, | |
370 | - unsigned long nr_to_read, unsigned long actual) | |
371 | -{ | |
372 | - if (actual == 0) { | |
373 | - ra->cache_hit += nr_to_read; | |
374 | - if (ra->cache_hit >= VM_MAX_CACHE_HIT) { | |
375 | - ra_off(ra); | |
376 | - ra->flags |= RA_FLAG_INCACHE; | |
377 | - return 0; | |
378 | - } | |
379 | - } else { | |
380 | - ra->cache_hit=0; | |
381 | - } | |
382 | - return 1; | |
383 | -} | |
384 | - | |
385 | -/* | |
386 | 228 | * This version skips the IO if the queue is read-congested, and will tell the |
387 | 229 | * block layer to abandon the readahead if request allocation would block. |
388 | 230 | * |
... | ... | @@ -399,191 +241,6 @@ |
399 | 241 | } |
400 | 242 | |
401 | 243 | /* |
402 | - * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' | |
403 | - * is set wait till the read completes. Otherwise attempt to read without | |
404 | - * blocking. | |
405 | - * Returns 1 meaning 'success' if read is successful without switching off | |
406 | - * readahead mode. Otherwise return failure. | |
407 | - */ | |
408 | -static int | |
409 | -blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, | |
410 | - pgoff_t offset, unsigned long nr_to_read, | |
411 | - struct file_ra_state *ra, int block) | |
412 | -{ | |
413 | - int actual; | |
414 | - | |
415 | - if (!block && bdi_read_congested(mapping->backing_dev_info)) | |
416 | - return 0; | |
417 | - | |
418 | - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | |
419 | - | |
420 | - return check_ra_success(ra, nr_to_read, actual); | |
421 | -} | |
422 | - | |
423 | -static int make_ahead_window(struct address_space *mapping, struct file *filp, | |
424 | - struct file_ra_state *ra, int force) | |
425 | -{ | |
426 | - int block, ret; | |
427 | - | |
428 | - ra->ahead_size = get_next_ra_size(ra); | |
429 | - ra->ahead_start = ra->start + ra->size; | |
430 | - | |
431 | - block = force || (ra->prev_index >= ra->ahead_start); | |
432 | - ret = blockable_page_cache_readahead(mapping, filp, | |
433 | - ra->ahead_start, ra->ahead_size, ra, block); | |
434 | - | |
435 | - if (!ret && !force) { | |
436 | - /* A read failure in blocking mode, implies pages are | |
437 | - * all cached. So we can safely assume we have taken | |
438 | - * care of all the pages requested in this call. | |
439 | - * A read failure in non-blocking mode, implies we are | |
440 | - * reading more pages than requested in this call. So | |
441 | - * we safely assume we have taken care of all the pages | |
442 | - * requested in this call. | |
443 | - * | |
444 | - * Just reset the ahead window in case we failed due to | |
445 | - * congestion. The ahead window will any way be closed | |
446 | - * in case we failed due to excessive page cache hits. | |
447 | - */ | |
448 | - reset_ahead_window(ra); | |
449 | - } | |
450 | - | |
451 | - return ret; | |
452 | -} | |
453 | - | |
454 | -/** | |
455 | - * page_cache_readahead - generic adaptive readahead | |
456 | - * @mapping: address_space which holds the pagecache and I/O vectors | |
457 | - * @ra: file_ra_state which holds the readahead state | |
458 | - * @filp: passed on to ->readpage() and ->readpages() | |
459 | - * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units | |
460 | - * @req_size: hint: total size of the read which the caller is performing in | |
461 | - * PAGE_CACHE_SIZE units | |
462 | - * | |
463 | - * page_cache_readahead() is the main function. It performs the adaptive | |
464 | - * readahead window size management and submits the readahead I/O. | |
465 | - * | |
466 | - * Note that @filp is purely used for passing on to the ->readpage[s]() | |
467 | - * handler: it may refer to a different file from @mapping (so we may not use | |
468 | - * @filp->f_mapping or @filp->f_path.dentry->d_inode here). | |
469 | - * Also, @ra may not be equal to &@filp->f_ra. | |
470 | - * | |
471 | - */ | |
472 | -unsigned long | |
473 | -page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | |
474 | - struct file *filp, pgoff_t offset, unsigned long req_size) | |
475 | -{ | |
476 | - unsigned long max, newsize; | |
477 | - int sequential; | |
478 | - | |
479 | - /* | |
480 | - * We avoid doing extra work and bogusly perturbing the readahead | |
481 | - * window expansion logic. | |
482 | - */ | |
483 | - if (offset == ra->prev_index && --req_size) | |
484 | - ++offset; | |
485 | - | |
486 | - /* Note that prev_index == -1 if it is a first read */ | |
487 | - sequential = (offset == ra->prev_index + 1); | |
488 | - ra->prev_index = offset; | |
489 | - ra->prev_offset = 0; | |
490 | - | |
491 | - max = get_max_readahead(ra); | |
492 | - newsize = min(req_size, max); | |
493 | - | |
494 | - /* No readahead or sub-page sized read or file already in cache */ | |
495 | - if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) | |
496 | - goto out; | |
497 | - | |
498 | - ra->prev_index += newsize - 1; | |
499 | - | |
500 | - /* | |
501 | - * Special case - first read at start of file. We'll assume it's | |
502 | - * a whole-file read and grow the window fast. Or detect first | |
503 | - * sequential access | |
504 | - */ | |
505 | - if (sequential && ra->size == 0) { | |
506 | - ra->size = get_init_ra_size(newsize, max); | |
507 | - ra->start = offset; | |
508 | - if (!blockable_page_cache_readahead(mapping, filp, offset, | |
509 | - ra->size, ra, 1)) | |
510 | - goto out; | |
511 | - | |
512 | - /* | |
513 | - * If the request size is larger than our max readahead, we | |
514 | - * at least want to be sure that we get 2 IOs in flight and | |
515 | - * we know that we will definitly need the new I/O. | |
516 | - * once we do this, subsequent calls should be able to overlap | |
517 | - * IOs,* thus preventing stalls. so issue the ahead window | |
518 | - * immediately. | |
519 | - */ | |
520 | - if (req_size >= max) | |
521 | - make_ahead_window(mapping, filp, ra, 1); | |
522 | - | |
523 | - goto out; | |
524 | - } | |
525 | - | |
526 | - /* | |
527 | - * Now handle the random case: | |
528 | - * partial page reads and first access were handled above, | |
529 | - * so this must be the next page otherwise it is random | |
530 | - */ | |
531 | - if (!sequential) { | |
532 | - ra_off(ra); | |
533 | - blockable_page_cache_readahead(mapping, filp, offset, | |
534 | - newsize, ra, 1); | |
535 | - goto out; | |
536 | - } | |
537 | - | |
538 | - /* | |
539 | - * If we get here we are doing sequential IO and this was not the first | |
540 | - * occurence (ie we have an existing window) | |
541 | - */ | |
542 | - if (ra->ahead_start == 0) { /* no ahead window yet */ | |
543 | - if (!make_ahead_window(mapping, filp, ra, 0)) | |
544 | - goto recheck; | |
545 | - } | |
546 | - | |
547 | - /* | |
548 | - * Already have an ahead window, check if we crossed into it. | |
549 | - * If so, shift windows and issue a new ahead window. | |
550 | - * Only return the #pages that are in the current window, so that | |
551 | - * we get called back on the first page of the ahead window which | |
552 | - * will allow us to submit more IO. | |
553 | - */ | |
554 | - if (ra->prev_index >= ra->ahead_start) { | |
555 | - ra->start = ra->ahead_start; | |
556 | - ra->size = ra->ahead_size; | |
557 | - make_ahead_window(mapping, filp, ra, 0); | |
558 | -recheck: | |
559 | - /* prev_index shouldn't overrun the ahead window */ | |
560 | - ra->prev_index = min(ra->prev_index, | |
561 | - ra->ahead_start + ra->ahead_size - 1); | |
562 | - } | |
563 | - | |
564 | -out: | |
565 | - return ra->prev_index + 1; | |
566 | -} | |
567 | -EXPORT_SYMBOL_GPL(page_cache_readahead); | |
568 | - | |
569 | -/* | |
570 | - * handle_ra_miss() is called when it is known that a page which should have | |
571 | - * been present in the pagecache (we just did some readahead there) was in fact | |
572 | - * not found. This will happen if it was evicted by the VM (readahead | |
573 | - * thrashing) | |
574 | - * | |
575 | - * Turn on the cache miss flag in the RA struct, this will cause the RA code | |
576 | - * to reduce the RA size on the next read. | |
577 | - */ | |
578 | -void handle_ra_miss(struct address_space *mapping, | |
579 | - struct file_ra_state *ra, pgoff_t offset) | |
580 | -{ | |
581 | - ra->flags |= RA_FLAG_MISS; | |
582 | - ra->flags &= ~RA_FLAG_INCACHE; | |
583 | - ra->cache_hit = 0; | |
584 | -} | |
585 | - | |
586 | -/* | |
587 | 244 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
588 | 245 | * sensible upper limit. |
589 | 246 | */ |
590 | 247 | |
591 | 248 | |
592 | 249 | |
... | ... | @@ -613,19 +270,39 @@ |
613 | 270 | EXPORT_SYMBOL_GPL(ra_submit); |
614 | 271 | |
615 | 272 | /* |
273 | + * Set the initial window size, round to next power of 2 and square | |
274 | + * for small size, x 4 for medium, and x 2 for large | |
275 | + * for 128k (32 page) max ra | |
276 | + * 1-8 page = 32k initial, > 8 page = 128k initial | |
277 | + */ | |
278 | +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | |
279 | +{ | |
280 | + unsigned long newsize = roundup_pow_of_two(size); | |
281 | + | |
282 | + if (newsize <= max / 32) | |
283 | + newsize = newsize * 4; | |
284 | + else if (newsize <= max / 4) | |
285 | + newsize = newsize * 2; | |
286 | + else | |
287 | + newsize = max; | |
288 | + | |
289 | + return newsize; | |
290 | +} | |
291 | + | |
292 | +/* | |
616 | 293 | * Get the previous window size, ramp it up, and |
617 | 294 | * return it as the new window size. |
618 | 295 | */ |
619 | -static unsigned long get_next_ra_size2(struct file_ra_state *ra, | |
296 | +static unsigned long get_next_ra_size(struct file_ra_state *ra, | |
620 | 297 | unsigned long max) |
621 | 298 | { |
622 | 299 | unsigned long cur = ra->readahead_index - ra->ra_index; |
623 | 300 | unsigned long newsize; |
624 | 301 | |
625 | 302 | if (cur < max / 16) |
626 | - newsize = cur * 4; | |
303 | + newsize = 4 * cur; | |
627 | 304 | else |
628 | - newsize = cur * 2; | |
305 | + newsize = 2 * cur; | |
629 | 306 | |
630 | 307 | return min(newsize, max); |
631 | 308 | } |
... | ... | @@ -701,7 +378,7 @@ |
701 | 378 | if (offset && (offset == ra->lookahead_index || |
702 | 379 | offset == ra->readahead_index)) { |
703 | 380 | ra_index = ra->readahead_index; |
704 | - ra_size = get_next_ra_size2(ra, max); | |
381 | + ra_size = get_next_ra_size(ra, max); | |
705 | 382 | la_size = ra_size; |
706 | 383 | goto fill_ra; |
707 | 384 | } |