Commit c743d96b6d2ff55a94df7b5ac7c74987bb9c343b

Authored by Fengguang Wu
Committed by Linus Torvalds
1 parent dc7868fcb9

readahead: remove the old algorithm

Remove the old readahead algorithm.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 26 additions and 365 deletions Side-by-side Diff

... ... @@ -701,14 +701,6 @@
701 701 * file_ra_state.la_index .ra_index .lookahead_index .readahead_index
702 702 */
703 703 struct file_ra_state {
704   - unsigned long start; /* Current window */
705   - unsigned long size;
706   - unsigned long flags; /* ra flags RA_FLAG_xxx*/
707   - unsigned long cache_hit; /* cache hit count*/
708   - unsigned long prev_index; /* Cache last read() position */
709   - unsigned long ahead_start; /* Ahead window */
710   - unsigned long ahead_size;
711   -
712 704 pgoff_t la_index; /* enqueue time */
713 705 pgoff_t ra_index; /* begin offset */
714 706 pgoff_t lookahead_index; /* time to do next readahead */
715 707  
... ... @@ -717,10 +709,9 @@
717 709 unsigned long ra_pages; /* Maximum readahead window */
718 710 unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
719 711 unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
  712 + unsigned long prev_index; /* Cache last read() position */
720 713 unsigned int prev_offset; /* Offset where last read() ended in a page */
721 714 };
722   -#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */
723   -#define RA_FLAG_INCACHE 0x02 /* file is already in cache */
724 715  
725 716 /*
726 717 * Measuring read-ahead sizes.
... ... @@ -1144,13 +1144,6 @@
1144 1144 struct page *page,
1145 1145 pgoff_t offset,
1146 1146 unsigned long size);
1147   -unsigned long page_cache_readahead(struct address_space *mapping,
1148   - struct file_ra_state *ra,
1149   - struct file *filp,
1150   - pgoff_t offset,
1151   - unsigned long size);
1152   -void handle_ra_miss(struct address_space *mapping,
1153   - struct file_ra_state *ra, pgoff_t offset);
1154 1147 unsigned long max_sane_readahead(unsigned long nr);
1155 1148  
1156 1149 /* Do stack extension */
... ... @@ -49,82 +49,6 @@
49 49 }
50 50 EXPORT_SYMBOL_GPL(file_ra_state_init);
51 51  
52   -/*
53   - * Return max readahead size for this inode in number-of-pages.
54   - */
55   -static inline unsigned long get_max_readahead(struct file_ra_state *ra)
56   -{
57   - return ra->ra_pages;
58   -}
59   -
60   -static inline unsigned long get_min_readahead(struct file_ra_state *ra)
61   -{
62   - return MIN_RA_PAGES;
63   -}
64   -
65   -static inline void reset_ahead_window(struct file_ra_state *ra)
66   -{
67   - /*
68   - * ... but preserve ahead_start + ahead_size value,
69   - * see 'recheck:' label in page_cache_readahead().
70   - * Note: We never use ->ahead_size as rvalue without
71   - * checking ->ahead_start != 0 first.
72   - */
73   - ra->ahead_size += ra->ahead_start;
74   - ra->ahead_start = 0;
75   -}
76   -
77   -static inline void ra_off(struct file_ra_state *ra)
78   -{
79   - ra->start = 0;
80   - ra->flags = 0;
81   - ra->size = 0;
82   - reset_ahead_window(ra);
83   - return;
84   -}
85   -
86   -/*
87   - * Set the initial window size, round to next power of 2 and square
88   - * for small size, x 4 for medium, and x 2 for large
89   - * for 128k (32 page) max ra
90   - * 1-8 page = 32k initial, > 8 page = 128k initial
91   - */
92   -static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
93   -{
94   - unsigned long newsize = roundup_pow_of_two(size);
95   -
96   - if (newsize <= max / 32)
97   - newsize = newsize * 4;
98   - else if (newsize <= max / 4)
99   - newsize = newsize * 2;
100   - else
101   - newsize = max;
102   - return newsize;
103   -}
104   -
105   -/*
106   - * Set the new window size, this is called only when I/O is to be submitted,
107   - * not for each call to readahead. If a cache miss occured, reduce next I/O
108   - * size, else increase depending on how close to max we are.
109   - */
110   -static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
111   -{
112   - unsigned long max = get_max_readahead(ra);
113   - unsigned long min = get_min_readahead(ra);
114   - unsigned long cur = ra->size;
115   - unsigned long newsize;
116   -
117   - if (ra->flags & RA_FLAG_MISS) {
118   - ra->flags &= ~RA_FLAG_MISS;
119   - newsize = max((cur - 2), min);
120   - } else if (cur < max / 16) {
121   - newsize = 4 * cur;
122   - } else {
123   - newsize = 2 * cur;
124   - }
125   - return min(newsize, max);
126   -}
127   -
128 52 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
129 53  
130 54 /**
... ... @@ -201,66 +125,6 @@
201 125 }
202 126  
203 127 /*
204   - * Readahead design.
205   - *
206   - * The fields in struct file_ra_state represent the most-recently-executed
207   - * readahead attempt:
208   - *
209   - * start: Page index at which we started the readahead
210   - * size: Number of pages in that read
211   - * Together, these form the "current window".
212   - * Together, start and size represent the `readahead window'.
213   - * prev_index: The page which the readahead algorithm most-recently inspected.
214   - * It is mainly used to detect sequential file reading.
215   - * If page_cache_readahead sees that it is again being called for
216   - * a page which it just looked at, it can return immediately without
217   - * making any state changes.
218   - * offset: Offset in the prev_index where the last read ended - used for
219   - * detection of sequential file reading.
220   - * ahead_start,
221   - * ahead_size: Together, these form the "ahead window".
222   - * ra_pages: The externally controlled max readahead for this fd.
223   - *
224   - * When readahead is in the off state (size == 0), readahead is disabled.
225   - * In this state, prev_index is used to detect the resumption of sequential I/O.
226   - *
227   - * The readahead code manages two windows - the "current" and the "ahead"
228   - * windows. The intent is that while the application is walking the pages
229   - * in the current window, I/O is underway on the ahead window. When the
230   - * current window is fully traversed, it is replaced by the ahead window
231   - * and the ahead window is invalidated. When this copying happens, the
232   - * new current window's pages are probably still locked. So
233   - * we submit a new batch of I/O immediately, creating a new ahead window.
234   - *
235   - * So:
236   - *
237   - * ----|----------------|----------------|-----
238   - * ^start ^start+size
239   - * ^ahead_start ^ahead_start+ahead_size
240   - *
241   - * ^ When this page is read, we submit I/O for the
242   - * ahead window.
243   - *
244   - * A `readahead hit' occurs when a read request is made against a page which is
245   - * the next sequential page. Ahead window calculations are done only when it
246   - * is time to submit a new IO. The code ramps up the size agressively at first,
247   - * but slow down as it approaches max_readhead.
248   - *
249   - * Any seek/ramdom IO will result in readahead being turned off. It will resume
250   - * at the first sequential access.
251   - *
252   - * There is a special-case: if the first page which the application tries to
253   - * read happens to be the first page of the file, it is assumed that a linear
254   - * read is about to happen and the window is immediately set to the initial size
255   - * based on I/O request size and the max_readahead.
256   - *
257   - * This function is to be called for every read request, rather than when
258   - * it is time to perform readahead. It is called only once for the entire I/O
259   - * regardless of size unless readahead is unable to start enough I/O to satisfy
260   - * the request (I/O request > max_readahead).
261   - */
262   -
263   -/*
264 128 * do_page_cache_readahead actually reads a chunk of disk. It allocates all
265 129 * the pages first, then submits them all for I/O. This avoids the very bad
266 130 * behaviour which would occur if page allocations are causing VM writeback.
... ... @@ -295,7 +159,7 @@
295 159 read_lock_irq(&mapping->tree_lock);
296 160 for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
297 161 pgoff_t page_offset = offset + page_idx;
298   -
  162 +
299 163 if (page_offset > end_index)
300 164 break;
301 165  
... ... @@ -361,28 +225,6 @@
361 225 }
362 226  
363 227 /*
364   - * Check how effective readahead is being. If the amount of started IO is
365   - * less than expected then the file is partly or fully in pagecache and
366   - * readahead isn't helping.
367   - *
368   - */
369   -static inline int check_ra_success(struct file_ra_state *ra,
370   - unsigned long nr_to_read, unsigned long actual)
371   -{
372   - if (actual == 0) {
373   - ra->cache_hit += nr_to_read;
374   - if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
375   - ra_off(ra);
376   - ra->flags |= RA_FLAG_INCACHE;
377   - return 0;
378   - }
379   - } else {
380   - ra->cache_hit=0;
381   - }
382   - return 1;
383   -}
384   -
385   -/*
386 228 * This version skips the IO if the queue is read-congested, and will tell the
387 229 * block layer to abandon the readahead if request allocation would block.
388 230 *
... ... @@ -399,191 +241,6 @@
399 241 }
400 242  
401 243 /*
402   - * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
403   - * is set wait till the read completes. Otherwise attempt to read without
404   - * blocking.
405   - * Returns 1 meaning 'success' if read is successful without switching off
406   - * readahead mode. Otherwise return failure.
407   - */
408   -static int
409   -blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
410   - pgoff_t offset, unsigned long nr_to_read,
411   - struct file_ra_state *ra, int block)
412   -{
413   - int actual;
414   -
415   - if (!block && bdi_read_congested(mapping->backing_dev_info))
416   - return 0;
417   -
418   - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
419   -
420   - return check_ra_success(ra, nr_to_read, actual);
421   -}
422   -
423   -static int make_ahead_window(struct address_space *mapping, struct file *filp,
424   - struct file_ra_state *ra, int force)
425   -{
426   - int block, ret;
427   -
428   - ra->ahead_size = get_next_ra_size(ra);
429   - ra->ahead_start = ra->start + ra->size;
430   -
431   - block = force || (ra->prev_index >= ra->ahead_start);
432   - ret = blockable_page_cache_readahead(mapping, filp,
433   - ra->ahead_start, ra->ahead_size, ra, block);
434   -
435   - if (!ret && !force) {
436   - /* A read failure in blocking mode, implies pages are
437   - * all cached. So we can safely assume we have taken
438   - * care of all the pages requested in this call.
439   - * A read failure in non-blocking mode, implies we are
440   - * reading more pages than requested in this call. So
441   - * we safely assume we have taken care of all the pages
442   - * requested in this call.
443   - *
444   - * Just reset the ahead window in case we failed due to
445   - * congestion. The ahead window will any way be closed
446   - * in case we failed due to excessive page cache hits.
447   - */
448   - reset_ahead_window(ra);
449   - }
450   -
451   - return ret;
452   -}
453   -
454   -/**
455   - * page_cache_readahead - generic adaptive readahead
456   - * @mapping: address_space which holds the pagecache and I/O vectors
457   - * @ra: file_ra_state which holds the readahead state
458   - * @filp: passed on to ->readpage() and ->readpages()
459   - * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
460   - * @req_size: hint: total size of the read which the caller is performing in
461   - * PAGE_CACHE_SIZE units
462   - *
463   - * page_cache_readahead() is the main function. It performs the adaptive
464   - * readahead window size management and submits the readahead I/O.
465   - *
466   - * Note that @filp is purely used for passing on to the ->readpage[s]()
467   - * handler: it may refer to a different file from @mapping (so we may not use
468   - * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
469   - * Also, @ra may not be equal to &@filp->f_ra.
470   - *
471   - */
472   -unsigned long
473   -page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
474   - struct file *filp, pgoff_t offset, unsigned long req_size)
475   -{
476   - unsigned long max, newsize;
477   - int sequential;
478   -
479   - /*
480   - * We avoid doing extra work and bogusly perturbing the readahead
481   - * window expansion logic.
482   - */
483   - if (offset == ra->prev_index && --req_size)
484   - ++offset;
485   -
486   - /* Note that prev_index == -1 if it is a first read */
487   - sequential = (offset == ra->prev_index + 1);
488   - ra->prev_index = offset;
489   - ra->prev_offset = 0;
490   -
491   - max = get_max_readahead(ra);
492   - newsize = min(req_size, max);
493   -
494   - /* No readahead or sub-page sized read or file already in cache */
495   - if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
496   - goto out;
497   -
498   - ra->prev_index += newsize - 1;
499   -
500   - /*
501   - * Special case - first read at start of file. We'll assume it's
502   - * a whole-file read and grow the window fast. Or detect first
503   - * sequential access
504   - */
505   - if (sequential && ra->size == 0) {
506   - ra->size = get_init_ra_size(newsize, max);
507   - ra->start = offset;
508   - if (!blockable_page_cache_readahead(mapping, filp, offset,
509   - ra->size, ra, 1))
510   - goto out;
511   -
512   - /*
513   - * If the request size is larger than our max readahead, we
514   - * at least want to be sure that we get 2 IOs in flight and
515   - * we know that we will definitly need the new I/O.
516   - * once we do this, subsequent calls should be able to overlap
517   - * IOs,* thus preventing stalls. so issue the ahead window
518   - * immediately.
519   - */
520   - if (req_size >= max)
521   - make_ahead_window(mapping, filp, ra, 1);
522   -
523   - goto out;
524   - }
525   -
526   - /*
527   - * Now handle the random case:
528   - * partial page reads and first access were handled above,
529   - * so this must be the next page otherwise it is random
530   - */
531   - if (!sequential) {
532   - ra_off(ra);
533   - blockable_page_cache_readahead(mapping, filp, offset,
534   - newsize, ra, 1);
535   - goto out;
536   - }
537   -
538   - /*
539   - * If we get here we are doing sequential IO and this was not the first
540   - * occurence (ie we have an existing window)
541   - */
542   - if (ra->ahead_start == 0) { /* no ahead window yet */
543   - if (!make_ahead_window(mapping, filp, ra, 0))
544   - goto recheck;
545   - }
546   -
547   - /*
548   - * Already have an ahead window, check if we crossed into it.
549   - * If so, shift windows and issue a new ahead window.
550   - * Only return the #pages that are in the current window, so that
551   - * we get called back on the first page of the ahead window which
552   - * will allow us to submit more IO.
553   - */
554   - if (ra->prev_index >= ra->ahead_start) {
555   - ra->start = ra->ahead_start;
556   - ra->size = ra->ahead_size;
557   - make_ahead_window(mapping, filp, ra, 0);
558   -recheck:
559   - /* prev_index shouldn't overrun the ahead window */
560   - ra->prev_index = min(ra->prev_index,
561   - ra->ahead_start + ra->ahead_size - 1);
562   - }
563   -
564   -out:
565   - return ra->prev_index + 1;
566   -}
567   -EXPORT_SYMBOL_GPL(page_cache_readahead);
568   -
569   -/*
570   - * handle_ra_miss() is called when it is known that a page which should have
571   - * been present in the pagecache (we just did some readahead there) was in fact
572   - * not found. This will happen if it was evicted by the VM (readahead
573   - * thrashing)
574   - *
575   - * Turn on the cache miss flag in the RA struct, this will cause the RA code
576   - * to reduce the RA size on the next read.
577   - */
578   -void handle_ra_miss(struct address_space *mapping,
579   - struct file_ra_state *ra, pgoff_t offset)
580   -{
581   - ra->flags |= RA_FLAG_MISS;
582   - ra->flags &= ~RA_FLAG_INCACHE;
583   - ra->cache_hit = 0;
584   -}
585   -
586   -/*
587 244 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
588 245 * sensible upper limit.
589 246 */
590 247  
591 248  
592 249  
... ... @@ -613,19 +270,39 @@
613 270 EXPORT_SYMBOL_GPL(ra_submit);
614 271  
615 272 /*
  273 + * Set the initial window size, round to next power of 2 and square
  274 + * for small size, x 4 for medium, and x 2 for large
  275 + * for 128k (32 page) max ra
  276 + * 1-8 page = 32k initial, > 8 page = 128k initial
  277 + */
  278 +static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
  279 +{
  280 + unsigned long newsize = roundup_pow_of_two(size);
  281 +
  282 + if (newsize <= max / 32)
  283 + newsize = newsize * 4;
  284 + else if (newsize <= max / 4)
  285 + newsize = newsize * 2;
  286 + else
  287 + newsize = max;
  288 +
  289 + return newsize;
  290 +}
  291 +
  292 +/*
616 293 * Get the previous window size, ramp it up, and
617 294 * return it as the new window size.
618 295 */
619   -static unsigned long get_next_ra_size2(struct file_ra_state *ra,
  296 +static unsigned long get_next_ra_size(struct file_ra_state *ra,
620 297 unsigned long max)
621 298 {
622 299 unsigned long cur = ra->readahead_index - ra->ra_index;
623 300 unsigned long newsize;
624 301  
625 302 if (cur < max / 16)
626   - newsize = cur * 4;
  303 + newsize = 4 * cur;
627 304 else
628   - newsize = cur * 2;
  305 + newsize = 2 * cur;
629 306  
630 307 return min(newsize, max);
631 308 }
... ... @@ -701,7 +378,7 @@
701 378 if (offset && (offset == ra->lookahead_index ||
702 379 offset == ra->readahead_index)) {
703 380 ra_index = ra->readahead_index;
704   - ra_size = get_next_ra_size2(ra, max);
  381 + ra_size = get_next_ra_size(ra, max);
705 382 la_size = ra_size;
706 383 goto fill_ra;
707 384 }