readahead: remove the old algorithm

Remove the old readahead algorithm. Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn> Cc: Steven Pratt <slpratt@austin.ibm.com> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

readahead: remove the old algorithm
Remove the old readahead algorithm. Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn> Cc: Steven Pratt <slpratt@austin.ibm.com> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Fengguang Wu · Linus Torvalds
1 parent dc7868fcb9
Showing 3 changed files with 26 additions and 365 deletions Side-by-side Diff
include/linux/fs.h
include/linux/mm.h
mm/readahead.c
@@ -701,14 +701,6 @@
  *  file_ra_state.la_index    .ra_index   .lookahead_index   .readahead_index
  */
 struct file_ra_state {
-	unsigned long start;		/* Current window */
-	unsigned long size;
-	unsigned long flags;		/* ra flags RA_FLAG_xxx*/
-	unsigned long cache_hit;	/* cache hit count*/
-	unsigned long prev_index;	/* Cache last read() position */
-	unsigned long ahead_start;	/* Ahead window */
-	unsigned long ahead_size;
-
 	pgoff_t la_index;               /* enqueue time */
 	pgoff_t ra_index;               /* begin offset */
 	pgoff_t lookahead_index;        /* time to do next readahead */
  
@@ -717,10 +709,9 @@
 	unsigned long ra_pages;		/* Maximum readahead window */
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
 	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
+	unsigned long prev_index;	/* Cache last read() position */
 	unsigned int prev_offset;	/* Offset where last read() ended in a page */
 };
-#define RA_FLAG_MISS 0x01	/* a cache miss occured against this file */
-#define RA_FLAG_INCACHE 0x02	/* file is already in cache */
  
 /*
  * Measuring read-ahead sizes.
@@ -1144,13 +1144,6 @@
 			  struct page *page,
 			  pgoff_t offset,
 			  unsigned long size);
-unsigned long page_cache_readahead(struct address_space *mapping,
-			  struct file_ra_state *ra,
-			  struct file *filp,
-			  pgoff_t offset,
-			  unsigned long size);
-void handle_ra_miss(struct address_space *mapping, 
-		    struct file_ra_state *ra, pgoff_t offset);
 unsigned long max_sane_readahead(unsigned long nr);
  
 /* Do stack extension */
@@ -49,82 +49,6 @@
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
  
-/*
- * Return max readahead size for this inode in number-of-pages.
- */
-static inline unsigned long get_max_readahead(struct file_ra_state *ra)
-{
-	return ra->ra_pages;
-}
-
-static inline unsigned long get_min_readahead(struct file_ra_state *ra)
-{
-	return MIN_RA_PAGES;
-}
-
-static inline void reset_ahead_window(struct file_ra_state *ra)
-{
-	/*
-	 * ... but preserve ahead_start + ahead_size value,
-	 * see 'recheck:' label in page_cache_readahead().
-	 * Note: We never use ->ahead_size as rvalue without
-	 * checking ->ahead_start != 0 first.
-	 */
-	ra->ahead_size += ra->ahead_start;
-	ra->ahead_start = 0;
-}
-
-static inline void ra_off(struct file_ra_state *ra)
-{
-	ra->start = 0;
-	ra->flags = 0;
-	ra->size = 0;
-	reset_ahead_window(ra);
-	return;
-}
-
-/*
- * Set the initial window size, round to next power of 2 and square
- * for small size, x 4 for medium, and x 2 for large
- * for 128k (32 page) max ra
- * 1-8 page = 32k initial, > 8 page = 128k initial
- */
-static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
-{
-	unsigned long newsize = roundup_pow_of_two(size);
-
-	if (newsize <= max / 32)
-		newsize = newsize * 4;
-	else if (newsize <= max / 4)
-		newsize = newsize * 2;
-	else
-		newsize = max;
-	return newsize;
-}
-
-/*
- * Set the new window size, this is called only when I/O is to be submitted,
- * not for each call to readahead.  If a cache miss occured, reduce next I/O
- * size, else increase depending on how close to max we are.
- */
-static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
-{
-	unsigned long max = get_max_readahead(ra);
-	unsigned long min = get_min_readahead(ra);
-	unsigned long cur = ra->size;
-	unsigned long newsize;
-
-	if (ra->flags & RA_FLAG_MISS) {
-		ra->flags &= ~RA_FLAG_MISS;
-		newsize = max((cur - 2), min);
-	} else if (cur < max / 16) {
-		newsize = 4 * cur;
-	} else {
-		newsize = 2 * cur;
-	}
-	return min(newsize, max);
-}
-
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
  
 /**
@@ -201,66 +125,6 @@
 }
  
 /*
- * Readahead design.
- *
- * The fields in struct file_ra_state represent the most-recently-executed
- * readahead attempt:
- *
- * start:	Page index at which we started the readahead
- * size:	Number of pages in that read
- *              Together, these form the "current window".
- *              Together, start and size represent the `readahead window'.
- * prev_index:  The page which the readahead algorithm most-recently inspected.
- *              It is mainly used to detect sequential file reading.
- *              If page_cache_readahead sees that it is again being called for
- *              a page which it just looked at, it can return immediately without
- *              making any state changes.
- * offset:      Offset in the prev_index where the last read ended - used for
- *              detection of sequential file reading.
- * ahead_start,
- * ahead_size:  Together, these form the "ahead window".
- * ra_pages:	The externally controlled max readahead for this fd.
- *
- * When readahead is in the off state (size == 0), readahead is disabled.
- * In this state, prev_index is used to detect the resumption of sequential I/O.
- *
- * The readahead code manages two windows - the "current" and the "ahead"
- * windows.  The intent is that while the application is walking the pages
- * in the current window, I/O is underway on the ahead window.  When the
- * current window is fully traversed, it is replaced by the ahead window
- * and the ahead window is invalidated.  When this copying happens, the
- * new current window's pages are probably still locked.  So
- * we submit a new batch of I/O immediately, creating a new ahead window.
- *
- * So:
- *
- *   ----|----------------|----------------|-----
- *       ^start           ^start+size
- *                        ^ahead_start     ^ahead_start+ahead_size
- *
- *         ^ When this page is read, we submit I/O for the
- *           ahead window.
- *
- * A `readahead hit' occurs when a read request is made against a page which is
- * the next sequential page. Ahead window calculations are done only when it
- * is time to submit a new IO.  The code ramps up the size agressively at first,
- * but slow down as it approaches max_readhead.
- *
- * Any seek/ramdom IO will result in readahead being turned off.  It will resume
- * at the first sequential access.
- *
- * There is a special-case: if the first page which the application tries to
- * read happens to be the first page of the file, it is assumed that a linear
- * read is about to happen and the window is immediately set to the initial size
- * based on I/O request size and the max_readahead.
- *
- * This function is to be called for every read request, rather than when
- * it is time to perform readahead.  It is called only once for the entire I/O
- * regardless of size unless readahead is unable to start enough I/O to satisfy
- * the request (I/O request > max_readahead).
- */
-
-/*
  * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
  * the pages first, then submits them all for I/O. This avoids the very bad
  * behaviour which would occur if page allocations are causing VM writeback.
@@ -295,7 +159,7 @@
 	read_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		pgoff_t page_offset = offset + page_idx;
-		
+
 		if (page_offset > end_index)
 			break;
  
@@ -361,28 +225,6 @@
 }
  
 /*
- * Check how effective readahead is being.  If the amount of started IO is
- * less than expected then the file is partly or fully in pagecache and
- * readahead isn't helping.
- *
- */
-static inline int check_ra_success(struct file_ra_state *ra,
-			unsigned long nr_to_read, unsigned long actual)
-{
-	if (actual == 0) {
-		ra->cache_hit += nr_to_read;
-		if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
-			ra_off(ra);
-			ra->flags |= RA_FLAG_INCACHE;
-			return 0;
-		}
-	} else {
-		ra->cache_hit=0;
-	}
-	return 1;
-}
-
-/*
  * This version skips the IO if the queue is read-congested, and will tell the
  * block layer to abandon the readahead if request allocation would block.
  *
@@ -399,191 +241,6 @@
 }
  
 /*
- * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
- * is set wait till the read completes.  Otherwise attempt to read without
- * blocking.
- * Returns 1 meaning 'success' if read is successful without switching off
- * readahead mode. Otherwise return failure.
- */
-static int
-blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read,
-			struct file_ra_state *ra, int block)
-{
-	int actual;
-
-	if (!block && bdi_read_congested(mapping->backing_dev_info))
-		return 0;
-
-	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
-
-	return check_ra_success(ra, nr_to_read, actual);
-}
-
-static int make_ahead_window(struct address_space *mapping, struct file *filp,
-				struct file_ra_state *ra, int force)
-{
-	int block, ret;
-
-	ra->ahead_size = get_next_ra_size(ra);
-	ra->ahead_start = ra->start + ra->size;
-
-	block = force || (ra->prev_index >= ra->ahead_start);
-	ret = blockable_page_cache_readahead(mapping, filp,
-			ra->ahead_start, ra->ahead_size, ra, block);
-
-	if (!ret && !force) {
-		/* A read failure in blocking mode, implies pages are
-		 * all cached. So we can safely assume we have taken
-		 * care of all the pages requested in this call.
-		 * A read failure in non-blocking mode, implies we are
-		 * reading more pages than requested in this call.  So
-		 * we safely assume we have taken care of all the pages
-		 * requested in this call.
-		 *
-		 * Just reset the ahead window in case we failed due to
-		 * congestion.  The ahead window will any way be closed
-		 * in case we failed due to excessive page cache hits.
-		 */
-		reset_ahead_window(ra);
-	}
-
-	return ret;
-}
-
-/**
- * page_cache_readahead - generic adaptive readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
- * @req_size: hint: total size of the read which the caller is performing in
- *            PAGE_CACHE_SIZE units
- *
- * page_cache_readahead() is the main function.  It performs the adaptive
- * readahead window size management and submits the readahead I/O.
- *
- * Note that @filp is purely used for passing on to the ->readpage[s]()
- * handler: it may refer to a different file from @mapping (so we may not use
- * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
- * Also, @ra may not be equal to &@filp->f_ra.
- *
- */
-unsigned long
-page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
-		     struct file *filp, pgoff_t offset, unsigned long req_size)
-{
-	unsigned long max, newsize;
-	int sequential;
-
-	/*
-	 * We avoid doing extra work and bogusly perturbing the readahead
-	 * window expansion logic.
-	 */
-	if (offset == ra->prev_index && --req_size)
-		++offset;
-
-	/* Note that prev_index == -1 if it is a first read */
-	sequential = (offset == ra->prev_index + 1);
-	ra->prev_index = offset;
-	ra->prev_offset = 0;
-
-	max = get_max_readahead(ra);
-	newsize = min(req_size, max);
-
-	/* No readahead or sub-page sized read or file already in cache */
-	if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
-		goto out;
-
-	ra->prev_index += newsize - 1;
-
-	/*
-	 * Special case - first read at start of file. We'll assume it's
-	 * a whole-file read and grow the window fast.  Or detect first
-	 * sequential access
-	 */
-	if (sequential && ra->size == 0) {
-		ra->size = get_init_ra_size(newsize, max);
-		ra->start = offset;
-		if (!blockable_page_cache_readahead(mapping, filp, offset,
-							 ra->size, ra, 1))
-			goto out;
-
-		/*
-		 * If the request size is larger than our max readahead, we
-		 * at least want to be sure that we get 2 IOs in flight and
-		 * we know that we will definitly need the new I/O.
-		 * once we do this, subsequent calls should be able to overlap
-		 * IOs,* thus preventing stalls. so issue the ahead window
-		 * immediately.
-		 */
-		if (req_size >= max)
-			make_ahead_window(mapping, filp, ra, 1);
-
-		goto out;
-	}
-
-	/*
-	 * Now handle the random case:
-	 * partial page reads and first access were handled above,
-	 * so this must be the next page otherwise it is random
-	 */
-	if (!sequential) {
-		ra_off(ra);
-		blockable_page_cache_readahead(mapping, filp, offset,
-				 newsize, ra, 1);
-		goto out;
-	}
-
-	/*
-	 * If we get here we are doing sequential IO and this was not the first
-	 * occurence (ie we have an existing window)
-	 */
-	if (ra->ahead_start == 0) {	 /* no ahead window yet */
-		if (!make_ahead_window(mapping, filp, ra, 0))
-			goto recheck;
-	}
-
-	/*
-	 * Already have an ahead window, check if we crossed into it.
-	 * If so, shift windows and issue a new ahead window.
-	 * Only return the #pages that are in the current window, so that
-	 * we get called back on the first page of the ahead window which
-	 * will allow us to submit more IO.
-	 */
-	if (ra->prev_index >= ra->ahead_start) {
-		ra->start = ra->ahead_start;
-		ra->size = ra->ahead_size;
-		make_ahead_window(mapping, filp, ra, 0);
-recheck:
-		/* prev_index shouldn't overrun the ahead window */
-		ra->prev_index = min(ra->prev_index,
-			ra->ahead_start + ra->ahead_size - 1);
-	}
-
-out:
-	return ra->prev_index + 1;
-}
-EXPORT_SYMBOL_GPL(page_cache_readahead);
-
-/*
- * handle_ra_miss() is called when it is known that a page which should have
- * been present in the pagecache (we just did some readahead there) was in fact
- * not found.  This will happen if it was evicted by the VM (readahead
- * thrashing)
- *
- * Turn on the cache miss flag in the RA struct, this will cause the RA code
- * to reduce the RA size on the next read.
- */
-void handle_ra_miss(struct address_space *mapping,
-		struct file_ra_state *ra, pgoff_t offset)
-{
-	ra->flags |= RA_FLAG_MISS;
-	ra->flags &= ~RA_FLAG_INCACHE;
-	ra->cache_hit = 0;
-}
-
-/*
  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
  * sensible upper limit.
  */
  
  
  
@@ -613,19 +270,39 @@
 EXPORT_SYMBOL_GPL(ra_submit);
  
 /*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+	unsigned long newsize = roundup_pow_of_two(size);
+
+	if (newsize <= max / 32)
+		newsize = newsize * 4;
+	else if (newsize <= max / 4)
+		newsize = newsize * 2;
+	else
+		newsize = max;
+
+	return newsize;
+}
+
+/*
  *  Get the previous window size, ramp it up, and
  *  return it as the new window size.
  */
-static unsigned long get_next_ra_size2(struct file_ra_state *ra,
+static unsigned long get_next_ra_size(struct file_ra_state *ra,
 						unsigned long max)
 {
 	unsigned long cur = ra->readahead_index - ra->ra_index;
 	unsigned long newsize;
  
 	if (cur < max / 16)
-		newsize = cur * 4;
+		newsize = 4 * cur;
 	else
-		newsize = cur * 2;
+		newsize = 2 * cur;
  
 	return min(newsize, max);
 }
@@ -701,7 +378,7 @@
 	if (offset && (offset == ra->lookahead_index ||
 			offset == ra->readahead_index)) {
 		ra_index = ra->readahead_index;
-		ra_size = get_next_ra_size2(ra, max);
+		ra_size = get_next_ra_size(ra, max);
 		la_size = ra_size;
 		goto fill_ra;
 	}
...	...	@@ -701,14 +701,6 @@
701	701	* file_ra_state.la_index .ra_index .lookahead_index .readahead_index
702	702	*/
703	703	struct file_ra_state {
704		- unsigned long start; /* Current window */
705		- unsigned long size;
706		- unsigned long flags; /* ra flags RA_FLAG_xxx*/
707		- unsigned long cache_hit; /* cache hit count*/
708		- unsigned long prev_index; /* Cache last read() position */
709		- unsigned long ahead_start; /* Ahead window */
710		- unsigned long ahead_size;
711		-
712	704	pgoff_t la_index; /* enqueue time */
713	705	pgoff_t ra_index; /* begin offset */
714	706	pgoff_t lookahead_index; /* time to do next readahead */
715	707
...	...	@@ -717,10 +709,9 @@
717	709	unsigned long ra_pages; /* Maximum readahead window */
718	710	unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
719	711	unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
	712	+ unsigned long prev_index; /* Cache last read() position */
720	713	unsigned int prev_offset; /* Offset where last read() ended in a page */
721	714	};
722		-#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */
723		-#define RA_FLAG_INCACHE 0x02 /* file is already in cache */
724	715
725	716	/*
726	717	* Measuring read-ahead sizes.
...	...	@@ -1144,13 +1144,6 @@
1144	1144	struct page *page,
1145	1145	pgoff_t offset,
1146	1146	unsigned long size);
1147		-unsigned long page_cache_readahead(struct address_space *mapping,
1148		- struct file_ra_state *ra,
1149		- struct file *filp,
1150		- pgoff_t offset,
1151		- unsigned long size);
1152		-void handle_ra_miss(struct address_space *mapping,
1153		- struct file_ra_state *ra, pgoff_t offset);
1154	1147	unsigned long max_sane_readahead(unsigned long nr);
1155	1148
1156	1149	/* Do stack extension */
...	...	@@ -49,82 +49,6 @@
49	49	}
50	50	EXPORT_SYMBOL_GPL(file_ra_state_init);
51	51
52		-/*
53		- * Return max readahead size for this inode in number-of-pages.
54		- */
55		-static inline unsigned long get_max_readahead(struct file_ra_state *ra)
56		-{
57		- return ra->ra_pages;
58		-}
59		-
60		-static inline unsigned long get_min_readahead(struct file_ra_state *ra)
61		-{
62		- return MIN_RA_PAGES;
63		-}
64		-
65		-static inline void reset_ahead_window(struct file_ra_state *ra)
66		-{
67		- /*
68		- * ... but preserve ahead_start + ahead_size value,
69		- * see 'recheck:' label in page_cache_readahead().
70		- * Note: We never use ->ahead_size as rvalue without
71		- * checking ->ahead_start != 0 first.
72		- */
73		- ra->ahead_size += ra->ahead_start;
74		- ra->ahead_start = 0;
75		-}
76		-
77		-static inline void ra_off(struct file_ra_state *ra)
78		-{
79		- ra->start = 0;
80		- ra->flags = 0;
81		- ra->size = 0;
82		- reset_ahead_window(ra);
83		- return;
84		-}
85		-
86		-/*
87		- * Set the initial window size, round to next power of 2 and square
88		- * for small size, x 4 for medium, and x 2 for large
89		- * for 128k (32 page) max ra
90		- * 1-8 page = 32k initial, > 8 page = 128k initial
91		- */
92		-static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
93		-{
94		- unsigned long newsize = roundup_pow_of_two(size);
95		-
96		- if (newsize <= max / 32)
97		- newsize = newsize * 4;
98		- else if (newsize <= max / 4)
99		- newsize = newsize * 2;
100		- else
101		- newsize = max;
102		- return newsize;
103		-}
104		-
105		-/*
106		- * Set the new window size, this is called only when I/O is to be submitted,
107		- * not for each call to readahead. If a cache miss occured, reduce next I/O
108		- * size, else increase depending on how close to max we are.
109		- */
110		-static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
111		-{
112		- unsigned long max = get_max_readahead(ra);
113		- unsigned long min = get_min_readahead(ra);
114		- unsigned long cur = ra->size;
115		- unsigned long newsize;
116		-
117		- if (ra->flags & RA_FLAG_MISS) {
118		- ra->flags &= ~RA_FLAG_MISS;
119		- newsize = max((cur - 2), min);
120		- } else if (cur < max / 16) {
121		- newsize = 4 * cur;
122		- } else {
123		- newsize = 2 * cur;
124		- }
125		- return min(newsize, max);
126		-}
127		-
128	52	#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
129	53
130	54	/**
...	...	@@ -201,66 +125,6 @@
201	125	}
202	126
203	127	/*
204		- * Readahead design.
205		- *
206		- * The fields in struct file_ra_state represent the most-recently-executed
207		- * readahead attempt:
208		- *
209		- * start: Page index at which we started the readahead
210		- * size: Number of pages in that read
211		- * Together, these form the "current window".
212		- * Together, start and size represent the `readahead window'.
213		- * prev_index: The page which the readahead algorithm most-recently inspected.
214		- * It is mainly used to detect sequential file reading.
215		- * If page_cache_readahead sees that it is again being called for
216		- * a page which it just looked at, it can return immediately without
217		- * making any state changes.
218		- * offset: Offset in the prev_index where the last read ended - used for
219		- * detection of sequential file reading.
220		- * ahead_start,
221		- * ahead_size: Together, these form the "ahead window".
222		- * ra_pages: The externally controlled max readahead for this fd.
223		- *
224		- * When readahead is in the off state (size == 0), readahead is disabled.
225		- * In this state, prev_index is used to detect the resumption of sequential I/O.
226		- *
227		- * The readahead code manages two windows - the "current" and the "ahead"
228		- * windows. The intent is that while the application is walking the pages
229		- * in the current window, I/O is underway on the ahead window. When the
230		- * current window is fully traversed, it is replaced by the ahead window
231		- * and the ahead window is invalidated. When this copying happens, the
232		- * new current window's pages are probably still locked. So
233		- * we submit a new batch of I/O immediately, creating a new ahead window.
234		- *
235		- * So:
236		- *
237		- * ----\|----------------\|----------------\|-----
238		- * ^start ^start+size
239		- * ^ahead_start ^ahead_start+ahead_size
240		- *
241		- * ^ When this page is read, we submit I/O for the
242		- * ahead window.
243		- *
244		- * A `readahead hit' occurs when a read request is made against a page which is
245		- * the next sequential page. Ahead window calculations are done only when it
246		- * is time to submit a new IO. The code ramps up the size agressively at first,
247		- * but slow down as it approaches max_readhead.
248		- *
249		- * Any seek/ramdom IO will result in readahead being turned off. It will resume
250		- * at the first sequential access.
251		- *
252		- * There is a special-case: if the first page which the application tries to
253		- * read happens to be the first page of the file, it is assumed that a linear
254		- * read is about to happen and the window is immediately set to the initial size
255		- * based on I/O request size and the max_readahead.
256		- *
257		- * This function is to be called for every read request, rather than when
258		- * it is time to perform readahead. It is called only once for the entire I/O
259		- * regardless of size unless readahead is unable to start enough I/O to satisfy
260		- * the request (I/O request > max_readahead).
261		- */
262		-
263		-/*
264	128	* do_page_cache_readahead actually reads a chunk of disk. It allocates all
265	129	* the pages first, then submits them all for I/O. This avoids the very bad
266	130	* behaviour which would occur if page allocations are causing VM writeback.
...	...	@@ -295,7 +159,7 @@
295	159	read_lock_irq(&mapping->tree_lock);
296	160	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
297	161	pgoff_t page_offset = offset + page_idx;
298		-
	162	+
299	163	if (page_offset > end_index)
300	164	break;
301	165
...	...	@@ -361,28 +225,6 @@
361	225	}
362	226
363	227	/*
364		- * Check how effective readahead is being. If the amount of started IO is
365		- * less than expected then the file is partly or fully in pagecache and
366		- * readahead isn't helping.
367		- *
368		- */
369		-static inline int check_ra_success(struct file_ra_state *ra,
370		- unsigned long nr_to_read, unsigned long actual)
371		-{
372		- if (actual == 0) {
373		- ra->cache_hit += nr_to_read;
374		- if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
375		- ra_off(ra);
376		- ra->flags \|= RA_FLAG_INCACHE;
377		- return 0;
378		- }
379		- } else {
380		- ra->cache_hit=0;
381		- }
382		- return 1;
383		-}
384		-
385		-/*
386	228	* This version skips the IO if the queue is read-congested, and will tell the
387	229	* block layer to abandon the readahead if request allocation would block.
388	230	*
...	...	@@ -399,191 +241,6 @@
399	241	}
400	242
401	243	/*
402		- * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
403		- * is set wait till the read completes. Otherwise attempt to read without
404		- * blocking.
405		- * Returns 1 meaning 'success' if read is successful without switching off
406		- * readahead mode. Otherwise return failure.
407		- */
408		-static int
409		-blockable_page_cache_readahead(struct address_space mapping, struct file filp,
410		- pgoff_t offset, unsigned long nr_to_read,
411		- struct file_ra_state *ra, int block)
412		-{
413		- int actual;
414		-
415		- if (!block && bdi_read_congested(mapping->backing_dev_info))
416		- return 0;
417		-
418		- actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
419		-
420		- return check_ra_success(ra, nr_to_read, actual);
421		-}
422		-
423		-static int make_ahead_window(struct address_space mapping, struct file filp,
424		- struct file_ra_state *ra, int force)
425		-{
426		- int block, ret;
427		-
428		- ra->ahead_size = get_next_ra_size(ra);
429		- ra->ahead_start = ra->start + ra->size;
430		-
431		- block = force \|\| (ra->prev_index >= ra->ahead_start);
432		- ret = blockable_page_cache_readahead(mapping, filp,
433		- ra->ahead_start, ra->ahead_size, ra, block);
434		-
435		- if (!ret && !force) {
436		- /* A read failure in blocking mode, implies pages are
437		- * all cached. So we can safely assume we have taken
438		- * care of all the pages requested in this call.
439		- * A read failure in non-blocking mode, implies we are
440		- * reading more pages than requested in this call. So
441		- * we safely assume we have taken care of all the pages
442		- * requested in this call.
443		- *
444		- * Just reset the ahead window in case we failed due to
445		- * congestion. The ahead window will any way be closed
446		- * in case we failed due to excessive page cache hits.
447		- */
448		- reset_ahead_window(ra);
449		- }
450		-
451		- return ret;
452		-}
453		-
454		-/**
455		- * page_cache_readahead - generic adaptive readahead
456		- * @mapping: address_space which holds the pagecache and I/O vectors
457		- * @ra: file_ra_state which holds the readahead state
458		- * @filp: passed on to ->readpage() and ->readpages()
459		- * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
460		- * @req_size: hint: total size of the read which the caller is performing in
461		- * PAGE_CACHE_SIZE units
462		- *
463		- * page_cache_readahead() is the main function. It performs the adaptive
464		- * readahead window size management and submits the readahead I/O.
465		- *
466		- * Note that @filp is purely used for passing on to the ->readpage[s]()
467		- * handler: it may refer to a different file from @mapping (so we may not use
468		- * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
469		- * Also, @ra may not be equal to &@filp->f_ra.
470		- *
471		- */
472		-unsigned long
473		-page_cache_readahead(struct address_space mapping, struct file_ra_state ra,
474		- struct file *filp, pgoff_t offset, unsigned long req_size)
475		-{
476		- unsigned long max, newsize;
477		- int sequential;
478		-
479		- /*
480		- * We avoid doing extra work and bogusly perturbing the readahead
481		- * window expansion logic.
482		- */
483		- if (offset == ra->prev_index && --req_size)
484		- ++offset;
485		-
486		- /* Note that prev_index == -1 if it is a first read */
487		- sequential = (offset == ra->prev_index + 1);
488		- ra->prev_index = offset;
489		- ra->prev_offset = 0;
490		-
491		- max = get_max_readahead(ra);
492		- newsize = min(req_size, max);
493		-
494		- /* No readahead or sub-page sized read or file already in cache */
495		- if (newsize == 0 \|\| (ra->flags & RA_FLAG_INCACHE))
496		- goto out;
497		-
498		- ra->prev_index += newsize - 1;
499		-
500		- /*
501		- * Special case - first read at start of file. We'll assume it's
502		- * a whole-file read and grow the window fast. Or detect first
503		- * sequential access
504		- */
505		- if (sequential && ra->size == 0) {
506		- ra->size = get_init_ra_size(newsize, max);
507		- ra->start = offset;
508		- if (!blockable_page_cache_readahead(mapping, filp, offset,
509		- ra->size, ra, 1))
510		- goto out;
511		-
512		- /*
513		- * If the request size is larger than our max readahead, we
514		- * at least want to be sure that we get 2 IOs in flight and
515		- * we know that we will definitly need the new I/O.
516		- * once we do this, subsequent calls should be able to overlap
517		- * IOs,* thus preventing stalls. so issue the ahead window
518		- * immediately.
519		- */
520		- if (req_size >= max)
521		- make_ahead_window(mapping, filp, ra, 1);
522		-
523		- goto out;
524		- }
525		-
526		- /*
527		- * Now handle the random case:
528		- * partial page reads and first access were handled above,
529		- * so this must be the next page otherwise it is random
530		- */
531		- if (!sequential) {
532		- ra_off(ra);
533		- blockable_page_cache_readahead(mapping, filp, offset,
534		- newsize, ra, 1);
535		- goto out;
536		- }
537		-
538		- /*
539		- * If we get here we are doing sequential IO and this was not the first
540		- * occurence (ie we have an existing window)
541		- */
542		- if (ra->ahead_start == 0) { /* no ahead window yet */
543		- if (!make_ahead_window(mapping, filp, ra, 0))
544		- goto recheck;
545		- }
546		-
547		- /*
548		- * Already have an ahead window, check if we crossed into it.
549		- * If so, shift windows and issue a new ahead window.
550		- * Only return the #pages that are in the current window, so that
551		- * we get called back on the first page of the ahead window which
552		- * will allow us to submit more IO.
553		- */
554		- if (ra->prev_index >= ra->ahead_start) {
555		- ra->start = ra->ahead_start;
556		- ra->size = ra->ahead_size;
557		- make_ahead_window(mapping, filp, ra, 0);
558		-recheck:
559		- /* prev_index shouldn't overrun the ahead window */
560		- ra->prev_index = min(ra->prev_index,
561		- ra->ahead_start + ra->ahead_size - 1);
562		- }
563		-
564		-out:
565		- return ra->prev_index + 1;
566		-}
567		-EXPORT_SYMBOL_GPL(page_cache_readahead);
568		-
569		-/*
570		- * handle_ra_miss() is called when it is known that a page which should have
571		- * been present in the pagecache (we just did some readahead there) was in fact
572		- * not found. This will happen if it was evicted by the VM (readahead
573		- * thrashing)
574		- *
575		- * Turn on the cache miss flag in the RA struct, this will cause the RA code
576		- * to reduce the RA size on the next read.
577		- */
578		-void handle_ra_miss(struct address_space *mapping,
579		- struct file_ra_state *ra, pgoff_t offset)
580		-{
581		- ra->flags \|= RA_FLAG_MISS;
582		- ra->flags &= ~RA_FLAG_INCACHE;
583		- ra->cache_hit = 0;
584		-}
585		-
586		-/*
587	244	* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
588	245	* sensible upper limit.
589	246	*/
590	247
591	248
592	249
...	...	@@ -613,19 +270,39 @@
613	270	EXPORT_SYMBOL_GPL(ra_submit);
614	271
615	272	/*
	273	+ * Set the initial window size, round to next power of 2 and square
	274	+ * for small size, x 4 for medium, and x 2 for large
	275	+ * for 128k (32 page) max ra
	276	+ * 1-8 page = 32k initial, > 8 page = 128k initial
	277	+ */
	278	+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
	279	+{
	280	+ unsigned long newsize = roundup_pow_of_two(size);
	281	+
	282	+ if (newsize <= max / 32)
	283	+ newsize = newsize * 4;
	284	+ else if (newsize <= max / 4)
	285	+ newsize = newsize * 2;
	286	+ else
	287	+ newsize = max;
	288	+
	289	+ return newsize;
	290	+}
	291	+
	292	+/*
616	293	* Get the previous window size, ramp it up, and
617	294	* return it as the new window size.
618	295	*/
619		-static unsigned long get_next_ra_size2(struct file_ra_state *ra,
	296	+static unsigned long get_next_ra_size(struct file_ra_state *ra,
620	297	unsigned long max)
621	298	{
622	299	unsigned long cur = ra->readahead_index - ra->ra_index;
623	300	unsigned long newsize;
624	301
625	302	if (cur < max / 16)
626		- newsize = cur * 4;
	303	+ newsize = 4 * cur;
627	304	else
628		- newsize = cur * 2;
	305	+ newsize = 2 * cur;
629	306
630	307	return min(newsize, max);
631	308	}
...	...	@@ -701,7 +378,7 @@
701	378	if (offset && (offset == ra->lookahead_index \|\|
702	379	offset == ra->readahead_index)) {
703	380	ra_index = ra->readahead_index;
704		- ra_size = get_next_ra_size2(ra, max);
	381	+ ra_size = get_next_ra_size(ra, max);
705	382	la_size = ra_size;
706	383	goto fill_ra;
707	384	}