Blame view

mm/filemap.c 98.3 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
12
  /*
   *	linux/mm/filemap.c
   *
   * Copyright (C) 1994-1999  Linus Torvalds
   */
  
  /*
   * This file handles the generic file mmap semantics used by
   * most "normal" filesystems (but you don't /have/ to use this:
   * the NFS filesystem used to do this differently, for example)
   */
b95f1b31b   Paul Gortmaker   mm: Map most file...
13
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
  #include <linux/compiler.h>
f9fe48bec   Ross Zwisler   dax: support dirt...
15
  #include <linux/dax.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
  #include <linux/fs.h>
3f07c0144   Ingo Molnar   sched/headers: Pr...
17
  #include <linux/sched/signal.h>
c22ce143d   Hiro Yoshioka   [PATCH] x86: cach...
18
  #include <linux/uaccess.h>
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
19
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/kernel_stat.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
21
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
26
27
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/file.h>
  #include <linux/uio.h>
cfcbfb138   Josef Bacik   mm/filemap.c: ena...
28
  #include <linux/error-injection.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29
30
  #include <linux/hash.h>
  #include <linux/writeback.h>
53253383f   Linus Torvalds   Include <linux/ba...
31
  #include <linux/backing-dev.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/security.h>
44110fe38   Paul Jackson   [PATCH] cpuset me...
35
  #include <linux/cpuset.h>
00501b531   Johannes Weiner   mm: memcontrol: r...
36
  #include <linux/hugetlb.h>
8a9f3ccd2   Balbir Singh   Memory controller...
37
  #include <linux/memcontrol.h>
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
38
  #include <linux/cleancache.h>
c7df8ad29   Mel Gorman   mm, truncate: do ...
39
  #include <linux/shmem_fs.h>
f1820361f   Kirill A. Shutemov   mm: implement ->m...
40
  #include <linux/rmap.h>
b1d29ba82   Johannes Weiner   delayacct: track ...
41
  #include <linux/delayacct.h>
eb414681d   Johannes Weiner   psi: pressure sta...
42
  #include <linux/psi.h>
d0e6a5821   Ben Dooks   mm/filemap.c: inc...
43
  #include <linux/ramfs.h>
b9306a796   Yang Shi   mm: filemap: clea...
44
  #include <linux/page_idle.h>
0f8053a50   Nick Piggin   [PATCH] mm: make ...
45
  #include "internal.h"
fe0bfaaff   Robert Jarzmik   mm: trace filemap...
46
47
  #define CREATE_TRACE_POINTS
  #include <trace/events/filemap.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
   * FIXME: remove all knowledge of the buffer layer from the core VM
   */
148f948ba   Jan Kara   vfs: Introduce ne...
51
  #include <linux/buffer_head.h> /* for try_to_free_buffers */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
52

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  #include <asm/mman.h>
  
  /*
   * Shared mappings implemented 30.11.1994. It's not fully working yet,
   * though.
   *
   * Shared mappings now work. 15.8.1995  Bruno.
   *
   * finished 'unifying' the page and buffer cache and SMP-threaded the
   * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
   *
   * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
   */
  
  /*
   * Lock ordering:
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
70
   *  ->i_mmap_rwsem		(truncate_pagecache)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
71
   *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
5d337b919   Hugh Dickins   [PATCH] swap: swa...
72
   *      ->swap_lock		(exclusive_swap_page, others)
b93b01631   Matthew Wilcox   page cache: use x...
73
   *        ->i_pages lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
74
   *
1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
75
   *  ->i_mutex
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
76
   *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
77
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
78
   *  ->mmap_lock
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
79
   *    ->i_mmap_rwsem
b8072f099   Hugh Dickins   [PATCH] mm: updat...
80
   *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
b93b01631   Matthew Wilcox   page cache: use x...
81
   *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
83
   *  ->mmap_lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
84
85
   *    ->lock_page		(access_process_vm)
   *
ccad23656   Al Viro   kill generic_file...
86
   *  ->i_mutex			(generic_perform_write)
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
87
   *    ->mmap_lock		(fault_in_pages_readable->do_page_fault)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
88
   *
f758eeabe   Christoph Hellwig   writeback: split ...
89
   *  bdi->wb.list_lock
a66979aba   Dave Chinner   fs: move i_wb_lis...
90
   *    sb_lock			(fs/fs-writeback.c)
b93b01631   Matthew Wilcox   page cache: use x...
91
   *    ->i_pages lock		(__sync_single_inode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
92
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
93
   *  ->i_mmap_rwsem
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
94
95
96
   *    ->anon_vma.lock		(vma_adjust)
   *
   *  ->anon_vma.lock
b8072f099   Hugh Dickins   [PATCH] mm: updat...
97
   *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
98
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
99
   *  ->page_table_lock or pte_lock
5d337b919   Hugh Dickins   [PATCH] swap: swa...
100
   *    ->swap_lock		(try_to_unmap_one)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
101
   *    ->private_lock		(try_to_unmap_one)
b93b01631   Matthew Wilcox   page cache: use x...
102
   *    ->i_pages lock		(try_to_unmap_one)
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
103
104
   *    ->pgdat->lru_lock		(follow_page->mark_page_accessed)
   *    ->pgdat->lru_lock		(check_pte_range->isolate_lru_page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
105
   *    ->private_lock		(page_remove_rmap->set_page_dirty)
b93b01631   Matthew Wilcox   page cache: use x...
106
   *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
f758eeabe   Christoph Hellwig   writeback: split ...
107
   *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
108
   *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
109
   *    ->memcg->move_lock	(page_remove_rmap->lock_page_memcg)
f758eeabe   Christoph Hellwig   writeback: split ...
110
   *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
111
   *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
112
113
   *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
114
   * ->i_mmap_rwsem
9a3c531df   Andi Kleen   mm: update stale ...
115
   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
116
   */
5c024e6a4   Matthew Wilcox   page cache: Conve...
117
  static void page_cache_delete(struct address_space *mapping,
91b0abe36   Johannes Weiner   mm + fs: store sh...
118
119
  				   struct page *page, void *shadow)
  {
5c024e6a4   Matthew Wilcox   page cache: Conve...
120
121
  	XA_STATE(xas, &mapping->i_pages, page->index);
  	unsigned int nr = 1;
c70b647d3   Kirill A. Shutemov   mm/filemap.c: add...
122

5c024e6a4   Matthew Wilcox   page cache: Conve...
123
  	mapping_set_update(&xas, mapping);
c70b647d3   Kirill A. Shutemov   mm/filemap.c: add...
124

5c024e6a4   Matthew Wilcox   page cache: Conve...
125
126
127
  	/* hugetlb pages are represented by a single entry in the xarray */
  	if (!PageHuge(page)) {
  		xas_set_order(&xas, page->index, compound_order(page));
d8c6546b1   Matthew Wilcox (Oracle)   mm: introduce com...
128
  		nr = compound_nr(page);
5c024e6a4   Matthew Wilcox   page cache: Conve...
129
  	}
91b0abe36   Johannes Weiner   mm + fs: store sh...
130

83929372f   Kirill A. Shutemov   filemap: prepare ...
131
132
133
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageTail(page), page);
  	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
449dd6984   Johannes Weiner   mm: keep page cac...
134

5c024e6a4   Matthew Wilcox   page cache: Conve...
135
136
  	xas_store(&xas, shadow);
  	xas_init_marks(&xas);
d3798ae8c   Johannes Weiner   mm: filemap: don'...
137

2300638b1   Jan Kara   mm: move clearing...
138
139
  	page->mapping = NULL;
  	/* Leave page->index set: truncation lookup relies upon it */
d3798ae8c   Johannes Weiner   mm: filemap: don'...
140
141
142
143
144
145
146
147
148
149
150
  	if (shadow) {
  		mapping->nrexceptional += nr;
  		/*
  		 * Make sure the nrexceptional update is committed before
  		 * the nrpages update so that final truncate racing
  		 * with reclaim does not see both counters 0 at the
  		 * same time and miss a shadow entry.
  		 */
  		smp_wmb();
  	}
  	mapping->nrpages -= nr;
91b0abe36   Johannes Weiner   mm + fs: store sh...
151
  }
5ecc4d852   Jan Kara   mm: factor out ch...
152
153
  static void unaccount_page_cache_page(struct address_space *mapping,
  				      struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
154
  {
5ecc4d852   Jan Kara   mm: factor out ch...
155
  	int nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
156

c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
157
158
159
160
161
162
163
164
  	/*
  	 * if we're uptodate, flush out into the cleancache, otherwise
  	 * invalidate any existing cleancache entries.  We can't leave
  	 * stale data around in the cleancache once our page is gone
  	 */
  	if (PageUptodate(page) && PageMappedToDisk(page))
  		cleancache_put_page(page);
  	else
3167760f8   Dan Magenheimer   mm: cleancache: s...
165
  		cleancache_invalidate_page(mapping, page);
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
166

83929372f   Kirill A. Shutemov   filemap: prepare ...
167
  	VM_BUG_ON_PAGE(PageTail(page), page);
06b241f32   Hugh Dickins   mm: __delete_from...
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
  	VM_BUG_ON_PAGE(page_mapped(page), page);
  	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
  		int mapcount;
  
  		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx
  ",
  			 current->comm, page_to_pfn(page));
  		dump_page(page, "still mapped when deleted");
  		dump_stack();
  		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  
  		mapcount = page_mapcount(page);
  		if (mapping_exiting(mapping) &&
  		    page_count(page) >= mapcount + 2) {
  			/*
  			 * All vmas have already been torn down, so it's
  			 * a good bet that actually the page is unmapped,
  			 * and we'd prefer not to leak it: if we're wrong,
  			 * some other bad page check should catch it later.
  			 */
  			page_mapcount_reset(page);
6d061f9f6   Joonsoo Kim   mm/page_ref: use ...
189
  			page_ref_sub(page, mapcount);
06b241f32   Hugh Dickins   mm: __delete_from...
190
191
  		}
  	}
4165b9b46   Michal Hocko   hugetlb: do not a...
192
  	/* hugetlb pages do not participate in page cache accounting. */
5ecc4d852   Jan Kara   mm: factor out ch...
193
194
  	if (PageHuge(page))
  		return;
09612fa65   Naoya Horiguchi   mm: hugetlb: retu...
195

6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
196
  	nr = thp_nr_pages(page);
5ecc4d852   Jan Kara   mm: factor out ch...
197

0d1c20722   Johannes Weiner   mm: memcontrol: s...
198
  	__mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
5ecc4d852   Jan Kara   mm: factor out ch...
199
  	if (PageSwapBacked(page)) {
0d1c20722   Johannes Weiner   mm: memcontrol: s...
200
  		__mod_lruvec_page_state(page, NR_SHMEM, -nr);
5ecc4d852   Jan Kara   mm: factor out ch...
201
202
  		if (PageTransHuge(page))
  			__dec_node_page_state(page, NR_SHMEM_THPS);
99cb0dbd4   Song Liu   mm,thp: add read-...
203
204
  	} else if (PageTransHuge(page)) {
  		__dec_node_page_state(page, NR_FILE_THPS);
09d91cda0   Song Liu   mm,thp: avoid wri...
205
  		filemap_nr_thps_dec(mapping);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
206
  	}
5ecc4d852   Jan Kara   mm: factor out ch...
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
  
  	/*
  	 * At this point page must be either written or cleaned by
  	 * truncate.  Dirty page here signals a bug and loss of
  	 * unwritten data.
  	 *
  	 * This fixes dirty accounting after removing the page entirely
  	 * but leaves PageDirty set: it has no effect for truncated
  	 * page and anyway will be cleared before returning page into
  	 * buddy allocator.
  	 */
  	if (WARN_ON_ONCE(PageDirty(page)))
  		account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
  }
  
  /*
   * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
b93b01631   Matthew Wilcox   page cache: use x...
225
   * is safe.  The caller must hold the i_pages lock.
5ecc4d852   Jan Kara   mm: factor out ch...
226
227
228
229
230
231
232
233
   */
  void __delete_from_page_cache(struct page *page, void *shadow)
  {
  	struct address_space *mapping = page->mapping;
  
  	trace_mm_filemap_delete_from_page_cache(page);
  
  	unaccount_page_cache_page(mapping, page);
5c024e6a4   Matthew Wilcox   page cache: Conve...
234
  	page_cache_delete(mapping, page, shadow);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
235
  }
59c66c5f8   Jan Kara   mm: factor out pa...
236
237
238
239
240
241
242
243
244
245
  static void page_cache_free_page(struct address_space *mapping,
  				struct page *page)
  {
  	void (*freepage)(struct page *);
  
  	freepage = mapping->a_ops->freepage;
  	if (freepage)
  		freepage(page);
  
  	if (PageTransHuge(page) && !PageHuge(page)) {
887b22c62   Matthew Wilcox (Oracle)   mm/filemap: fix p...
246
  		page_ref_sub(page, thp_nr_pages(page));
59c66c5f8   Jan Kara   mm: factor out pa...
247
248
249
250
251
  		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
  	} else {
  		put_page(page);
  	}
  }
702cfbf93   Minchan Kim   mm: goodbye remov...
252
253
254
255
256
257
258
259
260
  /**
   * delete_from_page_cache - delete page from page cache
   * @page: the page which the kernel is trying to remove from page cache
   *
   * This must be called only on pages that have been verified to be in the page
   * cache and locked.  It will never put the page into the free list, the caller
   * has a reference on the page.
   */
  void delete_from_page_cache(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
261
  {
83929372f   Kirill A. Shutemov   filemap: prepare ...
262
  	struct address_space *mapping = page_mapping(page);
c4843a759   Greg Thelen   memcg: add per cg...
263
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264

cd7619d6b   Matt Mackall   [PATCH] Extermina...
265
  	BUG_ON(!PageLocked(page));
b93b01631   Matthew Wilcox   page cache: use x...
266
  	xa_lock_irqsave(&mapping->i_pages, flags);
62cccb8c8   Johannes Weiner   mm: simplify lock...
267
  	__delete_from_page_cache(page, NULL);
b93b01631   Matthew Wilcox   page cache: use x...
268
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
6072d13c4   Linus Torvalds   Call the filesyst...
269

59c66c5f8   Jan Kara   mm: factor out pa...
270
  	page_cache_free_page(mapping, page);
97cecb5a2   Minchan Kim   mm: introduce del...
271
272
  }
  EXPORT_SYMBOL(delete_from_page_cache);
aa65c29ce   Jan Kara   mm: batch radix t...
273
  /*
ef8e5717d   Matthew Wilcox   page cache: Conve...
274
   * page_cache_delete_batch - delete several pages from page cache
aa65c29ce   Jan Kara   mm: batch radix t...
275
276
277
   * @mapping: the mapping to which pages belong
   * @pvec: pagevec with pages to delete
   *
b93b01631   Matthew Wilcox   page cache: use x...
278
   * The function walks over mapping->i_pages and removes pages passed in @pvec
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
279
280
   * from the mapping. The function expects @pvec to be sorted by page index
   * and is optimised for it to be dense.
b93b01631   Matthew Wilcox   page cache: use x...
281
   * It tolerates holes in @pvec (mapping entries at those indices are not
aa65c29ce   Jan Kara   mm: batch radix t...
282
   * modified). The function expects only THP head pages to be present in the
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
283
   * @pvec.
aa65c29ce   Jan Kara   mm: batch radix t...
284
   *
b93b01631   Matthew Wilcox   page cache: use x...
285
   * The function expects the i_pages lock to be held.
aa65c29ce   Jan Kara   mm: batch radix t...
286
   */
ef8e5717d   Matthew Wilcox   page cache: Conve...
287
  static void page_cache_delete_batch(struct address_space *mapping,
aa65c29ce   Jan Kara   mm: batch radix t...
288
289
  			     struct pagevec *pvec)
  {
ef8e5717d   Matthew Wilcox   page cache: Conve...
290
  	XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
aa65c29ce   Jan Kara   mm: batch radix t...
291
  	int total_pages = 0;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
292
  	int i = 0;
aa65c29ce   Jan Kara   mm: batch radix t...
293
  	struct page *page;
aa65c29ce   Jan Kara   mm: batch radix t...
294

ef8e5717d   Matthew Wilcox   page cache: Conve...
295
296
  	mapping_set_update(&xas, mapping);
  	xas_for_each(&xas, page, ULONG_MAX) {
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
297
  		if (i >= pagevec_count(pvec))
aa65c29ce   Jan Kara   mm: batch radix t...
298
  			break;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
299
300
  
  		/* A swap/dax/shadow entry got inserted? Skip it. */
3159f943a   Matthew Wilcox   xarray: Replace e...
301
  		if (xa_is_value(page))
aa65c29ce   Jan Kara   mm: batch radix t...
302
  			continue;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
  		/*
  		 * A page got inserted in our range? Skip it. We have our
  		 * pages locked so they are protected from being removed.
  		 * If we see a page whose index is higher than ours, it
  		 * means our page has been removed, which shouldn't be
  		 * possible because we're holding the PageLock.
  		 */
  		if (page != pvec->pages[i]) {
  			VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
  					page);
  			continue;
  		}
  
  		WARN_ON_ONCE(!PageLocked(page));
  
  		if (page->index == xas.xa_index)
aa65c29ce   Jan Kara   mm: batch radix t...
319
  			page->mapping = NULL;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
320
321
322
323
324
325
326
327
  		/* Leave page->index set: truncation lookup relies on it */
  
  		/*
  		 * Move to the next page in the vector if this is a regular
  		 * page or the index is of the last sub-page of this compound
  		 * page.
  		 */
  		if (page->index + compound_nr(page) - 1 == xas.xa_index)
aa65c29ce   Jan Kara   mm: batch radix t...
328
  			i++;
ef8e5717d   Matthew Wilcox   page cache: Conve...
329
  		xas_store(&xas, NULL);
aa65c29ce   Jan Kara   mm: batch radix t...
330
331
332
333
334
335
336
337
338
339
340
341
342
  		total_pages++;
  	}
  	mapping->nrpages -= total_pages;
  }
  
  void delete_from_page_cache_batch(struct address_space *mapping,
  				  struct pagevec *pvec)
  {
  	int i;
  	unsigned long flags;
  
  	if (!pagevec_count(pvec))
  		return;
b93b01631   Matthew Wilcox   page cache: use x...
343
  	xa_lock_irqsave(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
344
345
346
347
348
  	for (i = 0; i < pagevec_count(pvec); i++) {
  		trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
  
  		unaccount_page_cache_page(mapping, pvec->pages[i]);
  	}
ef8e5717d   Matthew Wilcox   page cache: Conve...
349
  	page_cache_delete_batch(mapping, pvec);
b93b01631   Matthew Wilcox   page cache: use x...
350
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
351
352
353
354
  
  	for (i = 0; i < pagevec_count(pvec); i++)
  		page_cache_free_page(mapping, pvec->pages[i]);
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
355
  int filemap_check_errors(struct address_space *mapping)
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
356
357
358
  {
  	int ret = 0;
  	/* Check for outstanding write errors */
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
359
360
  	if (test_bit(AS_ENOSPC, &mapping->flags) &&
  	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
361
  		ret = -ENOSPC;
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
362
363
  	if (test_bit(AS_EIO, &mapping->flags) &&
  	    test_and_clear_bit(AS_EIO, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
364
365
366
  		ret = -EIO;
  	return ret;
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
367
  EXPORT_SYMBOL(filemap_check_errors);
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
368

76341cabb   Jeff Layton   jbd2: don't clear...
369
370
371
372
373
374
375
376
377
  static int filemap_check_and_keep_errors(struct address_space *mapping)
  {
  	/* Check for outstanding write errors */
  	if (test_bit(AS_EIO, &mapping->flags))
  		return -EIO;
  	if (test_bit(AS_ENOSPC, &mapping->flags))
  		return -ENOSPC;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
378
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
379
   * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
380
381
   * @mapping:	address space structure to write
   * @start:	offset in bytes where the range starts
469eb4d03   Andrew Morton   [PATCH] filemap_f...
382
   * @end:	offset in bytes where the range ends (inclusive)
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
383
   * @sync_mode:	enable synchronous operation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
384
   *
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
385
386
387
   * Start writeback against all of a mapping's dirty pages that lie
   * within the byte offsets <start, end> inclusive.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
388
   * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
389
   * opposed to a regular memory cleansing writeback.  The difference between
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
390
391
   * these two operations is that if a dirty page/buffer is encountered, it must
   * be waited upon, and not just skipped over.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
392
393
   *
   * Return: %0 on success, negative error code otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
394
   */
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
395
396
  int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
  				loff_t end, int sync_mode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
397
398
399
400
  {
  	int ret;
  	struct writeback_control wbc = {
  		.sync_mode = sync_mode,
05fe478dd   Nick Piggin   mm: write_cache_p...
401
  		.nr_to_write = LONG_MAX,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
402
403
  		.range_start = start,
  		.range_end = end,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
404
  	};
f56753ac2   Christoph Hellwig   bdi: replace BDI_...
405
  	if (!mapping_can_writeback(mapping) ||
c3aab9a0b   Konstantin Khlebnikov   mm/filemap.c: don...
406
  	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
407
  		return 0;
b16b1deb5   Tejun Heo   writeback: make w...
408
  	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409
  	ret = do_writepages(mapping, &wbc);
b16b1deb5   Tejun Heo   writeback: make w...
410
  	wbc_detach_inode(&wbc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
411
412
413
414
415
416
  	return ret;
  }
  
  static inline int __filemap_fdatawrite(struct address_space *mapping,
  	int sync_mode)
  {
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
417
  	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
418
419
420
421
422
423
424
  }
  
  int filemap_fdatawrite(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
  }
  EXPORT_SYMBOL(filemap_fdatawrite);
f4c0a0fdf   Jan Kara   vfs: export filem...
425
  int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
426
  				loff_t end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
427
428
429
  {
  	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
  }
f4c0a0fdf   Jan Kara   vfs: export filem...
430
  EXPORT_SYMBOL(filemap_fdatawrite_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
431

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
432
433
434
435
  /**
   * filemap_flush - mostly a non-blocking flush
   * @mapping:	target address_space
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
436
437
   * This is a mostly non-blocking flush.  Not suitable for data-integrity
   * purposes - I/O may not be started against all dirty pages.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
438
439
   *
   * Return: %0 on success, negative error code otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
440
441
442
443
444
445
   */
  int filemap_flush(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
  }
  EXPORT_SYMBOL(filemap_flush);
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
446
447
448
449
450
451
452
453
  /**
   * filemap_range_has_page - check if a page exists in range.
   * @mapping:           address space within which to check
   * @start_byte:        offset in bytes where the range starts
   * @end_byte:          offset in bytes where the range ends (inclusive)
   *
   * Find at least one page in the range supplied, usually used to check if
   * direct writing in this range will trigger a writeback.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
454
455
456
   *
   * Return: %true if at least one page exists in the specified range,
   * %false otherwise.
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
457
458
459
460
   */
  bool filemap_range_has_page(struct address_space *mapping,
  			   loff_t start_byte, loff_t end_byte)
  {
f7b680468   Jan Kara   mm: use find_get_...
461
  	struct page *page;
8fa8e538e   Matthew Wilcox   page cache: Conve...
462
463
  	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
  	pgoff_t max = end_byte >> PAGE_SHIFT;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
464
465
466
  
  	if (end_byte < start_byte)
  		return false;
8fa8e538e   Matthew Wilcox   page cache: Conve...
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
  	rcu_read_lock();
  	for (;;) {
  		page = xas_find(&xas, max);
  		if (xas_retry(&xas, page))
  			continue;
  		/* Shadow entries don't count */
  		if (xa_is_value(page))
  			continue;
  		/*
  		 * We don't need to try to pin this page; we're about to
  		 * release the RCU lock anyway.  It is enough to know that
  		 * there was a page here recently.
  		 */
  		break;
  	}
  	rcu_read_unlock();
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
483

8fa8e538e   Matthew Wilcox   page cache: Conve...
484
  	return page != NULL;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
485
486
  }
  EXPORT_SYMBOL(filemap_range_has_page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
487
  static void __filemap_fdatawait_range(struct address_space *mapping,
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
488
  				     loff_t start_byte, loff_t end_byte)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
489
  {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
490
491
  	pgoff_t index = start_byte >> PAGE_SHIFT;
  	pgoff_t end = end_byte >> PAGE_SHIFT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
492
493
  	struct pagevec pvec;
  	int nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
494

94004ed72   Christoph Hellwig   kill wait_on_page...
495
  	if (end_byte < start_byte)
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
496
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
497

866798201   Mel Gorman   mm, pagevec: remo...
498
  	pagevec_init(&pvec);
312e9d2f7   Jan Kara   mm: use pagevec_l...
499
  	while (index <= end) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
500
  		unsigned i;
312e9d2f7   Jan Kara   mm: use pagevec_l...
501
  		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
67fd707f4   Jan Kara   mm: remove nr_pag...
502
  				end, PAGECACHE_TAG_WRITEBACK);
312e9d2f7   Jan Kara   mm: use pagevec_l...
503
504
  		if (!nr_pages)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505
506
  		for (i = 0; i < nr_pages; i++) {
  			struct page *page = pvec.pages[i];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
507
  			wait_on_page_writeback(page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
508
  			ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509
510
511
512
  		}
  		pagevec_release(&pvec);
  		cond_resched();
  	}
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
  }
  
  /**
   * filemap_fdatawait_range - wait for writeback to complete
   * @mapping:		address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the given address space
   * in the given range and wait for all of them.  Check error status of
   * the address space and return it.
   *
   * Since the error status of the address space is cleared by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
528
529
   *
   * Return: error status of the address space.
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
530
531
532
533
   */
  int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
  			    loff_t end_byte)
  {
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
534
535
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
536
  }
d3bccb6f4   Jan Kara   vfs: Introduce fi...
537
538
539
  EXPORT_SYMBOL(filemap_fdatawait_range);
  
  /**
aa0bfcd93   Ross Zwisler   mm: add filemap_f...
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
   * filemap_fdatawait_range_keep_errors - wait for writeback to complete
   * @mapping:		address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the given address space in the
   * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
   * this function does not clear error status of the address space.
   *
   * Use this function if callers don't handle errors themselves.  Expected
   * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
   * fsfreeze(8)
   */
  int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
  		loff_t start_byte, loff_t end_byte)
  {
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return filemap_check_and_keep_errors(mapping);
  }
  EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
  
  /**
a823e4589   Jeff Layton   mm: add file_fdat...
562
563
564
565
566
567
568
569
570
571
572
573
   * file_fdatawait_range - wait for writeback to complete
   * @file:		file pointing to address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the address space that file
   * refers to, in the given range and wait for all of them.  Check error
   * status of the address space vs. the file->f_wb_err cursor and return it.
   *
   * Since the error status of the file is advanced by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
574
575
   *
   * Return: error status of the address space vs. the file->f_wb_err cursor.
a823e4589   Jeff Layton   mm: add file_fdat...
576
577
578
579
580
581
582
583
584
   */
  int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return file_check_and_advance_wb_err(file);
  }
  EXPORT_SYMBOL(file_fdatawait_range);
d3bccb6f4   Jan Kara   vfs: Introduce fi...
585
586
  
  /**
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
587
588
589
590
591
592
593
594
595
596
   * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
   * @mapping: address space structure to wait for
   *
   * Walk the list of under-writeback pages of the given address space
   * and wait for all of them.  Unlike filemap_fdatawait(), this function
   * does not clear error status of the address space.
   *
   * Use this function if callers don't handle errors themselves.  Expected
   * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
   * fsfreeze(8)
a862f68a8   Mike Rapoport   docs/core-api/mm:...
597
598
   *
   * Return: error status of the address space.
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
599
   */
76341cabb   Jeff Layton   jbd2: don't clear...
600
  int filemap_fdatawait_keep_errors(struct address_space *mapping)
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
601
  {
ffb959bbd   Jeff Layton   mm: remove optimi...
602
  	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
76341cabb   Jeff Layton   jbd2: don't clear...
603
  	return filemap_check_and_keep_errors(mapping);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
604
  }
76341cabb   Jeff Layton   jbd2: don't clear...
605
  EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
606

875d91b11   Konstantin Khlebnikov   mm/filemap.c: rew...
607
  /* Returns true if writeback might be needed or already in progress. */
9326c9b20   Jeff Layton   mm: consolidate d...
608
  static bool mapping_needs_writeback(struct address_space *mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
609
  {
875d91b11   Konstantin Khlebnikov   mm/filemap.c: rew...
610
611
612
613
  	if (dax_mapping(mapping))
  		return mapping->nrexceptional;
  
  	return mapping->nrpages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
614
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
615

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
616
617
618
619
620
621
  /**
   * filemap_write_and_wait_range - write out & wait on a file range
   * @mapping:	the address_space for the pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
469eb4d03   Andrew Morton   [PATCH] filemap_f...
622
623
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
624
   * Note that @lend is inclusive (describes the last byte to be written) so
469eb4d03   Andrew Morton   [PATCH] filemap_f...
625
   * that this function can be used to write to the very end-of-file (end = -1).
a862f68a8   Mike Rapoport   docs/core-api/mm:...
626
627
   *
   * Return: error status of the address space.
469eb4d03   Andrew Morton   [PATCH] filemap_f...
628
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
629
630
631
  int filemap_write_and_wait_range(struct address_space *mapping,
  				 loff_t lstart, loff_t lend)
  {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
632
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
633

9326c9b20   Jeff Layton   mm: consolidate d...
634
  	if (mapping_needs_writeback(mapping)) {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
635
636
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
ddf8f376d   Ira Weiny   mm/filemap.c: cle...
637
638
639
640
641
642
  		/*
  		 * Even if the above returned error, the pages may be
  		 * written partially (e.g. -ENOSPC), so we wait for it.
  		 * But the -EIO is special case, it may indicate the worst
  		 * thing (e.g. bug) happened, so we avoid waiting for it.
  		 */
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
643
  		if (err != -EIO) {
94004ed72   Christoph Hellwig   kill wait_on_page...
644
645
  			int err2 = filemap_fdatawait_range(mapping,
  						lstart, lend);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
646
647
  			if (!err)
  				err = err2;
cbeaf9510   Jeff Layton   mm: clear AS_EIO/...
648
649
650
  		} else {
  			/* Clear any previously stored errors */
  			filemap_check_errors(mapping);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
651
  		}
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
652
653
  	} else {
  		err = filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
654
  	}
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
655
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
656
  }
f69955855   Chris Mason   Export filemap_wr...
657
  EXPORT_SYMBOL(filemap_write_and_wait_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658

5660e13d2   Jeff Layton   fs: new infrastru...
659
660
  void __filemap_set_wb_err(struct address_space *mapping, int err)
  {
3acdfd280   Jeff Layton   errseq: rename __...
661
  	errseq_t eseq = errseq_set(&mapping->wb_err, err);
5660e13d2   Jeff Layton   fs: new infrastru...
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
  
  	trace_filemap_set_wb_err(mapping, eseq);
  }
  EXPORT_SYMBOL(__filemap_set_wb_err);
  
  /**
   * file_check_and_advance_wb_err - report wb error (if any) that was previously
   * 				   and advance wb_err to current one
   * @file: struct file on which the error is being reported
   *
   * When userland calls fsync (or something like nfsd does the equivalent), we
   * want to report any writeback errors that occurred since the last fsync (or
   * since the file was opened if there haven't been any).
   *
   * Grab the wb_err from the mapping. If it matches what we have in the file,
   * then just quickly return 0. The file is all caught up.
   *
   * If it doesn't match, then take the mapping value, set the "seen" flag in
   * it and try to swap it into place. If it works, or another task beat us
   * to it with the new value, then update the f_wb_err and return the error
   * portion. The error at this point must be reported via proper channels
   * (a'la fsync, or NFS COMMIT operation, etc.).
   *
   * While we handle mapping->wb_err with atomic operations, the f_wb_err
   * value is protected by the f_lock since we must ensure that it reflects
   * the latest value swapped in for this file descriptor.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
688
689
   *
   * Return: %0 on success, negative error code otherwise.
5660e13d2   Jeff Layton   fs: new infrastru...
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
   */
  int file_check_and_advance_wb_err(struct file *file)
  {
  	int err = 0;
  	errseq_t old = READ_ONCE(file->f_wb_err);
  	struct address_space *mapping = file->f_mapping;
  
  	/* Locklessly handle the common case where nothing has changed */
  	if (errseq_check(&mapping->wb_err, old)) {
  		/* Something changed, must use slow path */
  		spin_lock(&file->f_lock);
  		old = file->f_wb_err;
  		err = errseq_check_and_advance(&mapping->wb_err,
  						&file->f_wb_err);
  		trace_file_check_and_advance_wb_err(file, old);
  		spin_unlock(&file->f_lock);
  	}
f4e222c56   Jeff Layton   mm: have filemap_...
707
708
709
710
711
712
713
714
  
  	/*
  	 * We're mostly using this function as a drop in replacement for
  	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
  	 * that the legacy code would have had on these flags.
  	 */
  	clear_bit(AS_EIO, &mapping->flags);
  	clear_bit(AS_ENOSPC, &mapping->flags);
5660e13d2   Jeff Layton   fs: new infrastru...
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
  	return err;
  }
  EXPORT_SYMBOL(file_check_and_advance_wb_err);
  
  /**
   * file_write_and_wait_range - write out & wait on a file range
   * @file:	file pointing to address_space with pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
   * Note that @lend is inclusive (describes the last byte to be written) so
   * that this function can be used to write to the very end-of-file (end = -1).
   *
   * After writing out and waiting on the data, we check and advance the
   * f_wb_err cursor to the latest value, and return any errors detected there.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
732
733
   *
   * Return: %0 on success, negative error code otherwise.
5660e13d2   Jeff Layton   fs: new infrastru...
734
735
736
737
738
   */
  int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
  {
  	int err = 0, err2;
  	struct address_space *mapping = file->f_mapping;
9326c9b20   Jeff Layton   mm: consolidate d...
739
  	if (mapping_needs_writeback(mapping)) {
5660e13d2   Jeff Layton   fs: new infrastru...
740
741
742
743
744
745
746
747
748
749
750
751
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
  		/* See comment of filemap_write_and_wait() */
  		if (err != -EIO)
  			__filemap_fdatawait_range(mapping, lstart, lend);
  	}
  	err2 = file_check_and_advance_wb_err(file);
  	if (!err)
  		err = err2;
  	return err;
  }
  EXPORT_SYMBOL(file_write_and_wait_range);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
752
  /**
ef6a3c631   Miklos Szeredi   mm: add replace_p...
753
754
755
756
757
758
759
760
761
762
763
   * replace_page_cache_page - replace a pagecache page with a new one
   * @old:	page to be replaced
   * @new:	page to replace with
   * @gfp_mask:	allocation mode
   *
   * This function replaces a page in the pagecache with a new one.  On
   * success it acquires the pagecache reference for the new page and
   * drops it for the old page.  Both the old and new pages must be
   * locked.  This function does not add the new page to the LRU, the
   * caller must do that.
   *
74d609585   Matthew Wilcox   page cache: Add a...
764
   * The remove + add is atomic.  This function cannot fail.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
765
766
   *
   * Return: %0
ef6a3c631   Miklos Szeredi   mm: add replace_p...
767
768
769
   */
  int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
  {
74d609585   Matthew Wilcox   page cache: Add a...
770
771
772
773
774
  	struct address_space *mapping = old->mapping;
  	void (*freepage)(struct page *) = mapping->a_ops->freepage;
  	pgoff_t offset = old->index;
  	XA_STATE(xas, &mapping->i_pages, offset);
  	unsigned long flags;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
775

309381fea   Sasha Levin   mm: dump page whe...
776
777
778
  	VM_BUG_ON_PAGE(!PageLocked(old), old);
  	VM_BUG_ON_PAGE(!PageLocked(new), new);
  	VM_BUG_ON_PAGE(new->mapping, new);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
779

74d609585   Matthew Wilcox   page cache: Add a...
780
781
782
  	get_page(new);
  	new->mapping = mapping;
  	new->index = offset;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
783

0d1c20722   Johannes Weiner   mm: memcontrol: s...
784
  	mem_cgroup_migrate(old, new);
74d609585   Matthew Wilcox   page cache: Add a...
785
786
  	xas_lock_irqsave(&xas, flags);
  	xas_store(&xas, new);
4165b9b46   Michal Hocko   hugetlb: do not a...
787

74d609585   Matthew Wilcox   page cache: Add a...
788
789
790
  	old->mapping = NULL;
  	/* hugetlb pages do not participate in page cache accounting. */
  	if (!PageHuge(old))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
791
  		__dec_lruvec_page_state(old, NR_FILE_PAGES);
74d609585   Matthew Wilcox   page cache: Add a...
792
  	if (!PageHuge(new))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
793
  		__inc_lruvec_page_state(new, NR_FILE_PAGES);
74d609585   Matthew Wilcox   page cache: Add a...
794
  	if (PageSwapBacked(old))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
795
  		__dec_lruvec_page_state(old, NR_SHMEM);
74d609585   Matthew Wilcox   page cache: Add a...
796
  	if (PageSwapBacked(new))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
797
  		__inc_lruvec_page_state(new, NR_SHMEM);
74d609585   Matthew Wilcox   page cache: Add a...
798
  	xas_unlock_irqrestore(&xas, flags);
74d609585   Matthew Wilcox   page cache: Add a...
799
800
801
  	if (freepage)
  		freepage(old);
  	put_page(old);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
802

74d609585   Matthew Wilcox   page cache: Add a...
803
  	return 0;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
804
805
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
16c0cc0ce   Andrew Morton   revert "mm/filema...
806
  noinline int __add_to_page_cache_locked(struct page *page,
76cd61739   Alexei Starovoitov   mm/error_inject: ...
807
  					struct address_space *mapping,
c4cf498dc   Linus Torvalds   Merge branch 'akp...
808
  					pgoff_t offset, gfp_t gfp,
76cd61739   Alexei Starovoitov   mm/error_inject: ...
809
  					void **shadowp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
810
  {
74d609585   Matthew Wilcox   page cache: Add a...
811
  	XA_STATE(xas, &mapping->i_pages, offset);
00501b531   Johannes Weiner   mm: memcontrol: r...
812
  	int huge = PageHuge(page);
e286781d5   Nick Piggin   mm: speculative p...
813
  	int error;
309381fea   Sasha Levin   mm: dump page whe...
814
815
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
74d609585   Matthew Wilcox   page cache: Add a...
816
  	mapping_set_update(&xas, mapping);
e286781d5   Nick Piggin   mm: speculative p...
817

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
818
  	get_page(page);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
819
820
  	page->mapping = mapping;
  	page->index = offset;
3fea5a499   Johannes Weiner   mm: memcontrol: c...
821
  	if (!huge) {
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
822
  		error = mem_cgroup_charge(page, current->mm, gfp);
3fea5a499   Johannes Weiner   mm: memcontrol: c...
823
824
825
  		if (error)
  			goto error;
  	}
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
826
  	gfp &= GFP_RECLAIM_MASK;
74d609585   Matthew Wilcox   page cache: Add a...
827
  	do {
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
828
829
830
831
832
833
  		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
  		void *entry, *old = NULL;
  
  		if (order > thp_order(page))
  			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
  					order, gfp);
74d609585   Matthew Wilcox   page cache: Add a...
834
  		xas_lock_irq(&xas);
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
  		xas_for_each_conflict(&xas, entry) {
  			old = entry;
  			if (!xa_is_value(entry)) {
  				xas_set_err(&xas, -EEXIST);
  				goto unlock;
  			}
  		}
  
  		if (old) {
  			if (shadowp)
  				*shadowp = old;
  			/* entry may have been split before we acquired lock */
  			order = xa_get_order(xas.xa, xas.xa_index);
  			if (order > thp_order(page)) {
  				xas_split(&xas, old, order);
  				xas_reset(&xas);
  			}
  		}
74d609585   Matthew Wilcox   page cache: Add a...
853
854
855
  		xas_store(&xas, page);
  		if (xas_error(&xas))
  			goto unlock;
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
856
  		if (old)
74d609585   Matthew Wilcox   page cache: Add a...
857
  			mapping->nrexceptional--;
74d609585   Matthew Wilcox   page cache: Add a...
858
859
860
861
  		mapping->nrpages++;
  
  		/* hugetlb pages do not participate in page cache accounting */
  		if (!huge)
0d1c20722   Johannes Weiner   mm: memcontrol: s...
862
  			__inc_lruvec_page_state(page, NR_FILE_PAGES);
74d609585   Matthew Wilcox   page cache: Add a...
863
864
  unlock:
  		xas_unlock_irq(&xas);
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
865
  	} while (xas_nomem(&xas, gfp));
74d609585   Matthew Wilcox   page cache: Add a...
866

3fea5a499   Johannes Weiner   mm: memcontrol: c...
867
868
  	if (xas_error(&xas)) {
  		error = xas_error(&xas);
74d609585   Matthew Wilcox   page cache: Add a...
869
  		goto error;
3fea5a499   Johannes Weiner   mm: memcontrol: c...
870
  	}
4165b9b46   Michal Hocko   hugetlb: do not a...
871

66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
872
873
  	trace_mm_filemap_add_to_page_cache(page);
  	return 0;
74d609585   Matthew Wilcox   page cache: Add a...
874
  error:
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
875
876
  	page->mapping = NULL;
  	/* Leave page->index set: truncation relies upon it */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
877
  	put_page(page);
3fea5a499   Johannes Weiner   mm: memcontrol: c...
878
  	return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
879
  }
cfcbfb138   Josef Bacik   mm/filemap.c: ena...
880
  ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
a528910e1   Johannes Weiner   mm: thrash detect...
881
882
883
884
885
886
887
888
889
890
  
  /**
   * add_to_page_cache_locked - add a locked page to the pagecache
   * @page:	page to add
   * @mapping:	the page's address_space
   * @offset:	page index
   * @gfp_mask:	page allocation mode
   *
   * This function is used to add a page to the pagecache. It must be locked.
   * This function does not add the page to the LRU.  The caller must do that.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
891
892
   *
   * Return: %0 on success, negative error code otherwise.
a528910e1   Johannes Weiner   mm: thrash detect...
893
894
895
896
897
898
899
   */
  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
  		pgoff_t offset, gfp_t gfp_mask)
  {
  	return __add_to_page_cache_locked(page, mapping, offset,
  					  gfp_mask, NULL);
  }
e286781d5   Nick Piggin   mm: speculative p...
900
  EXPORT_SYMBOL(add_to_page_cache_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
901
902
  
  int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
6daa0e286   Al Viro   [PATCH] gfp_t: mm...
903
  				pgoff_t offset, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
  {
a528910e1   Johannes Weiner   mm: thrash detect...
905
  	void *shadow = NULL;
4f98a2fee   Rik van Riel   vmscan: split LRU...
906
  	int ret;
48c935ad8   Kirill A. Shutemov   page-flags: defin...
907
  	__SetPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
908
909
910
  	ret = __add_to_page_cache_locked(page, mapping, offset,
  					 gfp_mask, &shadow);
  	if (unlikely(ret))
48c935ad8   Kirill A. Shutemov   page-flags: defin...
911
  		__ClearPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
912
913
914
915
916
  	else {
  		/*
  		 * The page might have been evicted from cache only
  		 * recently, in which case it should be activated like
  		 * any other repeatedly accessed page.
f0281a00f   Rik van Riel   mm: workingset: o...
917
918
919
  		 * The exception is pages getting rewritten; evicting other
  		 * data from the working set, only to cache data that will
  		 * get overwritten with something else, is a waste of memory.
a528910e1   Johannes Weiner   mm: thrash detect...
920
  		 */
1899ad18c   Johannes Weiner   mm: workingset: t...
921
922
923
  		WARN_ON_ONCE(PageActive(page));
  		if (!(gfp_mask & __GFP_WRITE) && shadow)
  			workingset_refault(page, shadow);
a528910e1   Johannes Weiner   mm: thrash detect...
924
925
  		lru_cache_add(page);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
926
927
  	return ret;
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
928
  EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
929

44110fe38   Paul Jackson   [PATCH] cpuset me...
930
  #ifdef CONFIG_NUMA
2ae88149a   Nick Piggin   [PATCH] mm: clean...
931
  struct page *__page_cache_alloc(gfp_t gfp)
44110fe38   Paul Jackson   [PATCH] cpuset me...
932
  {
c0ff7453b   Miao Xie   cpuset,mm: fix no...
933
934
  	int n;
  	struct page *page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
935
  	if (cpuset_do_page_mem_spread()) {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
936
937
  		unsigned int cpuset_mems_cookie;
  		do {
d26914d11   Mel Gorman   mm: optimize put_...
938
  			cpuset_mems_cookie = read_mems_allowed_begin();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
939
  			n = cpuset_mem_spread_node();
96db800f5   Vlastimil Babka   mm: rename alloc_...
940
  			page = __alloc_pages_node(n, gfp, 0);
d26914d11   Mel Gorman   mm: optimize put_...
941
  		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
942

c0ff7453b   Miao Xie   cpuset,mm: fix no...
943
  		return page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
944
  	}
2ae88149a   Nick Piggin   [PATCH] mm: clean...
945
  	return alloc_pages(gfp, 0);
44110fe38   Paul Jackson   [PATCH] cpuset me...
946
  }
2ae88149a   Nick Piggin   [PATCH] mm: clean...
947
  EXPORT_SYMBOL(__page_cache_alloc);
44110fe38   Paul Jackson   [PATCH] cpuset me...
948
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
949
950
951
952
953
954
955
956
957
958
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
   * waitqueues where the bucket discipline is to maintain all
   * waiters on the same queue and wake all when any of the pages
   * become available, and for the woken contexts to check to be
   * sure the appropriate page became available, this saves space
   * at a cost of "thundering herd" phenomena during rare hash
   * collisions.
   */
629060270   Nicholas Piggin   mm: add PageWaite...
959
960
961
962
963
  #define PAGE_WAIT_TABLE_BITS 8
  #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
  
  static wait_queue_head_t *page_waitqueue(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
964
  {
629060270   Nicholas Piggin   mm: add PageWaite...
965
  	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
966
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
967

629060270   Nicholas Piggin   mm: add PageWaite...
968
  void __init pagecache_init(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
969
  {
629060270   Nicholas Piggin   mm: add PageWaite...
970
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
971

629060270   Nicholas Piggin   mm: add PageWaite...
972
973
974
975
  	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
  		init_waitqueue_head(&page_wait_table[i]);
  
  	page_writeback_init();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
976
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
977

5ef64cc89   Linus Torvalds   mm: allow a contr...
978
979
  /*
   * The page wait code treats the "wait->flags" somewhat unusually, because
5868ec267   Linus Torvalds   mm: fix wake_page...
980
   * we have multiple different kinds of waits, not just the usual "exclusive"
5ef64cc89   Linus Torvalds   mm: allow a contr...
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
   * one.
   *
   * We have:
   *
   *  (a) no special bits set:
   *
   *	We're just waiting for the bit to be released, and when a waker
   *	calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
   *	and remove it from the wait queue.
   *
   *	Simple and straightforward.
   *
   *  (b) WQ_FLAG_EXCLUSIVE:
   *
   *	The waiter is waiting to get the lock, and only one waiter should
   *	be woken up to avoid any thundering herd behavior. We'll set the
   *	WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
   *
   *	This is the traditional exclusive wait.
   *
5868ec267   Linus Torvalds   mm: fix wake_page...
1001
   *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
5ef64cc89   Linus Torvalds   mm: allow a contr...
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
   *
   *	The waiter is waiting to get the bit, and additionally wants the
   *	lock to be transferred to it for fair lock behavior. If the lock
   *	cannot be taken, we stop walking the wait queue without waking
   *	the waiter.
   *
   *	This is the "fair lock handoff" case, and in addition to setting
   *	WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
   *	that it now has the lock.
   */
ac6424b98   Ingo Molnar   sched/wait: Renam...
1012
  static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1013
  {
5ef64cc89   Linus Torvalds   mm: allow a contr...
1014
  	unsigned int flags;
629060270   Nicholas Piggin   mm: add PageWaite...
1015
1016
1017
  	struct wait_page_key *key = arg;
  	struct wait_page_queue *wait_page
  		= container_of(wait, struct wait_page_queue, wait);
cdc8fcb49   Linus Torvalds   Merge tag 'for-5....
1018
  	if (!wake_page_match(wait_page, key))
629060270   Nicholas Piggin   mm: add PageWaite...
1019
  		return 0;
3510ca20e   Linus Torvalds   Minor page waitqu...
1020

9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1021
  	/*
5ef64cc89   Linus Torvalds   mm: allow a contr...
1022
1023
  	 * If it's a lock handoff wait, we get the bit for it, and
  	 * stop walking (and do not wake it up) if we can't.
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1024
  	 */
5ef64cc89   Linus Torvalds   mm: allow a contr...
1025
1026
1027
  	flags = wait->flags;
  	if (flags & WQ_FLAG_EXCLUSIVE) {
  		if (test_bit(key->bit_nr, &key->page->flags))
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1028
  			return -1;
5ef64cc89   Linus Torvalds   mm: allow a contr...
1029
1030
1031
1032
1033
  		if (flags & WQ_FLAG_CUSTOM) {
  			if (test_and_set_bit(key->bit_nr, &key->page->flags))
  				return -1;
  			flags |= WQ_FLAG_DONE;
  		}
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1034
  	}
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1035

5ef64cc89   Linus Torvalds   mm: allow a contr...
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
  	/*
  	 * We are holding the wait-queue lock, but the waiter that
  	 * is waiting for this will be checking the flags without
  	 * any locking.
  	 *
  	 * So update the flags atomically, and wake up the waiter
  	 * afterwards to avoid any races. This store-release pairs
  	 * with the load-acquire in wait_on_page_bit_common().
  	 */
  	smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1046
1047
1048
1049
1050
1051
  	wake_up_state(wait->private, mode);
  
  	/*
  	 * Ok, we have successfully done what we're waiting for,
  	 * and we can unconditionally remove the wait entry.
  	 *
5ef64cc89   Linus Torvalds   mm: allow a contr...
1052
1053
1054
  	 * Note that this pairs with the "finish_wait()" in the
  	 * waiter, and has to be the absolute last thing we do.
  	 * After this list_del_init(&wait->entry) the wait entry
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1055
1056
  	 * might be de-allocated and the process might even have
  	 * exited.
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1057
  	 */
c6fe44d96   Linus Torvalds   list: add "list_d...
1058
  	list_del_init_careful(&wait->entry);
5ef64cc89   Linus Torvalds   mm: allow a contr...
1059
  	return (flags & WQ_FLAG_EXCLUSIVE) != 0;
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1060
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
1061
  static void wake_up_page_bit(struct page *page, int bit_nr)
cbbce8220   NeilBrown   SCHED: add some "...
1062
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1063
1064
1065
  	wait_queue_head_t *q = page_waitqueue(page);
  	struct wait_page_key key;
  	unsigned long flags;
11a19c7b0   Tim Chen   sched/wait: Intro...
1066
  	wait_queue_entry_t bookmark;
cbbce8220   NeilBrown   SCHED: add some "...
1067

629060270   Nicholas Piggin   mm: add PageWaite...
1068
1069
1070
  	key.page = page;
  	key.bit_nr = bit_nr;
  	key.page_match = 0;
11a19c7b0   Tim Chen   sched/wait: Intro...
1071
1072
1073
1074
  	bookmark.flags = 0;
  	bookmark.private = NULL;
  	bookmark.func = NULL;
  	INIT_LIST_HEAD(&bookmark.entry);
629060270   Nicholas Piggin   mm: add PageWaite...
1075
  	spin_lock_irqsave(&q->lock, flags);
11a19c7b0   Tim Chen   sched/wait: Intro...
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
  	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  
  	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
  		/*
  		 * Take a breather from holding the lock,
  		 * allow pages that finish wake up asynchronously
  		 * to acquire the lock and remove themselves
  		 * from wait queue
  		 */
  		spin_unlock_irqrestore(&q->lock, flags);
  		cpu_relax();
  		spin_lock_irqsave(&q->lock, flags);
  		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  	}
629060270   Nicholas Piggin   mm: add PageWaite...
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
  	/*
  	 * It is possible for other pages to have collided on the waitqueue
  	 * hash, so in that case check for a page match. That prevents a long-
  	 * term waiter
  	 *
  	 * It is still possible to miss a case here, when we woke page waiters
  	 * and removed them from the waitqueue, but there are still other
  	 * page waiters.
  	 */
  	if (!waitqueue_active(q) || !key.page_match) {
  		ClearPageWaiters(page);
  		/*
  		 * It's possible to miss clearing Waiters here, when we woke
  		 * our page waiters, but the hashed waitqueue has waiters for
  		 * other pages on it.
  		 *
  		 * That's okay, it's a rare case. The next waker will clear it.
  		 */
  	}
  	spin_unlock_irqrestore(&q->lock, flags);
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
1111
1112
1113
1114
1115
1116
1117
  
  static void wake_up_page(struct page *page, int bit)
  {
  	if (!PageWaiters(page))
  		return;
  	wake_up_page_bit(page, bit);
  }
629060270   Nicholas Piggin   mm: add PageWaite...
1118

9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
  /*
   * A choice of three behaviors for wait_on_page_bit_common():
   */
  enum behavior {
  	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
  			 * __lock_page() waiting on then setting PG_locked.
  			 */
  	SHARED,		/* Hold ref to page and check the bit when woken, like
  			 * wait_on_page_writeback() waiting on PG_writeback.
  			 */
  	DROP,		/* Drop ref to page before wait, no check when woken,
  			 * like put_and_wait_on_page_locked() on PG_locked.
  			 */
  };
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1133
  /*
5ef64cc89   Linus Torvalds   mm: allow a contr...
1134
1135
   * Attempt to check (or get) the page bit, and mark us done
   * if successful.
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1136
1137
1138
1139
1140
1141
1142
1143
1144
   */
  static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
  					struct wait_queue_entry *wait)
  {
  	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
  		if (test_and_set_bit(bit_nr, &page->flags))
  			return false;
  	} else if (test_bit(bit_nr, &page->flags))
  		return false;
5ef64cc89   Linus Torvalds   mm: allow a contr...
1145
  	wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1146
1147
  	return true;
  }
5ef64cc89   Linus Torvalds   mm: allow a contr...
1148
1149
  /* How many times do we accept lock stealing from under a waiter? */
  int sysctl_page_lock_unfairness = 5;
dec0fd4a0   Jimmy Shiu   ANDROID: attribut...
1150
  static inline __sched int wait_on_page_bit_common(wait_queue_head_t *q,
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1151
  	struct page *page, int bit_nr, int state, enum behavior behavior)
629060270   Nicholas Piggin   mm: add PageWaite...
1152
  {
5ef64cc89   Linus Torvalds   mm: allow a contr...
1153
  	int unfairness = sysctl_page_lock_unfairness;
629060270   Nicholas Piggin   mm: add PageWaite...
1154
  	struct wait_page_queue wait_page;
ac6424b98   Ingo Molnar   sched/wait: Renam...
1155
  	wait_queue_entry_t *wait = &wait_page.wait;
b1d29ba82   Johannes Weiner   delayacct: track ...
1156
  	bool thrashing = false;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1157
  	bool delayacct = false;
eb414681d   Johannes Weiner   psi: pressure sta...
1158
  	unsigned long pflags;
629060270   Nicholas Piggin   mm: add PageWaite...
1159

eb414681d   Johannes Weiner   psi: pressure sta...
1160
  	if (bit_nr == PG_locked &&
b1d29ba82   Johannes Weiner   delayacct: track ...
1161
  	    !PageUptodate(page) && PageWorkingset(page)) {
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1162
  		if (!PageSwapBacked(page)) {
eb414681d   Johannes Weiner   psi: pressure sta...
1163
  			delayacct_thrashing_start();
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1164
1165
  			delayacct = true;
  		}
eb414681d   Johannes Weiner   psi: pressure sta...
1166
  		psi_memstall_enter(&pflags);
b1d29ba82   Johannes Weiner   delayacct: track ...
1167
1168
  		thrashing = true;
  	}
629060270   Nicholas Piggin   mm: add PageWaite...
1169
1170
1171
1172
  	init_wait(wait);
  	wait->func = wake_page_function;
  	wait_page.page = page;
  	wait_page.bit_nr = bit_nr;
5ef64cc89   Linus Torvalds   mm: allow a contr...
1173
1174
1175
1176
1177
1178
1179
  repeat:
  	wait->flags = 0;
  	if (behavior == EXCLUSIVE) {
  		wait->flags = WQ_FLAG_EXCLUSIVE;
  		if (--unfairness < 0)
  			wait->flags |= WQ_FLAG_CUSTOM;
  	}
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
  	/*
  	 * Do one last check whether we can get the
  	 * page bit synchronously.
  	 *
  	 * Do the SetPageWaiters() marking before that
  	 * to let any waker we _just_ missed know they
  	 * need to wake us up (otherwise they'll never
  	 * even go to the slow case that looks at the
  	 * page queue), and add ourselves to the wait
  	 * queue if we need to sleep.
  	 *
  	 * This part needs to be done under the queue
  	 * lock to avoid races.
  	 */
  	spin_lock_irq(&q->lock);
  	SetPageWaiters(page);
  	if (!trylock_page_bit_common(page, bit_nr, wait))
  		__add_wait_queue_entry_tail(q, wait);
  	spin_unlock_irq(&q->lock);
629060270   Nicholas Piggin   mm: add PageWaite...
1199

2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1200
1201
  	/*
  	 * From now on, all the logic will be based on
5ef64cc89   Linus Torvalds   mm: allow a contr...
1202
1203
1204
  	 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
  	 * see whether the page bit testing has already
  	 * been done by the wake function.
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1205
1206
1207
1208
1209
  	 *
  	 * We can drop our reference to the page.
  	 */
  	if (behavior == DROP)
  		put_page(page);
629060270   Nicholas Piggin   mm: add PageWaite...
1210

5ef64cc89   Linus Torvalds   mm: allow a contr...
1211
1212
1213
1214
1215
1216
  	/*
  	 * Note that until the "finish_wait()", or until
  	 * we see the WQ_FLAG_WOKEN flag, we need to
  	 * be very careful with the 'wait->flags', because
  	 * we may race with a waker that sets them.
  	 */
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1217
  	for (;;) {
5ef64cc89   Linus Torvalds   mm: allow a contr...
1218
  		unsigned int flags;
629060270   Nicholas Piggin   mm: add PageWaite...
1219
  		set_current_state(state);
5ef64cc89   Linus Torvalds   mm: allow a contr...
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
  		/* Loop until we've been woken or interrupted */
  		flags = smp_load_acquire(&wait->flags);
  		if (!(flags & WQ_FLAG_WOKEN)) {
  			if (signal_pending_state(state, current))
  				break;
  
  			io_schedule();
  			continue;
  		}
  
  		/* If we were non-exclusive, we're done */
  		if (behavior != EXCLUSIVE)
a8b169afb   Linus Torvalds   Avoid page waitqu...
1232
  			break;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1233

5ef64cc89   Linus Torvalds   mm: allow a contr...
1234
1235
  		/* If the waker got the lock for us, we're done */
  		if (flags & WQ_FLAG_DONE)
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1236
  			break;
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1237

5ef64cc89   Linus Torvalds   mm: allow a contr...
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
  		/*
  		 * Otherwise, if we're getting the lock, we need to
  		 * try to get it ourselves.
  		 *
  		 * And if that fails, we'll have to retry this all.
  		 */
  		if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
  			goto repeat;
  
  		wait->flags |= WQ_FLAG_DONE;
  		break;
629060270   Nicholas Piggin   mm: add PageWaite...
1249
  	}
5ef64cc89   Linus Torvalds   mm: allow a contr...
1250
1251
1252
1253
1254
1255
  	/*
  	 * If a signal happened, this 'finish_wait()' may remove the last
  	 * waiter from the wait-queues, but the PageWaiters bit will remain
  	 * set. That's ok. The next wakeup will take care of it, and trying
  	 * to do it here would be difficult and prone to races.
  	 */
629060270   Nicholas Piggin   mm: add PageWaite...
1256
  	finish_wait(q, wait);
eb414681d   Johannes Weiner   psi: pressure sta...
1257
  	if (thrashing) {
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1258
  		if (delayacct)
eb414681d   Johannes Weiner   psi: pressure sta...
1259
1260
1261
  			delayacct_thrashing_end();
  		psi_memstall_leave(&pflags);
  	}
b1d29ba82   Johannes Weiner   delayacct: track ...
1262

629060270   Nicholas Piggin   mm: add PageWaite...
1263
  	/*
5ef64cc89   Linus Torvalds   mm: allow a contr...
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
  	 * NOTE! The wait->flags weren't stable until we've done the
  	 * 'finish_wait()', and we could have exited the loop above due
  	 * to a signal, and had a wakeup event happen after the signal
  	 * test but before the 'finish_wait()'.
  	 *
  	 * So only after the finish_wait() can we reliably determine
  	 * if we got woken up or not, so we can now figure out the final
  	 * return value based on that state without races.
  	 *
  	 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
  	 * waiter, but an exclusive one requires WQ_FLAG_DONE.
629060270   Nicholas Piggin   mm: add PageWaite...
1275
  	 */
5ef64cc89   Linus Torvalds   mm: allow a contr...
1276
1277
  	if (behavior == EXCLUSIVE)
  		return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
629060270   Nicholas Piggin   mm: add PageWaite...
1278

2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1279
  	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
629060270   Nicholas Piggin   mm: add PageWaite...
1280
  }
dec0fd4a0   Jimmy Shiu   ANDROID: attribut...
1281
  __sched void wait_on_page_bit(struct page *page, int bit_nr)
629060270   Nicholas Piggin   mm: add PageWaite...
1282
1283
  {
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1284
  	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
629060270   Nicholas Piggin   mm: add PageWaite...
1285
1286
  }
  EXPORT_SYMBOL(wait_on_page_bit);
dec0fd4a0   Jimmy Shiu   ANDROID: attribut...
1287
  __sched int wait_on_page_bit_killable(struct page *page, int bit_nr)
629060270   Nicholas Piggin   mm: add PageWaite...
1288
1289
  {
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1290
  	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
cbbce8220   NeilBrown   SCHED: add some "...
1291
  }
4343d0087   David Howells   afs: Get rid of t...
1292
  EXPORT_SYMBOL(wait_on_page_bit_killable);
cbbce8220   NeilBrown   SCHED: add some "...
1293

dd3e6d503   Jens Axboe   mm: add support f...
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
  static int __wait_on_page_locked_async(struct page *page,
  				       struct wait_page_queue *wait, bool set)
  {
  	struct wait_queue_head *q = page_waitqueue(page);
  	int ret = 0;
  
  	wait->page = page;
  	wait->bit_nr = PG_locked;
  
  	spin_lock_irq(&q->lock);
  	__add_wait_queue_entry_tail(q, &wait->wait);
  	SetPageWaiters(page);
  	if (set)
  		ret = !trylock_page(page);
  	else
  		ret = PageLocked(page);
  	/*
  	 * If we were succesful now, we know we're still on the
  	 * waitqueue as we're still under the lock. This means it's
  	 * safe to remove and return success, we know the callback
  	 * isn't going to trigger.
  	 */
  	if (!ret)
  		__remove_wait_queue(q, &wait->wait);
  	else
  		ret = -EIOCBQUEUED;
  	spin_unlock_irq(&q->lock);
  	return ret;
  }
1a0a7853b   Jens Axboe   mm: support async...
1323
1324
1325
1326
1327
1328
1329
  static int wait_on_page_locked_async(struct page *page,
  				     struct wait_page_queue *wait)
  {
  	if (!PageLocked(page))
  		return 0;
  	return __wait_on_page_locked_async(compound_head(page), wait, false);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1330
  /**
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
   * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
   * @page: The page to wait for.
   *
   * The caller should hold a reference on @page.  They expect the page to
   * become unlocked relatively soon, but do not wish to hold up migration
   * (for example) by holding the reference while waiting for the page to
   * come unlocked.  After this function returns, the caller should not
   * dereference @page.
   */
  void put_and_wait_on_page_locked(struct page *page)
  {
  	wait_queue_head_t *q;
  
  	page = compound_head(page);
  	q = page_waitqueue(page);
  	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
  }
  
  /**
385e1ca5f   David Howells   CacheFiles: Permi...
1350
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
697f619fc   Randy Dunlap   filemap: fix kern...
1351
1352
   * @page: Page defining the wait queue of interest
   * @waiter: Waiter to add to the queue
385e1ca5f   David Howells   CacheFiles: Permi...
1353
1354
1355
   *
   * Add an arbitrary @waiter to the wait queue for the nominated @page.
   */
ac6424b98   Ingo Molnar   sched/wait: Renam...
1356
  void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
385e1ca5f   David Howells   CacheFiles: Permi...
1357
1358
1359
1360
1361
  {
  	wait_queue_head_t *q = page_waitqueue(page);
  	unsigned long flags;
  
  	spin_lock_irqsave(&q->lock, flags);
9c3a815f4   Linus Torvalds   page waitqueue: a...
1362
  	__add_wait_queue_entry_tail(q, waiter);
629060270   Nicholas Piggin   mm: add PageWaite...
1363
  	SetPageWaiters(page);
385e1ca5f   David Howells   CacheFiles: Permi...
1364
1365
1366
  	spin_unlock_irqrestore(&q->lock, flags);
  }
  EXPORT_SYMBOL_GPL(add_page_wait_queue);
b91e1302a   Linus Torvalds   mm: optimize Page...
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
  #ifndef clear_bit_unlock_is_negative_byte
  
  /*
   * PG_waiters is the high bit in the same byte as PG_lock.
   *
   * On x86 (and on many other architectures), we can clear PG_lock and
   * test the sign bit at the same time. But if the architecture does
   * not support that special operation, we just do this all by hand
   * instead.
   *
   * The read of PG_waiters has to be after (or concurrently with) PG_locked
ffceeb62f   Ethon Paul   mm/filemap: fix a...
1378
   * being cleared, but a memory barrier should be unnecessary since it is
b91e1302a   Linus Torvalds   mm: optimize Page...
1379
1380
1381
1382
1383
1384
   * in the same byte as PG_locked.
   */
  static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
  {
  	clear_bit_unlock(nr, mem);
  	/* smp_mb__after_atomic(); */
98473f9f3   Olof Johansson   mm/filemap: fix p...
1385
  	return test_bit(PG_waiters, mem);
b91e1302a   Linus Torvalds   mm: optimize Page...
1386
1387
1388
  }
  
  #endif
385e1ca5f   David Howells   CacheFiles: Permi...
1389
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1390
   * unlock_page - unlock a locked page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1391
1392
   * @page: the page
   *
0e9aa6755   Miaohe Lin   mm: fix some brok...
1393
   * Unlocks the page and wakes up sleepers in wait_on_page_locked().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1394
   * Also wakes sleepers in wait_on_page_writeback() because the wakeup
da3dae54e   Masanari Iida   Documentation: Do...
1395
   * mechanism between PageLocked pages and PageWriteback pages is shared.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1396
1397
   * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
   *
b91e1302a   Linus Torvalds   mm: optimize Page...
1398
1399
1400
1401
1402
   * Note that this depends on PG_waiters being the sign bit in the byte
   * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
   * clear the PG_locked bit and test PG_waiters at the same time fairly
   * portably (architectures that do LL/SC can test any bit, while x86 can
   * test the sign bit).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1403
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
1404
  void unlock_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1405
  {
b91e1302a   Linus Torvalds   mm: optimize Page...
1406
  	BUILD_BUG_ON(PG_waiters != 7);
48c935ad8   Kirill A. Shutemov   page-flags: defin...
1407
  	page = compound_head(page);
309381fea   Sasha Levin   mm: dump page whe...
1408
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
b91e1302a   Linus Torvalds   mm: optimize Page...
1409
1410
  	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
  		wake_up_page_bit(page, PG_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1411
1412
  }
  EXPORT_SYMBOL(unlock_page);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1413
1414
1415
  /**
   * end_page_writeback - end writeback against a page
   * @page: the page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1416
1417
1418
   */
  void end_page_writeback(struct page *page)
  {
888cf2db4   Mel Gorman   mm: avoid unneces...
1419
1420
1421
1422
1423
1424
1425
1426
1427
  	/*
  	 * TestClearPageReclaim could be used here but it is an atomic
  	 * operation and overkill in this particular case. Failing to
  	 * shuffle a page marked for immediate reclaim is too mild to
  	 * justify taking an atomic operation penalty at the end of
  	 * ever page writeback.
  	 */
  	if (PageReclaim(page)) {
  		ClearPageReclaim(page);
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1428
  		rotate_reclaimable_page(page);
888cf2db4   Mel Gorman   mm: avoid unneces...
1429
  	}
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1430

073861ed7   Hugh Dickins   mm: fix VM_BUG_ON...
1431
1432
1433
1434
1435
1436
1437
  	/*
  	 * Writeback does not hold a page reference of its own, relying
  	 * on truncation to wait for the clearing of PG_writeback.
  	 * But here we must make sure that the page is not freed and
  	 * reused before the wake_up_page().
  	 */
  	get_page(page);
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1438
1439
  	if (!test_clear_page_writeback(page))
  		BUG();
4e857c58e   Peter Zijlstra   arch: Mass conver...
1440
  	smp_mb__after_atomic();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1441
  	wake_up_page(page, PG_writeback);
073861ed7   Hugh Dickins   mm: fix VM_BUG_ON...
1442
  	put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1443
1444
  }
  EXPORT_SYMBOL(end_page_writeback);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1445
1446
1447
1448
  /*
   * After completing I/O on a page, call this routine to update the page
   * flags appropriately
   */
c11f0c0b5   Jens Axboe   block/mm: make bd...
1449
  void page_endio(struct page *page, bool is_write, int err)
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1450
  {
c11f0c0b5   Jens Axboe   block/mm: make bd...
1451
  	if (!is_write) {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1452
1453
1454
1455
1456
1457
1458
  		if (!err) {
  			SetPageUptodate(page);
  		} else {
  			ClearPageUptodate(page);
  			SetPageError(page);
  		}
  		unlock_page(page);
abf545484   Mike Christie   mm/block: convert...
1459
  	} else {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1460
  		if (err) {
dd8416c47   Minchan Kim   mm: do not access...
1461
  			struct address_space *mapping;
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1462
  			SetPageError(page);
dd8416c47   Minchan Kim   mm: do not access...
1463
1464
1465
  			mapping = page_mapping(page);
  			if (mapping)
  				mapping_set_error(mapping, err);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1466
1467
1468
1469
1470
  		}
  		end_page_writeback(page);
  	}
  }
  EXPORT_SYMBOL_GPL(page_endio);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1471
1472
  /**
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
870667553   Randy Dunlap   mm: fix filemap.c...
1473
   * @__page: the page to lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1474
   */
dec0fd4a0   Jimmy Shiu   ANDROID: attribut...
1475
  __sched void __lock_page(struct page *__page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1476
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1477
1478
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1479
1480
  	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
  				EXCLUSIVE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1481
1482
  }
  EXPORT_SYMBOL(__lock_page);
dec0fd4a0   Jimmy Shiu   ANDROID: attribut...
1483
  __sched int __lock_page_killable(struct page *__page)
2687a3569   Matthew Wilcox   Add lock_page_kil...
1484
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1485
1486
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1487
1488
  	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
  					EXCLUSIVE);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1489
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
1490
  EXPORT_SYMBOL_GPL(__lock_page_killable);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1491

dec0fd4a0   Jimmy Shiu   ANDROID: attribut...
1492
  __sched int __lock_page_async(struct page *page, struct wait_page_queue *wait)
dd3e6d503   Jens Axboe   mm: add support f...
1493
1494
1495
  {
  	return __wait_on_page_locked_async(page, wait, true);
  }
9a95f3cf7   Paul Cassella   mm: describe mmap...
1496
1497
  /*
   * Return values:
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1498
   * 1 - page is locked; mmap_lock is still held.
9a95f3cf7   Paul Cassella   mm: describe mmap...
1499
   * 0 - page is not locked.
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1500
   *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
9a95f3cf7   Paul Cassella   mm: describe mmap...
1501
   *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1502
   *     which case mmap_lock is still held.
9a95f3cf7   Paul Cassella   mm: describe mmap...
1503
1504
   *
   * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1505
   * with the page locked and the mmap_lock unperturbed.
9a95f3cf7   Paul Cassella   mm: describe mmap...
1506
   */
dec0fd4a0   Jimmy Shiu   ANDROID: attribut...
1507
  __sched int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
d065bd810   Michel Lespinasse   mm: retry page fa...
1508
1509
  			 unsigned int flags)
  {
4064b9827   Peter Xu   mm: allow VM_FAUL...
1510
  	if (fault_flag_allow_retry_first(flags)) {
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1511
  		/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1512
  		 * CAUTION! In this case, mmap_lock is not released
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1513
1514
1515
1516
  		 * even though return 0.
  		 */
  		if (flags & FAULT_FLAG_RETRY_NOWAIT)
  			return 0;
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1517
  		mmap_read_unlock(mm);
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1518
1519
1520
  		if (flags & FAULT_FLAG_KILLABLE)
  			wait_on_page_locked_killable(page);
  		else
318b275fb   Gleb Natapov   mm: allow GUP to ...
1521
  			wait_on_page_locked(page);
d065bd810   Michel Lespinasse   mm: retry page fa...
1522
  		return 0;
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1523
1524
1525
1526
1527
1528
  	} else {
  		if (flags & FAULT_FLAG_KILLABLE) {
  			int ret;
  
  			ret = __lock_page_killable(page);
  			if (ret) {
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1529
  				mmap_read_unlock(mm);
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1530
1531
1532
1533
1534
  				return 0;
  			}
  		} else
  			__lock_page(page);
  		return 1;
d065bd810   Michel Lespinasse   mm: retry page fa...
1535
1536
  	}
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1537
  /**
0d3f92966   Matthew Wilcox   page cache: Conve...
1538
1539
1540
1541
   * page_cache_next_miss() - Find the next gap in the page cache.
   * @mapping: Mapping.
   * @index: Index.
   * @max_scan: Maximum range to search.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1542
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1543
1544
   * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
   * gap with the lowest index.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1545
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1546
1547
1548
1549
1550
   * This function may be called under the rcu_read_lock.  However, this will
   * not atomically search a snapshot of the cache at a single point in time.
   * For example, if a gap is created at index 5, then subsequently a gap is
   * created at index 10, page_cache_next_miss covering both indices may
   * return 10 if called under the rcu_read_lock.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1551
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1552
1553
1554
   * Return: The index of the gap if found, otherwise an index outside the
   * range specified (in which case 'return - index >= max_scan' will be true).
   * In the rare case of index wrap-around, 0 will be returned.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1555
   */
0d3f92966   Matthew Wilcox   page cache: Conve...
1556
  pgoff_t page_cache_next_miss(struct address_space *mapping,
e7b563bb2   Johannes Weiner   mm: filemap: move...
1557
1558
  			     pgoff_t index, unsigned long max_scan)
  {
0d3f92966   Matthew Wilcox   page cache: Conve...
1559
  	XA_STATE(xas, &mapping->i_pages, index);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1560

0d3f92966   Matthew Wilcox   page cache: Conve...
1561
1562
1563
  	while (max_scan--) {
  		void *entry = xas_next(&xas);
  		if (!entry || xa_is_value(entry))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1564
  			break;
0d3f92966   Matthew Wilcox   page cache: Conve...
1565
  		if (xas.xa_index == 0)
e7b563bb2   Johannes Weiner   mm: filemap: move...
1566
1567
  			break;
  	}
0d3f92966   Matthew Wilcox   page cache: Conve...
1568
  	return xas.xa_index;
e7b563bb2   Johannes Weiner   mm: filemap: move...
1569
  }
0d3f92966   Matthew Wilcox   page cache: Conve...
1570
  EXPORT_SYMBOL(page_cache_next_miss);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1571
1572
  
  /**
2346a5605   Laurent Dufour   mm/filemap.c: fix...
1573
   * page_cache_prev_miss() - Find the previous gap in the page cache.
0d3f92966   Matthew Wilcox   page cache: Conve...
1574
1575
1576
   * @mapping: Mapping.
   * @index: Index.
   * @max_scan: Maximum range to search.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1577
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1578
1579
   * Search the range [max(index - max_scan + 1, 0), index] for the
   * gap with the highest index.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1580
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1581
1582
1583
1584
1585
   * This function may be called under the rcu_read_lock.  However, this will
   * not atomically search a snapshot of the cache at a single point in time.
   * For example, if a gap is created at index 10, then subsequently a gap is
   * created at index 5, page_cache_prev_miss() covering both indices may
   * return 5 if called under the rcu_read_lock.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1586
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1587
1588
1589
   * Return: The index of the gap if found, otherwise an index outside the
   * range specified (in which case 'index - return >= max_scan' will be true).
   * In the rare case of wrap-around, ULONG_MAX will be returned.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1590
   */
0d3f92966   Matthew Wilcox   page cache: Conve...
1591
  pgoff_t page_cache_prev_miss(struct address_space *mapping,
e7b563bb2   Johannes Weiner   mm: filemap: move...
1592
1593
  			     pgoff_t index, unsigned long max_scan)
  {
0d3f92966   Matthew Wilcox   page cache: Conve...
1594
  	XA_STATE(xas, &mapping->i_pages, index);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1595

0d3f92966   Matthew Wilcox   page cache: Conve...
1596
1597
1598
  	while (max_scan--) {
  		void *entry = xas_prev(&xas);
  		if (!entry || xa_is_value(entry))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1599
  			break;
0d3f92966   Matthew Wilcox   page cache: Conve...
1600
  		if (xas.xa_index == ULONG_MAX)
e7b563bb2   Johannes Weiner   mm: filemap: move...
1601
1602
  			break;
  	}
0d3f92966   Matthew Wilcox   page cache: Conve...
1603
  	return xas.xa_index;
e7b563bb2   Johannes Weiner   mm: filemap: move...
1604
  }
0d3f92966   Matthew Wilcox   page cache: Conve...
1605
  EXPORT_SYMBOL(page_cache_prev_miss);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1606
1607
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1608
   * find_get_entry - find and get a page cache entry
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1609
   * @mapping: the address_space to search
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1610
   * @index: The page cache index.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1611
1612
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1613
   * page cache page, the head page is returned with an increased refcount.
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1614
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1615
1616
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1617
   *
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1618
   * Return: The head page or shadow entry, %NULL if nothing is found.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1619
   */
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1620
  struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1621
  {
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1622
  	XA_STATE(xas, &mapping->i_pages, index);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1623
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1624

a60637c85   Nick Piggin   mm: lockless page...
1625
1626
  	rcu_read_lock();
  repeat:
4c7472c0d   Matthew Wilcox   page cache: Conve...
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
  	xas_reset(&xas);
  	page = xas_load(&xas);
  	if (xas_retry(&xas, page))
  		goto repeat;
  	/*
  	 * A shadow entry of a recently evicted page, or a swap entry from
  	 * shmem/tmpfs.  Return it without attempting to raise page count.
  	 */
  	if (!page || xa_is_value(page))
  		goto out;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1637

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1638
  	if (!page_cache_get_speculative(page))
4c7472c0d   Matthew Wilcox   page cache: Conve...
1639
  		goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1640

4c7472c0d   Matthew Wilcox   page cache: Conve...
1641
  	/*
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1642
  	 * Has the page moved or been split?
4c7472c0d   Matthew Wilcox   page cache: Conve...
1643
1644
1645
1646
  	 * This is part of the lockless pagecache protocol. See
  	 * include/linux/pagemap.h for details.
  	 */
  	if (unlikely(page != xas_reload(&xas))) {
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1647
  		put_page(page);
4c7472c0d   Matthew Wilcox   page cache: Conve...
1648
  		goto repeat;
a60637c85   Nick Piggin   mm: lockless page...
1649
  	}
27d20fddc   Nick Piggin   radix-tree: fix R...
1650
  out:
a60637c85   Nick Piggin   mm: lockless page...
1651
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1652
1653
  	return page;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1654

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1655
  /**
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1656
1657
1658
   * find_lock_entry - Locate and lock a page cache entry.
   * @mapping: The address_space to search.
   * @index: The page cache index.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1659
   *
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1660
1661
   * Looks up the page at @mapping & @index.  If there is a page in the
   * cache, the head page is returned locked and with an increased refcount.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1662
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1663
1664
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1665
   *
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1666
1667
   * Context: May sleep.
   * Return: The head page or shadow entry, %NULL if nothing is found.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1668
   */
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1669
  struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1670
1671
  {
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1672
  repeat:
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1673
  	page = find_get_entry(mapping, index);
4c7472c0d   Matthew Wilcox   page cache: Conve...
1674
  	if (page && !xa_is_value(page)) {
a60637c85   Nick Piggin   mm: lockless page...
1675
1676
  		lock_page(page);
  		/* Has the page been truncated? */
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1677
  		if (unlikely(page->mapping != mapping)) {
a60637c85   Nick Piggin   mm: lockless page...
1678
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1679
  			put_page(page);
a60637c85   Nick Piggin   mm: lockless page...
1680
  			goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1681
  		}
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1682
  		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1683
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1684
1685
  	return page;
  }
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1686
1687
  
  /**
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1688
1689
1690
1691
1692
   * pagecache_get_page - Find and get a reference to a page.
   * @mapping: The address_space to search.
   * @index: The page index.
   * @fgp_flags: %FGP flags modify how the page is returned.
   * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1693
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1694
   * Looks up the page cache entry at @mapping & @index.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1695
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1696
   * @fgp_flags can be zero or more of these flags:
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
1697
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1698
1699
   * * %FGP_ACCESSED - The page will be marked accessed.
   * * %FGP_LOCK - The page is returned locked.
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1700
1701
   * * %FGP_HEAD - If the page is present and a THP, return the head page
   *   rather than the exact page specified by the index.
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1702
1703
1704
1705
1706
1707
   * * %FGP_CREAT - If no page is present then a new page is allocated using
   *   @gfp_mask and added to the page cache and the VM's LRU list.
   *   The page is returned locked and with an increased refcount.
   * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
   *   page is already in cache.  If the page was allocated, unlock it before
   *   returning so the caller can do the same dance.
605cad834   Yang Shi   mm: filemap: add ...
1708
1709
1710
   * * %FGP_WRITE - The page will be written
   * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
   * * %FGP_NOWAIT - Don't get blocked by page lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1711
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1712
1713
   * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
   * if the %GFP flags specified for %FGP_CREAT are atomic.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1714
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1715
   * If there is a page cache page, it is returned with an increased refcount.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1716
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1717
   * Return: The found page or %NULL otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1718
   */
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1719
1720
  struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
  		int fgp_flags, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1721
  {
eb2be1893   Nick Piggin   mm: buffered writ...
1722
  	struct page *page;
2457aec63   Mel Gorman   mm: non-atomicall...
1723

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1724
  repeat:
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1725
  	page = find_get_entry(mapping, index);
3159f943a   Matthew Wilcox   xarray: Replace e...
1726
  	if (xa_is_value(page))
2457aec63   Mel Gorman   mm: non-atomicall...
1727
1728
1729
1730
1731
1732
1733
  		page = NULL;
  	if (!page)
  		goto no_page;
  
  	if (fgp_flags & FGP_LOCK) {
  		if (fgp_flags & FGP_NOWAIT) {
  			if (!trylock_page(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1734
  				put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1735
1736
1737
1738
1739
1740
1741
  				return NULL;
  			}
  		} else {
  			lock_page(page);
  		}
  
  		/* Has the page been truncated? */
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1742
  		if (unlikely(page->mapping != mapping)) {
2457aec63   Mel Gorman   mm: non-atomicall...
1743
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1744
  			put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1745
1746
  			goto repeat;
  		}
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1747
  		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
2457aec63   Mel Gorman   mm: non-atomicall...
1748
  	}
c16eb000c   Kirill Tkhai   mm/filemap.c: rem...
1749
  	if (fgp_flags & FGP_ACCESSED)
2457aec63   Mel Gorman   mm: non-atomicall...
1750
  		mark_page_accessed(page);
b9306a796   Yang Shi   mm: filemap: clea...
1751
1752
1753
1754
1755
  	else if (fgp_flags & FGP_WRITE) {
  		/* Clear idle flag for buffer write */
  		if (page_is_idle(page))
  			clear_page_idle(page);
  	}
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1756
1757
  	if (!(fgp_flags & FGP_HEAD))
  		page = find_subpage(page, index);
2457aec63   Mel Gorman   mm: non-atomicall...
1758
1759
1760
1761
  
  no_page:
  	if (!page && (fgp_flags & FGP_CREAT)) {
  		int err;
f56753ac2   Christoph Hellwig   bdi: replace BDI_...
1762
  		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
45f87de57   Michal Hocko   mm: get rid of ra...
1763
1764
1765
  			gfp_mask |= __GFP_WRITE;
  		if (fgp_flags & FGP_NOFS)
  			gfp_mask &= ~__GFP_FS;
2457aec63   Mel Gorman   mm: non-atomicall...
1766

45f87de57   Michal Hocko   mm: get rid of ra...
1767
  		page = __page_cache_alloc(gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1768
1769
  		if (!page)
  			return NULL;
2457aec63   Mel Gorman   mm: non-atomicall...
1770

a75d4c333   Josef Bacik   filemap: kill pag...
1771
  		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
2457aec63   Mel Gorman   mm: non-atomicall...
1772
  			fgp_flags |= FGP_LOCK;
eb39d618f   Hugh Dickins   mm: replace init_...
1773
  		/* Init accessed so avoid atomic mark_page_accessed later */
2457aec63   Mel Gorman   mm: non-atomicall...
1774
  		if (fgp_flags & FGP_ACCESSED)
eb39d618f   Hugh Dickins   mm: replace init_...
1775
  			__SetPageReferenced(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1776

2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1777
  		err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1778
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1779
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
1780
1781
1782
  			page = NULL;
  			if (err == -EEXIST)
  				goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1783
  		}
a75d4c333   Josef Bacik   filemap: kill pag...
1784
1785
1786
1787
1788
1789
1790
  
  		/*
  		 * add_to_page_cache_lru locks the page, and for mmap we expect
  		 * an unlocked page.
  		 */
  		if (page && (fgp_flags & FGP_FOR_MMAP))
  			unlock_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1791
  	}
2457aec63   Mel Gorman   mm: non-atomicall...
1792

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1793
1794
  	return page;
  }
2457aec63   Mel Gorman   mm: non-atomicall...
1795
  EXPORT_SYMBOL(pagecache_get_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1796
1797
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
   * find_get_entries - gang pagecache lookup
   * @mapping:	The address_space to search
   * @start:	The starting page cache index
   * @nr_entries:	The maximum number of entries
   * @entries:	Where the resulting entries are placed
   * @indices:	The cache indices corresponding to the entries in @entries
   *
   * find_get_entries() will search for and return a group of up to
   * @nr_entries entries in the mapping.  The entries are placed at
   * @entries.  find_get_entries() takes a reference against any actual
   * pages it returns.
   *
   * The search returns a group of mapping-contiguous page cache entries
   * with ascending indexes.  There may be holes in the indices due to
   * not-present pages.
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1814
1815
   * Any shadow entries of evicted pages, or swap entries from
   * shmem/tmpfs, are included in the returned array.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1816
   *
71725ed10   Hugh Dickins   mm: huge tmpfs: t...
1817
1818
1819
1820
1821
   * If it finds a Transparent Huge Page, head or tail, find_get_entries()
   * stops at that page: the caller is likely to have a better way to handle
   * the compound page as a whole, and then skip its extent, than repeatedly
   * calling find_get_entries() to return all its tails.
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1822
   * Return: the number of pages and shadow entries which were found.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1823
1824
1825
1826
1827
   */
  unsigned find_get_entries(struct address_space *mapping,
  			  pgoff_t start, unsigned int nr_entries,
  			  struct page **entries, pgoff_t *indices)
  {
f280bf092   Matthew Wilcox   page cache: Conve...
1828
1829
  	XA_STATE(xas, &mapping->i_pages, start);
  	struct page *page;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1830
  	unsigned int ret = 0;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1831
1832
1833
1834
1835
  
  	if (!nr_entries)
  		return 0;
  
  	rcu_read_lock();
f280bf092   Matthew Wilcox   page cache: Conve...
1836
  	xas_for_each(&xas, page, ULONG_MAX) {
f280bf092   Matthew Wilcox   page cache: Conve...
1837
  		if (xas_retry(&xas, page))
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1838
  			continue;
f280bf092   Matthew Wilcox   page cache: Conve...
1839
1840
1841
1842
1843
1844
  		/*
  		 * A shadow entry of a recently evicted page, a swap
  		 * entry from shmem/tmpfs or a DAX entry.  Return it
  		 * without attempting to raise page count.
  		 */
  		if (xa_is_value(page))
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1845
  			goto export;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1846

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1847
  		if (!page_cache_get_speculative(page))
f280bf092   Matthew Wilcox   page cache: Conve...
1848
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1849

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1850
  		/* Has the page moved or been split? */
f280bf092   Matthew Wilcox   page cache: Conve...
1851
1852
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
71725ed10   Hugh Dickins   mm: huge tmpfs: t...
1853
1854
1855
1856
1857
1858
1859
1860
  		/*
  		 * Terminate early on finding a THP, to allow the caller to
  		 * handle it all at once; but continue if this is hugetlbfs.
  		 */
  		if (PageTransHuge(page) && !PageHuge(page)) {
  			page = find_subpage(page, xas.xa_index);
  			nr_entries = ret + 1;
  		}
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1861
  export:
f280bf092   Matthew Wilcox   page cache: Conve...
1862
  		indices[ret] = xas.xa_index;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1863
1864
1865
  		entries[ret] = page;
  		if (++ret == nr_entries)
  			break;
f280bf092   Matthew Wilcox   page cache: Conve...
1866
1867
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1868
  		put_page(page);
f280bf092   Matthew Wilcox   page cache: Conve...
1869
1870
  retry:
  		xas_reset(&xas);
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1871
1872
1873
1874
1875
1876
  	}
  	rcu_read_unlock();
  	return ret;
  }
  
  /**
b947cee4b   Jan Kara   mm: implement fin...
1877
   * find_get_pages_range - gang pagecache lookup
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1878
1879
   * @mapping:	The address_space to search
   * @start:	The starting page index
b947cee4b   Jan Kara   mm: implement fin...
1880
   * @end:	The final page index (inclusive)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1881
1882
1883
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
b947cee4b   Jan Kara   mm: implement fin...
1884
1885
1886
1887
   * find_get_pages_range() will search for and return a group of up to @nr_pages
   * pages in the mapping starting at index @start and up to index @end
   * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
   * a reference against the returned pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1888
1889
1890
   *
   * The search returns a group of mapping-contiguous pages with ascending
   * indexes.  There may be holes in the indices due to not-present pages.
d72dc8a25   Jan Kara   mm: make pagevec_...
1891
   * We also update @start to index the next page for the traversal.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1892
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1893
1894
   * Return: the number of pages which were found. If this number is
   * smaller than @nr_pages, the end of specified range has been
b947cee4b   Jan Kara   mm: implement fin...
1895
   * reached.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1896
   */
b947cee4b   Jan Kara   mm: implement fin...
1897
1898
1899
  unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
  			      pgoff_t end, unsigned int nr_pages,
  			      struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1900
  {
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1901
1902
  	XA_STATE(xas, &mapping->i_pages, *start);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1903
1904
1905
1906
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1907
1908
  
  	rcu_read_lock();
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1909
  	xas_for_each(&xas, page, end) {
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1910
  		if (xas_retry(&xas, page))
a60637c85   Nick Piggin   mm: lockless page...
1911
  			continue;
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1912
1913
  		/* Skip over shadow, swap and DAX entries */
  		if (xa_is_value(page))
8079b1c85   Hugh Dickins   mm: clarify the r...
1914
  			continue;
a60637c85   Nick Piggin   mm: lockless page...
1915

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1916
  		if (!page_cache_get_speculative(page))
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1917
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1918

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1919
  		/* Has the page moved or been split? */
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1920
1921
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1922

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1923
  		pages[ret] = find_subpage(page, xas.xa_index);
b947cee4b   Jan Kara   mm: implement fin...
1924
  		if (++ret == nr_pages) {
5d3ee42f8   Yu Zhao   mm/shmem: make fi...
1925
  			*start = xas.xa_index + 1;
b947cee4b   Jan Kara   mm: implement fin...
1926
1927
  			goto out;
  		}
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1928
1929
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1930
  		put_page(page);
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1931
1932
  retry:
  		xas_reset(&xas);
a60637c85   Nick Piggin   mm: lockless page...
1933
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
1934

b947cee4b   Jan Kara   mm: implement fin...
1935
1936
1937
  	/*
  	 * We come here when there is no page beyond @end. We take care to not
  	 * overflow the index @start as it confuses some of the callers. This
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1938
  	 * breaks the iteration when there is a page at index -1 but that is
b947cee4b   Jan Kara   mm: implement fin...
1939
1940
1941
1942
1943
1944
1945
  	 * already broken anyway.
  	 */
  	if (end == (pgoff_t)-1)
  		*start = (pgoff_t)-1;
  	else
  		*start = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
1946
  	rcu_read_unlock();
d72dc8a25   Jan Kara   mm: make pagevec_...
1947

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1948
1949
  	return ret;
  }
ebf43500e   Jens Axboe   [PATCH] Add find_...
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
  /**
   * find_get_pages_contig - gang contiguous pagecache lookup
   * @mapping:	The address_space to search
   * @index:	The starting page index
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
   * find_get_pages_contig() works exactly like find_get_pages(), except
   * that the returned number of pages are guaranteed to be contiguous.
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1960
   * Return: the number of pages which were found.
ebf43500e   Jens Axboe   [PATCH] Add find_...
1961
1962
1963
1964
   */
  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
  			       unsigned int nr_pages, struct page **pages)
  {
3ece58a27   Matthew Wilcox   page cache: Conve...
1965
1966
  	XA_STATE(xas, &mapping->i_pages, index);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1967
1968
1969
1970
  	unsigned int ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1971
1972
  
  	rcu_read_lock();
3ece58a27   Matthew Wilcox   page cache: Conve...
1973
  	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
3ece58a27   Matthew Wilcox   page cache: Conve...
1974
1975
1976
1977
1978
1979
1980
  		if (xas_retry(&xas, page))
  			continue;
  		/*
  		 * If the entry has been swapped out, we can stop looking.
  		 * No current caller is looking for DAX entries.
  		 */
  		if (xa_is_value(page))
8079b1c85   Hugh Dickins   mm: clarify the r...
1981
  			break;
ebf43500e   Jens Axboe   [PATCH] Add find_...
1982

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1983
  		if (!page_cache_get_speculative(page))
3ece58a27   Matthew Wilcox   page cache: Conve...
1984
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1985

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1986
  		/* Has the page moved or been split? */
3ece58a27   Matthew Wilcox   page cache: Conve...
1987
1988
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
a60637c85   Nick Piggin   mm: lockless page...
1989

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1990
  		pages[ret] = find_subpage(page, xas.xa_index);
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1991
1992
  		if (++ret == nr_pages)
  			break;
3ece58a27   Matthew Wilcox   page cache: Conve...
1993
1994
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1995
  		put_page(page);
3ece58a27   Matthew Wilcox   page cache: Conve...
1996
1997
  retry:
  		xas_reset(&xas);
ebf43500e   Jens Axboe   [PATCH] Add find_...
1998
  	}
a60637c85   Nick Piggin   mm: lockless page...
1999
2000
  	rcu_read_unlock();
  	return ret;
ebf43500e   Jens Axboe   [PATCH] Add find_...
2001
  }
ef71c15c4   David Howells   AFS: export a cou...
2002
  EXPORT_SYMBOL(find_get_pages_contig);
ebf43500e   Jens Axboe   [PATCH] Add find_...
2003

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2004
  /**
72b045aec   Jan Kara   mm: implement fin...
2005
   * find_get_pages_range_tag - find and return pages in given range matching @tag
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2006
2007
   * @mapping:	the address_space to search
   * @index:	the starting page index
72b045aec   Jan Kara   mm: implement fin...
2008
   * @end:	The final page index (inclusive)
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2009
2010
2011
2012
   * @tag:	the tag index
   * @nr_pages:	the maximum number of pages
   * @pages:	where the resulting pages are placed
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2013
   * Like find_get_pages, except we only return pages which are tagged with
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2014
   * @tag.   We update @index to index the next page for the traversal.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2015
2016
   *
   * Return: the number of pages which were found.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2017
   */
72b045aec   Jan Kara   mm: implement fin...
2018
  unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
a6906972f   Matthew Wilcox   page cache; Conve...
2019
  			pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
72b045aec   Jan Kara   mm: implement fin...
2020
  			struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2021
  {
a6906972f   Matthew Wilcox   page cache; Conve...
2022
2023
  	XA_STATE(xas, &mapping->i_pages, *index);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
2024
2025
2026
2027
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
2028
2029
  
  	rcu_read_lock();
a6906972f   Matthew Wilcox   page cache; Conve...
2030
  	xas_for_each_marked(&xas, page, end, tag) {
a6906972f   Matthew Wilcox   page cache; Conve...
2031
  		if (xas_retry(&xas, page))
a60637c85   Nick Piggin   mm: lockless page...
2032
  			continue;
a6906972f   Matthew Wilcox   page cache; Conve...
2033
2034
2035
2036
2037
2038
  		/*
  		 * Shadow entries should never be tagged, but this iteration
  		 * is lockless so there is a window for page reclaim to evict
  		 * a page we saw tagged.  Skip over it.
  		 */
  		if (xa_is_value(page))
139b6a6fb   Johannes Weiner   mm: filemap: upda...
2039
  			continue;
a60637c85   Nick Piggin   mm: lockless page...
2040

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2041
  		if (!page_cache_get_speculative(page))
a6906972f   Matthew Wilcox   page cache; Conve...
2042
  			goto retry;
a60637c85   Nick Piggin   mm: lockless page...
2043

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2044
  		/* Has the page moved or been split? */
a6906972f   Matthew Wilcox   page cache; Conve...
2045
2046
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
a60637c85   Nick Piggin   mm: lockless page...
2047

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2048
  		pages[ret] = find_subpage(page, xas.xa_index);
72b045aec   Jan Kara   mm: implement fin...
2049
  		if (++ret == nr_pages) {
5d3ee42f8   Yu Zhao   mm/shmem: make fi...
2050
  			*index = xas.xa_index + 1;
72b045aec   Jan Kara   mm: implement fin...
2051
2052
  			goto out;
  		}
a6906972f   Matthew Wilcox   page cache; Conve...
2053
2054
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2055
  		put_page(page);
a6906972f   Matthew Wilcox   page cache; Conve...
2056
2057
  retry:
  		xas_reset(&xas);
a60637c85   Nick Piggin   mm: lockless page...
2058
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
2059

72b045aec   Jan Kara   mm: implement fin...
2060
  	/*
a6906972f   Matthew Wilcox   page cache; Conve...
2061
  	 * We come here when we got to @end. We take care to not overflow the
72b045aec   Jan Kara   mm: implement fin...
2062
  	 * index @index as it confuses some of the callers. This breaks the
a6906972f   Matthew Wilcox   page cache; Conve...
2063
2064
  	 * iteration when there is a page at index -1 but that is already
  	 * broken anyway.
72b045aec   Jan Kara   mm: implement fin...
2065
2066
2067
2068
2069
2070
  	 */
  	if (end == (pgoff_t)-1)
  		*index = (pgoff_t)-1;
  	else
  		*index = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
2071
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2072

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2073
2074
  	return ret;
  }
72b045aec   Jan Kara   mm: implement fin...
2075
  EXPORT_SYMBOL(find_get_pages_range_tag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2076

76d42bd96   Wu Fengguang   [PATCH] readahead...
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
  /*
   * CD/DVDs are error prone. When a medium error occurs, the driver may fail
   * a _large_ part of the i/o request. Imagine the worst scenario:
   *
   *      ---R__________________________________________B__________
   *         ^ reading here                             ^ bad block(assume 4k)
   *
   * read(R) => miss => readahead(R...B) => media error => frustrating retries
   * => failing the whole request => read(R) => read(R+1) =>
   * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
   * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
   * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
   *
   * It is going insane. Fix it by quickly scaling down the readahead size.
   */
0f8e2db4e   Souptick Joarder   mm/filemap.c: rem...
2092
  static void shrink_readahead_size_eio(struct file_ra_state *ra)
76d42bd96   Wu Fengguang   [PATCH] readahead...
2093
  {
76d42bd96   Wu Fengguang   [PATCH] readahead...
2094
  	ra->ra_pages /= 4;
76d42bd96   Wu Fengguang   [PATCH] readahead...
2095
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2096
  /**
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2097
2098
   * generic_file_buffered_read - generic file read routine
   * @iocb:	the iocb to read
6e58e79db   Al Viro   introduce copy_pa...
2099
2100
   * @iter:	data destination
   * @written:	already copied
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2101
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2102
   * This is a generic file read routine, and uses the
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2103
   * mapping->a_ops->readpage() function for the actual low-level stuff.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2104
2105
2106
   *
   * This is really ugly. But the goto's actually try to clarify some
   * of the logic when it comes to error handling etc.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2107
2108
2109
2110
   *
   * Return:
   * * total number of bytes copied, including those the were already @written
   * * negative error code if nothing was copied
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2111
   */
d85dc2e11   Goldwyn Rodrigues   fs: export generi...
2112
  ssize_t generic_file_buffered_read(struct kiocb *iocb,
6e58e79db   Al Viro   introduce copy_pa...
2113
  		struct iov_iter *iter, ssize_t written)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2114
  {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2115
  	struct file *filp = iocb->ki_filp;
36e789144   Christoph Hellwig   kill do_generic_m...
2116
  	struct address_space *mapping = filp->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2117
  	struct inode *inode = mapping->host;
36e789144   Christoph Hellwig   kill do_generic_m...
2118
  	struct file_ra_state *ra = &filp->f_ra;
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2119
  	loff_t *ppos = &iocb->ki_pos;
57f6b96c0   Fengguang Wu   filemap: convert ...
2120
2121
2122
2123
  	pgoff_t index;
  	pgoff_t last_index;
  	pgoff_t prev_index;
  	unsigned long offset;      /* offset into pagecache page */
ec0f16372   Jan Kara   readahead: improv...
2124
  	unsigned int prev_offset;
6e58e79db   Al Viro   introduce copy_pa...
2125
  	int error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2126

c2a9737f4   Wei Fang   vfs,mm: fix a dea...
2127
  	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
d05c5f7ba   Linus Torvalds   vfs,mm: fix retur...
2128
  		return 0;
c2a9737f4   Wei Fang   vfs,mm: fix a dea...
2129
  	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2130
2131
2132
2133
2134
  	index = *ppos >> PAGE_SHIFT;
  	prev_index = ra->prev_pos >> PAGE_SHIFT;
  	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
  	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
  	offset = *ppos & ~PAGE_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2135

13bd69142   Jens Axboe   mm: mark async io...
2136
2137
2138
2139
2140
2141
2142
  	/*
  	 * If we've already successfully copied some data, then we
  	 * can no longer safely return -EIOCBQUEUED. Hence mark
  	 * an async read NOWAIT at that point.
  	 */
  	if (written && (iocb->ki_flags & IOCB_WAITQ))
  		iocb->ki_flags |= IOCB_NOWAIT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2143
2144
  	for (;;) {
  		struct page *page;
57f6b96c0   Fengguang Wu   filemap: convert ...
2145
  		pgoff_t end_index;
a32ea1e1f   NeilBrown   Fix read/truncate...
2146
  		loff_t isize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2147
  		unsigned long nr, ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2148
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2149
  find_page:
5abf186a3   Michal Hocko   mm, fs: check for...
2150
2151
2152
2153
  		if (fatal_signal_pending(current)) {
  			error = -EINTR;
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2154
  		page = find_get_page(mapping, index);
3ea89ee86   Fengguang Wu   readahead: conver...
2155
  		if (!page) {
cdc8fcb49   Linus Torvalds   Merge tag 'for-5....
2156
  			if (iocb->ki_flags & IOCB_NOIO)
3239d8348   Milosz Tanski   fs: support IOCB_...
2157
  				goto would_block;
cf914a7d6   Rusty Russell   readahead: split ...
2158
  			page_cache_sync_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
2159
  					ra, filp,
3ea89ee86   Fengguang Wu   readahead: conver...
2160
2161
2162
2163
2164
2165
  					index, last_index - index);
  			page = find_get_page(mapping, index);
  			if (unlikely(page == NULL))
  				goto no_cached_page;
  		}
  		if (PageReadahead(page)) {
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2166
2167
2168
2169
  			if (iocb->ki_flags & IOCB_NOIO) {
  				put_page(page);
  				goto out;
  			}
cf914a7d6   Rusty Russell   readahead: split ...
2170
  			page_cache_async_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
2171
  					ra, filp, page,
3ea89ee86   Fengguang Wu   readahead: conver...
2172
  					index, last_index - index);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2173
  		}
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2174
  		if (!PageUptodate(page)) {
ebded0278   Mel Gorman   mm: filemap: avoi...
2175
2176
2177
2178
2179
  			/*
  			 * See comment in do_read_cache_page on why
  			 * wait_on_page_locked is used to avoid unnecessarily
  			 * serialisations and why it's safe.
  			 */
1a0a7853b   Jens Axboe   mm: support async...
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
  			if (iocb->ki_flags & IOCB_WAITQ) {
  				if (written) {
  					put_page(page);
  					goto out;
  				}
  				error = wait_on_page_locked_async(page,
  								iocb->ki_waitq);
  			} else {
  				if (iocb->ki_flags & IOCB_NOWAIT) {
  					put_page(page);
  					goto would_block;
  				}
  				error = wait_on_page_locked_killable(page);
  			}
c4b209a42   Bart Van Assche   do_generic_file_r...
2194
2195
  			if (unlikely(error))
  				goto readpage_error;
ebded0278   Mel Gorman   mm: filemap: avoi...
2196
2197
  			if (PageUptodate(page))
  				goto page_ok;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2198
  			if (inode->i_blkbits == PAGE_SHIFT ||
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2199
2200
  					!mapping->a_ops->is_partially_uptodate)
  				goto page_not_up_to_date;
6d6d36bc6   Eryu Guan   mm/filemap: don't...
2201
  			/* pipes can't handle partially uptodate pages */
00e237074   David Howells   iov_iter: Use acc...
2202
  			if (unlikely(iov_iter_is_pipe(iter)))
6d6d36bc6   Eryu Guan   mm/filemap: don't...
2203
  				goto page_not_up_to_date;
529ae9aaa   Nick Piggin   mm: rename page t...
2204
  			if (!trylock_page(page))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2205
  				goto page_not_up_to_date;
8d056cb96   Dave Hansen   mm/vfs: revalidat...
2206
2207
2208
  			/* Did it get truncated before we got the lock? */
  			if (!page->mapping)
  				goto page_not_up_to_date_locked;
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2209
  			if (!mapping->a_ops->is_partially_uptodate(page,
6e58e79db   Al Viro   introduce copy_pa...
2210
  							offset, iter->count))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2211
2212
2213
  				goto page_not_up_to_date_locked;
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2214
  page_ok:
a32ea1e1f   NeilBrown   Fix read/truncate...
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
  		/*
  		 * i_size must be checked after we know the page is Uptodate.
  		 *
  		 * Checking i_size after the check allows us to calculate
  		 * the correct value for "nr", which means the zero-filled
  		 * part of the page is not copied back to userspace (unless
  		 * another truncate extends the file - this is desired though).
  		 */
  
  		isize = i_size_read(inode);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2225
  		end_index = (isize - 1) >> PAGE_SHIFT;
a32ea1e1f   NeilBrown   Fix read/truncate...
2226
  		if (unlikely(!isize || index > end_index)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2227
  			put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2228
2229
2230
2231
  			goto out;
  		}
  
  		/* nr is the maximum number of bytes to copy from this page */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2232
  		nr = PAGE_SIZE;
a32ea1e1f   NeilBrown   Fix read/truncate...
2233
  		if (index == end_index) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2234
  			nr = ((isize - 1) & ~PAGE_MASK) + 1;
a32ea1e1f   NeilBrown   Fix read/truncate...
2235
  			if (nr <= offset) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2236
  				put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2237
2238
2239
2240
  				goto out;
  			}
  		}
  		nr = nr - offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2241
2242
2243
2244
2245
2246
2247
2248
2249
  
  		/* If users can be writing to this page using arbitrary
  		 * virtual addresses, take care about potential aliasing
  		 * before reading the page on the kernel side.
  		 */
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
  
  		/*
ec0f16372   Jan Kara   readahead: improv...
2250
2251
  		 * When a sequential read accesses a page several times,
  		 * only mark it as accessed the first time.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2252
  		 */
ec0f16372   Jan Kara   readahead: improv...
2253
  		if (prev_index != index || offset != prev_offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2254
2255
2256
2257
2258
2259
  			mark_page_accessed(page);
  		prev_index = index;
  
  		/*
  		 * Ok, we have the page, and it's up-to-date, so
  		 * now we can copy it to user space...
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2260
  		 */
6e58e79db   Al Viro   introduce copy_pa...
2261
2262
  
  		ret = copy_page_to_iter(page, offset, nr, iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2263
  		offset += ret;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2264
2265
  		index += offset >> PAGE_SHIFT;
  		offset &= ~PAGE_MASK;
6ce745ed3   Jan Kara   readahead: code c...
2266
  		prev_offset = offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2267

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2268
  		put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2269
2270
2271
2272
2273
2274
2275
2276
  		written += ret;
  		if (!iov_iter_count(iter))
  			goto out;
  		if (ret < nr) {
  			error = -EFAULT;
  			goto out;
  		}
  		continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2277
2278
2279
  
  page_not_up_to_date:
  		/* Get exclusive access to the page ... */
0abed7c69   Jens Axboe   mm: never attempt...
2280
2281
2282
2283
2284
  		if (iocb->ki_flags & IOCB_WAITQ) {
  			if (written) {
  				put_page(page);
  				goto out;
  			}
1a0a7853b   Jens Axboe   mm: support async...
2285
  			error = lock_page_async(page, iocb->ki_waitq);
0abed7c69   Jens Axboe   mm: never attempt...
2286
  		} else {
1a0a7853b   Jens Axboe   mm: support async...
2287
  			error = lock_page_killable(page);
0abed7c69   Jens Axboe   mm: never attempt...
2288
  		}
854623235   Oleg Nesterov   do_generic_file_r...
2289
2290
  		if (unlikely(error))
  			goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2291

8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2292
  page_not_up_to_date_locked:
da6052f7b   Nick Piggin   [PATCH] update so...
2293
  		/* Did it get truncated before we got the lock? */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2294
2295
  		if (!page->mapping) {
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2296
  			put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
  			continue;
  		}
  
  		/* Did somebody else fill it already? */
  		if (PageUptodate(page)) {
  			unlock_page(page);
  			goto page_ok;
  		}
  
  readpage:
cdc8fcb49   Linus Torvalds   Merge tag 'for-5....
2307
  		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2308
2309
2310
2311
  			unlock_page(page);
  			put_page(page);
  			goto would_block;
  		}
91803b499   Jeff Moyer   do_generic_file_r...
2312
2313
2314
2315
2316
2317
  		/*
  		 * A previous I/O error may have been due to temporary
  		 * failures, eg. multipath errors.
  		 * PG_error will be set again if readpage fails.
  		 */
  		ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2318
2319
  		/* Start the actual read. The read will unlock the page. */
  		error = mapping->a_ops->readpage(filp, page);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2320
2321
  		if (unlikely(error)) {
  			if (error == AOP_TRUNCATED_PAGE) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2322
  				put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2323
  				error = 0;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2324
2325
  				goto find_page;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2326
  			goto readpage_error;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2327
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2328
2329
  
  		if (!PageUptodate(page)) {
0abed7c69   Jens Axboe   mm: never attempt...
2330
2331
2332
2333
2334
  			if (iocb->ki_flags & IOCB_WAITQ) {
  				if (written) {
  					put_page(page);
  					goto out;
  				}
c8d317aa1   Hao Xu   io_uring: fix asy...
2335
  				error = lock_page_async(page, iocb->ki_waitq);
0abed7c69   Jens Axboe   mm: never attempt...
2336
  			} else {
c8d317aa1   Hao Xu   io_uring: fix asy...
2337
  				error = lock_page_killable(page);
0abed7c69   Jens Axboe   mm: never attempt...
2338
  			}
c8d317aa1   Hao Xu   io_uring: fix asy...
2339

854623235   Oleg Nesterov   do_generic_file_r...
2340
2341
  			if (unlikely(error))
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2342
2343
2344
  			if (!PageUptodate(page)) {
  				if (page->mapping == NULL) {
  					/*
2ecdc82ef   Christoph Hellwig   kill unused inval...
2345
  					 * invalidate_mapping_pages got it
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2346
2347
  					 */
  					unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2348
  					put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2349
2350
2351
  					goto find_page;
  				}
  				unlock_page(page);
0f8e2db4e   Souptick Joarder   mm/filemap.c: rem...
2352
  				shrink_readahead_size_eio(ra);
854623235   Oleg Nesterov   do_generic_file_r...
2353
2354
  				error = -EIO;
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2355
2356
2357
  			}
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2358
2359
2360
2361
  		goto page_ok;
  
  readpage_error:
  		/* UHHUH! A synchronous read error occurred. Report it */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2362
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2363
2364
2365
2366
2367
2368
2369
  		goto out;
  
  no_cached_page:
  		/*
  		 * Ok, it wasn't cached, so we need to create a new
  		 * page..
  		 */
453f85d43   Mel Gorman   mm: remove __GFP_...
2370
  		page = page_cache_alloc(mapping);
eb2be1893   Nick Piggin   mm: buffered writ...
2371
  		if (!page) {
6e58e79db   Al Viro   introduce copy_pa...
2372
  			error = -ENOMEM;
eb2be1893   Nick Piggin   mm: buffered writ...
2373
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2374
  		}
6afdb859b   Michal Hocko   mm: do not ignore...
2375
  		error = add_to_page_cache_lru(page, mapping, index,
c62d25556   Michal Hocko   mm, fs: introduce...
2376
  				mapping_gfp_constraint(mapping, GFP_KERNEL));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2377
  		if (error) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2378
  			put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2379
2380
  			if (error == -EEXIST) {
  				error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2381
  				goto find_page;
6e58e79db   Al Viro   introduce copy_pa...
2382
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2383
2384
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2385
2386
  		goto readpage;
  	}
3239d8348   Milosz Tanski   fs: support IOCB_...
2387
2388
  would_block:
  	error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2389
  out:
7ff81078d   Fengguang Wu   readahead: remove...
2390
  	ra->prev_pos = prev_index;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2391
  	ra->prev_pos <<= PAGE_SHIFT;
7ff81078d   Fengguang Wu   readahead: remove...
2392
  	ra->prev_pos |= prev_offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2393

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2394
  	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
0c6aa2639   Krishna Kumar   mm: do_generic_fi...
2395
  	file_accessed(filp);
6e58e79db   Al Viro   introduce copy_pa...
2396
  	return written ? written : error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2397
  }
d85dc2e11   Goldwyn Rodrigues   fs: export generi...
2398
  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2399

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2400
  /**
6abd23227   Al Viro   bury generic_file...
2401
   * generic_file_read_iter - generic filesystem read routine
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2402
   * @iocb:	kernel I/O control block
6abd23227   Al Viro   bury generic_file...
2403
   * @iter:	destination for the data read
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2404
   *
6abd23227   Al Viro   bury generic_file...
2405
   * This is the "read_iter()" routine for all filesystems
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2406
   * that can use the page cache directly.
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
   *
   * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
   * be returned when no data can be read without waiting for I/O requests
   * to complete; it doesn't prevent readahead.
   *
   * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
   * requests shall be made for the read or for readahead.  When no data
   * can be read, -EAGAIN shall be returned.  When readahead would be
   * triggered, a partial, possibly empty read shall be returned.
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2417
2418
   * Return:
   * * number of bytes copied, even for partial reads
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2419
   * * negative error code (or 0 if IOCB_NOIO) if nothing was read
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2420
2421
   */
  ssize_t
ed978a811   Al Viro   new helper: gener...
2422
  generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2423
  {
e7080a439   Nicolai Stange   mm/filemap: gener...
2424
  	size_t count = iov_iter_count(iter);
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2425
  	ssize_t retval = 0;
e7080a439   Nicolai Stange   mm/filemap: gener...
2426
2427
2428
  
  	if (!count)
  		goto out; /* skip atime */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2429

2ba48ce51   Al Viro   mirror O_APPEND a...
2430
  	if (iocb->ki_flags & IOCB_DIRECT) {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2431
  		struct file *file = iocb->ki_filp;
ed978a811   Al Viro   new helper: gener...
2432
2433
  		struct address_space *mapping = file->f_mapping;
  		struct inode *inode = mapping->host;
543ade1fc   Badari Pulavarty   [PATCH] Streamlin...
2434
  		loff_t size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2435

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2436
  		size = i_size_read(inode);
6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
  		if (iocb->ki_flags & IOCB_NOWAIT) {
  			if (filemap_range_has_page(mapping, iocb->ki_pos,
  						   iocb->ki_pos + count - 1))
  				return -EAGAIN;
  		} else {
  			retval = filemap_write_and_wait_range(mapping,
  						iocb->ki_pos,
  					        iocb->ki_pos + count - 1);
  			if (retval < 0)
  				goto out;
  		}
d8d3d94b8   Al Viro   pass iov_iter to ...
2448

0d5b0cf24   Christoph Hellwig   fs: update atime ...
2449
  		file_accessed(file);
5ecda1371   Al Viro   generic_file_read...
2450
  		retval = mapping->a_ops->direct_IO(iocb, iter);
c3a690240   Al Viro   fix ITER_PIPE int...
2451
  		if (retval >= 0) {
c64fb5c74   Christoph Hellwig   filemap: remove p...
2452
  			iocb->ki_pos += retval;
5ecda1371   Al Viro   generic_file_read...
2453
  			count -= retval;
9fe55eea7   Steven Whitehouse   Fix race when che...
2454
  		}
5b47d59af   Al Viro   fix braino in gen...
2455
  		iov_iter_revert(iter, count - iov_iter_count(iter));
66f998f61   Josef Bacik   fs: allow short d...
2456

9fe55eea7   Steven Whitehouse   Fix race when che...
2457
2458
2459
2460
2461
2462
  		/*
  		 * Btrfs can have a short DIO read if we encounter
  		 * compressed extents, so if there was an error, or if
  		 * we've already read everything we wanted to, or if
  		 * there was a short read because we hit EOF, go ahead
  		 * and return.  Otherwise fallthrough to buffered io for
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
2463
2464
  		 * the rest of the read.  Buffered reads will not work for
  		 * DAX files, so don't bother trying.
9fe55eea7   Steven Whitehouse   Fix race when che...
2465
  		 */
5ecda1371   Al Viro   generic_file_read...
2466
  		if (retval < 0 || !count || iocb->ki_pos >= size ||
0d5b0cf24   Christoph Hellwig   fs: update atime ...
2467
  		    IS_DAX(inode))
9fe55eea7   Steven Whitehouse   Fix race when che...
2468
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2469
  	}
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2470
  	retval = generic_file_buffered_read(iocb, iter, retval);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2471
2472
2473
  out:
  	return retval;
  }
ed978a811   Al Viro   new helper: gener...
2474
  EXPORT_SYMBOL(generic_file_read_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2475

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2476
  #ifdef CONFIG_MMU
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2477
  #define MMAP_LOTSAMISS  (100)
6b4c9f446   Josef Bacik   filemap: drop the...
2478
  /*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2479
   * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
6b4c9f446   Josef Bacik   filemap: drop the...
2480
2481
2482
2483
   * @vmf - the vm_fault for this fault.
   * @page - the page to lock.
   * @fpin - the pointer to the file we may pin (or is already pinned).
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2484
   * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
6b4c9f446   Josef Bacik   filemap: drop the...
2485
   * It differs in that it actually returns the page locked if it returns 1 and 0
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2486
   * if it couldn't lock the page.  If we did have to drop the mmap_lock then fpin
6b4c9f446   Josef Bacik   filemap: drop the...
2487
2488
2489
2490
2491
2492
2493
   * will point to the pinned file and needs to be fput()'ed at a later point.
   */
  static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
  				     struct file **fpin)
  {
  	if (trylock_page(page))
  		return 1;
8b0f9fa2e   Linus Torvalds   filemap: add a co...
2494
2495
  	/*
  	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2496
  	 * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
8b0f9fa2e   Linus Torvalds   filemap: add a co...
2497
2498
  	 * is supposed to work. We have way too many special cases..
  	 */
6b4c9f446   Josef Bacik   filemap: drop the...
2499
2500
2501
2502
2503
2504
2505
  	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
  		return 0;
  
  	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
  	if (vmf->flags & FAULT_FLAG_KILLABLE) {
  		if (__lock_page_killable(page)) {
  			/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2506
  			 * We didn't have the right flags to drop the mmap_lock,
6b4c9f446   Josef Bacik   filemap: drop the...
2507
2508
  			 * but all fault_handlers only check for fatal signals
  			 * if we return VM_FAULT_RETRY, so we need to drop the
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2509
  			 * mmap_lock here and return 0 if we don't have a fpin.
6b4c9f446   Josef Bacik   filemap: drop the...
2510
2511
  			 */
  			if (*fpin == NULL)
d8ed45c5d   Michel Lespinasse   mmap locking API:...
2512
  				mmap_read_unlock(vmf->vma->vm_mm);
6b4c9f446   Josef Bacik   filemap: drop the...
2513
2514
2515
2516
2517
2518
  			return 0;
  		}
  	} else
  		__lock_page(page);
  	return 1;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2519

ef00e08e2   Linus Torvalds   readahead: clean ...
2520
  /*
6b4c9f446   Josef Bacik   filemap: drop the...
2521
2522
2523
2524
2525
   * Synchronous readahead happens when we don't even find a page in the page
   * cache at all.  We don't want to perform IO under the mmap sem, so if we have
   * to drop the mmap sem we return the file that was pinned in order for us to do
   * that.  If we didn't pin a file then we return NULL.  The file that is
   * returned needs to be fput()'ed when we're done with it.
ef00e08e2   Linus Torvalds   readahead: clean ...
2526
   */
6b4c9f446   Josef Bacik   filemap: drop the...
2527
  static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
ef00e08e2   Linus Torvalds   readahead: clean ...
2528
  {
2a1180f1b   Josef Bacik   filemap: pass vm_...
2529
2530
  	struct file *file = vmf->vma->vm_file;
  	struct file_ra_state *ra = &file->f_ra;
ef00e08e2   Linus Torvalds   readahead: clean ...
2531
  	struct address_space *mapping = file->f_mapping;
db660d462   David Howells   mm/filemap: fold ...
2532
  	DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
6b4c9f446   Josef Bacik   filemap: drop the...
2533
  	struct file *fpin = NULL;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2534
  	unsigned int mmap_miss;
ef00e08e2   Linus Torvalds   readahead: clean ...
2535
2536
  
  	/* If we don't want any read-ahead, don't bother */
2a1180f1b   Josef Bacik   filemap: pass vm_...
2537
  	if (vmf->vma->vm_flags & VM_RAND_READ)
6b4c9f446   Josef Bacik   filemap: drop the...
2538
  		return fpin;
275b12bf5   Wu Fengguang   readahead: return...
2539
  	if (!ra->ra_pages)
6b4c9f446   Josef Bacik   filemap: drop the...
2540
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2541

2a1180f1b   Josef Bacik   filemap: pass vm_...
2542
  	if (vmf->vma->vm_flags & VM_SEQ_READ) {
6b4c9f446   Josef Bacik   filemap: drop the...
2543
  		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
db660d462   David Howells   mm/filemap: fold ...
2544
  		page_cache_sync_ra(&ractl, ra, ra->ra_pages);
6b4c9f446   Josef Bacik   filemap: drop the...
2545
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2546
  	}
207d04baa   Andi Kleen   readahead: reduce...
2547
  	/* Avoid banging the cache line if not needed */
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2548
2549
2550
  	mmap_miss = READ_ONCE(ra->mmap_miss);
  	if (mmap_miss < MMAP_LOTSAMISS * 10)
  		WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
ef00e08e2   Linus Torvalds   readahead: clean ...
2551
2552
2553
2554
2555
  
  	/*
  	 * Do we miss much more than hit in this file? If so,
  	 * stop bothering with read-ahead. It will only hurt.
  	 */
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2556
  	if (mmap_miss > MMAP_LOTSAMISS)
6b4c9f446   Josef Bacik   filemap: drop the...
2557
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2558

d30a11004   Wu Fengguang   readahead: record...
2559
2560
2561
  	/*
  	 * mmap read-around
  	 */
6b4c9f446   Josef Bacik   filemap: drop the...
2562
  	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
db660d462   David Howells   mm/filemap: fold ...
2563
  	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
600e19afc   Roman Gushchin   mm: use only per-...
2564
2565
  	ra->size = ra->ra_pages;
  	ra->async_size = ra->ra_pages / 4;
db660d462   David Howells   mm/filemap: fold ...
2566
2567
  	ractl._index = ra->start;
  	do_page_cache_ra(&ractl, ra->size, ra->async_size);
6b4c9f446   Josef Bacik   filemap: drop the...
2568
  	return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2569
2570
2571
2572
  }
  
  /*
   * Asynchronous readahead happens when we find the page and PG_readahead,
6b4c9f446   Josef Bacik   filemap: drop the...
2573
   * so we want to possibly extend the readahead further.  We return the file that
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2574
   * was pinned if we have to drop the mmap_lock in order to do IO.
ef00e08e2   Linus Torvalds   readahead: clean ...
2575
   */
6b4c9f446   Josef Bacik   filemap: drop the...
2576
2577
  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
  					    struct page *page)
ef00e08e2   Linus Torvalds   readahead: clean ...
2578
  {
2a1180f1b   Josef Bacik   filemap: pass vm_...
2579
2580
  	struct file *file = vmf->vma->vm_file;
  	struct file_ra_state *ra = &file->f_ra;
ef00e08e2   Linus Torvalds   readahead: clean ...
2581
  	struct address_space *mapping = file->f_mapping;
6b4c9f446   Josef Bacik   filemap: drop the...
2582
  	struct file *fpin = NULL;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2583
  	unsigned int mmap_miss;
2a1180f1b   Josef Bacik   filemap: pass vm_...
2584
  	pgoff_t offset = vmf->pgoff;
ef00e08e2   Linus Torvalds   readahead: clean ...
2585
2586
  
  	/* If we don't want any read-ahead, don't bother */
5c72feee3   Jan Kara   mm/filemap.c: don...
2587
  	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
6b4c9f446   Josef Bacik   filemap: drop the...
2588
  		return fpin;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2589
2590
2591
  	mmap_miss = READ_ONCE(ra->mmap_miss);
  	if (mmap_miss)
  		WRITE_ONCE(ra->mmap_miss, --mmap_miss);
6b4c9f446   Josef Bacik   filemap: drop the...
2592
2593
  	if (PageReadahead(page)) {
  		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2fad6f5de   Wu Fengguang   readahead: enforc...
2594
2595
  		page_cache_async_readahead(mapping, ra, file,
  					   page, offset, ra->ra_pages);
6b4c9f446   Josef Bacik   filemap: drop the...
2596
2597
  	}
  	return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2598
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2599
  /**
54cb8821d   Nick Piggin   mm: merge populat...
2600
   * filemap_fault - read in file data for page fault handling
d0217ac04   Nick Piggin   mm: fault feedbac...
2601
   * @vmf:	struct vm_fault containing details of the fault
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2602
   *
54cb8821d   Nick Piggin   mm: merge populat...
2603
   * filemap_fault() is invoked via the vma operations vector for a
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2604
2605
2606
2607
2608
   * mapped memory region to read in file data during a page fault.
   *
   * The goto's are kind of ugly, but this streamlines the normal case of having
   * it in the page cache, and handles the special cases reasonably without
   * having a lot of duplicated code.
9a95f3cf7   Paul Cassella   mm: describe mmap...
2609
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2610
   * vma->vm_mm->mmap_lock must be held on entry.
9a95f3cf7   Paul Cassella   mm: describe mmap...
2611
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2612
   * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
a49858338   Yang Shi   mm/filemap.c: cor...
2613
   * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
9a95f3cf7   Paul Cassella   mm: describe mmap...
2614
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2615
   * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
9a95f3cf7   Paul Cassella   mm: describe mmap...
2616
2617
2618
   * has not been released.
   *
   * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2619
2620
   *
   * Return: bitwise-OR of %VM_FAULT_ codes.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2621
   */
2bcd6454b   Souptick Joarder   mm: use new retur...
2622
  vm_fault_t filemap_fault(struct vm_fault *vmf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2623
2624
  {
  	int error;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2625
  	struct file *file = vmf->vma->vm_file;
6b4c9f446   Josef Bacik   filemap: drop the...
2626
  	struct file *fpin = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2627
2628
2629
  	struct address_space *mapping = file->f_mapping;
  	struct file_ra_state *ra = &file->f_ra;
  	struct inode *inode = mapping->host;
ef00e08e2   Linus Torvalds   readahead: clean ...
2630
  	pgoff_t offset = vmf->pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2631
  	pgoff_t max_off;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2632
  	struct page *page;
2bcd6454b   Souptick Joarder   mm: use new retur...
2633
  	vm_fault_t ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2634

9ab2594fe   Matthew Wilcox   mm: tighten up th...
2635
2636
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off))
5307cc1aa   Linus Torvalds   Remove broken ptr...
2637
  		return VM_FAULT_SIGBUS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2638

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2639
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
2640
  	 * Do we have something in the page cache already?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2641
  	 */
ef00e08e2   Linus Torvalds   readahead: clean ...
2642
  	page = find_get_page(mapping, offset);
45cac65b0   Shaohua Li   readahead: fault ...
2643
  	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2644
  		/*
ef00e08e2   Linus Torvalds   readahead: clean ...
2645
2646
  		 * We found the page, so try async readahead before
  		 * waiting for the lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2647
  		 */
6b4c9f446   Josef Bacik   filemap: drop the...
2648
  		fpin = do_async_mmap_readahead(vmf, page);
45cac65b0   Shaohua Li   readahead: fault ...
2649
  	} else if (!page) {
ef00e08e2   Linus Torvalds   readahead: clean ...
2650
  		/* No page in the page cache at all */
ef00e08e2   Linus Torvalds   readahead: clean ...
2651
  		count_vm_event(PGMAJFAULT);
2262185c5   Roman Gushchin   mm: per-cgroup me...
2652
  		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ef00e08e2   Linus Torvalds   readahead: clean ...
2653
  		ret = VM_FAULT_MAJOR;
6b4c9f446   Josef Bacik   filemap: drop the...
2654
  		fpin = do_sync_mmap_readahead(vmf);
ef00e08e2   Linus Torvalds   readahead: clean ...
2655
  retry_find:
a75d4c333   Josef Bacik   filemap: kill pag...
2656
2657
2658
  		page = pagecache_get_page(mapping, offset,
  					  FGP_CREAT|FGP_FOR_MMAP,
  					  vmf->gfp_mask);
6b4c9f446   Josef Bacik   filemap: drop the...
2659
2660
2661
  		if (!page) {
  			if (fpin)
  				goto out_retry;
e520e932d   Matthew Wilcox (Oracle)   mm/filemap.c: use...
2662
  			return VM_FAULT_OOM;
6b4c9f446   Josef Bacik   filemap: drop the...
2663
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2664
  	}
6b4c9f446   Josef Bacik   filemap: drop the...
2665
2666
  	if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
  		goto out_retry;
b522c94da   Michel Lespinasse   mm: filemap_fault...
2667
2668
  
  	/* Did it get truncated? */
585e5a7ba   Song Liu   filemap: check co...
2669
  	if (unlikely(compound_head(page)->mapping != mapping)) {
b522c94da   Michel Lespinasse   mm: filemap_fault...
2670
2671
2672
2673
  		unlock_page(page);
  		put_page(page);
  		goto retry_find;
  	}
520e5ba41   Song Liu   filemap: update o...
2674
  	VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
b522c94da   Michel Lespinasse   mm: filemap_fault...
2675

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2676
  	/*
d00806b18   Nick Piggin   mm: fix fault vs ...
2677
2678
  	 * We have a locked page in the page cache, now we need to check
  	 * that it's up-to-date. If not, it is going to be due to an error.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2679
  	 */
d00806b18   Nick Piggin   mm: fix fault vs ...
2680
  	if (unlikely(!PageUptodate(page)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2681
  		goto page_not_uptodate;
ef00e08e2   Linus Torvalds   readahead: clean ...
2682
  	/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2683
  	 * We've made it this far and we had to drop our mmap_lock, now is the
6b4c9f446   Josef Bacik   filemap: drop the...
2684
2685
2686
2687
2688
2689
2690
2691
2692
  	 * time to return to the upper layer and have it re-find the vma and
  	 * redo the fault.
  	 */
  	if (fpin) {
  		unlock_page(page);
  		goto out_retry;
  	}
  
  	/*
ef00e08e2   Linus Torvalds   readahead: clean ...
2693
2694
2695
  	 * Found the page and have a reference on it.
  	 * We must recheck i_size under page lock.
  	 */
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2696
2697
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off)) {
d00806b18   Nick Piggin   mm: fix fault vs ...
2698
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2699
  		put_page(page);
5307cc1aa   Linus Torvalds   Remove broken ptr...
2700
  		return VM_FAULT_SIGBUS;
d00806b18   Nick Piggin   mm: fix fault vs ...
2701
  	}
d0217ac04   Nick Piggin   mm: fault feedbac...
2702
  	vmf->page = page;
83c54070e   Nick Piggin   mm: fault feedbac...
2703
  	return ret | VM_FAULT_LOCKED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2704

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2705
  page_not_uptodate:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2706
2707
2708
2709
2710
2711
  	/*
  	 * Umm, take care of errors if the page isn't up-to-date.
  	 * Try to re-read it _once_. We do this synchronously,
  	 * because there really aren't any performance issues here
  	 * and we need to check for errors.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2712
  	ClearPageError(page);
6b4c9f446   Josef Bacik   filemap: drop the...
2713
  	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2714
  	error = mapping->a_ops->readpage(file, page);
3ef0f720e   Miklos Szeredi   mm: fix infinite ...
2715
2716
2717
2718
2719
  	if (!error) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page))
  			error = -EIO;
  	}
6b4c9f446   Josef Bacik   filemap: drop the...
2720
2721
  	if (fpin)
  		goto out_retry;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2722
  	put_page(page);
d00806b18   Nick Piggin   mm: fix fault vs ...
2723
2724
  
  	if (!error || error == AOP_TRUNCATED_PAGE)
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2725
  		goto retry_find;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2726

0f8e2db4e   Souptick Joarder   mm/filemap.c: rem...
2727
  	shrink_readahead_size_eio(ra);
d0217ac04   Nick Piggin   mm: fault feedbac...
2728
  	return VM_FAULT_SIGBUS;
6b4c9f446   Josef Bacik   filemap: drop the...
2729
2730
2731
  
  out_retry:
  	/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2732
  	 * We dropped the mmap_lock, we need to return to the fault handler to
6b4c9f446   Josef Bacik   filemap: drop the...
2733
2734
2735
2736
2737
2738
2739
2740
  	 * re-find the vma and come back and find our hopefully still populated
  	 * page.
  	 */
  	if (page)
  		put_page(page);
  	if (fpin)
  		fput(fpin);
  	return ret | VM_FAULT_RETRY;
54cb8821d   Nick Piggin   mm: merge populat...
2741
2742
  }
  EXPORT_SYMBOL(filemap_fault);
82b0f8c39   Jan Kara   mm: join struct f...
2743
  void filemap_map_pages(struct vm_fault *vmf,
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2744
  		pgoff_t start_pgoff, pgoff_t end_pgoff)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2745
  {
82b0f8c39   Jan Kara   mm: join struct f...
2746
  	struct file *file = vmf->vma->vm_file;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2747
  	struct address_space *mapping = file->f_mapping;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2748
  	pgoff_t last_pgoff = start_pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2749
  	unsigned long max_idx;
070e807c6   Matthew Wilcox   page cache: Conve...
2750
  	XA_STATE(xas, &mapping->i_pages, start_pgoff);
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2751
  	struct page *head, *page;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2752
  	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2753
2754
  
  	rcu_read_lock();
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2755
2756
  	xas_for_each(&xas, head, end_pgoff) {
  		if (xas_retry(&xas, head))
070e807c6   Matthew Wilcox   page cache: Conve...
2757
  			continue;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2758
  		if (xa_is_value(head))
2cf938aae   Matthew Wilcox   mm: use radix_tre...
2759
  			goto next;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2760

e0975b2aa   Michal Hocko   mm, fault_around:...
2761
2762
2763
2764
  		/*
  		 * Check for a locked page first, as a speculative
  		 * reference may adversely influence page migration.
  		 */
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2765
  		if (PageLocked(head))
e0975b2aa   Michal Hocko   mm, fault_around:...
2766
  			goto next;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2767
  		if (!page_cache_get_speculative(head))
070e807c6   Matthew Wilcox   page cache: Conve...
2768
  			goto next;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2769

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2770
  		/* Has the page moved or been split? */
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2771
  		if (unlikely(head != xas_reload(&xas)))
070e807c6   Matthew Wilcox   page cache: Conve...
2772
  			goto skip;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2773
  		page = find_subpage(head, xas.xa_index);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2774

27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2775
  		if (!PageUptodate(head) ||
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2776
2777
2778
  				PageReadahead(page) ||
  				PageHWPoison(page))
  			goto skip;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2779
  		if (!trylock_page(head))
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2780
  			goto skip;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2781
  		if (head->mapping != mapping || !PageUptodate(head))
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2782
  			goto unlock;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2783
  		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2784
  		if (xas.xa_index >= max_idx)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2785
  			goto unlock;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2786
2787
  		if (mmap_miss > 0)
  			mmap_miss--;
7267ec008   Kirill A. Shutemov   mm: postpone page...
2788

070e807c6   Matthew Wilcox   page cache: Conve...
2789
  		vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
82b0f8c39   Jan Kara   mm: join struct f...
2790
  		if (vmf->pte)
070e807c6   Matthew Wilcox   page cache: Conve...
2791
2792
  			vmf->pte += xas.xa_index - last_pgoff;
  		last_pgoff = xas.xa_index;
9d82c6943   Johannes Weiner   mm: memcontrol: c...
2793
  		if (alloc_set_pte(vmf, page))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2794
  			goto unlock;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2795
  		unlock_page(head);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2796
2797
  		goto next;
  unlock:
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2798
  		unlock_page(head);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2799
  skip:
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2800
  		put_page(head);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2801
  next:
7267ec008   Kirill A. Shutemov   mm: postpone page...
2802
  		/* Huge page is mapped? No need to proceed. */
82b0f8c39   Jan Kara   mm: join struct f...
2803
  		if (pmd_trans_huge(*vmf->pmd))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2804
  			break;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2805
2806
  	}
  	rcu_read_unlock();
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2807
  	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2808
2809
  }
  EXPORT_SYMBOL(filemap_map_pages);
2bcd6454b   Souptick Joarder   mm: use new retur...
2810
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
4fcf1c620   Jan Kara   mm: Make default ...
2811
2812
  {
  	struct page *page = vmf->page;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2813
  	struct inode *inode = file_inode(vmf->vma->vm_file);
2bcd6454b   Souptick Joarder   mm: use new retur...
2814
  	vm_fault_t ret = VM_FAULT_LOCKED;
4fcf1c620   Jan Kara   mm: Make default ...
2815

14da92001   Jan Kara   fs: Protect write...
2816
  	sb_start_pagefault(inode->i_sb);
11bac8000   Dave Jiang   mm, fs: reduce fa...
2817
  	file_update_time(vmf->vma->vm_file);
4fcf1c620   Jan Kara   mm: Make default ...
2818
2819
2820
2821
2822
2823
  	lock_page(page);
  	if (page->mapping != inode->i_mapping) {
  		unlock_page(page);
  		ret = VM_FAULT_NOPAGE;
  		goto out;
  	}
14da92001   Jan Kara   fs: Protect write...
2824
2825
2826
2827
2828
2829
  	/*
  	 * We mark the page dirty already here so that when freeze is in
  	 * progress, we are guaranteed that writeback during freezing will
  	 * see the dirty page and writeprotect it again.
  	 */
  	set_page_dirty(page);
1d1d1a767   Darrick J. Wong   mm: only enforce ...
2830
  	wait_for_stable_page(page);
4fcf1c620   Jan Kara   mm: Make default ...
2831
  out:
14da92001   Jan Kara   fs: Protect write...
2832
  	sb_end_pagefault(inode->i_sb);
4fcf1c620   Jan Kara   mm: Make default ...
2833
2834
  	return ret;
  }
4fcf1c620   Jan Kara   mm: Make default ...
2835

f0f37e2f7   Alexey Dobriyan   const: mark struc...
2836
  const struct vm_operations_struct generic_file_vm_ops = {
54cb8821d   Nick Piggin   mm: merge populat...
2837
  	.fault		= filemap_fault,
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2838
  	.map_pages	= filemap_map_pages,
4fcf1c620   Jan Kara   mm: Make default ...
2839
  	.page_mkwrite	= filemap_page_mkwrite,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
  };
  
  /* This is used for a general mmap of a disk file */
  
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	if (!mapping->a_ops->readpage)
  		return -ENOEXEC;
  	file_accessed(file);
  	vma->vm_ops = &generic_file_vm_ops;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
  
  /*
   * This is for filesystems which do not implement ->writepage.
   */
  int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
  		return -EINVAL;
  	return generic_file_mmap(file, vma);
  }
  #else
4b96a37d1   Souptick Joarder   mm: convert to us...
2865
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
453972283   Arnd Bergmann   mm/filemap.c: pro...
2866
  {
4b96a37d1   Souptick Joarder   mm: convert to us...
2867
  	return VM_FAULT_SIGBUS;
453972283   Arnd Bergmann   mm/filemap.c: pro...
2868
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2869
2870
2871
2872
2873
2874
2875
2876
2877
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  #endif /* CONFIG_MMU */
453972283   Arnd Bergmann   mm/filemap.c: pro...
2878
  EXPORT_SYMBOL(filemap_page_mkwrite);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2879
2880
  EXPORT_SYMBOL(generic_file_mmap);
  EXPORT_SYMBOL(generic_file_readonly_mmap);
67f9fd91f   Sasha Levin   mm: remove read_c...
2881
2882
2883
2884
2885
  static struct page *wait_on_page_read(struct page *page)
  {
  	if (!IS_ERR(page)) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2886
  			put_page(page);
67f9fd91f   Sasha Levin   mm: remove read_c...
2887
2888
2889
2890
2891
  			page = ERR_PTR(-EIO);
  		}
  	}
  	return page;
  }
32b635298   Mel Gorman   mm: filemap: remo...
2892
  static struct page *do_read_cache_page(struct address_space *mapping,
57f6b96c0   Fengguang Wu   filemap: convert ...
2893
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2894
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
2895
2896
  				void *data,
  				gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2897
  {
eb2be1893   Nick Piggin   mm: buffered writ...
2898
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2899
2900
2901
2902
  	int err;
  repeat:
  	page = find_get_page(mapping, index);
  	if (!page) {
453f85d43   Mel Gorman   mm: remove __GFP_...
2903
  		page = __page_cache_alloc(gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2904
2905
  		if (!page)
  			return ERR_PTR(-ENOMEM);
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
2906
  		err = add_to_page_cache_lru(page, mapping, index, gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2907
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2908
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
2909
2910
  			if (err == -EEXIST)
  				goto repeat;
22ecdb4f8   Matthew Wilcox   page cache: Remov...
2911
  			/* Presumably ENOMEM for xarray node */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2912
2913
  			return ERR_PTR(err);
  		}
32b635298   Mel Gorman   mm: filemap: remo...
2914
2915
  
  filler:
6c45b4541   Christoph Hellwig   mm/filemap: don't...
2916
2917
2918
2919
  		if (filler)
  			err = filler(data, page);
  		else
  			err = mapping->a_ops->readpage(data, page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2920
  		if (err < 0) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2921
  			put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2922
  			return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2923
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2924

32b635298   Mel Gorman   mm: filemap: remo...
2925
2926
2927
2928
2929
  		page = wait_on_page_read(page);
  		if (IS_ERR(page))
  			return page;
  		goto out;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2930
2931
  	if (PageUptodate(page))
  		goto out;
ebded0278   Mel Gorman   mm: filemap: avoi...
2932
  	/*
0e9aa6755   Miaohe Lin   mm: fix some brok...
2933
  	 * Page is not up to date and may be locked due to one of the following
ebded0278   Mel Gorman   mm: filemap: avoi...
2934
2935
2936
2937
2938
2939
2940
2941
  	 * case a: Page is being filled and the page lock is held
  	 * case b: Read/write error clearing the page uptodate status
  	 * case c: Truncation in progress (page locked)
  	 * case d: Reclaim in progress
  	 *
  	 * Case a, the page will be up to date when the page is unlocked.
  	 *    There is no need to serialise on the page lock here as the page
  	 *    is pinned so the lock gives no additional protection. Even if the
ce89fddfe   Randy Dunlap   mm/filemap.c: del...
2942
  	 *    page is truncated, the data is still valid if PageUptodate as
ebded0278   Mel Gorman   mm: filemap: avoi...
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
  	 *    it's a race vs truncate race.
  	 * Case b, the page will not be up to date
  	 * Case c, the page may be truncated but in itself, the data may still
  	 *    be valid after IO completes as it's a read vs truncate race. The
  	 *    operation must restart if the page is not uptodate on unlock but
  	 *    otherwise serialising on page lock to stabilise the mapping gives
  	 *    no additional guarantees to the caller as the page lock is
  	 *    released before return.
  	 * Case d, similar to truncation. If reclaim holds the page lock, it
  	 *    will be a race with remove_mapping that determines if the mapping
  	 *    is valid on unlock but otherwise the data is valid and there is
  	 *    no need to serialise with page lock.
  	 *
  	 * As the page lock gives no additional guarantee, we optimistically
  	 * wait on the page to be unlocked and check if it's up to date and
  	 * use the page if it is. Otherwise, the page lock is required to
  	 * distinguish between the different cases. The motivation is that we
  	 * avoid spurious serialisations and wakeups when multiple processes
  	 * wait on the same page for IO to complete.
  	 */
  	wait_on_page_locked(page);
  	if (PageUptodate(page))
  		goto out;
  
  	/* Distinguish between all the cases under the safety of the lock */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2968
  	lock_page(page);
ebded0278   Mel Gorman   mm: filemap: avoi...
2969
2970
  
  	/* Case c or d, restart the operation */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2971
2972
  	if (!page->mapping) {
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2973
  		put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2974
  		goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2975
  	}
ebded0278   Mel Gorman   mm: filemap: avoi...
2976
2977
  
  	/* Someone else locked and filled the page in a very small window */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2978
2979
2980
2981
  	if (PageUptodate(page)) {
  		unlock_page(page);
  		goto out;
  	}
faffdfa04   Xianting Tian   mm/filemap.c: cle...
2982
2983
2984
2985
2986
2987
2988
2989
  
  	/*
  	 * A previous I/O error may have been due to temporary
  	 * failures.
  	 * Clear page error before actual read, PG_error will be
  	 * set again if read page fails.
  	 */
  	ClearPageError(page);
32b635298   Mel Gorman   mm: filemap: remo...
2990
  	goto filler;
c855ff371   David Howells   Fix a bad error c...
2991
  out:
6fe6900e1   Nick Piggin   mm: make read_cac...
2992
2993
2994
  	mark_page_accessed(page);
  	return page;
  }
0531b2aac   Linus Torvalds   mm: add new 'read...
2995
2996
  
  /**
67f9fd91f   Sasha Levin   mm: remove read_c...
2997
   * read_cache_page - read into page cache, fill it if needed
0531b2aac   Linus Torvalds   mm: add new 'read...
2998
2999
3000
   * @mapping:	the page's address_space
   * @index:	the page index
   * @filler:	function to perform the read
5e5358e7c   Hugh Dickins   mm: cleanup descr...
3001
   * @data:	first arg to filler(data, page) function, often left as NULL
0531b2aac   Linus Torvalds   mm: add new 'read...
3002
   *
0531b2aac   Linus Torvalds   mm: add new 'read...
3003
   * Read into the page cache. If a page already exists, and PageUptodate() is
67f9fd91f   Sasha Levin   mm: remove read_c...
3004
   * not set, try to fill the page and wait for it to become unlocked.
0531b2aac   Linus Torvalds   mm: add new 'read...
3005
3006
   *
   * If the page does not get brought uptodate, return -EIO.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3007
3008
   *
   * Return: up to date page on success, ERR_PTR() on failure.
0531b2aac   Linus Torvalds   mm: add new 'read...
3009
   */
67f9fd91f   Sasha Levin   mm: remove read_c...
3010
  struct page *read_cache_page(struct address_space *mapping,
0531b2aac   Linus Torvalds   mm: add new 'read...
3011
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
3012
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
3013
3014
  				void *data)
  {
d322a8e5e   Christoph Hellwig   mm/filemap.c: fix...
3015
3016
  	return do_read_cache_page(mapping, index, filler, data,
  			mapping_gfp_mask(mapping));
0531b2aac   Linus Torvalds   mm: add new 'read...
3017
  }
67f9fd91f   Sasha Levin   mm: remove read_c...
3018
  EXPORT_SYMBOL(read_cache_page);
0531b2aac   Linus Torvalds   mm: add new 'read...
3019
3020
3021
3022
3023
3024
3025
3026
  
  /**
   * read_cache_page_gfp - read into page cache, using specified page allocation flags.
   * @mapping:	the page's address_space
   * @index:	the page index
   * @gfp:	the page allocator flags to use if allocating
   *
   * This is the same as "read_mapping_page(mapping, index, NULL)", but with
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
3027
   * any new page allocations done using the specified allocation flags.
0531b2aac   Linus Torvalds   mm: add new 'read...
3028
3029
   *
   * If the page does not get brought uptodate, return -EIO.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3030
3031
   *
   * Return: up to date page on success, ERR_PTR() on failure.
0531b2aac   Linus Torvalds   mm: add new 'read...
3032
3033
3034
3035
3036
   */
  struct page *read_cache_page_gfp(struct address_space *mapping,
  				pgoff_t index,
  				gfp_t gfp)
  {
6c45b4541   Christoph Hellwig   mm/filemap: don't...
3037
  	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
0531b2aac   Linus Torvalds   mm: add new 'read...
3038
3039
  }
  EXPORT_SYMBOL(read_cache_page_gfp);
afddba49d   Nick Piggin   fs: introduce wri...
3040
3041
3042
3043
3044
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned flags,
  				struct page **pagep, void **fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
4e02ed4b4   Nick Piggin   fs: remove prepar...
3045
  	return aops->write_begin(file, mapping, pos, len, flags,
afddba49d   Nick Piggin   fs: introduce wri...
3046
  							pagep, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
3047
3048
3049
3050
3051
3052
3053
3054
  }
  EXPORT_SYMBOL(pagecache_write_begin);
  
  int pagecache_write_end(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned copied,
  				struct page *page, void *fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
afddba49d   Nick Piggin   fs: introduce wri...
3055

4e02ed4b4   Nick Piggin   fs: remove prepar...
3056
  	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
3057
3058
  }
  EXPORT_SYMBOL(pagecache_write_end);
a92853b67   Konstantin Khlebnikov   fs/direct-io.c: k...
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
  /*
   * Warn about a page cache invalidation failure during a direct I/O write.
   */
  void dio_warn_stale_pagecache(struct file *filp)
  {
  	static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
  	char pathname[128];
  	struct inode *inode = file_inode(filp);
  	char *path;
  
  	errseq_set(&inode->i_mapping->wb_err, -EIO);
  	if (__ratelimit(&_rs)) {
  		path = file_path(filp, pathname, sizeof(pathname));
  		if (IS_ERR(path))
  			path = "(unknown)";
  		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!
  ");
  		pr_crit("File: %s PID: %d Comm: %.20s
  ", path, current->pid,
  			current->comm);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3081
  ssize_t
1af5bb491   Christoph Hellwig   filemap: remove t...
3082
  generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3083
3084
3085
3086
  {
  	struct file	*file = iocb->ki_filp;
  	struct address_space *mapping = file->f_mapping;
  	struct inode	*inode = mapping->host;
1af5bb491   Christoph Hellwig   filemap: remove t...
3087
  	loff_t		pos = iocb->ki_pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3088
  	ssize_t		written;
a969e903a   Christoph Hellwig   kill generic_file...
3089
3090
  	size_t		write_len;
  	pgoff_t		end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3091

0c949334a   Al Viro   iov_iter_truncate()
3092
  	write_len = iov_iter_count(from);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3093
  	end = (pos + write_len - 1) >> PAGE_SHIFT;
a969e903a   Christoph Hellwig   kill generic_file...
3094

6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
3095
3096
3097
  	if (iocb->ki_flags & IOCB_NOWAIT) {
  		/* If there are pages to writeback, return */
  		if (filemap_range_has_page(inode->i_mapping, pos,
35f12f0f5   zhengbin   mm/filemap: pass ...
3098
  					   pos + write_len - 1))
6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
3099
3100
3101
3102
3103
3104
3105
  			return -EAGAIN;
  	} else {
  		written = filemap_write_and_wait_range(mapping, pos,
  							pos + write_len - 1);
  		if (written)
  			goto out;
  	}
a969e903a   Christoph Hellwig   kill generic_file...
3106
3107
3108
3109
3110
  
  	/*
  	 * After a write we want buffered reads to be sure to go to disk to get
  	 * the new data.  We invalidate clean cached page from the region we're
  	 * about to write.  We do this *before* the write so that we can return
6ccfa806a   Hisashi Hifumi   VFS: fix dio writ...
3111
  	 * without clobbering -EIOCBQUEUED from ->direct_IO().
a969e903a   Christoph Hellwig   kill generic_file...
3112
  	 */
55635ba76   Andrey Ryabinin   fs: fix data inva...
3113
  	written = invalidate_inode_pages2_range(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3114
  					pos >> PAGE_SHIFT, end);
55635ba76   Andrey Ryabinin   fs: fix data inva...
3115
3116
3117
3118
3119
3120
3121
3122
  	/*
  	 * If a page can not be invalidated, return 0 to fall back
  	 * to buffered write.
  	 */
  	if (written) {
  		if (written == -EBUSY)
  			return 0;
  		goto out;
a969e903a   Christoph Hellwig   kill generic_file...
3123
  	}
639a93a52   Al Viro   generic_file_dire...
3124
  	written = mapping->a_ops->direct_IO(iocb, from);
a969e903a   Christoph Hellwig   kill generic_file...
3125
3126
3127
3128
3129
3130
3131
3132
  
  	/*
  	 * Finally, try again to invalidate clean pages which might have been
  	 * cached by non-direct readahead, or faulted in by get_user_pages()
  	 * if the source of the write was an mmap'ed region of the file
  	 * we're writing.  Either one is a pretty crazy thing to do,
  	 * so we don't support it 100%.  If this invalidation
  	 * fails, tough, the write still worked...
332391a99   Lukas Czerner   fs: Fix page cach...
3133
3134
3135
3136
  	 *
  	 * Most of the time we do not need this since dio_complete() will do
  	 * the invalidation for us. However there are some file systems that
  	 * do not end up with dio_complete() being called, so let's not break
80c1fe902   Konstantin Khlebnikov   mm/filemap.c: rem...
3137
3138
  	 * them by removing it completely.
  	 *
9266a1403   Konstantin Khlebnikov   mm/filemap.c: war...
3139
3140
  	 * Noticeable example is a blkdev_direct_IO().
  	 *
80c1fe902   Konstantin Khlebnikov   mm/filemap.c: rem...
3141
  	 * Skip invalidation for async writes or if mapping has no pages.
a969e903a   Christoph Hellwig   kill generic_file...
3142
  	 */
9266a1403   Konstantin Khlebnikov   mm/filemap.c: war...
3143
3144
3145
  	if (written > 0 && mapping->nrpages &&
  	    invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
  		dio_warn_stale_pagecache(file);
a969e903a   Christoph Hellwig   kill generic_file...
3146

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3147
  	if (written > 0) {
0116651c8   Namhyung Kim   mm: remove tempor...
3148
  		pos += written;
639a93a52   Al Viro   generic_file_dire...
3149
  		write_len -= written;
0116651c8   Namhyung Kim   mm: remove tempor...
3150
3151
  		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
  			i_size_write(inode, pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3152
3153
  			mark_inode_dirty(inode);
  		}
5cb6c6c7e   Al Viro   generic_file_dire...
3154
  		iocb->ki_pos = pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3155
  	}
639a93a52   Al Viro   generic_file_dire...
3156
  	iov_iter_revert(from, write_len - iov_iter_count(from));
a969e903a   Christoph Hellwig   kill generic_file...
3157
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3158
3159
3160
  	return written;
  }
  EXPORT_SYMBOL(generic_file_direct_write);
eb2be1893   Nick Piggin   mm: buffered writ...
3161
3162
3163
3164
  /*
   * Find or create a page at the given pagecache position. Return the locked
   * page. This function is specifically for buffered writes.
   */
54566b2c1   Nick Piggin   fs: symlink write...
3165
3166
  struct page *grab_cache_page_write_begin(struct address_space *mapping,
  					pgoff_t index, unsigned flags)
eb2be1893   Nick Piggin   mm: buffered writ...
3167
  {
eb2be1893   Nick Piggin   mm: buffered writ...
3168
  	struct page *page;
bbddabe2e   Johannes Weiner   mm: filemap: only...
3169
  	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
0faa70cb0   Johannes Weiner   mm: filemap: pass...
3170

54566b2c1   Nick Piggin   fs: symlink write...
3171
  	if (flags & AOP_FLAG_NOFS)
2457aec63   Mel Gorman   mm: non-atomicall...
3172
3173
3174
  		fgp_flags |= FGP_NOFS;
  
  	page = pagecache_get_page(mapping, index, fgp_flags,
45f87de57   Michal Hocko   mm: get rid of ra...
3175
  			mapping_gfp_mask(mapping));
c585a2678   Steven Rostedt   mm: remove likely...
3176
  	if (page)
2457aec63   Mel Gorman   mm: non-atomicall...
3177
  		wait_for_stable_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
3178

eb2be1893   Nick Piggin   mm: buffered writ...
3179
3180
  	return page;
  }
54566b2c1   Nick Piggin   fs: symlink write...
3181
  EXPORT_SYMBOL(grab_cache_page_write_begin);
eb2be1893   Nick Piggin   mm: buffered writ...
3182

3b93f911d   Al Viro   export generic_pe...
3183
  ssize_t generic_perform_write(struct file *file,
afddba49d   Nick Piggin   fs: introduce wri...
3184
3185
3186
3187
3188
3189
  				struct iov_iter *i, loff_t pos)
  {
  	struct address_space *mapping = file->f_mapping;
  	const struct address_space_operations *a_ops = mapping->a_ops;
  	long status = 0;
  	ssize_t written = 0;
674b892ed   Nick Piggin   mm: restore KERNE...
3190
  	unsigned int flags = 0;
afddba49d   Nick Piggin   fs: introduce wri...
3191
3192
  	do {
  		struct page *page;
afddba49d   Nick Piggin   fs: introduce wri...
3193
3194
3195
3196
  		unsigned long offset;	/* Offset into pagecache page */
  		unsigned long bytes;	/* Bytes to write to page */
  		size_t copied;		/* Bytes copied from user */
  		void *fsdata;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3197
3198
  		offset = (pos & (PAGE_SIZE - 1));
  		bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3199
3200
3201
  						iov_iter_count(i));
  
  again:
00a3d660c   Linus Torvalds   Revert "fs: do no...
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
  		/*
  		 * Bring in the user page that we will copy from _first_.
  		 * Otherwise there's a nasty deadlock on copying from the
  		 * same page as we're writing to, without it being marked
  		 * up-to-date.
  		 *
  		 * Not only is this an optimisation, but it is also required
  		 * to check that the address is actually valid, when atomic
  		 * usercopies are used, below.
  		 */
  		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  			status = -EFAULT;
  			break;
  		}
296291cdd   Jan Kara   mm: make sendfile...
3216
3217
3218
3219
  		if (fatal_signal_pending(current)) {
  			status = -EINTR;
  			break;
  		}
674b892ed   Nick Piggin   mm: restore KERNE...
3220
  		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
afddba49d   Nick Piggin   fs: introduce wri...
3221
  						&page, &fsdata);
2457aec63   Mel Gorman   mm: non-atomicall...
3222
  		if (unlikely(status < 0))
afddba49d   Nick Piggin   fs: introduce wri...
3223
  			break;
931e80e4b   anfei zhou   mm: flush dcache ...
3224
3225
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
00a3d660c   Linus Torvalds   Revert "fs: do no...
3226

afddba49d   Nick Piggin   fs: introduce wri...
3227
  		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
afddba49d   Nick Piggin   fs: introduce wri...
3228
3229
3230
3231
3232
3233
3234
3235
3236
  		flush_dcache_page(page);
  
  		status = a_ops->write_end(file, mapping, pos, bytes, copied,
  						page, fsdata);
  		if (unlikely(status < 0))
  			break;
  		copied = status;
  
  		cond_resched();
124d3b704   Nick Piggin   fix writev regres...
3237
  		iov_iter_advance(i, copied);
afddba49d   Nick Piggin   fs: introduce wri...
3238
3239
3240
3241
3242
3243
3244
3245
3246
  		if (unlikely(copied == 0)) {
  			/*
  			 * If we were unable to copy any data at all, we must
  			 * fall back to a single segment length write.
  			 *
  			 * If we didn't fallback here, we could livelock
  			 * because not all segments in the iov can be copied at
  			 * once without a pagefault.
  			 */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3247
  			bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3248
3249
3250
  						iov_iter_single_seg_count(i));
  			goto again;
  		}
afddba49d   Nick Piggin   fs: introduce wri...
3251
3252
3253
3254
  		pos += copied;
  		written += copied;
  
  		balance_dirty_pages_ratelimited(mapping);
afddba49d   Nick Piggin   fs: introduce wri...
3255
3256
3257
3258
  	} while (iov_iter_count(i));
  
  	return written ? written : status;
  }
3b93f911d   Al Viro   export generic_pe...
3259
  EXPORT_SYMBOL(generic_perform_write);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3260

e4dd9de3c   Jan Kara   vfs: Export __gen...
3261
  /**
8174202b3   Al Viro   write_iter varian...
3262
   * __generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3263
   * @iocb:	IO state structure (file, offset, etc.)
8174202b3   Al Viro   write_iter varian...
3264
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
   *
   * This function does all the work needed for actually writing data to a
   * file. It does all basic checks, removes SUID from the file, updates
   * modification times and calls proper subroutines depending on whether we
   * do direct IO or a standard buffered write.
   *
   * It expects i_mutex to be grabbed unless we work on a block device or similar
   * object which does not need locking at all.
   *
   * This function does *not* take care of syncing data in case of O_SYNC write.
   * A caller has to handle it. This is mainly due to the fact that we want to
   * avoid syncing under i_mutex.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3277
3278
3279
3280
   *
   * Return:
   * * number of bytes written, even for truncated writes
   * * negative error code if no data has been written at all
e4dd9de3c   Jan Kara   vfs: Export __gen...
3281
   */
8174202b3   Al Viro   write_iter varian...
3282
  ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3283
3284
  {
  	struct file *file = iocb->ki_filp;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3285
  	struct address_space * mapping = file->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3286
  	struct inode 	*inode = mapping->host;
3b93f911d   Al Viro   export generic_pe...
3287
  	ssize_t		written = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3288
  	ssize_t		err;
3b93f911d   Al Viro   export generic_pe...
3289
  	ssize_t		status;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3290

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3291
  	/* We can write back this queue in page reclaim */
de1414a65   Christoph Hellwig   fs: export inode_...
3292
  	current->backing_dev_info = inode_to_bdi(inode);
5fa8e0a1c   Jan Kara   fs: Rename file_r...
3293
  	err = file_remove_privs(file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3294
3295
  	if (err)
  		goto out;
c3b2da314   Josef Bacik   fs: introduce ino...
3296
3297
3298
  	err = file_update_time(file);
  	if (err)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3299

2ba48ce51   Al Viro   mirror O_APPEND a...
3300
  	if (iocb->ki_flags & IOCB_DIRECT) {
0b8def9d6   Al Viro   __generic_file_wr...
3301
  		loff_t pos, endbyte;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3302

1af5bb491   Christoph Hellwig   filemap: remove t...
3303
  		written = generic_file_direct_write(iocb, from);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3304
  		/*
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3305
3306
3307
3308
3309
  		 * If the write stopped short of completing, fall back to
  		 * buffered writes.  Some filesystems do this for writes to
  		 * holes, for example.  For DAX files, a buffered write will
  		 * not succeed (even if it did, DAX does not handle dirty
  		 * page-cache pages correctly).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3310
  		 */
0b8def9d6   Al Viro   __generic_file_wr...
3311
  		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3312
  			goto out;
0b8def9d6   Al Viro   __generic_file_wr...
3313
  		status = generic_perform_write(file, from, pos = iocb->ki_pos);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3314
  		/*
3b93f911d   Al Viro   export generic_pe...
3315
  		 * If generic_perform_write() returned a synchronous error
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3316
3317
3318
3319
3320
  		 * then we want to return the number of bytes which were
  		 * direct-written, or the error code if that was zero.  Note
  		 * that this differs from normal direct-io semantics, which
  		 * will return -EFOO even if some bytes were written.
  		 */
60bb45297   Al Viro   __generic_file_wr...
3321
  		if (unlikely(status < 0)) {
3b93f911d   Al Viro   export generic_pe...
3322
  			err = status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3323
3324
  			goto out;
  		}
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3325
3326
3327
3328
3329
  		/*
  		 * We need to ensure that the page cache pages are written to
  		 * disk and invalidated to preserve the expected O_DIRECT
  		 * semantics.
  		 */
3b93f911d   Al Viro   export generic_pe...
3330
  		endbyte = pos + status - 1;
0b8def9d6   Al Viro   __generic_file_wr...
3331
  		err = filemap_write_and_wait_range(mapping, pos, endbyte);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3332
  		if (err == 0) {
0b8def9d6   Al Viro   __generic_file_wr...
3333
  			iocb->ki_pos = endbyte + 1;
3b93f911d   Al Viro   export generic_pe...
3334
  			written += status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3335
  			invalidate_mapping_pages(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3336
3337
  						 pos >> PAGE_SHIFT,
  						 endbyte >> PAGE_SHIFT);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3338
3339
3340
3341
3342
3343
3344
  		} else {
  			/*
  			 * We don't know how much we wrote, so just return
  			 * the number of bytes which were direct-written
  			 */
  		}
  	} else {
0b8def9d6   Al Viro   __generic_file_wr...
3345
3346
3347
  		written = generic_perform_write(file, from, iocb->ki_pos);
  		if (likely(written > 0))
  			iocb->ki_pos += written;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3348
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3349
3350
3351
3352
  out:
  	current->backing_dev_info = NULL;
  	return written ? written : err;
  }
8174202b3   Al Viro   write_iter varian...
3353
  EXPORT_SYMBOL(__generic_file_write_iter);
e4dd9de3c   Jan Kara   vfs: Export __gen...
3354

e4dd9de3c   Jan Kara   vfs: Export __gen...
3355
  /**
8174202b3   Al Viro   write_iter varian...
3356
   * generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3357
   * @iocb:	IO state structure
8174202b3   Al Viro   write_iter varian...
3358
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3359
   *
8174202b3   Al Viro   write_iter varian...
3360
   * This is a wrapper around __generic_file_write_iter() to be used by most
e4dd9de3c   Jan Kara   vfs: Export __gen...
3361
3362
   * filesystems. It takes care of syncing the file in case of O_SYNC file
   * and acquires i_mutex as needed.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3363
3364
3365
3366
   * Return:
   * * negative error code if no data has been written at all of
   *   vfs_fsync_range() failed for a synchronous write
   * * number of bytes written, even for truncated writes
e4dd9de3c   Jan Kara   vfs: Export __gen...
3367
   */
8174202b3   Al Viro   write_iter varian...
3368
  ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3369
3370
  {
  	struct file *file = iocb->ki_filp;
148f948ba   Jan Kara   vfs: Introduce ne...
3371
  	struct inode *inode = file->f_mapping->host;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3372
  	ssize_t ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3373

5955102c9   Al Viro   wrappers for ->i_...
3374
  	inode_lock(inode);
3309dd04c   Al Viro   switch generic_wr...
3375
3376
  	ret = generic_write_checks(iocb, from);
  	if (ret > 0)
5f380c7fa   Al Viro   lift generic_writ...
3377
  		ret = __generic_file_write_iter(iocb, from);
5955102c9   Al Viro   wrappers for ->i_...
3378
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3379

e25922176   Christoph Hellwig   fs: simplify the ...
3380
3381
  	if (ret > 0)
  		ret = generic_write_sync(iocb, ret);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3382
3383
  	return ret;
  }
8174202b3   Al Viro   write_iter varian...
3384
  EXPORT_SYMBOL(generic_file_write_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3385

cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3386
3387
3388
3389
3390
3391
3392
  /**
   * try_to_release_page() - release old fs-specific metadata on a page
   *
   * @page: the page which the kernel is trying to free
   * @gfp_mask: memory allocation flags (and I/O mode)
   *
   * The address_space is to try to release any data against the page
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3393
   * (presumably at page->private).
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3394
   *
266cf658e   David Howells   FS-Cache: Recruit...
3395
3396
3397
   * This may also be called if PG_fscache is set on a page, indicating that the
   * page is known to the local caching routines.
   *
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3398
   * The @gfp_mask argument specifies whether I/O may be performed to release
71baba4b9   Mel Gorman   mm, page_alloc: r...
3399
   * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3400
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3401
   * Return: %1 if the release was successful, otherwise return zero.
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
   */
  int try_to_release_page(struct page *page, gfp_t gfp_mask)
  {
  	struct address_space * const mapping = page->mapping;
  
  	BUG_ON(!PageLocked(page));
  	if (PageWriteback(page))
  		return 0;
  
  	if (mapping && mapping->a_ops->releasepage)
  		return mapping->a_ops->releasepage(page, gfp_mask);
  	return try_to_free_buffers(page);
  }
  
  EXPORT_SYMBOL(try_to_release_page);